Coverage Report

Created: 2025-12-31 07:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/compressed_dc.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/compressed_dc.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cstdint>
12
#include <cstdlib>
13
#include <cstring>
14
#include <vector>
15
16
#include "lib/jxl/ac_context.h"
17
#include "lib/jxl/frame_header.h"
18
#include "lib/jxl/modular/modular_image.h"
19
20
#undef HWY_TARGET_INCLUDE
21
#define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc"
22
#include <hwy/foreach_target.h>
23
#include <hwy/highway.h>
24
25
#include "lib/jxl/base/compiler_specific.h"
26
#include "lib/jxl/base/data_parallel.h"
27
#include "lib/jxl/base/rect.h"
28
#include "lib/jxl/base/status.h"
29
#include "lib/jxl/image.h"
30
HWY_BEFORE_NAMESPACE();
31
namespace jxl {
32
namespace HWY_NAMESPACE {
33
34
using D = HWY_FULL(float);
35
using DScalar = HWY_CAPPED(float, 1);
36
37
// These templates are not found via ADL.
38
using hwy::HWY_NAMESPACE::Abs;
39
using hwy::HWY_NAMESPACE::Add;
40
using hwy::HWY_NAMESPACE::Div;
41
using hwy::HWY_NAMESPACE::Max;
42
using hwy::HWY_NAMESPACE::Mul;
43
using hwy::HWY_NAMESPACE::MulAdd;
44
using hwy::HWY_NAMESPACE::Rebind;
45
using hwy::HWY_NAMESPACE::Sub;
46
using hwy::HWY_NAMESPACE::Vec;
47
using hwy::HWY_NAMESPACE::ZeroIfNegative;
48
49
// TODO(veluca): optimize constants.
50
const float w1 = 0.20345139757231578f;
51
const float w2 = 0.0334829185968739f;
52
const float w0 = 1.0f - 4.0f * (w1 + w2);
53
54
template <class V>
55
8.45M
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
8.45M
  return Max(a, b);
61
8.45M
#endif
62
8.45M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::MaxWorkaround<hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::MaxWorkaround<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::MaxWorkaround<hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
55
6.39M
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
6.39M
  return Max(a, b);
61
6.39M
#endif
62
6.39M
}
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::MaxWorkaround<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
55
2.05M
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
2.05M
  return Max(a, b);
61
2.05M
#endif
62
2.05M
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::MaxWorkaround<hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::MaxWorkaround<hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 1ul> jxl::N_AVX3_ZEN4::MaxWorkaround<hwy::N_AVX3_ZEN4::Vec128<float, 1ul> >(hwy::N_AVX3_ZEN4::Vec128<float, 1ul>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::MaxWorkaround<hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 1ul> jxl::N_AVX3_SPR::MaxWorkaround<hwy::N_AVX3_SPR::Vec128<float, 1ul> >(hwy::N_AVX3_SPR::Vec128<float, 1ul>, hwy::N_AVX3_SPR::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::MaxWorkaround<hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::MaxWorkaround<hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Vec128<float, 1ul>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::MaxWorkaround<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>)
63
64
template <typename D>
65
JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor,
66
                                    const float* JXL_RESTRICT row_top,
67
                                    const float* JXL_RESTRICT row,
68
                                    const float* JXL_RESTRICT row_bottom,
69
                                    Vec<D>* JXL_RESTRICT mc,
70
                                    Vec<D>* JXL_RESTRICT sm,
71
8.45M
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
8.45M
  const auto tl = LoadU(d, row_top + x - 1);
73
8.45M
  const auto tc = Load(d, row_top + x);
74
8.45M
  const auto tr = LoadU(d, row_top + x + 1);
75
76
8.45M
  const auto ml = LoadU(d, row + x - 1);
77
8.45M
  *mc = Load(d, row + x);
78
8.45M
  const auto mr = LoadU(d, row + x + 1);
79
80
8.45M
  const auto bl = LoadU(d, row_bottom + x - 1);
81
8.45M
  const auto bc = Load(d, row_bottom + x);
82
8.45M
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
8.45M
  const auto w_center = Set(d, w0);
85
8.45M
  const auto w_side = Set(d, w1);
86
8.45M
  const auto w_corner = Set(d, w2);
87
88
8.45M
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
8.45M
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
8.45M
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
8.45M
  const auto dc_quant = Set(d, dc_factor);
93
8.45M
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
8.45M
}
Unexecuted instantiation: void jxl::N_SSE4::ComputePixelChannel<hwy::N_SSE4::Simd<float, 1ul, 0> >(hwy::N_SSE4::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, unsigned long)
Unexecuted instantiation: void jxl::N_SSE4::ComputePixelChannel<hwy::N_SSE4::Simd<float, 4ul, 0> >(hwy::N_SSE4::Simd<float, 4ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, unsigned long)
void jxl::N_AVX2::ComputePixelChannel<hwy::N_AVX2::Simd<float, 1ul, 0> >(hwy::N_AVX2::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, unsigned long)
Line
Count
Source
71
6.39M
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
6.39M
  const auto tl = LoadU(d, row_top + x - 1);
73
6.39M
  const auto tc = Load(d, row_top + x);
74
6.39M
  const auto tr = LoadU(d, row_top + x + 1);
75
76
6.39M
  const auto ml = LoadU(d, row + x - 1);
77
6.39M
  *mc = Load(d, row + x);
78
6.39M
  const auto mr = LoadU(d, row + x + 1);
79
80
6.39M
  const auto bl = LoadU(d, row_bottom + x - 1);
81
6.39M
  const auto bc = Load(d, row_bottom + x);
82
6.39M
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
6.39M
  const auto w_center = Set(d, w0);
85
6.39M
  const auto w_side = Set(d, w1);
86
6.39M
  const auto w_corner = Set(d, w2);
87
88
6.39M
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
6.39M
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
6.39M
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
6.39M
  const auto dc_quant = Set(d, dc_factor);
93
6.39M
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
6.39M
}
void jxl::N_AVX2::ComputePixelChannel<hwy::N_AVX2::Simd<float, 8ul, 0> >(hwy::N_AVX2::Simd<float, 8ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, unsigned long)
Line
Count
Source
71
2.05M
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
2.05M
  const auto tl = LoadU(d, row_top + x - 1);
73
2.05M
  const auto tc = Load(d, row_top + x);
74
2.05M
  const auto tr = LoadU(d, row_top + x + 1);
75
76
2.05M
  const auto ml = LoadU(d, row + x - 1);
77
2.05M
  *mc = Load(d, row + x);
78
2.05M
  const auto mr = LoadU(d, row + x + 1);
79
80
2.05M
  const auto bl = LoadU(d, row_bottom + x - 1);
81
2.05M
  const auto bc = Load(d, row_bottom + x);
82
2.05M
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
2.05M
  const auto w_center = Set(d, w0);
85
2.05M
  const auto w_side = Set(d, w1);
86
2.05M
  const auto w_corner = Set(d, w2);
87
88
2.05M
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
2.05M
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
2.05M
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
2.05M
  const auto dc_quant = Set(d, dc_factor);
93
2.05M
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
2.05M
}
Unexecuted instantiation: void jxl::N_AVX3::ComputePixelChannel<hwy::N_AVX3::Simd<float, 1ul, 0> >(hwy::N_AVX3::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX3::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX3::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX3::Simd<float, 1ul, 0>)()))*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3::ComputePixelChannel<hwy::N_AVX3::Simd<float, 16ul, 0> >(hwy::N_AVX3::Simd<float, 16ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>)()))*, decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>)()))*, decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>)()))*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::ComputePixelChannel<hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0> >(hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>)()))*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::ComputePixelChannel<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>)()))*, decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>)()))*, decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>)()))*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3_SPR::ComputePixelChannel<hwy::N_AVX3_SPR::Simd<float, 1ul, 0> >(hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 1ul, 0>)()))*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3_SPR::ComputePixelChannel<hwy::N_AVX3_SPR::Simd<float, 16ul, 0> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>)()))*, decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>)()))*, decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>)()))*, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::ComputePixelChannel<hwy::N_SSE2::Simd<float, 1ul, 0> >(hwy::N_SSE2::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::ComputePixelChannel<hwy::N_SSE2::Simd<float, 4ul, 0> >(hwy::N_SSE2::Simd<float, 4ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, unsigned long)
95
96
template <typename D>
97
JXL_INLINE void ComputePixel(
98
    const float* JXL_RESTRICT dc_factors,
99
    const float* JXL_RESTRICT* JXL_RESTRICT rows_top,
100
    const float* JXL_RESTRICT* JXL_RESTRICT rows,
101
    const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom,
102
2.81M
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
2.81M
  const D d;
104
2.81M
  auto mc_x = Undefined(d);
105
2.81M
  auto mc_y = Undefined(d);
106
2.81M
  auto mc_b = Undefined(d);
107
2.81M
  auto sm_x = Undefined(d);
108
2.81M
  auto sm_y = Undefined(d);
109
2.81M
  auto sm_b = Undefined(d);
110
2.81M
  auto gap = Set(d, 0.5f);
111
2.81M
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
2.81M
                      &mc_x, &sm_x, &gap, x);
113
2.81M
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
2.81M
                      &mc_y, &sm_y, &gap, x);
115
2.81M
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
2.81M
                      &mc_b, &sm_b, &gap, x);
117
2.81M
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
2.81M
  factor = ZeroIfNegative(factor);
119
120
2.81M
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
2.81M
  Store(out, d, out_rows[0] + x);
122
2.81M
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
2.81M
  Store(out, d, out_rows[1] + x);
124
2.81M
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
2.81M
  Store(out, d, out_rows[2] + x);
126
2.81M
}
Unexecuted instantiation: void jxl::N_SSE4::ComputePixel<hwy::N_SSE4::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Unexecuted instantiation: void jxl::N_SSE4::ComputePixel<hwy::N_SSE4::Simd<float, 4ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
void jxl::N_AVX2::ComputePixel<hwy::N_AVX2::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Line
Count
Source
102
2.13M
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
2.13M
  const D d;
104
2.13M
  auto mc_x = Undefined(d);
105
2.13M
  auto mc_y = Undefined(d);
106
2.13M
  auto mc_b = Undefined(d);
107
2.13M
  auto sm_x = Undefined(d);
108
2.13M
  auto sm_y = Undefined(d);
109
2.13M
  auto sm_b = Undefined(d);
110
2.13M
  auto gap = Set(d, 0.5f);
111
2.13M
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
2.13M
                      &mc_x, &sm_x, &gap, x);
113
2.13M
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
2.13M
                      &mc_y, &sm_y, &gap, x);
115
2.13M
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
2.13M
                      &mc_b, &sm_b, &gap, x);
117
2.13M
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
2.13M
  factor = ZeroIfNegative(factor);
119
120
2.13M
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
2.13M
  Store(out, d, out_rows[0] + x);
122
2.13M
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
2.13M
  Store(out, d, out_rows[1] + x);
124
2.13M
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
2.13M
  Store(out, d, out_rows[2] + x);
126
2.13M
}
void jxl::N_AVX2::ComputePixel<hwy::N_AVX2::Simd<float, 8ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Line
Count
Source
102
685k
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
685k
  const D d;
104
685k
  auto mc_x = Undefined(d);
105
685k
  auto mc_y = Undefined(d);
106
685k
  auto mc_b = Undefined(d);
107
685k
  auto sm_x = Undefined(d);
108
685k
  auto sm_y = Undefined(d);
109
685k
  auto sm_b = Undefined(d);
110
685k
  auto gap = Set(d, 0.5f);
111
685k
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
685k
                      &mc_x, &sm_x, &gap, x);
113
685k
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
685k
                      &mc_y, &sm_y, &gap, x);
115
685k
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
685k
                      &mc_b, &sm_b, &gap, x);
117
685k
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
685k
  factor = ZeroIfNegative(factor);
119
120
685k
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
685k
  Store(out, d, out_rows[0] + x);
122
685k
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
685k
  Store(out, d, out_rows[1] + x);
124
685k
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
685k
  Store(out, d, out_rows[2] + x);
126
685k
}
Unexecuted instantiation: void jxl::N_AVX3::ComputePixel<hwy::N_AVX3::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3::ComputePixel<hwy::N_AVX3::Simd<float, 16ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::ComputePixel<hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3_ZEN4::ComputePixel<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3_SPR::ComputePixel<hwy::N_AVX3_SPR::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Unexecuted instantiation: void jxl::N_AVX3_SPR::ComputePixel<hwy::N_AVX3_SPR::Simd<float, 16ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::ComputePixel<hwy::N_SSE2::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Unexecuted instantiation: void jxl::N_SSE2::ComputePixel<hwy::N_SSE2::Simd<float, 4ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
127
128
Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager,
129
                           const float* dc_factors, Image3F* dc,
130
9.26k
                           ThreadPool* pool) {
131
9.26k
  const size_t xsize = dc->xsize();
132
9.26k
  const size_t ysize = dc->ysize();
133
9.26k
  if (ysize <= 2 || xsize <= 2) return true;
134
135
  // TODO(veluca): use tile-based processing?
136
  // TODO(veluca): decide if changes to the y channel should be propagated to
137
  // the x and b channels through color correlation.
138
6.49k
  JXL_ENSURE(w1 + w2 < 0.25f);
139
140
12.9k
  JXL_ASSIGN_OR_RETURN(Image3F smoothed,
141
12.9k
                       Image3F::Create(memory_manager, xsize, ysize));
142
  // Fill in borders that the loop below will not. First and last are unused.
143
25.9k
  for (size_t c = 0; c < 3; c++) {
144
38.9k
    for (size_t y : {static_cast<size_t>(0), ysize - 1}) {
145
38.9k
      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
146
38.9k
             xsize * sizeof(float));
147
38.9k
    }
148
19.4k
  }
149
203k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
203k
    const float* JXL_RESTRICT rows_top[3]{
151
203k
        dc->ConstPlaneRow(0, y - 1),
152
203k
        dc->ConstPlaneRow(1, y - 1),
153
203k
        dc->ConstPlaneRow(2, y - 1),
154
203k
    };
155
203k
    const float* JXL_RESTRICT rows[3] = {
156
203k
        dc->ConstPlaneRow(0, y),
157
203k
        dc->ConstPlaneRow(1, y),
158
203k
        dc->ConstPlaneRow(2, y),
159
203k
    };
160
203k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
203k
        dc->ConstPlaneRow(0, y + 1),
162
203k
        dc->ConstPlaneRow(1, y + 1),
163
203k
        dc->ConstPlaneRow(2, y + 1),
164
203k
    };
165
203k
    float* JXL_RESTRICT rows_out[3] = {
166
203k
        smoothed.PlaneRow(0, y),
167
203k
        smoothed.PlaneRow(1, y),
168
203k
        smoothed.PlaneRow(2, y),
169
203k
    };
170
406k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
1.62M
      for (size_t c = 0; c < 3; c++) {
172
1.21M
        rows_out[c][x] = rows[c][x];
173
1.21M
      }
174
406k
    }
175
176
203k
    size_t x = 1;
177
    // First pixels
178
203k
    const size_t N = Lanes(D());
179
1.60M
    for (; x < std::min(N, xsize - 1); x++) {
180
1.39M
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
1.39M
                            x);
182
1.39M
    }
183
    // Full vectors.
184
888k
    for (; x + N <= xsize - 1; x += N) {
185
685k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
685k
    }
187
    // Last pixels.
188
936k
    for (; x < xsize - 1; x++) {
189
733k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
733k
                            x);
191
733k
    }
192
203k
    return true;
193
203k
  };
Unexecuted instantiation: compressed_dc.cc:jxl::N_SSE4::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
compressed_dc.cc:jxl::N_AVX2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
149
203k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
203k
    const float* JXL_RESTRICT rows_top[3]{
151
203k
        dc->ConstPlaneRow(0, y - 1),
152
203k
        dc->ConstPlaneRow(1, y - 1),
153
203k
        dc->ConstPlaneRow(2, y - 1),
154
203k
    };
155
203k
    const float* JXL_RESTRICT rows[3] = {
156
203k
        dc->ConstPlaneRow(0, y),
157
203k
        dc->ConstPlaneRow(1, y),
158
203k
        dc->ConstPlaneRow(2, y),
159
203k
    };
160
203k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
203k
        dc->ConstPlaneRow(0, y + 1),
162
203k
        dc->ConstPlaneRow(1, y + 1),
163
203k
        dc->ConstPlaneRow(2, y + 1),
164
203k
    };
165
203k
    float* JXL_RESTRICT rows_out[3] = {
166
203k
        smoothed.PlaneRow(0, y),
167
203k
        smoothed.PlaneRow(1, y),
168
203k
        smoothed.PlaneRow(2, y),
169
203k
    };
170
406k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
1.62M
      for (size_t c = 0; c < 3; c++) {
172
1.21M
        rows_out[c][x] = rows[c][x];
173
1.21M
      }
174
406k
    }
175
176
203k
    size_t x = 1;
177
    // First pixels
178
203k
    const size_t N = Lanes(D());
179
1.60M
    for (; x < std::min(N, xsize - 1); x++) {
180
1.39M
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
1.39M
                            x);
182
1.39M
    }
183
    // Full vectors.
184
888k
    for (; x + N <= xsize - 1; x += N) {
185
685k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
685k
    }
187
    // Last pixels.
188
936k
    for (; x < xsize - 1; x++) {
189
733k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
733k
                            x);
191
733k
    }
192
203k
    return true;
193
203k
  };
Unexecuted instantiation: compressed_dc.cc:jxl::N_AVX3::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: compressed_dc.cc:jxl::N_AVX3_ZEN4::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: compressed_dc.cc:jxl::N_AVX3_SPR::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: compressed_dc.cc:jxl::N_SSE2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
194
12.9k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit,
195
12.9k
                                process_row, "DCSmoothingRow"));
196
6.49k
  dc->Swap(smoothed);
197
6.49k
  return true;
198
12.9k
}
Unexecuted instantiation: jxl::N_SSE4::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
jxl::N_AVX2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
Line
Count
Source
130
9.26k
                           ThreadPool* pool) {
131
9.26k
  const size_t xsize = dc->xsize();
132
9.26k
  const size_t ysize = dc->ysize();
133
9.26k
  if (ysize <= 2 || xsize <= 2) return true;
134
135
  // TODO(veluca): use tile-based processing?
136
  // TODO(veluca): decide if changes to the y channel should be propagated to
137
  // the x and b channels through color correlation.
138
6.49k
  JXL_ENSURE(w1 + w2 < 0.25f);
139
140
12.9k
  JXL_ASSIGN_OR_RETURN(Image3F smoothed,
141
12.9k
                       Image3F::Create(memory_manager, xsize, ysize));
142
  // Fill in borders that the loop below will not. First and last are unused.
143
25.9k
  for (size_t c = 0; c < 3; c++) {
144
38.9k
    for (size_t y : {static_cast<size_t>(0), ysize - 1}) {
145
38.9k
      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
146
38.9k
             xsize * sizeof(float));
147
38.9k
    }
148
19.4k
  }
149
12.9k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
12.9k
    const float* JXL_RESTRICT rows_top[3]{
151
12.9k
        dc->ConstPlaneRow(0, y - 1),
152
12.9k
        dc->ConstPlaneRow(1, y - 1),
153
12.9k
        dc->ConstPlaneRow(2, y - 1),
154
12.9k
    };
155
12.9k
    const float* JXL_RESTRICT rows[3] = {
156
12.9k
        dc->ConstPlaneRow(0, y),
157
12.9k
        dc->ConstPlaneRow(1, y),
158
12.9k
        dc->ConstPlaneRow(2, y),
159
12.9k
    };
160
12.9k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
12.9k
        dc->ConstPlaneRow(0, y + 1),
162
12.9k
        dc->ConstPlaneRow(1, y + 1),
163
12.9k
        dc->ConstPlaneRow(2, y + 1),
164
12.9k
    };
165
12.9k
    float* JXL_RESTRICT rows_out[3] = {
166
12.9k
        smoothed.PlaneRow(0, y),
167
12.9k
        smoothed.PlaneRow(1, y),
168
12.9k
        smoothed.PlaneRow(2, y),
169
12.9k
    };
170
12.9k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
12.9k
      for (size_t c = 0; c < 3; c++) {
172
12.9k
        rows_out[c][x] = rows[c][x];
173
12.9k
      }
174
12.9k
    }
175
176
12.9k
    size_t x = 1;
177
    // First pixels
178
12.9k
    const size_t N = Lanes(D());
179
12.9k
    for (; x < std::min(N, xsize - 1); x++) {
180
12.9k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
12.9k
                            x);
182
12.9k
    }
183
    // Full vectors.
184
12.9k
    for (; x + N <= xsize - 1; x += N) {
185
12.9k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
12.9k
    }
187
    // Last pixels.
188
12.9k
    for (; x < xsize - 1; x++) {
189
12.9k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
12.9k
                            x);
191
12.9k
    }
192
12.9k
    return true;
193
12.9k
  };
194
12.9k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit,
195
12.9k
                                process_row, "DCSmoothingRow"));
196
6.49k
  dc->Swap(smoothed);
197
6.49k
  return true;
198
12.9k
}
Unexecuted instantiation: jxl::N_AVX3::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_AVX3_SPR::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_SSE2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
199
200
// DC dequantization.
201
void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
202
               const float* dc_factors, float mul, const float* cfl_factors,
203
               const YCbCrChromaSubsampling& chroma_subsampling,
204
14.9k
               const BlockCtxMap& bctx) {
205
14.9k
  const HWY_FULL(float) df;
206
14.9k
  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
207
14.9k
  if (chroma_subsampling.Is444()) {
208
14.6k
    const auto fac_x = Set(df, dc_factors[0] * mul);
209
14.6k
    const auto fac_y = Set(df, dc_factors[1] * mul);
210
14.6k
    const auto fac_b = Set(df, dc_factors[2] * mul);
211
14.6k
    const auto cfl_fac_x = Set(df, cfl_factors[0]);
212
14.6k
    const auto cfl_fac_b = Set(df, cfl_factors[2]);
213
368k
    for (size_t y = 0; y < r.ysize(); y++) {
214
353k
      float* dec_row_x = r.PlaneRow(dc, 0, y);
215
353k
      float* dec_row_y = r.PlaneRow(dc, 1, y);
216
353k
      float* dec_row_b = r.PlaneRow(dc, 2, y);
217
353k
      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
218
353k
      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
219
353k
      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
220
1.71M
      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
221
1.36M
        const auto in_q_x = Load(di, quant_row_x + x);
222
1.36M
        const auto in_q_y = Load(di, quant_row_y + x);
223
1.36M
        const auto in_q_b = Load(di, quant_row_b + x);
224
1.36M
        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
225
1.36M
        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
226
1.36M
        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
227
1.36M
        Store(in_y, df, dec_row_y + x);
228
1.36M
        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
229
1.36M
        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
230
1.36M
      }
231
353k
    }
232
14.6k
  } else {
233
879
    for (size_t c : {1, 0, 2}) {
234
879
      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
235
879
                r.y0() >> chroma_subsampling.VShift(c),
236
879
                r.xsize() >> chroma_subsampling.HShift(c),
237
879
                r.ysize() >> chroma_subsampling.VShift(c));
238
879
      const auto fac = Set(df, dc_factors[c] * mul);
239
879
      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
240
38.7k
      for (size_t y = 0; y < rect.ysize(); y++) {
241
37.8k
        const int32_t* quant_row = ch.plane.Row(y);
242
37.8k
        float* row = rect.PlaneRow(dc, c, y);
243
316k
        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
244
278k
          const auto in_q = Load(di, quant_row + x);
245
278k
          const auto out = Mul(ConvertTo(df, in_q), fac);
246
278k
          Store(out, df, row + x);
247
278k
        }
248
37.8k
      }
249
879
    }
250
293
  }
251
14.9k
  if (bctx.num_dc_ctxs <= 1) {
252
371k
    for (size_t y = 0; y < r.ysize(); y++) {
253
357k
      uint8_t* qdc_row = r.Row(quant_dc, y);
254
357k
      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
255
357k
    }
256
14.0k
  } else {
257
10.2k
    for (size_t y = 0; y < r.ysize(); y++) {
258
9.27k
      uint8_t* qdc_row_val = r.Row(quant_dc, y);
259
9.27k
      const int32_t* quant_row_x =
260
9.27k
          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
261
9.27k
      const int32_t* quant_row_y =
262
9.27k
          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
263
9.27k
      const int32_t* quant_row_b =
264
9.27k
          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
265
948k
      for (size_t x = 0; x < r.xsize(); x++) {
266
939k
        int bucket_x = 0;
267
939k
        int bucket_y = 0;
268
939k
        int bucket_b = 0;
269
8.05M
        for (int t : bctx.dc_thresholds[0]) {
270
8.05M
          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
271
8.05M
        }
272
939k
        for (int t : bctx.dc_thresholds[1]) {
273
107k
          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
274
107k
        }
275
939k
        for (int t : bctx.dc_thresholds[2]) {
276
219k
          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
277
219k
        }
278
939k
        int bucket = bucket_x;
279
939k
        bucket *= bctx.dc_thresholds[2].size() + 1;
280
939k
        bucket += bucket_b;
281
939k
        bucket *= bctx.dc_thresholds[1].size() + 1;
282
939k
        bucket += bucket_y;
283
939k
        qdc_row_val[x] = bucket;
284
939k
      }
285
9.27k
    }
286
927
  }
287
14.9k
}
Unexecuted instantiation: jxl::N_SSE4::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
jxl::N_AVX2::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
Line
Count
Source
204
14.9k
               const BlockCtxMap& bctx) {
205
14.9k
  const HWY_FULL(float) df;
206
14.9k
  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
207
14.9k
  if (chroma_subsampling.Is444()) {
208
14.6k
    const auto fac_x = Set(df, dc_factors[0] * mul);
209
14.6k
    const auto fac_y = Set(df, dc_factors[1] * mul);
210
14.6k
    const auto fac_b = Set(df, dc_factors[2] * mul);
211
14.6k
    const auto cfl_fac_x = Set(df, cfl_factors[0]);
212
14.6k
    const auto cfl_fac_b = Set(df, cfl_factors[2]);
213
368k
    for (size_t y = 0; y < r.ysize(); y++) {
214
353k
      float* dec_row_x = r.PlaneRow(dc, 0, y);
215
353k
      float* dec_row_y = r.PlaneRow(dc, 1, y);
216
353k
      float* dec_row_b = r.PlaneRow(dc, 2, y);
217
353k
      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
218
353k
      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
219
353k
      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
220
1.71M
      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
221
1.36M
        const auto in_q_x = Load(di, quant_row_x + x);
222
1.36M
        const auto in_q_y = Load(di, quant_row_y + x);
223
1.36M
        const auto in_q_b = Load(di, quant_row_b + x);
224
1.36M
        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
225
1.36M
        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
226
1.36M
        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
227
1.36M
        Store(in_y, df, dec_row_y + x);
228
1.36M
        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
229
1.36M
        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
230
1.36M
      }
231
353k
    }
232
14.6k
  } else {
233
879
    for (size_t c : {1, 0, 2}) {
234
879
      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
235
879
                r.y0() >> chroma_subsampling.VShift(c),
236
879
                r.xsize() >> chroma_subsampling.HShift(c),
237
879
                r.ysize() >> chroma_subsampling.VShift(c));
238
879
      const auto fac = Set(df, dc_factors[c] * mul);
239
879
      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
240
38.7k
      for (size_t y = 0; y < rect.ysize(); y++) {
241
37.8k
        const int32_t* quant_row = ch.plane.Row(y);
242
37.8k
        float* row = rect.PlaneRow(dc, c, y);
243
316k
        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
244
278k
          const auto in_q = Load(di, quant_row + x);
245
278k
          const auto out = Mul(ConvertTo(df, in_q), fac);
246
278k
          Store(out, df, row + x);
247
278k
        }
248
37.8k
      }
249
879
    }
250
293
  }
251
14.9k
  if (bctx.num_dc_ctxs <= 1) {
252
371k
    for (size_t y = 0; y < r.ysize(); y++) {
253
357k
      uint8_t* qdc_row = r.Row(quant_dc, y);
254
357k
      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
255
357k
    }
256
14.0k
  } else {
257
10.2k
    for (size_t y = 0; y < r.ysize(); y++) {
258
9.27k
      uint8_t* qdc_row_val = r.Row(quant_dc, y);
259
9.27k
      const int32_t* quant_row_x =
260
9.27k
          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
261
9.27k
      const int32_t* quant_row_y =
262
9.27k
          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
263
9.27k
      const int32_t* quant_row_b =
264
9.27k
          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
265
948k
      for (size_t x = 0; x < r.xsize(); x++) {
266
939k
        int bucket_x = 0;
267
939k
        int bucket_y = 0;
268
939k
        int bucket_b = 0;
269
8.05M
        for (int t : bctx.dc_thresholds[0]) {
270
8.05M
          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
271
8.05M
        }
272
939k
        for (int t : bctx.dc_thresholds[1]) {
273
107k
          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
274
107k
        }
275
939k
        for (int t : bctx.dc_thresholds[2]) {
276
219k
          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
277
219k
        }
278
939k
        int bucket = bucket_x;
279
939k
        bucket *= bctx.dc_thresholds[2].size() + 1;
280
939k
        bucket += bucket_b;
281
939k
        bucket *= bctx.dc_thresholds[1].size() + 1;
282
939k
        bucket += bucket_y;
283
939k
        qdc_row_val[x] = bucket;
284
939k
      }
285
9.27k
    }
286
927
  }
287
14.9k
}
Unexecuted instantiation: jxl::N_AVX3::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
Unexecuted instantiation: jxl::N_AVX3_SPR::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
Unexecuted instantiation: jxl::N_SSE2::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
288
289
// NOLINTNEXTLINE(google-readability-namespace-comments)
290
}  // namespace HWY_NAMESPACE
291
}  // namespace jxl
292
HWY_AFTER_NAMESPACE();
293
294
#if HWY_ONCE
295
namespace jxl {
296
297
HWY_EXPORT(DequantDC);
298
HWY_EXPORT(AdaptiveDCSmoothing);
299
Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager,
300
                           const float* dc_factors, Image3F* dc,
301
9.26k
                           ThreadPool* pool) {
302
9.26k
  return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(memory_manager, dc_factors,
303
9.26k
                                                   dc, pool);
304
9.26k
}
305
306
void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
307
               const float* dc_factors, float mul, const float* cfl_factors,
308
               const YCbCrChromaSubsampling& chroma_subsampling,
309
14.9k
               const BlockCtxMap& bctx) {
310
14.9k
  HWY_DYNAMIC_DISPATCH(DequantDC)
311
14.9k
  (r, dc, quant_dc, in, dc_factors, mul, cfl_factors, chroma_subsampling, bctx);
312
14.9k
}
313
314
}  // namespace jxl
315
#endif  // HWY_ONCE