Coverage Report

Created: 2026-06-13 08:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/compressed_dc.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/compressed_dc.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cstdint>
12
#include <cstdlib>
13
#include <cstring>
14
#include <vector>
15
16
#include "lib/jxl/ac_context.h"
17
#include "lib/jxl/frame_header.h"
18
#include "lib/jxl/modular/modular_image.h"
19
20
#undef HWY_TARGET_INCLUDE
21
#define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc"
22
#include <hwy/foreach_target.h>
23
#include <hwy/highway.h>
24
25
#include "lib/jxl/base/compiler_specific.h"
26
#include "lib/jxl/base/data_parallel.h"
27
#include "lib/jxl/base/rect.h"
28
#include "lib/jxl/base/status.h"
29
#include "lib/jxl/image.h"
30
HWY_BEFORE_NAMESPACE();
31
namespace jxl {
32
namespace HWY_NAMESPACE {
33
34
using D = HWY_FULL(float);
35
using DScalar = HWY_CAPPED(float, 1);
36
37
// These templates are not found via ADL.
38
using hwy::HWY_NAMESPACE::Abs;
39
using hwy::HWY_NAMESPACE::Add;
40
using hwy::HWY_NAMESPACE::Div;
41
using hwy::HWY_NAMESPACE::Max;
42
using hwy::HWY_NAMESPACE::Mul;
43
using hwy::HWY_NAMESPACE::MulAdd;
44
using hwy::HWY_NAMESPACE::Rebind;
45
using hwy::HWY_NAMESPACE::Sub;
46
using hwy::HWY_NAMESPACE::Vec;
47
using hwy::HWY_NAMESPACE::ZeroIfNegative;
48
49
// TODO(veluca): optimize constants.
50
const float w1 = 0.20345139757231578f;
51
const float w2 = 0.0334829185968739f;
52
const float w0 = 1.0f - 4.0f * (w1 + w2);
53
54
template <class V>
55
7.25M
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
7.25M
  return Max(a, b);
61
7.25M
#endif
62
7.25M
}
hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::MaxWorkaround<hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>)
Line
Count
Source
55
768k
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
768k
  return Max(a, b);
61
768k
#endif
62
768k
}
hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::MaxWorkaround<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
Line
Count
Source
55
1.07M
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
1.07M
  return Max(a, b);
61
1.07M
#endif
62
1.07M
}
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::MaxWorkaround<hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
55
2.26M
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
2.26M
  return Max(a, b);
61
2.26M
#endif
62
2.26M
}
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::MaxWorkaround<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
55
875k
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
875k
  return Max(a, b);
61
875k
#endif
62
875k
}
hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::MaxWorkaround<hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Vec128<float, 1ul>, hwy::N_SSE2::Vec128<float, 1ul>)
Line
Count
Source
55
835k
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
835k
  return Max(a, b);
61
835k
#endif
62
835k
}
hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::MaxWorkaround<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>)
Line
Count
Source
55
1.42M
V MaxWorkaround(V a, V b) {
56
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57
  // Prevents "Do not know how to split the result of this operator" error
58
  return IfThenElse(a > b, a, b);
59
#else
60
1.42M
  return Max(a, b);
61
1.42M
#endif
62
1.42M
}
63
64
template <typename D>
65
JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor,
66
                                    const float* JXL_RESTRICT row_top,
67
                                    const float* JXL_RESTRICT row,
68
                                    const float* JXL_RESTRICT row_bottom,
69
                                    Vec<D>* JXL_RESTRICT mc,
70
                                    Vec<D>* JXL_RESTRICT sm,
71
7.04M
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
7.04M
  const auto tl = LoadU(d, row_top + x - 1);
73
7.04M
  const auto tc = Load(d, row_top + x);
74
7.04M
  const auto tr = LoadU(d, row_top + x + 1);
75
76
7.04M
  const auto ml = LoadU(d, row + x - 1);
77
7.04M
  *mc = Load(d, row + x);
78
7.04M
  const auto mr = LoadU(d, row + x + 1);
79
80
7.04M
  const auto bl = LoadU(d, row_bottom + x - 1);
81
7.04M
  const auto bc = Load(d, row_bottom + x);
82
7.04M
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
7.04M
  const auto w_center = Set(d, w0);
85
7.04M
  const auto w_side = Set(d, w1);
86
7.04M
  const auto w_corner = Set(d, w2);
87
88
7.04M
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
7.04M
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
7.04M
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
7.04M
  const auto dc_quant = Set(d, dc_factor);
93
7.04M
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
7.04M
}
void jxl::N_SSE4::ComputePixelChannel<hwy::N_SSE4::Simd<float, 1ul, 0> >(hwy::N_SSE4::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, unsigned long)
Line
Count
Source
71
732k
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
732k
  const auto tl = LoadU(d, row_top + x - 1);
73
732k
  const auto tc = Load(d, row_top + x);
74
732k
  const auto tr = LoadU(d, row_top + x + 1);
75
76
732k
  const auto ml = LoadU(d, row + x - 1);
77
732k
  *mc = Load(d, row + x);
78
732k
  const auto mr = LoadU(d, row + x + 1);
79
80
732k
  const auto bl = LoadU(d, row_bottom + x - 1);
81
732k
  const auto bc = Load(d, row_bottom + x);
82
732k
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
732k
  const auto w_center = Set(d, w0);
85
732k
  const auto w_side = Set(d, w1);
86
732k
  const auto w_corner = Set(d, w2);
87
88
732k
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
732k
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
732k
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
732k
  const auto dc_quant = Set(d, dc_factor);
93
732k
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
732k
}
void jxl::N_SSE4::ComputePixelChannel<hwy::N_SSE4::Simd<float, 4ul, 0> >(hwy::N_SSE4::Simd<float, 4ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, unsigned long)
Line
Count
Source
71
1.08M
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
1.08M
  const auto tl = LoadU(d, row_top + x - 1);
73
1.08M
  const auto tc = Load(d, row_top + x);
74
1.08M
  const auto tr = LoadU(d, row_top + x + 1);
75
76
1.08M
  const auto ml = LoadU(d, row + x - 1);
77
1.08M
  *mc = Load(d, row + x);
78
1.08M
  const auto mr = LoadU(d, row + x + 1);
79
80
1.08M
  const auto bl = LoadU(d, row_bottom + x - 1);
81
1.08M
  const auto bc = Load(d, row_bottom + x);
82
1.08M
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
1.08M
  const auto w_center = Set(d, w0);
85
1.08M
  const auto w_side = Set(d, w1);
86
1.08M
  const auto w_corner = Set(d, w2);
87
88
1.08M
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
1.08M
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
1.08M
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
1.08M
  const auto dc_quant = Set(d, dc_factor);
93
1.08M
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
1.08M
}
void jxl::N_AVX2::ComputePixelChannel<hwy::N_AVX2::Simd<float, 1ul, 0> >(hwy::N_AVX2::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, unsigned long)
Line
Count
Source
71
2.11M
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
2.11M
  const auto tl = LoadU(d, row_top + x - 1);
73
2.11M
  const auto tc = Load(d, row_top + x);
74
2.11M
  const auto tr = LoadU(d, row_top + x + 1);
75
76
2.11M
  const auto ml = LoadU(d, row + x - 1);
77
2.11M
  *mc = Load(d, row + x);
78
2.11M
  const auto mr = LoadU(d, row + x + 1);
79
80
2.11M
  const auto bl = LoadU(d, row_bottom + x - 1);
81
2.11M
  const auto bc = Load(d, row_bottom + x);
82
2.11M
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
2.11M
  const auto w_center = Set(d, w0);
85
2.11M
  const auto w_side = Set(d, w1);
86
2.11M
  const auto w_corner = Set(d, w2);
87
88
2.11M
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
2.11M
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
2.11M
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
2.11M
  const auto dc_quant = Set(d, dc_factor);
93
2.11M
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
2.11M
}
void jxl::N_AVX2::ComputePixelChannel<hwy::N_AVX2::Simd<float, 8ul, 0> >(hwy::N_AVX2::Simd<float, 8ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, unsigned long)
Line
Count
Source
71
868k
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
868k
  const auto tl = LoadU(d, row_top + x - 1);
73
868k
  const auto tc = Load(d, row_top + x);
74
868k
  const auto tr = LoadU(d, row_top + x + 1);
75
76
868k
  const auto ml = LoadU(d, row + x - 1);
77
868k
  *mc = Load(d, row + x);
78
868k
  const auto mr = LoadU(d, row + x + 1);
79
80
868k
  const auto bl = LoadU(d, row_bottom + x - 1);
81
868k
  const auto bc = Load(d, row_bottom + x);
82
868k
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
868k
  const auto w_center = Set(d, w0);
85
868k
  const auto w_side = Set(d, w1);
86
868k
  const auto w_corner = Set(d, w2);
87
88
868k
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
868k
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
868k
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
868k
  const auto dc_quant = Set(d, dc_factor);
93
868k
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
868k
}
void jxl::N_SSE2::ComputePixelChannel<hwy::N_SSE2::Simd<float, 1ul, 0> >(hwy::N_SSE2::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, unsigned long)
Line
Count
Source
71
779k
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
779k
  const auto tl = LoadU(d, row_top + x - 1);
73
779k
  const auto tc = Load(d, row_top + x);
74
779k
  const auto tr = LoadU(d, row_top + x + 1);
75
76
779k
  const auto ml = LoadU(d, row + x - 1);
77
779k
  *mc = Load(d, row + x);
78
779k
  const auto mr = LoadU(d, row + x + 1);
79
80
779k
  const auto bl = LoadU(d, row_bottom + x - 1);
81
779k
  const auto bc = Load(d, row_bottom + x);
82
779k
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
779k
  const auto w_center = Set(d, w0);
85
779k
  const auto w_side = Set(d, w1);
86
779k
  const auto w_corner = Set(d, w2);
87
88
779k
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
779k
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
779k
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
779k
  const auto dc_quant = Set(d, dc_factor);
93
779k
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
779k
}
void jxl::N_SSE2::ComputePixelChannel<hwy::N_SSE2::Simd<float, 4ul, 0> >(hwy::N_SSE2::Simd<float, 4ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, unsigned long)
Line
Count
Source
71
1.46M
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
72
1.46M
  const auto tl = LoadU(d, row_top + x - 1);
73
1.46M
  const auto tc = Load(d, row_top + x);
74
1.46M
  const auto tr = LoadU(d, row_top + x + 1);
75
76
1.46M
  const auto ml = LoadU(d, row + x - 1);
77
1.46M
  *mc = Load(d, row + x);
78
1.46M
  const auto mr = LoadU(d, row + x + 1);
79
80
1.46M
  const auto bl = LoadU(d, row_bottom + x - 1);
81
1.46M
  const auto bc = Load(d, row_bottom + x);
82
1.46M
  const auto br = LoadU(d, row_bottom + x + 1);
83
84
1.46M
  const auto w_center = Set(d, w0);
85
1.46M
  const auto w_side = Set(d, w1);
86
1.46M
  const auto w_corner = Set(d, w2);
87
88
1.46M
  const auto corner = Add(Add(tl, tr), Add(bl, br));
89
1.46M
  const auto side = Add(Add(ml, mr), Add(tc, bc));
90
1.46M
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));
91
92
1.46M
  const auto dc_quant = Set(d, dc_factor);
93
1.46M
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
94
1.46M
}
95
96
template <typename D>
97
JXL_INLINE void ComputePixel(
98
    const float* JXL_RESTRICT dc_factors,
99
    const float* JXL_RESTRICT* JXL_RESTRICT rows_top,
100
    const float* JXL_RESTRICT* JXL_RESTRICT rows,
101
    const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom,
102
2.50M
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
2.50M
  const D d;
104
2.50M
  auto mc_x = Undefined(d);
105
2.50M
  auto mc_y = Undefined(d);
106
2.50M
  auto mc_b = Undefined(d);
107
2.50M
  auto sm_x = Undefined(d);
108
2.50M
  auto sm_y = Undefined(d);
109
2.50M
  auto sm_b = Undefined(d);
110
2.50M
  auto gap = Set(d, 0.5f);
111
2.50M
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
2.50M
                      &mc_x, &sm_x, &gap, x);
113
2.50M
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
2.50M
                      &mc_y, &sm_y, &gap, x);
115
2.50M
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
2.50M
                      &mc_b, &sm_b, &gap, x);
117
2.50M
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
2.50M
  factor = ZeroIfNegative(factor);
119
120
2.50M
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
2.50M
  Store(out, d, out_rows[0] + x);
122
2.50M
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
2.50M
  Store(out, d, out_rows[1] + x);
124
2.50M
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
2.50M
  Store(out, d, out_rows[2] + x);
126
2.50M
}
void jxl::N_SSE4::ComputePixel<hwy::N_SSE4::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Line
Count
Source
102
266k
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
266k
  const D d;
104
266k
  auto mc_x = Undefined(d);
105
266k
  auto mc_y = Undefined(d);
106
266k
  auto mc_b = Undefined(d);
107
266k
  auto sm_x = Undefined(d);
108
266k
  auto sm_y = Undefined(d);
109
266k
  auto sm_b = Undefined(d);
110
266k
  auto gap = Set(d, 0.5f);
111
266k
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
266k
                      &mc_x, &sm_x, &gap, x);
113
266k
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
266k
                      &mc_y, &sm_y, &gap, x);
115
266k
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
266k
                      &mc_b, &sm_b, &gap, x);
117
266k
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
266k
  factor = ZeroIfNegative(factor);
119
120
266k
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
266k
  Store(out, d, out_rows[0] + x);
122
266k
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
266k
  Store(out, d, out_rows[1] + x);
124
266k
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
266k
  Store(out, d, out_rows[2] + x);
126
266k
}
void jxl::N_SSE4::ComputePixel<hwy::N_SSE4::Simd<float, 4ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Line
Count
Source
102
369k
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
369k
  const D d;
104
369k
  auto mc_x = Undefined(d);
105
369k
  auto mc_y = Undefined(d);
106
369k
  auto mc_b = Undefined(d);
107
369k
  auto sm_x = Undefined(d);
108
369k
  auto sm_y = Undefined(d);
109
369k
  auto sm_b = Undefined(d);
110
369k
  auto gap = Set(d, 0.5f);
111
369k
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
369k
                      &mc_x, &sm_x, &gap, x);
113
369k
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
369k
                      &mc_y, &sm_y, &gap, x);
115
369k
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
369k
                      &mc_b, &sm_b, &gap, x);
117
369k
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
369k
  factor = ZeroIfNegative(factor);
119
120
369k
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
369k
  Store(out, d, out_rows[0] + x);
122
369k
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
369k
  Store(out, d, out_rows[1] + x);
124
369k
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
369k
  Store(out, d, out_rows[2] + x);
126
369k
}
void jxl::N_AVX2::ComputePixel<hwy::N_AVX2::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Line
Count
Source
102
796k
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
796k
  const D d;
104
796k
  auto mc_x = Undefined(d);
105
796k
  auto mc_y = Undefined(d);
106
796k
  auto mc_b = Undefined(d);
107
796k
  auto sm_x = Undefined(d);
108
796k
  auto sm_y = Undefined(d);
109
796k
  auto sm_b = Undefined(d);
110
796k
  auto gap = Set(d, 0.5f);
111
796k
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
796k
                      &mc_x, &sm_x, &gap, x);
113
796k
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
796k
                      &mc_y, &sm_y, &gap, x);
115
796k
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
796k
                      &mc_b, &sm_b, &gap, x);
117
796k
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
796k
  factor = ZeroIfNegative(factor);
119
120
796k
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
796k
  Store(out, d, out_rows[0] + x);
122
796k
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
796k
  Store(out, d, out_rows[1] + x);
124
796k
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
796k
  Store(out, d, out_rows[2] + x);
126
796k
}
void jxl::N_AVX2::ComputePixel<hwy::N_AVX2::Simd<float, 8ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Line
Count
Source
102
296k
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
296k
  const D d;
104
296k
  auto mc_x = Undefined(d);
105
296k
  auto mc_y = Undefined(d);
106
296k
  auto mc_b = Undefined(d);
107
296k
  auto sm_x = Undefined(d);
108
296k
  auto sm_y = Undefined(d);
109
296k
  auto sm_b = Undefined(d);
110
296k
  auto gap = Set(d, 0.5f);
111
296k
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
296k
                      &mc_x, &sm_x, &gap, x);
113
296k
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
296k
                      &mc_y, &sm_y, &gap, x);
115
296k
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
296k
                      &mc_b, &sm_b, &gap, x);
117
296k
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
296k
  factor = ZeroIfNegative(factor);
119
120
296k
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
296k
  Store(out, d, out_rows[0] + x);
122
296k
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
296k
  Store(out, d, out_rows[1] + x);
124
296k
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
296k
  Store(out, d, out_rows[2] + x);
126
296k
}
void jxl::N_SSE2::ComputePixel<hwy::N_SSE2::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Line
Count
Source
102
287k
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
287k
  const D d;
104
287k
  auto mc_x = Undefined(d);
105
287k
  auto mc_y = Undefined(d);
106
287k
  auto mc_b = Undefined(d);
107
287k
  auto sm_x = Undefined(d);
108
287k
  auto sm_y = Undefined(d);
109
287k
  auto sm_b = Undefined(d);
110
287k
  auto gap = Set(d, 0.5f);
111
287k
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
287k
                      &mc_x, &sm_x, &gap, x);
113
287k
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
287k
                      &mc_y, &sm_y, &gap, x);
115
287k
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
287k
                      &mc_b, &sm_b, &gap, x);
117
287k
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
287k
  factor = ZeroIfNegative(factor);
119
120
287k
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
287k
  Store(out, d, out_rows[0] + x);
122
287k
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
287k
  Store(out, d, out_rows[1] + x);
124
287k
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
287k
  Store(out, d, out_rows[2] + x);
126
287k
}
void jxl::N_SSE2::ComputePixel<hwy::N_SSE2::Simd<float, 4ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long)
Line
Count
Source
102
491k
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103
491k
  const D d;
104
491k
  auto mc_x = Undefined(d);
105
491k
  auto mc_y = Undefined(d);
106
491k
  auto mc_b = Undefined(d);
107
491k
  auto sm_x = Undefined(d);
108
491k
  auto sm_y = Undefined(d);
109
491k
  auto sm_b = Undefined(d);
110
491k
  auto gap = Set(d, 0.5f);
111
491k
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112
491k
                      &mc_x, &sm_x, &gap, x);
113
491k
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114
491k
                      &mc_y, &sm_y, &gap, x);
115
491k
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116
491k
                      &mc_b, &sm_b, &gap, x);
117
491k
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118
491k
  factor = ZeroIfNegative(factor);
119
120
491k
  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121
491k
  Store(out, d, out_rows[0] + x);
122
491k
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123
491k
  Store(out, d, out_rows[1] + x);
124
491k
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125
491k
  Store(out, d, out_rows[2] + x);
126
491k
}
127
128
Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager,
129
                           const float* dc_factors, Image3F* dc,
130
17.0k
                           ThreadPool* pool) {
131
17.0k
  const size_t xsize = dc->xsize();
132
17.0k
  const size_t ysize = dc->ysize();
133
17.0k
  if (ysize <= 2 || xsize <= 2) return true;
134
135
  // TODO(veluca): use tile-based processing?
136
  // TODO(veluca): decide if changes to the y channel should be propagated to
137
  // the x and b channels through color correlation.
138
7.81k
  JXL_ENSURE(w1 + w2 < 0.25f);
139
140
15.6k
  JXL_ASSIGN_OR_RETURN(Image3F smoothed,
141
15.6k
                       Image3F::Create(memory_manager, xsize, ysize));
142
  // Fill in borders that the loop below will not. First and last are unused.
143
31.2k
  for (size_t c = 0; c < 3; c++) {
144
46.9k
    for (size_t y : {static_cast<size_t>(0), ysize - 1}) {
145
46.9k
      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
146
46.9k
             xsize * sizeof(float));
147
46.9k
    }
148
23.4k
  }
149
258k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
258k
    const float* JXL_RESTRICT rows_top[3]{
151
258k
        dc->ConstPlaneRow(0, y - 1),
152
258k
        dc->ConstPlaneRow(1, y - 1),
153
258k
        dc->ConstPlaneRow(2, y - 1),
154
258k
    };
155
258k
    const float* JXL_RESTRICT rows[3] = {
156
258k
        dc->ConstPlaneRow(0, y),
157
258k
        dc->ConstPlaneRow(1, y),
158
258k
        dc->ConstPlaneRow(2, y),
159
258k
    };
160
258k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
258k
        dc->ConstPlaneRow(0, y + 1),
162
258k
        dc->ConstPlaneRow(1, y + 1),
163
258k
        dc->ConstPlaneRow(2, y + 1),
164
258k
    };
165
258k
    float* JXL_RESTRICT rows_out[3] = {
166
258k
        smoothed.PlaneRow(0, y),
167
258k
        smoothed.PlaneRow(1, y),
168
258k
        smoothed.PlaneRow(2, y),
169
258k
    };
170
506k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
2.02M
      for (size_t c = 0; c < 3; c++) {
172
1.51M
        rows_out[c][x] = rows[c][x];
173
1.51M
      }
174
506k
    }
175
176
258k
    size_t x = 1;
177
    // First pixels
178
258k
    const size_t N = Lanes(D());
179
1.33M
    for (; x < std::min(N, xsize - 1); x++) {
180
1.07M
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
1.07M
                            x);
182
1.07M
    }
183
    // Full vectors.
184
1.51M
    for (; x + N <= xsize - 1; x += N) {
185
1.25M
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
1.25M
    }
187
    // Last pixels.
188
567k
    for (; x < xsize - 1; x++) {
189
309k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
309k
                            x);
191
309k
    }
192
258k
    return true;
193
258k
  };
compressed_dc.cc:jxl::N_SSE4::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
149
70.7k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
70.7k
    const float* JXL_RESTRICT rows_top[3]{
151
70.7k
        dc->ConstPlaneRow(0, y - 1),
152
70.7k
        dc->ConstPlaneRow(1, y - 1),
153
70.7k
        dc->ConstPlaneRow(2, y - 1),
154
70.7k
    };
155
70.7k
    const float* JXL_RESTRICT rows[3] = {
156
70.7k
        dc->ConstPlaneRow(0, y),
157
70.7k
        dc->ConstPlaneRow(1, y),
158
70.7k
        dc->ConstPlaneRow(2, y),
159
70.7k
    };
160
70.7k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
70.7k
        dc->ConstPlaneRow(0, y + 1),
162
70.7k
        dc->ConstPlaneRow(1, y + 1),
163
70.7k
        dc->ConstPlaneRow(2, y + 1),
164
70.7k
    };
165
70.7k
    float* JXL_RESTRICT rows_out[3] = {
166
70.7k
        smoothed.PlaneRow(0, y),
167
70.7k
        smoothed.PlaneRow(1, y),
168
70.7k
        smoothed.PlaneRow(2, y),
169
70.7k
    };
170
138k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
552k
      for (size_t c = 0; c < 3; c++) {
172
413k
        rows_out[c][x] = rows[c][x];
173
413k
      }
174
138k
    }
175
176
70.7k
    size_t x = 1;
177
    // First pixels
178
70.7k
    const size_t N = Lanes(D());
179
270k
    for (; x < std::min(N, xsize - 1); x++) {
180
199k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
199k
                            x);
182
199k
    }
183
    // Full vectors.
184
502k
    for (; x + N <= xsize - 1; x += N) {
185
432k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
432k
    }
187
    // Last pixels.
188
144k
    for (; x < xsize - 1; x++) {
189
73.2k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
73.2k
                            x);
191
73.2k
    }
192
70.7k
    return true;
193
70.7k
  };
compressed_dc.cc:jxl::N_AVX2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
149
109k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
109k
    const float* JXL_RESTRICT rows_top[3]{
151
109k
        dc->ConstPlaneRow(0, y - 1),
152
109k
        dc->ConstPlaneRow(1, y - 1),
153
109k
        dc->ConstPlaneRow(2, y - 1),
154
109k
    };
155
109k
    const float* JXL_RESTRICT rows[3] = {
156
109k
        dc->ConstPlaneRow(0, y),
157
109k
        dc->ConstPlaneRow(1, y),
158
109k
        dc->ConstPlaneRow(2, y),
159
109k
    };
160
109k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
109k
        dc->ConstPlaneRow(0, y + 1),
162
109k
        dc->ConstPlaneRow(1, y + 1),
163
109k
        dc->ConstPlaneRow(2, y + 1),
164
109k
    };
165
109k
    float* JXL_RESTRICT rows_out[3] = {
166
109k
        smoothed.PlaneRow(0, y),
167
109k
        smoothed.PlaneRow(1, y),
168
109k
        smoothed.PlaneRow(2, y),
169
109k
    };
170
217k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
867k
      for (size_t c = 0; c < 3; c++) {
172
650k
        rows_out[c][x] = rows[c][x];
173
650k
      }
174
217k
    }
175
176
109k
    size_t x = 1;
177
    // First pixels
178
109k
    const size_t N = Lanes(D());
179
769k
    for (; x < std::min(N, xsize - 1); x++) {
180
660k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
660k
                            x);
182
660k
    }
183
    // Full vectors.
184
416k
    for (; x + N <= xsize - 1; x += N) {
185
307k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
307k
    }
187
    // Last pixels.
188
262k
    for (; x < xsize - 1; x++) {
189
153k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
153k
                            x);
191
153k
    }
192
109k
    return true;
193
109k
  };
compressed_dc.cc:jxl::N_SSE2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
149
78.5k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
78.5k
    const float* JXL_RESTRICT rows_top[3]{
151
78.5k
        dc->ConstPlaneRow(0, y - 1),
152
78.5k
        dc->ConstPlaneRow(1, y - 1),
153
78.5k
        dc->ConstPlaneRow(2, y - 1),
154
78.5k
    };
155
78.5k
    const float* JXL_RESTRICT rows[3] = {
156
78.5k
        dc->ConstPlaneRow(0, y),
157
78.5k
        dc->ConstPlaneRow(1, y),
158
78.5k
        dc->ConstPlaneRow(2, y),
159
78.5k
    };
160
78.5k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
78.5k
        dc->ConstPlaneRow(0, y + 1),
162
78.5k
        dc->ConstPlaneRow(1, y + 1),
163
78.5k
        dc->ConstPlaneRow(2, y + 1),
164
78.5k
    };
165
78.5k
    float* JXL_RESTRICT rows_out[3] = {
166
78.5k
        smoothed.PlaneRow(0, y),
167
78.5k
        smoothed.PlaneRow(1, y),
168
78.5k
        smoothed.PlaneRow(2, y),
169
78.5k
    };
170
151k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
605k
      for (size_t c = 0; c < 3; c++) {
172
453k
        rows_out[c][x] = rows[c][x];
173
453k
      }
174
151k
    }
175
176
78.5k
    size_t x = 1;
177
    // First pixels
178
78.5k
    const size_t N = Lanes(D());
179
295k
    for (; x < std::min(N, xsize - 1); x++) {
180
216k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
216k
                            x);
182
216k
    }
183
    // Full vectors.
184
595k
    for (; x + N <= xsize - 1; x += N) {
185
516k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
516k
    }
187
    // Last pixels.
188
161k
    for (; x < xsize - 1; x++) {
189
82.7k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
82.7k
                            x);
191
82.7k
    }
192
78.5k
    return true;
193
78.5k
  };
194
15.6k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit,
195
15.6k
                                process_row, "DCSmoothingRow"));
196
7.81k
  dc->Swap(smoothed);
197
7.81k
  return true;
198
15.6k
}
jxl::N_SSE4::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
Line
Count
Source
130
1.55k
                           ThreadPool* pool) {
131
1.55k
  const size_t xsize = dc->xsize();
132
1.55k
  const size_t ysize = dc->ysize();
133
1.55k
  if (ysize <= 2 || xsize <= 2) return true;
134
135
  // TODO(veluca): use tile-based processing?
136
  // TODO(veluca): decide if changes to the y channel should be propagated to
137
  // the x and b channels through color correlation.
138
1.41k
  JXL_ENSURE(w1 + w2 < 0.25f);
139
140
2.83k
  JXL_ASSIGN_OR_RETURN(Image3F smoothed,
141
2.83k
                       Image3F::Create(memory_manager, xsize, ysize));
142
  // Fill in borders that the loop below will not. First and last are unused.
143
5.67k
  for (size_t c = 0; c < 3; c++) {
144
8.51k
    for (size_t y : {static_cast<size_t>(0), ysize - 1}) {
145
8.51k
      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
146
8.51k
             xsize * sizeof(float));
147
8.51k
    }
148
4.25k
  }
149
2.83k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
2.83k
    const float* JXL_RESTRICT rows_top[3]{
151
2.83k
        dc->ConstPlaneRow(0, y - 1),
152
2.83k
        dc->ConstPlaneRow(1, y - 1),
153
2.83k
        dc->ConstPlaneRow(2, y - 1),
154
2.83k
    };
155
2.83k
    const float* JXL_RESTRICT rows[3] = {
156
2.83k
        dc->ConstPlaneRow(0, y),
157
2.83k
        dc->ConstPlaneRow(1, y),
158
2.83k
        dc->ConstPlaneRow(2, y),
159
2.83k
    };
160
2.83k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
2.83k
        dc->ConstPlaneRow(0, y + 1),
162
2.83k
        dc->ConstPlaneRow(1, y + 1),
163
2.83k
        dc->ConstPlaneRow(2, y + 1),
164
2.83k
    };
165
2.83k
    float* JXL_RESTRICT rows_out[3] = {
166
2.83k
        smoothed.PlaneRow(0, y),
167
2.83k
        smoothed.PlaneRow(1, y),
168
2.83k
        smoothed.PlaneRow(2, y),
169
2.83k
    };
170
2.83k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
2.83k
      for (size_t c = 0; c < 3; c++) {
172
2.83k
        rows_out[c][x] = rows[c][x];
173
2.83k
      }
174
2.83k
    }
175
176
2.83k
    size_t x = 1;
177
    // First pixels
178
2.83k
    const size_t N = Lanes(D());
179
2.83k
    for (; x < std::min(N, xsize - 1); x++) {
180
2.83k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
2.83k
                            x);
182
2.83k
    }
183
    // Full vectors.
184
2.83k
    for (; x + N <= xsize - 1; x += N) {
185
2.83k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
2.83k
    }
187
    // Last pixels.
188
2.83k
    for (; x < xsize - 1; x++) {
189
2.83k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
2.83k
                            x);
191
2.83k
    }
192
2.83k
    return true;
193
2.83k
  };
194
2.83k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit,
195
2.83k
                                process_row, "DCSmoothingRow"));
196
1.41k
  dc->Swap(smoothed);
197
1.41k
  return true;
198
2.83k
}
jxl::N_AVX2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
Line
Count
Source
130
13.6k
                           ThreadPool* pool) {
131
13.6k
  const size_t xsize = dc->xsize();
132
13.6k
  const size_t ysize = dc->ysize();
133
13.6k
  if (ysize <= 2 || xsize <= 2) return true;
134
135
  // TODO(veluca): use tile-based processing?
136
  // TODO(veluca): decide if changes to the y channel should be propagated to
137
  // the x and b channels through color correlation.
138
4.66k
  JXL_ENSURE(w1 + w2 < 0.25f);
139
140
9.32k
  JXL_ASSIGN_OR_RETURN(Image3F smoothed,
141
9.32k
                       Image3F::Create(memory_manager, xsize, ysize));
142
  // Fill in borders that the loop below will not. First and last are unused.
143
18.6k
  for (size_t c = 0; c < 3; c++) {
144
27.9k
    for (size_t y : {static_cast<size_t>(0), ysize - 1}) {
145
27.9k
      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
146
27.9k
             xsize * sizeof(float));
147
27.9k
    }
148
13.9k
  }
149
9.32k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
9.32k
    const float* JXL_RESTRICT rows_top[3]{
151
9.32k
        dc->ConstPlaneRow(0, y - 1),
152
9.32k
        dc->ConstPlaneRow(1, y - 1),
153
9.32k
        dc->ConstPlaneRow(2, y - 1),
154
9.32k
    };
155
9.32k
    const float* JXL_RESTRICT rows[3] = {
156
9.32k
        dc->ConstPlaneRow(0, y),
157
9.32k
        dc->ConstPlaneRow(1, y),
158
9.32k
        dc->ConstPlaneRow(2, y),
159
9.32k
    };
160
9.32k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
9.32k
        dc->ConstPlaneRow(0, y + 1),
162
9.32k
        dc->ConstPlaneRow(1, y + 1),
163
9.32k
        dc->ConstPlaneRow(2, y + 1),
164
9.32k
    };
165
9.32k
    float* JXL_RESTRICT rows_out[3] = {
166
9.32k
        smoothed.PlaneRow(0, y),
167
9.32k
        smoothed.PlaneRow(1, y),
168
9.32k
        smoothed.PlaneRow(2, y),
169
9.32k
    };
170
9.32k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
9.32k
      for (size_t c = 0; c < 3; c++) {
172
9.32k
        rows_out[c][x] = rows[c][x];
173
9.32k
      }
174
9.32k
    }
175
176
9.32k
    size_t x = 1;
177
    // First pixels
178
9.32k
    const size_t N = Lanes(D());
179
9.32k
    for (; x < std::min(N, xsize - 1); x++) {
180
9.32k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
9.32k
                            x);
182
9.32k
    }
183
    // Full vectors.
184
9.32k
    for (; x + N <= xsize - 1; x += N) {
185
9.32k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
9.32k
    }
187
    // Last pixels.
188
9.32k
    for (; x < xsize - 1; x++) {
189
9.32k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
9.32k
                            x);
191
9.32k
    }
192
9.32k
    return true;
193
9.32k
  };
194
9.32k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit,
195
9.32k
                                process_row, "DCSmoothingRow"));
196
4.66k
  dc->Swap(smoothed);
197
4.66k
  return true;
198
9.32k
}
jxl::N_SSE2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)
Line
Count
Source
130
1.87k
                           ThreadPool* pool) {
131
1.87k
  const size_t xsize = dc->xsize();
132
1.87k
  const size_t ysize = dc->ysize();
133
1.87k
  if (ysize <= 2 || xsize <= 2) return true;
134
135
  // TODO(veluca): use tile-based processing?
136
  // TODO(veluca): decide if changes to the y channel should be propagated to
137
  // the x and b channels through color correlation.
138
1.73k
  JXL_ENSURE(w1 + w2 < 0.25f);
139
140
3.47k
  JXL_ASSIGN_OR_RETURN(Image3F smoothed,
141
3.47k
                       Image3F::Create(memory_manager, xsize, ysize));
142
  // Fill in borders that the loop below will not. First and last are unused.
143
6.95k
  for (size_t c = 0; c < 3; c++) {
144
10.4k
    for (size_t y : {static_cast<size_t>(0), ysize - 1}) {
145
10.4k
      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
146
10.4k
             xsize * sizeof(float));
147
10.4k
    }
148
5.21k
  }
149
3.47k
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
150
3.47k
    const float* JXL_RESTRICT rows_top[3]{
151
3.47k
        dc->ConstPlaneRow(0, y - 1),
152
3.47k
        dc->ConstPlaneRow(1, y - 1),
153
3.47k
        dc->ConstPlaneRow(2, y - 1),
154
3.47k
    };
155
3.47k
    const float* JXL_RESTRICT rows[3] = {
156
3.47k
        dc->ConstPlaneRow(0, y),
157
3.47k
        dc->ConstPlaneRow(1, y),
158
3.47k
        dc->ConstPlaneRow(2, y),
159
3.47k
    };
160
3.47k
    const float* JXL_RESTRICT rows_bottom[3] = {
161
3.47k
        dc->ConstPlaneRow(0, y + 1),
162
3.47k
        dc->ConstPlaneRow(1, y + 1),
163
3.47k
        dc->ConstPlaneRow(2, y + 1),
164
3.47k
    };
165
3.47k
    float* JXL_RESTRICT rows_out[3] = {
166
3.47k
        smoothed.PlaneRow(0, y),
167
3.47k
        smoothed.PlaneRow(1, y),
168
3.47k
        smoothed.PlaneRow(2, y),
169
3.47k
    };
170
3.47k
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171
3.47k
      for (size_t c = 0; c < 3; c++) {
172
3.47k
        rows_out[c][x] = rows[c][x];
173
3.47k
      }
174
3.47k
    }
175
176
3.47k
    size_t x = 1;
177
    // First pixels
178
3.47k
    const size_t N = Lanes(D());
179
3.47k
    for (; x < std::min(N, xsize - 1); x++) {
180
3.47k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181
3.47k
                            x);
182
3.47k
    }
183
    // Full vectors.
184
3.47k
    for (; x + N <= xsize - 1; x += N) {
185
3.47k
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186
3.47k
    }
187
    // Last pixels.
188
3.47k
    for (; x < xsize - 1; x++) {
189
3.47k
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190
3.47k
                            x);
191
3.47k
    }
192
3.47k
    return true;
193
3.47k
  };
194
3.47k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit,
195
3.47k
                                process_row, "DCSmoothingRow"));
196
1.73k
  dc->Swap(smoothed);
197
1.73k
  return true;
198
3.47k
}
199
200
// DC dequantization.
201
void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
202
               const float* dc_factors, float mul, const float* cfl_factors,
203
               const YCbCrChromaSubsampling& chroma_subsampling,
204
24.6k
               const BlockCtxMap& bctx) {
205
24.6k
  const HWY_FULL(float) df;
206
24.6k
  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
207
24.6k
  if (chroma_subsampling.Is444()) {
208
19.7k
    const auto fac_x = Set(df, dc_factors[0] * mul);
209
19.7k
    const auto fac_y = Set(df, dc_factors[1] * mul);
210
19.7k
    const auto fac_b = Set(df, dc_factors[2] * mul);
211
19.7k
    const auto cfl_fac_x = Set(df, cfl_factors[0]);
212
19.7k
    const auto cfl_fac_b = Set(df, cfl_factors[2]);
213
451k
    for (size_t y = 0; y < r.ysize(); y++) {
214
431k
      float* dec_row_x = r.PlaneRow(dc, 0, y);
215
431k
      float* dec_row_y = r.PlaneRow(dc, 1, y);
216
431k
      float* dec_row_b = r.PlaneRow(dc, 2, y);
217
431k
      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
218
431k
      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
219
431k
      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
220
2.72M
      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
221
2.29M
        const auto in_q_x = Load(di, quant_row_x + x);
222
2.29M
        const auto in_q_y = Load(di, quant_row_y + x);
223
2.29M
        const auto in_q_b = Load(di, quant_row_b + x);
224
2.29M
        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
225
2.29M
        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
226
2.29M
        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
227
2.29M
        Store(in_y, df, dec_row_y + x);
228
2.29M
        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
229
2.29M
        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
230
2.29M
      }
231
431k
    }
232
19.7k
  } else {
233
14.6k
    for (size_t c : {1, 0, 2}) {
234
14.6k
      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
235
14.6k
                r.y0() >> chroma_subsampling.VShift(c),
236
14.6k
                r.xsize() >> chroma_subsampling.HShift(c),
237
14.6k
                r.ysize() >> chroma_subsampling.VShift(c));
238
14.6k
      const auto fac = Set(df, dc_factors[c] * mul);
239
14.6k
      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
240
159k
      for (size_t y = 0; y < rect.ysize(); y++) {
241
145k
        const int32_t* quant_row = ch.plane.Row(y);
242
145k
        float* row = rect.PlaneRow(dc, c, y);
243
608k
        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
244
463k
          const auto in_q = Load(di, quant_row + x);
245
463k
          const auto out = Mul(ConvertTo(df, in_q), fac);
246
463k
          Store(out, df, row + x);
247
463k
        }
248
145k
      }
249
14.6k
    }
250
4.86k
  }
251
24.6k
  if (bctx.num_dc_ctxs <= 1) {
252
496k
    for (size_t y = 0; y < r.ysize(); y++) {
253
475k
      uint8_t* qdc_row = r.Row(quant_dc, y);
254
475k
      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
255
475k
    }
256
21.7k
  } else {
257
2.88k
    JXL_DASSERT(r.ysize() == 0 ||
258
2.88k
                (r.ysize() - 1) >> chroma_subsampling.VShift(0) <
259
2.88k
                    in.channel[1].plane.ysize());
260
2.88k
    JXL_DASSERT(r.ysize() == 0 ||
261
2.88k
                (r.ysize() - 1) >> chroma_subsampling.VShift(1) <
262
2.88k
                    in.channel[0].plane.ysize());
263
2.88k
    JXL_DASSERT(r.ysize() == 0 ||
264
2.88k
                (r.ysize() - 1) >> chroma_subsampling.VShift(2) <
265
2.88k
                    in.channel[2].plane.ysize());
266
22.1k
    for (size_t y = 0; y < r.ysize(); y++) {
267
19.2k
      uint8_t* qdc_row_val = r.Row(quant_dc, y);
268
19.2k
      const int32_t* quant_row_x =
269
19.2k
          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
270
19.2k
      const int32_t* quant_row_y =
271
19.2k
          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
272
19.2k
      const int32_t* quant_row_b =
273
19.2k
          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
274
526k
      for (size_t x = 0; x < r.xsize(); x++) {
275
507k
        int bucket_x = 0;
276
507k
        int bucket_y = 0;
277
507k
        int bucket_b = 0;
278
1.49M
        for (int t : bctx.dc_thresholds[0]) {
279
1.49M
          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
280
1.49M
        }
281
730k
        for (int t : bctx.dc_thresholds[1]) {
282
730k
          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
283
730k
        }
284
799k
        for (int t : bctx.dc_thresholds[2]) {
285
799k
          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
286
799k
        }
287
507k
        int bucket = bucket_x;
288
507k
        bucket *= bctx.dc_thresholds[2].size() + 1;
289
507k
        bucket += bucket_b;
290
507k
        bucket *= bctx.dc_thresholds[1].size() + 1;
291
507k
        bucket += bucket_y;
292
507k
        qdc_row_val[x] = bucket;
293
507k
      }
294
19.2k
    }
295
2.88k
  }
296
24.6k
}
jxl::N_SSE4::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
Line
Count
Source
204
3.53k
               const BlockCtxMap& bctx) {
205
3.53k
  const HWY_FULL(float) df;
206
3.53k
  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
207
3.53k
  if (chroma_subsampling.Is444()) {
208
1.89k
    const auto fac_x = Set(df, dc_factors[0] * mul);
209
1.89k
    const auto fac_y = Set(df, dc_factors[1] * mul);
210
1.89k
    const auto fac_b = Set(df, dc_factors[2] * mul);
211
1.89k
    const auto cfl_fac_x = Set(df, cfl_factors[0]);
212
1.89k
    const auto cfl_fac_b = Set(df, cfl_factors[2]);
213
98.6k
    for (size_t y = 0; y < r.ysize(); y++) {
214
96.7k
      float* dec_row_x = r.PlaneRow(dc, 0, y);
215
96.7k
      float* dec_row_y = r.PlaneRow(dc, 1, y);
216
96.7k
      float* dec_row_b = r.PlaneRow(dc, 2, y);
217
96.7k
      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
218
96.7k
      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
219
96.7k
      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
220
826k
      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
221
730k
        const auto in_q_x = Load(di, quant_row_x + x);
222
730k
        const auto in_q_y = Load(di, quant_row_y + x);
223
730k
        const auto in_q_b = Load(di, quant_row_b + x);
224
730k
        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
225
730k
        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
226
730k
        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
227
730k
        Store(in_y, df, dec_row_y + x);
228
730k
        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
229
730k
        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
230
730k
      }
231
96.7k
    }
232
1.89k
  } else {
233
4.89k
    for (size_t c : {1, 0, 2}) {
234
4.89k
      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
235
4.89k
                r.y0() >> chroma_subsampling.VShift(c),
236
4.89k
                r.xsize() >> chroma_subsampling.HShift(c),
237
4.89k
                r.ysize() >> chroma_subsampling.VShift(c));
238
4.89k
      const auto fac = Set(df, dc_factors[c] * mul);
239
4.89k
      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
240
45.9k
      for (size_t y = 0; y < rect.ysize(); y++) {
241
41.0k
        const int32_t* quant_row = ch.plane.Row(y);
242
41.0k
        float* row = rect.PlaneRow(dc, c, y);
243
151k
        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
244
110k
          const auto in_q = Load(di, quant_row + x);
245
110k
          const auto out = Mul(ConvertTo(df, in_q), fac);
246
110k
          Store(out, df, row + x);
247
110k
        }
248
41.0k
      }
249
4.89k
    }
250
1.63k
  }
251
3.53k
  if (bctx.num_dc_ctxs <= 1) {
252
116k
    for (size_t y = 0; y < r.ysize(); y++) {
253
113k
      uint8_t* qdc_row = r.Row(quant_dc, y);
254
113k
      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
255
113k
    }
256
3.29k
  } else {
257
236
    JXL_DASSERT(r.ysize() == 0 ||
258
236
                (r.ysize() - 1) >> chroma_subsampling.VShift(0) <
259
236
                    in.channel[1].plane.ysize());
260
236
    JXL_DASSERT(r.ysize() == 0 ||
261
236
                (r.ysize() - 1) >> chroma_subsampling.VShift(1) <
262
236
                    in.channel[0].plane.ysize());
263
236
    JXL_DASSERT(r.ysize() == 0 ||
264
236
                (r.ysize() - 1) >> chroma_subsampling.VShift(2) <
265
236
                    in.channel[2].plane.ysize());
266
1.89k
    for (size_t y = 0; y < r.ysize(); y++) {
267
1.66k
      uint8_t* qdc_row_val = r.Row(quant_dc, y);
268
1.66k
      const int32_t* quant_row_x =
269
1.66k
          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
270
1.66k
      const int32_t* quant_row_y =
271
1.66k
          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
272
1.66k
      const int32_t* quant_row_b =
273
1.66k
          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
274
32.3k
      for (size_t x = 0; x < r.xsize(); x++) {
275
30.6k
        int bucket_x = 0;
276
30.6k
        int bucket_y = 0;
277
30.6k
        int bucket_b = 0;
278
49.2k
        for (int t : bctx.dc_thresholds[0]) {
279
49.2k
          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
280
49.2k
        }
281
107k
        for (int t : bctx.dc_thresholds[1]) {
282
107k
          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
283
107k
        }
284
48.8k
        for (int t : bctx.dc_thresholds[2]) {
285
48.8k
          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
286
48.8k
        }
287
30.6k
        int bucket = bucket_x;
288
30.6k
        bucket *= bctx.dc_thresholds[2].size() + 1;
289
30.6k
        bucket += bucket_b;
290
30.6k
        bucket *= bctx.dc_thresholds[1].size() + 1;
291
30.6k
        bucket += bucket_y;
292
30.6k
        qdc_row_val[x] = bucket;
293
30.6k
      }
294
1.66k
    }
295
236
  }
296
3.53k
}
jxl::N_AVX2::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
Line
Count
Source
204
17.4k
               const BlockCtxMap& bctx) {
205
17.4k
  const HWY_FULL(float) df;
206
17.4k
  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
207
17.4k
  if (chroma_subsampling.Is444()) {
208
15.6k
    const auto fac_x = Set(df, dc_factors[0] * mul);
209
15.6k
    const auto fac_y = Set(df, dc_factors[1] * mul);
210
15.6k
    const auto fac_b = Set(df, dc_factors[2] * mul);
211
15.6k
    const auto cfl_fac_x = Set(df, cfl_factors[0]);
212
15.6k
    const auto cfl_fac_b = Set(df, cfl_factors[2]);
213
247k
    for (size_t y = 0; y < r.ysize(); y++) {
214
231k
      float* dec_row_x = r.PlaneRow(dc, 0, y);
215
231k
      float* dec_row_y = r.PlaneRow(dc, 1, y);
216
231k
      float* dec_row_b = r.PlaneRow(dc, 2, y);
217
231k
      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
218
231k
      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
219
231k
      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
220
981k
      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
221
750k
        const auto in_q_x = Load(di, quant_row_x + x);
222
750k
        const auto in_q_y = Load(di, quant_row_y + x);
223
750k
        const auto in_q_b = Load(di, quant_row_b + x);
224
750k
        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
225
750k
        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
226
750k
        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
227
750k
        Store(in_y, df, dec_row_y + x);
228
750k
        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
229
750k
        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
230
750k
      }
231
231k
    }
232
15.6k
  } else {
233
5.43k
    for (size_t c : {1, 0, 2}) {
234
5.43k
      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
235
5.43k
                r.y0() >> chroma_subsampling.VShift(c),
236
5.43k
                r.xsize() >> chroma_subsampling.HShift(c),
237
5.43k
                r.ysize() >> chroma_subsampling.VShift(c));
238
5.43k
      const auto fac = Set(df, dc_factors[c] * mul);
239
5.43k
      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
240
73.7k
      for (size_t y = 0; y < rect.ysize(); y++) {
241
68.3k
        const int32_t* quant_row = ch.plane.Row(y);
242
68.3k
        float* row = rect.PlaneRow(dc, c, y);
243
321k
        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
244
253k
          const auto in_q = Load(di, quant_row + x);
245
253k
          const auto out = Mul(ConvertTo(df, in_q), fac);
246
253k
          Store(out, df, row + x);
247
253k
        }
248
68.3k
      }
249
5.43k
    }
250
1.81k
  }
251
17.4k
  if (bctx.num_dc_ctxs <= 1) {
252
261k
    for (size_t y = 0; y < r.ysize(); y++) {
253
245k
      uint8_t* qdc_row = r.Row(quant_dc, y);
254
245k
      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
255
245k
    }
256
15.2k
  } else {
257
2.22k
    JXL_DASSERT(r.ysize() == 0 ||
258
2.22k
                (r.ysize() - 1) >> chroma_subsampling.VShift(0) <
259
2.22k
                    in.channel[1].plane.ysize());
260
2.22k
    JXL_DASSERT(r.ysize() == 0 ||
261
2.22k
                (r.ysize() - 1) >> chroma_subsampling.VShift(1) <
262
2.22k
                    in.channel[0].plane.ysize());
263
2.22k
    JXL_DASSERT(r.ysize() == 0 ||
264
2.22k
                (r.ysize() - 1) >> chroma_subsampling.VShift(2) <
265
2.22k
                    in.channel[2].plane.ysize());
266
17.7k
    for (size_t y = 0; y < r.ysize(); y++) {
267
15.5k
      uint8_t* qdc_row_val = r.Row(quant_dc, y);
268
15.5k
      const int32_t* quant_row_x =
269
15.5k
          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
270
15.5k
      const int32_t* quant_row_y =
271
15.5k
          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
272
15.5k
      const int32_t* quant_row_b =
273
15.5k
          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
274
462k
      for (size_t x = 0; x < r.xsize(); x++) {
275
446k
        int bucket_x = 0;
276
446k
        int bucket_y = 0;
277
446k
        int bucket_b = 0;
278
1.42M
        for (int t : bctx.dc_thresholds[0]) {
279
1.42M
          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
280
1.42M
        }
281
530k
        for (int t : bctx.dc_thresholds[1]) {
282
530k
          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
283
530k
        }
284
714k
        for (int t : bctx.dc_thresholds[2]) {
285
714k
          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
286
714k
        }
287
446k
        int bucket = bucket_x;
288
446k
        bucket *= bctx.dc_thresholds[2].size() + 1;
289
446k
        bucket += bucket_b;
290
446k
        bucket *= bctx.dc_thresholds[1].size() + 1;
291
446k
        bucket += bucket_y;
292
446k
        qdc_row_val[x] = bucket;
293
446k
      }
294
15.5k
    }
295
2.22k
  }
296
17.4k
}
jxl::N_SSE2::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&)
Line
Count
Source
204
3.64k
               const BlockCtxMap& bctx) {
205
3.64k
  const HWY_FULL(float) df;
206
3.64k
  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
207
3.64k
  if (chroma_subsampling.Is444()) {
208
2.22k
    const auto fac_x = Set(df, dc_factors[0] * mul);
209
2.22k
    const auto fac_y = Set(df, dc_factors[1] * mul);
210
2.22k
    const auto fac_b = Set(df, dc_factors[2] * mul);
211
2.22k
    const auto cfl_fac_x = Set(df, cfl_factors[0]);
212
2.22k
    const auto cfl_fac_b = Set(df, cfl_factors[2]);
213
105k
    for (size_t y = 0; y < r.ysize(); y++) {
214
103k
      float* dec_row_x = r.PlaneRow(dc, 0, y);
215
103k
      float* dec_row_y = r.PlaneRow(dc, 1, y);
216
103k
      float* dec_row_b = r.PlaneRow(dc, 2, y);
217
103k
      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
218
103k
      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
219
103k
      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
220
913k
      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
221
810k
        const auto in_q_x = Load(di, quant_row_x + x);
222
810k
        const auto in_q_y = Load(di, quant_row_y + x);
223
810k
        const auto in_q_b = Load(di, quant_row_b + x);
224
810k
        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
225
810k
        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
226
810k
        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
227
810k
        Store(in_y, df, dec_row_y + x);
228
810k
        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
229
810k
        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
230
810k
      }
231
103k
    }
232
2.22k
  } else {
233
4.26k
    for (size_t c : {1, 0, 2}) {
234
4.26k
      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
235
4.26k
                r.y0() >> chroma_subsampling.VShift(c),
236
4.26k
                r.xsize() >> chroma_subsampling.HShift(c),
237
4.26k
                r.ysize() >> chroma_subsampling.VShift(c));
238
4.26k
      const auto fac = Set(df, dc_factors[c] * mul);
239
4.26k
      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
240
40.1k
      for (size_t y = 0; y < rect.ysize(); y++) {
241
35.8k
        const int32_t* quant_row = ch.plane.Row(y);
242
35.8k
        float* row = rect.PlaneRow(dc, c, y);
243
135k
        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
244
100k
          const auto in_q = Load(di, quant_row + x);
245
100k
          const auto out = Mul(ConvertTo(df, in_q), fac);
246
100k
          Store(out, df, row + x);
247
100k
        }
248
35.8k
      }
249
4.26k
    }
250
1.42k
  }
251
3.64k
  if (bctx.num_dc_ctxs <= 1) {
252
119k
    for (size_t y = 0; y < r.ysize(); y++) {
253
116k
      uint8_t* qdc_row = r.Row(quant_dc, y);
254
116k
      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
255
116k
    }
256
3.22k
  } else {
257
421
    JXL_DASSERT(r.ysize() == 0 ||
258
421
                (r.ysize() - 1) >> chroma_subsampling.VShift(0) <
259
421
                    in.channel[1].plane.ysize());
260
421
    JXL_DASSERT(r.ysize() == 0 ||
261
421
                (r.ysize() - 1) >> chroma_subsampling.VShift(1) <
262
421
                    in.channel[0].plane.ysize());
263
421
    JXL_DASSERT(r.ysize() == 0 ||
264
421
                (r.ysize() - 1) >> chroma_subsampling.VShift(2) <
265
421
                    in.channel[2].plane.ysize());
266
2.51k
    for (size_t y = 0; y < r.ysize(); y++) {
267
2.09k
      uint8_t* qdc_row_val = r.Row(quant_dc, y);
268
2.09k
      const int32_t* quant_row_x =
269
2.09k
          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
270
2.09k
      const int32_t* quant_row_y =
271
2.09k
          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
272
2.09k
      const int32_t* quant_row_b =
273
2.09k
          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
274
32.0k
      for (size_t x = 0; x < r.xsize(); x++) {
275
29.9k
        int bucket_x = 0;
276
29.9k
        int bucket_y = 0;
277
29.9k
        int bucket_b = 0;
278
29.9k
        for (int t : bctx.dc_thresholds[0]) {
279
16.2k
          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
280
16.2k
        }
281
92.6k
        for (int t : bctx.dc_thresholds[1]) {
282
92.6k
          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
283
92.6k
        }
284
35.3k
        for (int t : bctx.dc_thresholds[2]) {
285
35.3k
          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
286
35.3k
        }
287
29.9k
        int bucket = bucket_x;
288
29.9k
        bucket *= bctx.dc_thresholds[2].size() + 1;
289
29.9k
        bucket += bucket_b;
290
29.9k
        bucket *= bctx.dc_thresholds[1].size() + 1;
291
29.9k
        bucket += bucket_y;
292
29.9k
        qdc_row_val[x] = bucket;
293
29.9k
      }
294
2.09k
    }
295
421
  }
296
3.64k
}
297
298
// NOLINTNEXTLINE(google-readability-namespace-comments)
299
}  // namespace HWY_NAMESPACE
300
}  // namespace jxl
301
HWY_AFTER_NAMESPACE();
302
303
#if HWY_ONCE
304
namespace jxl {
305
306
HWY_EXPORT(DequantDC);
307
HWY_EXPORT(AdaptiveDCSmoothing);
308
Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager,
309
                           const float* dc_factors, Image3F* dc,
310
17.0k
                           ThreadPool* pool) {
311
17.0k
  return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(memory_manager, dc_factors,
312
17.0k
                                                   dc, pool);
313
17.0k
}
314
315
void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
316
               const float* dc_factors, float mul, const float* cfl_factors,
317
               const YCbCrChromaSubsampling& chroma_subsampling,
318
24.6k
               const BlockCtxMap& bctx) {
319
24.6k
  HWY_DYNAMIC_DISPATCH(DequantDC)
320
24.6k
  (r, dc, quant_dc, in, dc_factors, mul, cfl_factors, chroma_subsampling, bctx);
321
24.6k
}
322
323
}  // namespace jxl
324
#endif  // HWY_ONCE