/src/libjxl/lib/jxl/compressed_dc.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/compressed_dc.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cstdint> |
12 | | #include <cstdlib> |
13 | | #include <cstring> |
14 | | #include <vector> |
15 | | |
16 | | #include "lib/jxl/ac_context.h" |
17 | | #include "lib/jxl/frame_header.h" |
18 | | #include "lib/jxl/modular/modular_image.h" |
19 | | |
20 | | #undef HWY_TARGET_INCLUDE |
21 | | #define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc" |
22 | | #include <hwy/foreach_target.h> |
23 | | #include <hwy/highway.h> |
24 | | |
25 | | #include "lib/jxl/base/compiler_specific.h" |
26 | | #include "lib/jxl/base/data_parallel.h" |
27 | | #include "lib/jxl/base/rect.h" |
28 | | #include "lib/jxl/base/status.h" |
29 | | #include "lib/jxl/image.h" |
30 | | HWY_BEFORE_NAMESPACE(); |
31 | | namespace jxl { |
32 | | namespace HWY_NAMESPACE { |
33 | | |
34 | | using D = HWY_FULL(float); |
35 | | using DScalar = HWY_CAPPED(float, 1); |
36 | | |
37 | | // These templates are not found via ADL. |
38 | | using hwy::HWY_NAMESPACE::Abs; |
39 | | using hwy::HWY_NAMESPACE::Add; |
40 | | using hwy::HWY_NAMESPACE::Div; |
41 | | using hwy::HWY_NAMESPACE::Max; |
42 | | using hwy::HWY_NAMESPACE::Mul; |
43 | | using hwy::HWY_NAMESPACE::MulAdd; |
44 | | using hwy::HWY_NAMESPACE::Rebind; |
45 | | using hwy::HWY_NAMESPACE::Sub; |
46 | | using hwy::HWY_NAMESPACE::Vec; |
47 | | using hwy::HWY_NAMESPACE::ZeroIfNegative; |
48 | | |
49 | | // TODO(veluca): optimize constants. |
50 | | const float w1 = 0.20345139757231578f; |
51 | | const float w2 = 0.0334829185968739f; |
52 | | const float w0 = 1.0f - 4.0f * (w1 + w2); |
53 | | |
54 | | template <class V> |
55 | 680k | V MaxWorkaround(V a, V b) { |
56 | | #if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800 |
57 | | // Prevents "Do not know how to split the result of this operator" error |
58 | | return IfThenElse(a > b, a, b); |
59 | | #else |
60 | 680k | return Max(a, b); |
61 | 680k | #endif |
62 | 680k | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::MaxWorkaround<hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>) Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::MaxWorkaround<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::MaxWorkaround<hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 55 | 511k | V MaxWorkaround(V a, V b) { | 56 | | #if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800 | 57 | | // Prevents "Do not know how to split the result of this operator" error | 58 | | return IfThenElse(a > b, a, b); | 59 | | #else | 60 | 511k | return Max(a, b); | 61 | 511k | #endif | 62 | 511k | } |
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::MaxWorkaround<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 55 | 168k | V MaxWorkaround(V a, V b) { | 56 | | #if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800 | 57 | | // Prevents "Do not know how to split the result of this operator" error | 58 | | return IfThenElse(a > b, a, b); | 59 | | #else | 60 | 168k | return Max(a, b); | 61 | 168k | #endif | 62 | 168k | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::MaxWorkaround<hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Vec128<float, 1ul>, hwy::N_SSE2::Vec128<float, 1ul>) Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::MaxWorkaround<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) |
63 | | |
64 | | template <typename D> |
65 | | JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor, |
66 | | const float* JXL_RESTRICT row_top, |
67 | | const float* JXL_RESTRICT row, |
68 | | const float* JXL_RESTRICT row_bottom, |
69 | | Vec<D>* JXL_RESTRICT mc, |
70 | | Vec<D>* JXL_RESTRICT sm, |
71 | 680k | Vec<D>* JXL_RESTRICT gap, size_t x) { |
72 | 680k | const auto tl = LoadU(d, row_top + x - 1); |
73 | 680k | const auto tc = Load(d, row_top + x); |
74 | 680k | const auto tr = LoadU(d, row_top + x + 1); |
75 | | |
76 | 680k | const auto ml = LoadU(d, row + x - 1); |
77 | 680k | *mc = Load(d, row + x); |
78 | 680k | const auto mr = LoadU(d, row + x + 1); |
79 | | |
80 | 680k | const auto bl = LoadU(d, row_bottom + x - 1); |
81 | 680k | const auto bc = Load(d, row_bottom + x); |
82 | 680k | const auto br = LoadU(d, row_bottom + x + 1); |
83 | | |
84 | 680k | const auto w_center = Set(d, w0); |
85 | 680k | const auto w_side = Set(d, w1); |
86 | 680k | const auto w_corner = Set(d, w2); |
87 | | |
88 | 680k | const auto corner = Add(Add(tl, tr), Add(bl, br)); |
89 | 680k | const auto side = Add(Add(ml, mr), Add(tc, bc)); |
90 | 680k | *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center))); |
91 | | |
92 | 680k | const auto dc_quant = Set(d, dc_factor); |
93 | 680k | *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant))); |
94 | 680k | } Unexecuted instantiation: void jxl::N_SSE4::ComputePixelChannel<hwy::N_SSE4::Simd<float, 1ul, 0> >(hwy::N_SSE4::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)()))*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::ComputePixelChannel<hwy::N_SSE4::Simd<float, 4ul, 0> >(hwy::N_SSE4::Simd<float, 4ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)()))*, unsigned long) void jxl::N_AVX2::ComputePixelChannel<hwy::N_AVX2::Simd<float, 1ul, 0> >(hwy::N_AVX2::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)()))*, unsigned long) Line | Count | Source | 71 | 511k | Vec<D>* JXL_RESTRICT gap, size_t x) { | 72 | 511k | const auto tl = LoadU(d, row_top + x - 1); | 73 | 511k | const auto tc = Load(d, row_top + x); | 74 | 511k | const auto tr = LoadU(d, row_top + x + 1); | 75 | | | 76 | 511k | const auto ml = LoadU(d, row + x - 1); | 77 | 511k | *mc = Load(d, row + x); | 78 | 511k | const auto mr = LoadU(d, row + x + 1); | 79 | | | 80 | 511k | const auto bl = LoadU(d, row_bottom + x - 1); | 81 | 511k | const auto bc = Load(d, row_bottom + x); | 82 | 511k | const auto br = LoadU(d, row_bottom + x + 1); | 83 | | | 84 | 511k | const auto w_center = Set(d, w0); | 85 | 511k | const auto w_side = Set(d, w1); | 86 | 511k | const auto w_corner = Set(d, w2); | 87 | | | 88 | 511k | const auto corner = Add(Add(tl, tr), Add(bl, br)); | 89 | 511k | const auto side = Add(Add(ml, mr), Add(tc, bc)); | 90 | 511k | *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center))); | 91 | | | 92 | 511k | const auto dc_quant = Set(d, dc_factor); | 93 | 511k | *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant))); | 94 | 511k | } |
void jxl::N_AVX2::ComputePixelChannel<hwy::N_AVX2::Simd<float, 8ul, 0> >(hwy::N_AVX2::Simd<float, 8ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)()))*, unsigned long) Line | Count | Source | 71 | 168k | Vec<D>* JXL_RESTRICT gap, size_t x) { | 72 | 168k | const auto tl = LoadU(d, row_top + x - 1); | 73 | 168k | const auto tc = Load(d, row_top + x); | 74 | 168k | const auto tr = LoadU(d, row_top + x + 1); | 75 | | | 76 | 168k | const auto ml = LoadU(d, row + x - 1); | 77 | 168k | *mc = Load(d, row + x); | 78 | 168k | const auto mr = LoadU(d, row + x + 1); | 79 | | | 80 | 168k | const auto bl = LoadU(d, row_bottom + x - 1); | 81 | 168k | const auto bc = Load(d, row_bottom + x); | 82 | 168k | const auto br = LoadU(d, row_bottom + x + 1); | 83 | | | 84 | 168k | const auto w_center = Set(d, w0); | 85 | 168k | const auto w_side = Set(d, w1); | 86 | 168k | const auto w_corner = Set(d, w2); | 87 | | | 88 | 168k | const auto corner = Add(Add(tl, tr), Add(bl, br)); | 89 | 168k | const auto side = Add(Add(ml, mr), Add(tc, bc)); | 90 | 168k | *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center))); | 91 | | | 92 | 168k | const auto dc_quant = Set(d, dc_factor); | 93 | 168k | *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant))); | 94 | 168k | } |
Unexecuted instantiation: void jxl::N_SSE2::ComputePixelChannel<hwy::N_SSE2::Simd<float, 1ul, 0> >(hwy::N_SSE2::Simd<float, 1ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 1ul, 0>)()))*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::ComputePixelChannel<hwy::N_SSE2::Simd<float, 4ul, 0> >(hwy::N_SSE2::Simd<float, 4ul, 0>, float, float const*, float const*, float const*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, decltype (Zero((hwy::N_SSE2::Simd<float, 4ul, 0>)()))*, unsigned long) |
95 | | |
96 | | template <typename D> |
97 | | JXL_INLINE void ComputePixel( |
98 | | const float* JXL_RESTRICT dc_factors, |
99 | | const float* JXL_RESTRICT* JXL_RESTRICT rows_top, |
100 | | const float* JXL_RESTRICT* JXL_RESTRICT rows, |
101 | | const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom, |
102 | 226k | float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) { |
103 | 226k | const D d; |
104 | 226k | auto mc_x = Undefined(d); |
105 | 226k | auto mc_y = Undefined(d); |
106 | 226k | auto mc_b = Undefined(d); |
107 | 226k | auto sm_x = Undefined(d); |
108 | 226k | auto sm_y = Undefined(d); |
109 | 226k | auto sm_b = Undefined(d); |
110 | 226k | auto gap = Set(d, 0.5f); |
111 | 226k | ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0], |
112 | 226k | &mc_x, &sm_x, &gap, x); |
113 | 226k | ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1], |
114 | 226k | &mc_y, &sm_y, &gap, x); |
115 | 226k | ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2], |
116 | 226k | &mc_b, &sm_b, &gap, x); |
117 | 226k | auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f)); |
118 | 226k | factor = ZeroIfNegative(factor); |
119 | | |
120 | 226k | auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x); |
121 | 226k | Store(out, d, out_rows[0] + x); |
122 | 226k | out = MulAdd(Sub(sm_y, mc_y), factor, mc_y); |
123 | 226k | Store(out, d, out_rows[1] + x); |
124 | 226k | out = MulAdd(Sub(sm_b, mc_b), factor, mc_b); |
125 | 226k | Store(out, d, out_rows[2] + x); |
126 | 226k | } Unexecuted instantiation: void jxl::N_SSE4::ComputePixel<hwy::N_SSE4::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long) Unexecuted instantiation: void jxl::N_SSE4::ComputePixel<hwy::N_SSE4::Simd<float, 4ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long) void jxl::N_AVX2::ComputePixel<hwy::N_AVX2::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long) Line | Count | Source | 102 | 170k | float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) { | 103 | 170k | const D d; | 104 | 170k | auto mc_x = Undefined(d); | 105 | 170k | auto mc_y = Undefined(d); | 106 | 170k | auto mc_b = Undefined(d); | 107 | 170k | auto sm_x = Undefined(d); | 108 | 170k | auto sm_y = Undefined(d); | 109 | 170k | auto sm_b = Undefined(d); | 110 | 170k | auto gap = Set(d, 0.5f); | 111 | 170k | ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0], | 112 | 170k | &mc_x, &sm_x, &gap, x); | 113 | 170k | ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1], | 114 | 170k | &mc_y, &sm_y, &gap, x); | 115 | 170k | ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2], | 116 | 170k | &mc_b, &sm_b, &gap, x); | 117 | 170k | auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f)); | 118 | 170k | factor = ZeroIfNegative(factor); | 119 | | | 120 | 170k | auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x); | 121 | 170k | Store(out, d, out_rows[0] + x); | 122 | 170k | out = MulAdd(Sub(sm_y, mc_y), factor, mc_y); | 123 | 170k | Store(out, d, out_rows[1] + x); | 124 | 170k | out = MulAdd(Sub(sm_b, mc_b), factor, mc_b); | 125 | 170k | Store(out, d, out_rows[2] + x); | 126 | 170k | } |
void jxl::N_AVX2::ComputePixel<hwy::N_AVX2::Simd<float, 8ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long) Line | Count | Source | 102 | 56.1k | float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) { | 103 | 56.1k | const D d; | 104 | 56.1k | auto mc_x = Undefined(d); | 105 | 56.1k | auto mc_y = Undefined(d); | 106 | 56.1k | auto mc_b = Undefined(d); | 107 | 56.1k | auto sm_x = Undefined(d); | 108 | 56.1k | auto sm_y = Undefined(d); | 109 | 56.1k | auto sm_b = Undefined(d); | 110 | 56.1k | auto gap = Set(d, 0.5f); | 111 | 56.1k | ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0], | 112 | 56.1k | &mc_x, &sm_x, &gap, x); | 113 | 56.1k | ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1], | 114 | 56.1k | &mc_y, &sm_y, &gap, x); | 115 | 56.1k | ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2], | 116 | 56.1k | &mc_b, &sm_b, &gap, x); | 117 | 56.1k | auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f)); | 118 | 56.1k | factor = ZeroIfNegative(factor); | 119 | | | 120 | 56.1k | auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x); | 121 | 56.1k | Store(out, d, out_rows[0] + x); | 122 | 56.1k | out = MulAdd(Sub(sm_y, mc_y), factor, mc_y); | 123 | 56.1k | Store(out, d, out_rows[1] + x); | 124 | 56.1k | out = MulAdd(Sub(sm_b, mc_b), factor, mc_b); | 125 | 56.1k | Store(out, d, out_rows[2] + x); | 126 | 56.1k | } |
Unexecuted instantiation: void jxl::N_SSE2::ComputePixel<hwy::N_SSE2::Simd<float, 1ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long) Unexecuted instantiation: void jxl::N_SSE2::ComputePixel<hwy::N_SSE2::Simd<float, 4ul, 0> >(float const*, float const* restrict*, float const* restrict*, float const* restrict*, float* restrict*, unsigned long) |
127 | | |
128 | | Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager, |
129 | | const float* dc_factors, Image3F* dc, |
130 | 2.19k | ThreadPool* pool) { |
131 | 2.19k | const size_t xsize = dc->xsize(); |
132 | 2.19k | const size_t ysize = dc->ysize(); |
133 | 2.19k | if (ysize <= 2 || xsize <= 2) return true; |
134 | | |
135 | | // TODO(veluca): use tile-based processing? |
136 | | // TODO(veluca): decide if changes to the y channel should be propagated to |
137 | | // the x and b channels through color correlation. |
138 | 868 | JXL_ENSURE(w1 + w2 < 0.25f); |
139 | | |
140 | 1.73k | JXL_ASSIGN_OR_RETURN(Image3F smoothed, |
141 | 1.73k | Image3F::Create(memory_manager, xsize, ysize)); |
142 | | // Fill in borders that the loop below will not. First and last are unused. |
143 | 3.47k | for (size_t c = 0; c < 3; c++) { |
144 | 5.20k | for (size_t y : {static_cast<size_t>(0), ysize - 1}) { |
145 | 5.20k | memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y), |
146 | 5.20k | xsize * sizeof(float)); |
147 | 5.20k | } |
148 | 2.60k | } |
149 | 17.3k | auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status { |
150 | 17.3k | const float* JXL_RESTRICT rows_top[3]{ |
151 | 17.3k | dc->ConstPlaneRow(0, y - 1), |
152 | 17.3k | dc->ConstPlaneRow(1, y - 1), |
153 | 17.3k | dc->ConstPlaneRow(2, y - 1), |
154 | 17.3k | }; |
155 | 17.3k | const float* JXL_RESTRICT rows[3] = { |
156 | 17.3k | dc->ConstPlaneRow(0, y), |
157 | 17.3k | dc->ConstPlaneRow(1, y), |
158 | 17.3k | dc->ConstPlaneRow(2, y), |
159 | 17.3k | }; |
160 | 17.3k | const float* JXL_RESTRICT rows_bottom[3] = { |
161 | 17.3k | dc->ConstPlaneRow(0, y + 1), |
162 | 17.3k | dc->ConstPlaneRow(1, y + 1), |
163 | 17.3k | dc->ConstPlaneRow(2, y + 1), |
164 | 17.3k | }; |
165 | 17.3k | float* JXL_RESTRICT rows_out[3] = { |
166 | 17.3k | smoothed.PlaneRow(0, y), |
167 | 17.3k | smoothed.PlaneRow(1, y), |
168 | 17.3k | smoothed.PlaneRow(2, y), |
169 | 17.3k | }; |
170 | 34.7k | for (size_t x : {static_cast<size_t>(0), xsize - 1}) { |
171 | 139k | for (size_t c = 0; c < 3; c++) { |
172 | 104k | rows_out[c][x] = rows[c][x]; |
173 | 104k | } |
174 | 34.7k | } |
175 | | |
176 | 17.3k | size_t x = 1; |
177 | | // First pixels |
178 | 17.3k | const size_t N = Lanes(D()); |
179 | 138k | for (; x < std::min(N, xsize - 1); x++) { |
180 | 121k | ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out, |
181 | 121k | x); |
182 | 121k | } |
183 | | // Full vectors. |
184 | 73.4k | for (; x + N <= xsize - 1; x += N) { |
185 | 56.1k | ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x); |
186 | 56.1k | } |
187 | | // Last pixels. |
188 | 66.7k | for (; x < xsize - 1; x++) { |
189 | 49.3k | ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out, |
190 | 49.3k | x); |
191 | 49.3k | } |
192 | 17.3k | return true; |
193 | 17.3k | }; Unexecuted instantiation: compressed_dc.cc:jxl::N_SSE4::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const compressed_dc.cc:jxl::N_AVX2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 149 | 17.3k | auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status { | 150 | 17.3k | const float* JXL_RESTRICT rows_top[3]{ | 151 | 17.3k | dc->ConstPlaneRow(0, y - 1), | 152 | 17.3k | dc->ConstPlaneRow(1, y - 1), | 153 | 17.3k | dc->ConstPlaneRow(2, y - 1), | 154 | 17.3k | }; | 155 | 17.3k | const float* JXL_RESTRICT rows[3] = { | 156 | 17.3k | dc->ConstPlaneRow(0, y), | 157 | 17.3k | dc->ConstPlaneRow(1, y), | 158 | 17.3k | dc->ConstPlaneRow(2, y), | 159 | 17.3k | }; | 160 | 17.3k | const float* JXL_RESTRICT rows_bottom[3] = { | 161 | 17.3k | dc->ConstPlaneRow(0, y + 1), | 162 | 17.3k | dc->ConstPlaneRow(1, y + 1), | 163 | 17.3k | dc->ConstPlaneRow(2, y + 1), | 164 | 17.3k | }; | 165 | 17.3k | float* JXL_RESTRICT rows_out[3] = { | 166 | 17.3k | smoothed.PlaneRow(0, y), | 167 | 17.3k | smoothed.PlaneRow(1, y), | 168 | 17.3k | smoothed.PlaneRow(2, y), | 169 | 17.3k | }; | 170 | 34.7k | for (size_t x : {static_cast<size_t>(0), xsize - 1}) { | 171 | 139k | for (size_t c = 0; c < 3; c++) { | 172 | 104k | rows_out[c][x] = rows[c][x]; | 173 | 104k | } | 174 | 34.7k | } | 175 | | | 176 | 17.3k | size_t x = 1; | 177 | | // First pixels | 178 | 17.3k | const size_t N = Lanes(D()); | 179 | 138k | for (; x < std::min(N, xsize - 1); x++) { | 180 | 121k | ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out, | 181 | 121k | x); | 182 | 121k | } | 183 | | // Full vectors. | 184 | 73.4k | for (; x + N <= xsize - 1; x += N) { | 185 | 56.1k | ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x); | 186 | 56.1k | } | 187 | | // Last pixels. | 188 | 66.7k | for (; x < xsize - 1; x++) { | 189 | 49.3k | ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out, | 190 | 49.3k | x); | 191 | 49.3k | } | 192 | 17.3k | return true; | 193 | 17.3k | }; |
Unexecuted instantiation: compressed_dc.cc:jxl::N_SSE2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const |
194 | 1.73k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit, |
195 | 1.73k | process_row, "DCSmoothingRow")); |
196 | 868 | dc->Swap(smoothed); |
197 | 868 | return true; |
198 | 1.73k | } Unexecuted instantiation: jxl::N_SSE4::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*) jxl::N_AVX2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*) Line | Count | Source | 130 | 2.19k | ThreadPool* pool) { | 131 | 2.19k | const size_t xsize = dc->xsize(); | 132 | 2.19k | const size_t ysize = dc->ysize(); | 133 | 2.19k | if (ysize <= 2 || xsize <= 2) return true; | 134 | | | 135 | | // TODO(veluca): use tile-based processing? | 136 | | // TODO(veluca): decide if changes to the y channel should be propagated to | 137 | | // the x and b channels through color correlation. | 138 | 868 | JXL_ENSURE(w1 + w2 < 0.25f); | 139 | | | 140 | 1.73k | JXL_ASSIGN_OR_RETURN(Image3F smoothed, | 141 | 1.73k | Image3F::Create(memory_manager, xsize, ysize)); | 142 | | // Fill in borders that the loop below will not. First and last are unused. | 143 | 3.47k | for (size_t c = 0; c < 3; c++) { | 144 | 5.20k | for (size_t y : {static_cast<size_t>(0), ysize - 1}) { | 145 | 5.20k | memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y), | 146 | 5.20k | xsize * sizeof(float)); | 147 | 5.20k | } | 148 | 2.60k | } | 149 | 1.73k | auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status { | 150 | 1.73k | const float* JXL_RESTRICT rows_top[3]{ | 151 | 1.73k | dc->ConstPlaneRow(0, y - 1), | 152 | 1.73k | dc->ConstPlaneRow(1, y - 1), | 153 | 1.73k | dc->ConstPlaneRow(2, y - 1), | 154 | 1.73k | }; | 155 | 1.73k | const float* JXL_RESTRICT rows[3] = { | 156 | 1.73k | dc->ConstPlaneRow(0, y), | 157 | 1.73k | dc->ConstPlaneRow(1, y), | 158 | 1.73k | dc->ConstPlaneRow(2, y), | 159 | 1.73k | }; | 160 | 1.73k | const float* JXL_RESTRICT rows_bottom[3] = { | 161 | 1.73k | dc->ConstPlaneRow(0, y + 1), | 162 | 1.73k | dc->ConstPlaneRow(1, y + 1), | 163 | 1.73k | dc->ConstPlaneRow(2, y + 1), | 164 | 1.73k | }; | 165 | 1.73k | float* JXL_RESTRICT rows_out[3] = { | 166 | 1.73k | smoothed.PlaneRow(0, y), | 167 | 1.73k | smoothed.PlaneRow(1, y), | 168 | 1.73k | smoothed.PlaneRow(2, y), | 169 | 1.73k | }; | 170 | 1.73k | for (size_t x : {static_cast<size_t>(0), xsize - 1}) { | 171 | 1.73k | for (size_t c = 0; c < 3; c++) { | 172 | 1.73k | rows_out[c][x] = rows[c][x]; | 173 | 1.73k | } | 174 | 1.73k | } | 175 | | | 176 | 1.73k | size_t x = 1; | 177 | | // First pixels | 178 | 1.73k | const size_t N = Lanes(D()); | 179 | 1.73k | for (; x < std::min(N, xsize - 1); x++) { | 180 | 1.73k | ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out, | 181 | 1.73k | x); | 182 | 1.73k | } | 183 | | // Full vectors. | 184 | 1.73k | for (; x + N <= xsize - 1; x += N) { | 185 | 1.73k | ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x); | 186 | 1.73k | } | 187 | | // Last pixels. | 188 | 1.73k | for (; x < xsize - 1; x++) { | 189 | 1.73k | ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out, | 190 | 1.73k | x); | 191 | 1.73k | } | 192 | 1.73k | return true; | 193 | 1.73k | }; | 194 | 1.73k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit, | 195 | 1.73k | process_row, "DCSmoothingRow")); | 196 | 868 | dc->Swap(smoothed); | 197 | 868 | return true; | 198 | 1.73k | } |
Unexecuted instantiation: jxl::N_SSE2::AdaptiveDCSmoothing(JxlMemoryManagerStruct*, float const*, jxl::Image3<float>*, jxl::ThreadPool*) |
199 | | |
200 | | // DC dequantization. |
201 | | void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in, |
202 | | const float* dc_factors, float mul, const float* cfl_factors, |
203 | | const YCbCrChromaSubsampling& chroma_subsampling, |
204 | 3.31k | const BlockCtxMap& bctx) { |
205 | 3.31k | const HWY_FULL(float) df; |
206 | 3.31k | const Rebind<pixel_type, HWY_FULL(float)> di; // assumes pixel_type <= float |
207 | 3.31k | if (chroma_subsampling.Is444()) { |
208 | 3.28k | const auto fac_x = Set(df, dc_factors[0] * mul); |
209 | 3.28k | const auto fac_y = Set(df, dc_factors[1] * mul); |
210 | 3.28k | const auto fac_b = Set(df, dc_factors[2] * mul); |
211 | 3.28k | const auto cfl_fac_x = Set(df, cfl_factors[0]); |
212 | 3.28k | const auto cfl_fac_b = Set(df, cfl_factors[2]); |
213 | 40.8k | for (size_t y = 0; y < r.ysize(); y++) { |
214 | 37.5k | float* dec_row_x = r.PlaneRow(dc, 0, y); |
215 | 37.5k | float* dec_row_y = r.PlaneRow(dc, 1, y); |
216 | 37.5k | float* dec_row_b = r.PlaneRow(dc, 2, y); |
217 | 37.5k | const int32_t* quant_row_x = in.channel[1].plane.Row(y); |
218 | 37.5k | const int32_t* quant_row_y = in.channel[0].plane.Row(y); |
219 | 37.5k | const int32_t* quant_row_b = in.channel[2].plane.Row(y); |
220 | 196k | for (size_t x = 0; x < r.xsize(); x += Lanes(di)) { |
221 | 158k | const auto in_q_x = Load(di, quant_row_x + x); |
222 | 158k | const auto in_q_y = Load(di, quant_row_y + x); |
223 | 158k | const auto in_q_b = Load(di, quant_row_b + x); |
224 | 158k | const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x); |
225 | 158k | const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y); |
226 | 158k | const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b); |
227 | 158k | Store(in_y, df, dec_row_y + x); |
228 | 158k | Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x); |
229 | 158k | Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x); |
230 | 158k | } |
231 | 37.5k | } |
232 | 3.28k | } else { |
233 | 90 | for (size_t c : {1, 0, 2}) { |
234 | 90 | Rect rect(r.x0() >> chroma_subsampling.HShift(c), |
235 | 90 | r.y0() >> chroma_subsampling.VShift(c), |
236 | 90 | r.xsize() >> chroma_subsampling.HShift(c), |
237 | 90 | r.ysize() >> chroma_subsampling.VShift(c)); |
238 | 90 | const auto fac = Set(df, dc_factors[c] * mul); |
239 | 90 | const Channel& ch = in.channel[c < 2 ? c ^ 1 : c]; |
240 | 509 | for (size_t y = 0; y < rect.ysize(); y++) { |
241 | 419 | const int32_t* quant_row = ch.plane.Row(y); |
242 | 419 | float* row = rect.PlaneRow(dc, c, y); |
243 | 1.12k | for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) { |
244 | 704 | const auto in_q = Load(di, quant_row + x); |
245 | 704 | const auto out = Mul(ConvertTo(df, in_q), fac); |
246 | 704 | Store(out, df, row + x); |
247 | 704 | } |
248 | 419 | } |
249 | 90 | } |
250 | 30 | } |
251 | 3.31k | if (bctx.num_dc_ctxs <= 1) { |
252 | 34.9k | for (size_t y = 0; y < r.ysize(); y++) { |
253 | 33.0k | uint8_t* qdc_row = r.Row(quant_dc, y); |
254 | 33.0k | memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize()); |
255 | 33.0k | } |
256 | 1.92k | } else { |
257 | 6.09k | for (size_t y = 0; y < r.ysize(); y++) { |
258 | 4.69k | uint8_t* qdc_row_val = r.Row(quant_dc, y); |
259 | 4.69k | const int32_t* quant_row_x = |
260 | 4.69k | in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0)); |
261 | 4.69k | const int32_t* quant_row_y = |
262 | 4.69k | in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1)); |
263 | 4.69k | const int32_t* quant_row_b = |
264 | 4.69k | in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2)); |
265 | 181k | for (size_t x = 0; x < r.xsize(); x++) { |
266 | 176k | int bucket_x = 0; |
267 | 176k | int bucket_y = 0; |
268 | 176k | int bucket_b = 0; |
269 | 329k | for (int t : bctx.dc_thresholds[0]) { |
270 | 329k | if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++; |
271 | 329k | } |
272 | 176k | for (int t : bctx.dc_thresholds[1]) { |
273 | 67.9k | if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++; |
274 | 67.9k | } |
275 | 427k | for (int t : bctx.dc_thresholds[2]) { |
276 | 427k | if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++; |
277 | 427k | } |
278 | 176k | int bucket = bucket_x; |
279 | 176k | bucket *= bctx.dc_thresholds[2].size() + 1; |
280 | 176k | bucket += bucket_b; |
281 | 176k | bucket *= bctx.dc_thresholds[1].size() + 1; |
282 | 176k | bucket += bucket_y; |
283 | 176k | qdc_row_val[x] = bucket; |
284 | 176k | } |
285 | 4.69k | } |
286 | 1.39k | } |
287 | 3.31k | } Unexecuted instantiation: jxl::N_SSE4::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&) jxl::N_AVX2::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&) Line | Count | Source | 204 | 3.31k | const BlockCtxMap& bctx) { | 205 | 3.31k | const HWY_FULL(float) df; | 206 | 3.31k | const Rebind<pixel_type, HWY_FULL(float)> di; // assumes pixel_type <= float | 207 | 3.31k | if (chroma_subsampling.Is444()) { | 208 | 3.28k | const auto fac_x = Set(df, dc_factors[0] * mul); | 209 | 3.28k | const auto fac_y = Set(df, dc_factors[1] * mul); | 210 | 3.28k | const auto fac_b = Set(df, dc_factors[2] * mul); | 211 | 3.28k | const auto cfl_fac_x = Set(df, cfl_factors[0]); | 212 | 3.28k | const auto cfl_fac_b = Set(df, cfl_factors[2]); | 213 | 40.8k | for (size_t y = 0; y < r.ysize(); y++) { | 214 | 37.5k | float* dec_row_x = r.PlaneRow(dc, 0, y); | 215 | 37.5k | float* dec_row_y = r.PlaneRow(dc, 1, y); | 216 | 37.5k | float* dec_row_b = r.PlaneRow(dc, 2, y); | 217 | 37.5k | const int32_t* quant_row_x = in.channel[1].plane.Row(y); | 218 | 37.5k | const int32_t* quant_row_y = in.channel[0].plane.Row(y); | 219 | 37.5k | const int32_t* quant_row_b = in.channel[2].plane.Row(y); | 220 | 196k | for (size_t x = 0; x < r.xsize(); x += Lanes(di)) { | 221 | 158k | const auto in_q_x = Load(di, quant_row_x + x); | 222 | 158k | const auto in_q_y = Load(di, quant_row_y + x); | 223 | 158k | const auto in_q_b = Load(di, quant_row_b + x); | 224 | 158k | const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x); | 225 | 158k | const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y); | 226 | 158k | const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b); | 227 | 158k | Store(in_y, df, dec_row_y + x); | 228 | 158k | Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x); | 229 | 158k | Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x); | 230 | 158k | } | 231 | 37.5k | } | 232 | 3.28k | } else { | 233 | 90 | for (size_t c : {1, 0, 2}) { | 234 | 90 | Rect rect(r.x0() >> chroma_subsampling.HShift(c), | 235 | 90 | r.y0() >> chroma_subsampling.VShift(c), | 236 | 90 | r.xsize() >> chroma_subsampling.HShift(c), | 237 | 90 | r.ysize() >> chroma_subsampling.VShift(c)); | 238 | 90 | const auto fac = Set(df, dc_factors[c] * mul); | 239 | 90 | const Channel& ch = in.channel[c < 2 ? c ^ 1 : c]; | 240 | 509 | for (size_t y = 0; y < rect.ysize(); y++) { | 241 | 419 | const int32_t* quant_row = ch.plane.Row(y); | 242 | 419 | float* row = rect.PlaneRow(dc, c, y); | 243 | 1.12k | for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) { | 244 | 704 | const auto in_q = Load(di, quant_row + x); | 245 | 704 | const auto out = Mul(ConvertTo(df, in_q), fac); | 246 | 704 | Store(out, df, row + x); | 247 | 704 | } | 248 | 419 | } | 249 | 90 | } | 250 | 30 | } | 251 | 3.31k | if (bctx.num_dc_ctxs <= 1) { | 252 | 34.9k | for (size_t y = 0; y < r.ysize(); y++) { | 253 | 33.0k | uint8_t* qdc_row = r.Row(quant_dc, y); | 254 | 33.0k | memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize()); | 255 | 33.0k | } | 256 | 1.92k | } else { | 257 | 6.09k | for (size_t y = 0; y < r.ysize(); y++) { | 258 | 4.69k | uint8_t* qdc_row_val = r.Row(quant_dc, y); | 259 | 4.69k | const int32_t* quant_row_x = | 260 | 4.69k | in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0)); | 261 | 4.69k | const int32_t* quant_row_y = | 262 | 4.69k | in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1)); | 263 | 4.69k | const int32_t* quant_row_b = | 264 | 4.69k | in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2)); | 265 | 181k | for (size_t x = 0; x < r.xsize(); x++) { | 266 | 176k | int bucket_x = 0; | 267 | 176k | int bucket_y = 0; | 268 | 176k | int bucket_b = 0; | 269 | 329k | for (int t : bctx.dc_thresholds[0]) { | 270 | 329k | if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++; | 271 | 329k | } | 272 | 176k | for (int t : bctx.dc_thresholds[1]) { | 273 | 67.9k | if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++; | 274 | 67.9k | } | 275 | 427k | for (int t : bctx.dc_thresholds[2]) { | 276 | 427k | if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++; | 277 | 427k | } | 278 | 176k | int bucket = bucket_x; | 279 | 176k | bucket *= bctx.dc_thresholds[2].size() + 1; | 280 | 176k | bucket += bucket_b; | 281 | 176k | bucket *= bctx.dc_thresholds[1].size() + 1; | 282 | 176k | bucket += bucket_y; | 283 | 176k | qdc_row_val[x] = bucket; | 284 | 176k | } | 285 | 4.69k | } | 286 | 1.39k | } | 287 | 3.31k | } |
Unexecuted instantiation: jxl::N_SSE2::DequantDC(jxl::RectT<unsigned long> const&, jxl::Image3<float>*, jxl::Plane<unsigned char>*, jxl::Image const&, float const*, float, float const*, jxl::YCbCrChromaSubsampling const&, jxl::BlockCtxMap const&) |
288 | | |
289 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
290 | | } // namespace HWY_NAMESPACE |
291 | | } // namespace jxl |
292 | | HWY_AFTER_NAMESPACE(); |
293 | | |
294 | | #if HWY_ONCE |
295 | | namespace jxl { |
296 | | |
297 | | HWY_EXPORT(DequantDC); |
298 | | HWY_EXPORT(AdaptiveDCSmoothing); |
299 | | Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager, |
300 | | const float* dc_factors, Image3F* dc, |
301 | 2.19k | ThreadPool* pool) { |
302 | 2.19k | return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(memory_manager, dc_factors, |
303 | 2.19k | dc, pool); |
304 | 2.19k | } |
305 | | |
306 | | void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in, |
307 | | const float* dc_factors, float mul, const float* cfl_factors, |
308 | | const YCbCrChromaSubsampling& chroma_subsampling, |
309 | 3.31k | const BlockCtxMap& bctx) { |
310 | 3.31k | HWY_DYNAMIC_DISPATCH(DequantDC) |
311 | 3.31k | (r, dc, quant_dc, in, dc_factors, mul, cfl_factors, chroma_subsampling, bctx); |
312 | 3.31k | } |
313 | | |
314 | | } // namespace jxl |
315 | | #endif // HWY_ONCE |