/src/libjxl/lib/jxl/enc_chroma_from_luma.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_chroma_from_luma.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cmath> |
12 | | #include <cstdlib> |
13 | | #include <hwy/base.h> // HWY_ALIGN_MAX |
14 | | #include <limits> |
15 | | |
16 | | #include "lib/jxl/ac_strategy.h" |
17 | | #include "lib/jxl/base/compiler_specific.h" |
18 | | #include "lib/jxl/base/span.h" |
19 | | #include "lib/jxl/chroma_from_luma.h" |
20 | | #include "lib/jxl/coeff_order_fwd.h" |
21 | | #include "lib/jxl/enc_bit_writer.h" |
22 | | #include "lib/jxl/fields.h" |
23 | | #include "lib/jxl/frame_dimensions.h" |
24 | | #include "lib/jxl/image.h" |
25 | | #include "lib/jxl/quant_weights.h" |
26 | | |
27 | | #undef HWY_TARGET_INCLUDE |
28 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_chroma_from_luma.cc" |
29 | | #include <hwy/foreach_target.h> |
30 | | #include <hwy/highway.h> |
31 | | |
32 | | #include "lib/jxl/base/common.h" |
33 | | #include "lib/jxl/base/rect.h" |
34 | | #include "lib/jxl/base/status.h" |
35 | | #include "lib/jxl/cms/opsin_params.h" |
36 | | #include "lib/jxl/dec_transforms-inl.h" |
37 | | #include "lib/jxl/enc_aux_out.h" |
38 | | #include "lib/jxl/enc_params.h" |
39 | | #include "lib/jxl/enc_transforms-inl.h" |
40 | | #include "lib/jxl/quantizer.h" |
41 | | #include "lib/jxl/simd_util.h" |
42 | | HWY_BEFORE_NAMESPACE(); |
43 | | namespace jxl { |
44 | | namespace HWY_NAMESPACE { |
45 | | |
46 | | // These templates are not found via ADL. |
47 | | using hwy::HWY_NAMESPACE::Abs; |
48 | | using hwy::HWY_NAMESPACE::Ge; |
49 | | using hwy::HWY_NAMESPACE::GetLane; |
50 | | using hwy::HWY_NAMESPACE::IfThenElse; |
51 | | using hwy::HWY_NAMESPACE::Lt; |
52 | | |
53 | | static HWY_FULL(float) df; |
54 | | |
55 | | struct CFLFunction { |
56 | | static constexpr float kCoeff = 1.f / 3; |
57 | | static constexpr float kThres = 100.0f; |
58 | | static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; |
59 | | CFLFunction(const float* values_m, const float* values_s, size_t num, |
60 | | float base, float distance_mul) |
61 | 28.8k | : values_m(values_m), |
62 | 28.8k | values_s(values_s), |
63 | 28.8k | num(num), |
64 | 28.8k | base(base), |
65 | 28.8k | distance_mul(distance_mul) { |
66 | 28.8k | JXL_DASSERT(num % Lanes(df) == 0); |
67 | 28.8k | } Unexecuted instantiation: jxl::N_SSE4::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float) jxl::N_AVX2::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float) Line | Count | Source | 61 | 28.8k | : values_m(values_m), | 62 | 28.8k | values_s(values_s), | 63 | 28.8k | num(num), | 64 | 28.8k | base(base), | 65 | 28.8k | distance_mul(distance_mul) { | 66 | 28.8k | JXL_DASSERT(num % Lanes(df) == 0); | 67 | 28.8k | } |
Unexecuted instantiation: jxl::N_SSE2::CFLFunction::CFLFunction(float const*, float const*, unsigned long, float, float) |
68 | | |
69 | | // Returns f'(x), where f is 1/3 * sum ((|color residual| + 1)^2-1) + |
70 | | // distance_mul * x^2 * num. |
71 | 313k | float Compute(float x, float eps, float* fpeps, float* fmeps) const { |
72 | 313k | float first_derivative = 2 * distance_mul * num * x; |
73 | 313k | float first_derivative_peps = 2 * distance_mul * num * (x + eps); |
74 | 313k | float first_derivative_meps = 2 * distance_mul * num * (x - eps); |
75 | | |
76 | 313k | const auto inv_color_factor = Set(df, kInvColorFactor); |
77 | 313k | const auto thres = Set(df, kThres); |
78 | 313k | const auto coeffx2 = Set(df, kCoeff * 2.0f); |
79 | 313k | const auto one = Set(df, 1.0f); |
80 | 313k | const auto zero = Set(df, 0.0f); |
81 | 313k | const auto base_v = Set(df, base); |
82 | 313k | const auto x_v = Set(df, x); |
83 | 313k | const auto xpe_v = Set(df, x + eps); |
84 | 313k | const auto xme_v = Set(df, x - eps); |
85 | 313k | auto fd_v = Zero(df); |
86 | 313k | auto fdpe_v = Zero(df); |
87 | 313k | auto fdme_v = Zero(df); |
88 | | |
89 | 139M | for (size_t i = 0; i < num; i += Lanes(df)) { |
90 | | // color residual = ax + b |
91 | 138M | const auto a = Mul(inv_color_factor, Load(df, values_m + i)); |
92 | 138M | const auto b = |
93 | 138M | Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i)); |
94 | 138M | const auto v = MulAdd(a, x_v, b); |
95 | 138M | const auto vpe = MulAdd(a, xpe_v, b); |
96 | 138M | const auto vme = MulAdd(a, xme_v, b); |
97 | 138M | const auto av = Abs(v); |
98 | 138M | const auto avpe = Abs(vpe); |
99 | 138M | const auto avme = Abs(vme); |
100 | 138M | const auto acoeffx2 = Mul(coeffx2, a); |
101 | 138M | auto d = Mul(acoeffx2, Add(av, one)); |
102 | 138M | auto dpe = Mul(acoeffx2, Add(avpe, one)); |
103 | 138M | auto dme = Mul(acoeffx2, Add(avme, one)); |
104 | 138M | d = IfThenElse(Lt(v, zero), Sub(zero, d), d); |
105 | 138M | dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe); |
106 | 138M | dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme); |
107 | 138M | const auto above = Ge(av, thres); |
108 | | // TODO(eustas): use IfThenElseZero |
109 | 138M | fd_v = Add(fd_v, IfThenElse(above, zero, d)); |
110 | 138M | fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe)); |
111 | 138M | fdme_v = Add(fdme_v, IfThenElse(above, zero, dme)); |
112 | 138M | } |
113 | | |
114 | 313k | *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v)); |
115 | 313k | *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v)); |
116 | 313k | return first_derivative + GetLane(SumOfLanes(df, fd_v)); |
117 | 313k | } Unexecuted instantiation: jxl::N_SSE4::CFLFunction::Compute(float, float, float*, float*) const jxl::N_AVX2::CFLFunction::Compute(float, float, float*, float*) const Line | Count | Source | 71 | 313k | float Compute(float x, float eps, float* fpeps, float* fmeps) const { | 72 | 313k | float first_derivative = 2 * distance_mul * num * x; | 73 | 313k | float first_derivative_peps = 2 * distance_mul * num * (x + eps); | 74 | 313k | float first_derivative_meps = 2 * distance_mul * num * (x - eps); | 75 | | | 76 | 313k | const auto inv_color_factor = Set(df, kInvColorFactor); | 77 | 313k | const auto thres = Set(df, kThres); | 78 | 313k | const auto coeffx2 = Set(df, kCoeff * 2.0f); | 79 | 313k | const auto one = Set(df, 1.0f); | 80 | 313k | const auto zero = Set(df, 0.0f); | 81 | 313k | const auto base_v = Set(df, base); | 82 | 313k | const auto x_v = Set(df, x); | 83 | 313k | const auto xpe_v = Set(df, x + eps); | 84 | 313k | const auto xme_v = Set(df, x - eps); | 85 | 313k | auto fd_v = Zero(df); | 86 | 313k | auto fdpe_v = Zero(df); | 87 | 313k | auto fdme_v = Zero(df); | 88 | | | 89 | 139M | for (size_t i = 0; i < num; i += Lanes(df)) { | 90 | | // color residual = ax + b | 91 | 138M | const auto a = Mul(inv_color_factor, Load(df, values_m + i)); | 92 | 138M | const auto b = | 93 | 138M | Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i)); | 94 | 138M | const auto v = MulAdd(a, x_v, b); | 95 | 138M | const auto vpe = MulAdd(a, xpe_v, b); | 96 | 138M | const auto vme = MulAdd(a, xme_v, b); | 97 | 138M | const auto av = Abs(v); | 98 | 138M | const auto avpe = Abs(vpe); | 99 | 138M | const auto avme = Abs(vme); | 100 | 138M | const auto acoeffx2 = Mul(coeffx2, a); | 101 | 138M | auto d = Mul(acoeffx2, Add(av, one)); | 102 | 138M | auto dpe = Mul(acoeffx2, Add(avpe, one)); | 103 | 138M | auto dme = Mul(acoeffx2, Add(avme, one)); | 104 | 138M | d = IfThenElse(Lt(v, zero), Sub(zero, d), d); | 105 | 138M | dpe = IfThenElse(Lt(vpe, zero), Sub(zero, dpe), dpe); | 106 | 138M | dme = IfThenElse(Lt(vme, zero), Sub(zero, dme), dme); | 107 | 138M | const auto above = Ge(av, thres); | 108 | | // TODO(eustas): use IfThenElseZero | 109 | 138M | fd_v = Add(fd_v, IfThenElse(above, zero, d)); | 110 | 138M | fdpe_v = Add(fdpe_v, IfThenElse(above, zero, dpe)); | 111 | 138M | fdme_v = Add(fdme_v, IfThenElse(above, zero, dme)); | 112 | 138M | } | 113 | | | 114 | 313k | *fpeps = first_derivative_peps + GetLane(SumOfLanes(df, fdpe_v)); | 115 | 313k | *fmeps = first_derivative_meps + GetLane(SumOfLanes(df, fdme_v)); | 116 | 313k | return first_derivative + GetLane(SumOfLanes(df, fd_v)); | 117 | 313k | } |
Unexecuted instantiation: jxl::N_SSE2::CFLFunction::Compute(float, float, float*, float*) const |
118 | | |
119 | | const float* JXL_RESTRICT values_m; |
120 | | const float* JXL_RESTRICT values_s; |
121 | | size_t num; |
122 | | float base; |
123 | | float distance_mul; |
124 | | }; |
125 | | |
126 | | // Chroma-from-luma search, values_m will have luma -- and values_s chroma. |
127 | | int32_t FindBestMultiplier(const float* values_m, const float* values_s, |
128 | | size_t num, float base, float distance_mul, |
129 | 28.8k | bool fast) { |
130 | 28.8k | if (num == 0) { |
131 | 0 | return 0; |
132 | 0 | } |
133 | 28.8k | float x; |
134 | 28.8k | if (fast) { |
135 | 0 | static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; |
136 | 0 | auto ca = Zero(df); |
137 | 0 | auto cb = Zero(df); |
138 | 0 | const auto inv_color_factor = Set(df, kInvColorFactor); |
139 | 0 | const auto base_v = Set(df, base); |
140 | 0 | for (size_t i = 0; i < num; i += Lanes(df)) { |
141 | | // color residual = ax + b |
142 | 0 | const auto a = Mul(inv_color_factor, Load(df, values_m + i)); |
143 | 0 | const auto b = |
144 | 0 | Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i)); |
145 | 0 | ca = MulAdd(a, a, ca); |
146 | 0 | cb = MulAdd(a, b, cb); |
147 | 0 | } |
148 | | // + distance_mul * x^2 * num |
149 | 0 | x = -GetLane(SumOfLanes(df, cb)) / |
150 | 0 | (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f); |
151 | 28.8k | } else { |
152 | 28.8k | constexpr float eps = 100; |
153 | 28.8k | constexpr float kClamp = 20.0f; |
154 | 28.8k | CFLFunction fn(values_m, values_s, num, base, distance_mul); |
155 | 28.8k | x = 0; |
156 | | // Up to 20 Newton iterations, with approximate derivatives. |
157 | | // Derivatives are approximate due to the high amount of noise in the exact |
158 | | // derivatives. |
159 | 324k | for (size_t i = 0; i < 20; i++) { |
160 | 313k | float dfpeps; |
161 | 313k | float dfmeps; |
162 | 313k | float d_f = fn.Compute(x, eps, &dfpeps, &dfmeps); |
163 | 313k | float ddf = (dfpeps - dfmeps) / (2 * eps); |
164 | 313k | float kExperimentalInsignificantStabilizer = 0.85; |
165 | 313k | float step = d_f / (ddf + kExperimentalInsignificantStabilizer); |
166 | 313k | x -= std::min(kClamp, std::max(-kClamp, step)); |
167 | 313k | if (std::abs(step) < 3e-3) break; |
168 | 313k | } |
169 | 28.8k | } |
170 | | // CFL seems to be tricky for larger transforms for HF components |
171 | | // close to zero. This heuristic brings the solutions closer to zero |
172 | | // and reduces red-green oscillations. A better approach would |
173 | | // look into variance of the multiplier within separate (e.g. 8x8) |
174 | | // areas and only apply this heuristic where there is a high variance. |
175 | | // This would give about 1 % more compression density. |
176 | 28.8k | float towards_zero = 2.6; |
177 | 28.8k | if (x >= towards_zero) { |
178 | 4.26k | x -= towards_zero; |
179 | 24.6k | } else if (x <= -towards_zero) { |
180 | 8.15k | x += towards_zero; |
181 | 16.4k | } else { |
182 | 16.4k | x = 0; |
183 | 16.4k | } |
184 | 28.8k | return jxl::Clamp1(std::round(x), -128.0f, 127.0f); |
185 | 28.8k | } Unexecuted instantiation: jxl::N_SSE4::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool) jxl::N_AVX2::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool) Line | Count | Source | 129 | 28.8k | bool fast) { | 130 | 28.8k | if (num == 0) { | 131 | 0 | return 0; | 132 | 0 | } | 133 | 28.8k | float x; | 134 | 28.8k | if (fast) { | 135 | 0 | static constexpr float kInvColorFactor = 1.0f / kDefaultColorFactor; | 136 | 0 | auto ca = Zero(df); | 137 | 0 | auto cb = Zero(df); | 138 | 0 | const auto inv_color_factor = Set(df, kInvColorFactor); | 139 | 0 | const auto base_v = Set(df, base); | 140 | 0 | for (size_t i = 0; i < num; i += Lanes(df)) { | 141 | | // color residual = ax + b | 142 | 0 | const auto a = Mul(inv_color_factor, Load(df, values_m + i)); | 143 | 0 | const auto b = | 144 | 0 | Sub(Mul(base_v, Load(df, values_m + i)), Load(df, values_s + i)); | 145 | 0 | ca = MulAdd(a, a, ca); | 146 | 0 | cb = MulAdd(a, b, cb); | 147 | 0 | } | 148 | | // + distance_mul * x^2 * num | 149 | 0 | x = -GetLane(SumOfLanes(df, cb)) / | 150 | 0 | (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f); | 151 | 28.8k | } else { | 152 | 28.8k | constexpr float eps = 100; | 153 | 28.8k | constexpr float kClamp = 20.0f; | 154 | 28.8k | CFLFunction fn(values_m, values_s, num, base, distance_mul); | 155 | 28.8k | x = 0; | 156 | | // Up to 20 Newton iterations, with approximate derivatives. | 157 | | // Derivatives are approximate due to the high amount of noise in the exact | 158 | | // derivatives. | 159 | 324k | for (size_t i = 0; i < 20; i++) { | 160 | 313k | float dfpeps; | 161 | 313k | float dfmeps; | 162 | 313k | float d_f = fn.Compute(x, eps, &dfpeps, &dfmeps); | 163 | 313k | float ddf = (dfpeps - dfmeps) / (2 * eps); | 164 | 313k | float kExperimentalInsignificantStabilizer = 0.85; | 165 | 313k | float step = d_f / (ddf + kExperimentalInsignificantStabilizer); | 166 | 313k | x -= std::min(kClamp, std::max(-kClamp, step)); | 167 | 313k | if (std::abs(step) < 3e-3) break; | 168 | 313k | } | 169 | 28.8k | } | 170 | | // CFL seems to be tricky for larger transforms for HF components | 171 | | // close to zero. This heuristic brings the solutions closer to zero | 172 | | // and reduces red-green oscillations. A better approach would | 173 | | // look into variance of the multiplier within separate (e.g. 8x8) | 174 | | // areas and only apply this heuristic where there is a high variance. | 175 | | // This would give about 1 % more compression density. | 176 | 28.8k | float towards_zero = 2.6; | 177 | 28.8k | if (x >= towards_zero) { | 178 | 4.26k | x -= towards_zero; | 179 | 24.6k | } else if (x <= -towards_zero) { | 180 | 8.15k | x += towards_zero; | 181 | 16.4k | } else { | 182 | 16.4k | x = 0; | 183 | 16.4k | } | 184 | 28.8k | return jxl::Clamp1(std::round(x), -128.0f, 127.0f); | 185 | 28.8k | } |
Unexecuted instantiation: jxl::N_SSE2::FindBestMultiplier(float const*, float const*, unsigned long, float, float, bool) |
186 | | |
187 | | Status InitDCStorage(JxlMemoryManager* memory_manager, size_t num_blocks, |
188 | 186 | ImageF* dc_values) { |
189 | | // First row: Y channel |
190 | | // Second row: X channel |
191 | | // Third row: Y channel |
192 | | // Fourth row: B channel |
193 | 186 | JXL_ASSIGN_OR_RETURN( |
194 | 186 | *dc_values, |
195 | 186 | ImageF::Create(memory_manager, RoundUpTo(num_blocks, Lanes(df)), 4)); |
196 | | |
197 | 186 | JXL_ENSURE(dc_values->xsize() != 0); |
198 | | // Zero-fill the last lanes |
199 | 930 | for (size_t y = 0; y < 4; y++) { |
200 | 6.69k | for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize(); |
201 | 5.95k | x++) { |
202 | 5.95k | dc_values->Row(y)[x] = 0; |
203 | 5.95k | } |
204 | 744 | } |
205 | 186 | return true; |
206 | 186 | } Unexecuted instantiation: jxl::N_SSE4::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*) jxl::N_AVX2::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*) Line | Count | Source | 188 | 186 | ImageF* dc_values) { | 189 | | // First row: Y channel | 190 | | // Second row: X channel | 191 | | // Third row: Y channel | 192 | | // Fourth row: B channel | 193 | 186 | JXL_ASSIGN_OR_RETURN( | 194 | 186 | *dc_values, | 195 | 186 | ImageF::Create(memory_manager, RoundUpTo(num_blocks, Lanes(df)), 4)); | 196 | | | 197 | 186 | JXL_ENSURE(dc_values->xsize() != 0); | 198 | | // Zero-fill the last lanes | 199 | 930 | for (size_t y = 0; y < 4; y++) { | 200 | 6.69k | for (size_t x = dc_values->xsize() - Lanes(df); x < dc_values->xsize(); | 201 | 5.95k | x++) { | 202 | 5.95k | dc_values->Row(y)[x] = 0; | 203 | 5.95k | } | 204 | 744 | } | 205 | 186 | return true; | 206 | 186 | } |
Unexecuted instantiation: jxl::N_SSE2::InitDCStorage(JxlMemoryManagerStruct*, unsigned long, jxl::Plane<float>*) |
207 | | |
208 | | Status ComputeTile(const Image3F& opsin, const Rect& opsin_rect, |
209 | | const DequantMatrices& dequant, |
210 | | const AcStrategyImage* ac_strategy, |
211 | | const ImageI* raw_quant_field, const Quantizer* quantizer, |
212 | | const Rect& rect, bool fast, bool use_dct8, ImageSB* map_x, |
213 | 14.4k | ImageSB* map_b, ImageF* dc_values, Span<float> mem) { |
214 | 14.4k | static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks, |
215 | 14.4k | "Invalid color tile dim"); |
216 | 14.4k | size_t xsize_blocks = opsin_rect.xsize() / kBlockDim; |
217 | 14.4k | constexpr float kDistanceMultiplierAC = 1e-9f; |
218 | 14.4k | const size_t dct_scratch_size = |
219 | 14.4k | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; |
220 | | |
221 | 14.4k | const size_t y0 = rect.y0(); |
222 | 14.4k | const size_t x0 = rect.x0(); |
223 | 14.4k | const size_t x1 = rect.x0() + rect.xsize(); |
224 | 14.4k | const size_t y1 = rect.y0() + rect.ysize(); |
225 | | |
226 | 14.4k | int ty = y0 / kColorTileDimInBlocks; |
227 | 14.4k | int tx = x0 / kColorTileDimInBlocks; |
228 | | |
229 | 14.4k | int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty); |
230 | 14.4k | int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty); |
231 | | |
232 | 14.4k | float* JXL_RESTRICT dc_values_yx = dc_values->Row(0); |
233 | 14.4k | float* JXL_RESTRICT dc_values_x = dc_values->Row(1); |
234 | 14.4k | float* JXL_RESTRICT dc_values_yb = dc_values->Row(2); |
235 | 14.4k | float* JXL_RESTRICT dc_values_b = dc_values->Row(3); |
236 | | |
237 | | // All are aligned. |
238 | 14.4k | float* HWY_RESTRICT block_y = mem.begin(); |
239 | 14.4k | float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea; |
240 | 14.4k | float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea; |
241 | 14.4k | JXL_ENSURE(mem.remove_prefix(3 * AcStrategy::kMaxCoeffArea)); |
242 | 14.4k | float* HWY_RESTRICT coeffs_yx = mem.begin(); |
243 | 14.4k | float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim; |
244 | 14.4k | float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim; |
245 | 14.4k | float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim; |
246 | 14.4k | JXL_ENSURE(mem.remove_prefix(4 * kColorTileDim * kColorTileDim)); |
247 | 14.4k | constexpr size_t dc_size = |
248 | 14.4k | AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks; |
249 | 14.4k | float* HWY_RESTRICT dc_y = mem.begin(); |
250 | 14.4k | float* HWY_RESTRICT dc_x = dc_y + dc_size; |
251 | 14.4k | float* HWY_RESTRICT dc_b = dc_x + dc_size; |
252 | 14.4k | JXL_ENSURE(mem.remove_prefix(3 * dc_size)); |
253 | 14.4k | float* HWY_RESTRICT scratch_space = mem.begin(); |
254 | 14.4k | JXL_ENSURE(mem.size() == 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size); |
255 | | |
256 | 14.4k | size_t num_ac = 0; |
257 | | |
258 | 121k | for (size_t y = y0; y < y1; ++y) { |
259 | 106k | const float* JXL_RESTRICT row_y = |
260 | 106k | opsin_rect.ConstPlaneRow(opsin, 1, y * kBlockDim); |
261 | 106k | const float* JXL_RESTRICT row_x = |
262 | 106k | opsin_rect.ConstPlaneRow(opsin, 0, y * kBlockDim); |
263 | 106k | const float* JXL_RESTRICT row_b = |
264 | 106k | opsin_rect.ConstPlaneRow(opsin, 2, y * kBlockDim); |
265 | 106k | size_t stride = opsin.PixelsPerRow(); |
266 | | |
267 | 903k | for (size_t x = x0; x < x1; x++) { |
268 | 796k | AcStrategy acs = use_dct8 |
269 | 796k | ? AcStrategy::FromRawStrategy(AcStrategyType::DCT) |
270 | 796k | : ac_strategy->ConstRow(y)[x]; |
271 | 796k | if (!acs.IsFirstBlock()) continue; |
272 | 639k | size_t xs = acs.covered_blocks_x(); |
273 | 639k | TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride, |
274 | 639k | block_y, scratch_space); |
275 | 639k | DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs, scratch_space); |
276 | 639k | TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride, |
277 | 639k | block_x, scratch_space); |
278 | 639k | DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs, scratch_space); |
279 | 639k | TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride, |
280 | 639k | block_b, scratch_space); |
281 | 639k | DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs, scratch_space); |
282 | 639k | const float* const JXL_RESTRICT qm_x = |
283 | 639k | dequant.InvMatrix(acs.Strategy(), 0); |
284 | 639k | const float* const JXL_RESTRICT qm_b = |
285 | 639k | dequant.InvMatrix(acs.Strategy(), 2); |
286 | 639k | float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0); |
287 | 639k | float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2); |
288 | | |
289 | | // Copy DCs in dc_values. |
290 | 1.31M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { |
291 | 1.47M | for (size_t ix = 0; ix < xs; ix++) { |
292 | 796k | dc_values_yx[(iy + y) * xsize_blocks + ix + x] = |
293 | 796k | dc_y[iy * xs + ix] * q_dc_x; |
294 | 796k | dc_values_x[(iy + y) * xsize_blocks + ix + x] = |
295 | 796k | dc_x[iy * xs + ix] * q_dc_x; |
296 | 796k | dc_values_yb[(iy + y) * xsize_blocks + ix + x] = |
297 | 796k | dc_y[iy * xs + ix] * q_dc_b; |
298 | 796k | dc_values_b[(iy + y) * xsize_blocks + ix + x] = |
299 | 796k | dc_b[iy * xs + ix] * q_dc_b; |
300 | 796k | } |
301 | 678k | } |
302 | | |
303 | | // Do not use this block for computing AC CfL. |
304 | 639k | if (acs.covered_blocks_x() + x0 > x1 || |
305 | 639k | acs.covered_blocks_y() + y0 > y1) { |
306 | 0 | continue; |
307 | 0 | } |
308 | | |
309 | | // Copy AC coefficients in the local block. The order in which |
310 | | // coefficients get stored does not matter. |
311 | 639k | size_t cx = acs.covered_blocks_x(); |
312 | 639k | size_t cy = acs.covered_blocks_y(); |
313 | 639k | CoefficientLayout(&cy, &cx); |
314 | | // Zero out LFs. This introduces terms in the optimization loop that |
315 | | // don't affect the result, as they are all 0, but allow for simpler |
316 | | // SIMDfication. |
317 | 1.30M | for (size_t iy = 0; iy < cy; iy++) { |
318 | 1.46M | for (size_t ix = 0; ix < cx; ix++) { |
319 | 796k | block_y[cx * kBlockDim * iy + ix] = 0; |
320 | 796k | block_x[cx * kBlockDim * iy + ix] = 0; |
321 | 796k | block_b[cx * kBlockDim * iy + ix] = 0; |
322 | 796k | } |
323 | 665k | } |
324 | | // Unclear why this is like it is. (This works slightly better |
325 | | // than the previous approach which was also a hack.) |
326 | 639k | const float qq = |
327 | 639k | (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x]; |
328 | | // Experimentally values 128-130 seem best -- I don't know why we |
329 | | // need this multiplier. |
330 | 639k | const float kStrangeMultiplier = 128; |
331 | 639k | float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq; |
332 | 639k | const auto qv = Set(df, q); |
333 | 7.00M | for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) { |
334 | 6.36M | const auto b_y = Load(df, block_y + i); |
335 | 6.36M | const auto b_x = Load(df, block_x + i); |
336 | 6.36M | const auto b_b = Load(df, block_b + i); |
337 | 6.36M | const auto qqm_x = Mul(qv, Load(df, qm_x + i)); |
338 | 6.36M | const auto qqm_b = Mul(qv, Load(df, qm_b + i)); |
339 | 6.36M | Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac); |
340 | 6.36M | Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac); |
341 | 6.36M | Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac); |
342 | 6.36M | Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac); |
343 | 6.36M | num_ac += Lanes(df); |
344 | 6.36M | } |
345 | 639k | } |
346 | 106k | } |
347 | 14.4k | JXL_ENSURE(num_ac % Lanes(df) == 0); |
348 | 14.4k | row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f, |
349 | 14.4k | kDistanceMultiplierAC, fast); |
350 | 14.4k | row_out_b[tx] = |
351 | 14.4k | FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio, |
352 | 14.4k | kDistanceMultiplierAC, fast); |
353 | 14.4k | return true; |
354 | 14.4k | } Unexecuted instantiation: jxl::N_SSE4::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>) jxl::N_AVX2::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>) Line | Count | Source | 213 | 14.4k | ImageSB* map_b, ImageF* dc_values, Span<float> mem) { | 214 | 14.4k | static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks, | 215 | 14.4k | "Invalid color tile dim"); | 216 | 14.4k | size_t xsize_blocks = opsin_rect.xsize() / kBlockDim; | 217 | 14.4k | constexpr float kDistanceMultiplierAC = 1e-9f; | 218 | 14.4k | const size_t dct_scratch_size = | 219 | 14.4k | 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim; | 220 | | | 221 | 14.4k | const size_t y0 = rect.y0(); | 222 | 14.4k | const size_t x0 = rect.x0(); | 223 | 14.4k | const size_t x1 = rect.x0() + rect.xsize(); | 224 | 14.4k | const size_t y1 = rect.y0() + rect.ysize(); | 225 | | | 226 | 14.4k | int ty = y0 / kColorTileDimInBlocks; | 227 | 14.4k | int tx = x0 / kColorTileDimInBlocks; | 228 | | | 229 | 14.4k | int8_t* JXL_RESTRICT row_out_x = map_x->Row(ty); | 230 | 14.4k | int8_t* JXL_RESTRICT row_out_b = map_b->Row(ty); | 231 | | | 232 | 14.4k | float* JXL_RESTRICT dc_values_yx = dc_values->Row(0); | 233 | 14.4k | float* JXL_RESTRICT dc_values_x = dc_values->Row(1); | 234 | 14.4k | float* JXL_RESTRICT dc_values_yb = dc_values->Row(2); | 235 | 14.4k | float* JXL_RESTRICT dc_values_b = dc_values->Row(3); | 236 | | | 237 | | // All are aligned. | 238 | 14.4k | float* HWY_RESTRICT block_y = mem.begin(); | 239 | 14.4k | float* HWY_RESTRICT block_x = block_y + AcStrategy::kMaxCoeffArea; | 240 | 14.4k | float* HWY_RESTRICT block_b = block_x + AcStrategy::kMaxCoeffArea; | 241 | 14.4k | JXL_ENSURE(mem.remove_prefix(3 * AcStrategy::kMaxCoeffArea)); | 242 | 14.4k | float* HWY_RESTRICT coeffs_yx = mem.begin(); | 243 | 14.4k | float* HWY_RESTRICT coeffs_x = coeffs_yx + kColorTileDim * kColorTileDim; | 244 | 14.4k | float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim; | 245 | 14.4k | float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim; | 246 | 14.4k | JXL_ENSURE(mem.remove_prefix(4 * kColorTileDim * kColorTileDim)); | 247 | 14.4k | constexpr size_t dc_size = | 248 | 14.4k | AcStrategy::kMaxCoeffBlocks * AcStrategy::kMaxCoeffBlocks; | 249 | 14.4k | float* HWY_RESTRICT dc_y = mem.begin(); | 250 | 14.4k | float* HWY_RESTRICT dc_x = dc_y + dc_size; | 251 | 14.4k | float* HWY_RESTRICT dc_b = dc_x + dc_size; | 252 | 14.4k | JXL_ENSURE(mem.remove_prefix(3 * dc_size)); | 253 | 14.4k | float* HWY_RESTRICT scratch_space = mem.begin(); | 254 | 14.4k | JXL_ENSURE(mem.size() == 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size); | 255 | | | 256 | 14.4k | size_t num_ac = 0; | 257 | | | 258 | 121k | for (size_t y = y0; y < y1; ++y) { | 259 | 106k | const float* JXL_RESTRICT row_y = | 260 | 106k | opsin_rect.ConstPlaneRow(opsin, 1, y * kBlockDim); | 261 | 106k | const float* JXL_RESTRICT row_x = | 262 | 106k | opsin_rect.ConstPlaneRow(opsin, 0, y * kBlockDim); | 263 | 106k | const float* JXL_RESTRICT row_b = | 264 | 106k | opsin_rect.ConstPlaneRow(opsin, 2, y * kBlockDim); | 265 | 106k | size_t stride = opsin.PixelsPerRow(); | 266 | | | 267 | 903k | for (size_t x = x0; x < x1; x++) { | 268 | 796k | AcStrategy acs = use_dct8 | 269 | 796k | ? AcStrategy::FromRawStrategy(AcStrategyType::DCT) | 270 | 796k | : ac_strategy->ConstRow(y)[x]; | 271 | 796k | if (!acs.IsFirstBlock()) continue; | 272 | 639k | size_t xs = acs.covered_blocks_x(); | 273 | 639k | TransformFromPixels(acs.Strategy(), row_y + x * kBlockDim, stride, | 274 | 639k | block_y, scratch_space); | 275 | 639k | DCFromLowestFrequencies(acs.Strategy(), block_y, dc_y, xs, scratch_space); | 276 | 639k | TransformFromPixels(acs.Strategy(), row_x + x * kBlockDim, stride, | 277 | 639k | block_x, scratch_space); | 278 | 639k | DCFromLowestFrequencies(acs.Strategy(), block_x, dc_x, xs, scratch_space); | 279 | 639k | TransformFromPixels(acs.Strategy(), row_b + x * kBlockDim, stride, | 280 | 639k | block_b, scratch_space); | 281 | 639k | DCFromLowestFrequencies(acs.Strategy(), block_b, dc_b, xs, scratch_space); | 282 | 639k | const float* const JXL_RESTRICT qm_x = | 283 | 639k | dequant.InvMatrix(acs.Strategy(), 0); | 284 | 639k | const float* const JXL_RESTRICT qm_b = | 285 | 639k | dequant.InvMatrix(acs.Strategy(), 2); | 286 | 639k | float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0); | 287 | 639k | float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2); | 288 | | | 289 | | // Copy DCs in dc_values. | 290 | 1.31M | for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) { | 291 | 1.47M | for (size_t ix = 0; ix < xs; ix++) { | 292 | 796k | dc_values_yx[(iy + y) * xsize_blocks + ix + x] = | 293 | 796k | dc_y[iy * xs + ix] * q_dc_x; | 294 | 796k | dc_values_x[(iy + y) * xsize_blocks + ix + x] = | 295 | 796k | dc_x[iy * xs + ix] * q_dc_x; | 296 | 796k | dc_values_yb[(iy + y) * xsize_blocks + ix + x] = | 297 | 796k | dc_y[iy * xs + ix] * q_dc_b; | 298 | 796k | dc_values_b[(iy + y) * xsize_blocks + ix + x] = | 299 | 796k | dc_b[iy * xs + ix] * q_dc_b; | 300 | 796k | } | 301 | 678k | } | 302 | | | 303 | | // Do not use this block for computing AC CfL. | 304 | 639k | if (acs.covered_blocks_x() + x0 > x1 || | 305 | 639k | acs.covered_blocks_y() + y0 > y1) { | 306 | 0 | continue; | 307 | 0 | } | 308 | | | 309 | | // Copy AC coefficients in the local block. The order in which | 310 | | // coefficients get stored does not matter. | 311 | 639k | size_t cx = acs.covered_blocks_x(); | 312 | 639k | size_t cy = acs.covered_blocks_y(); | 313 | 639k | CoefficientLayout(&cy, &cx); | 314 | | // Zero out LFs. This introduces terms in the optimization loop that | 315 | | // don't affect the result, as they are all 0, but allow for simpler | 316 | | // SIMDfication. | 317 | 1.30M | for (size_t iy = 0; iy < cy; iy++) { | 318 | 1.46M | for (size_t ix = 0; ix < cx; ix++) { | 319 | 796k | block_y[cx * kBlockDim * iy + ix] = 0; | 320 | 796k | block_x[cx * kBlockDim * iy + ix] = 0; | 321 | 796k | block_b[cx * kBlockDim * iy + ix] = 0; | 322 | 796k | } | 323 | 665k | } | 324 | | // Unclear why this is like it is. (This works slightly better | 325 | | // than the previous approach which was also a hack.) | 326 | 639k | const float qq = | 327 | 639k | (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x]; | 328 | | // Experimentally values 128-130 seem best -- I don't know why we | 329 | | // need this multiplier. | 330 | 639k | const float kStrangeMultiplier = 128; | 331 | 639k | float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq; | 332 | 639k | const auto qv = Set(df, q); | 333 | 7.00M | for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) { | 334 | 6.36M | const auto b_y = Load(df, block_y + i); | 335 | 6.36M | const auto b_x = Load(df, block_x + i); | 336 | 6.36M | const auto b_b = Load(df, block_b + i); | 337 | 6.36M | const auto qqm_x = Mul(qv, Load(df, qm_x + i)); | 338 | 6.36M | const auto qqm_b = Mul(qv, Load(df, qm_b + i)); | 339 | 6.36M | Store(Mul(b_y, qqm_x), df, coeffs_yx + num_ac); | 340 | 6.36M | Store(Mul(b_x, qqm_x), df, coeffs_x + num_ac); | 341 | 6.36M | Store(Mul(b_y, qqm_b), df, coeffs_yb + num_ac); | 342 | 6.36M | Store(Mul(b_b, qqm_b), df, coeffs_b + num_ac); | 343 | 6.36M | num_ac += Lanes(df); | 344 | 6.36M | } | 345 | 639k | } | 346 | 106k | } | 347 | 14.4k | JXL_ENSURE(num_ac % Lanes(df) == 0); | 348 | 14.4k | row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f, | 349 | 14.4k | kDistanceMultiplierAC, fast); | 350 | 14.4k | row_out_b[tx] = | 351 | 14.4k | FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio, | 352 | 14.4k | kDistanceMultiplierAC, fast); | 353 | 14.4k | return true; | 354 | 14.4k | } |
Unexecuted instantiation: jxl::N_SSE2::ComputeTile(jxl::Image3<float> const&, jxl::RectT<unsigned long> const&, jxl::DequantMatrices const&, jxl::AcStrategyImage const*, jxl::Plane<int> const*, jxl::Quantizer const*, jxl::RectT<unsigned long> const&, bool, bool, jxl::Plane<signed char>*, jxl::Plane<signed char>*, jxl::Plane<float>*, jxl::Span<float>) |
355 | | |
356 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
357 | | } // namespace HWY_NAMESPACE |
358 | | } // namespace jxl |
359 | | HWY_AFTER_NAMESPACE(); |
360 | | |
361 | | #if HWY_ONCE |
362 | | namespace jxl { |
363 | | |
364 | | HWY_EXPORT(InitDCStorage); |
365 | | HWY_EXPORT(ComputeTile); |
366 | | |
367 | 186 | Status CfLHeuristics::Init(const Rect& rect) { |
368 | 186 | size_t xsize_blocks = rect.xsize() / kBlockDim; |
369 | 186 | size_t ysize_blocks = rect.ysize() / kBlockDim; |
370 | 186 | return HWY_DYNAMIC_DISPATCH(InitDCStorage)( |
371 | 186 | memory_manager, xsize_blocks * ysize_blocks, &dc_values); |
372 | 186 | } |
373 | | |
374 | | Status CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin, |
375 | | const Rect& opsin_rect, |
376 | | const DequantMatrices& dequant, |
377 | | const AcStrategyImage* ac_strategy, |
378 | | const ImageI* raw_quant_field, |
379 | | const Quantizer* quantizer, bool fast, |
380 | 14.4k | size_t thread, ColorCorrelationMap* cmap) { |
381 | 14.4k | bool use_dct8 = ac_strategy == nullptr; |
382 | 14.4k | Span<float> scratch(mem.address<float>() + thread * ItemsPerThread(), |
383 | 14.4k | ItemsPerThread()); |
384 | 14.4k | return HWY_DYNAMIC_DISPATCH(ComputeTile)( |
385 | 14.4k | opsin, opsin_rect, dequant, ac_strategy, raw_quant_field, quantizer, r, |
386 | 14.4k | fast, use_dct8, &cmap->ytox_map, &cmap->ytob_map, &dc_values, scratch); |
387 | 14.4k | } |
388 | | |
389 | | Status ColorCorrelationEncodeDC(const ColorCorrelation& color_correlation, |
390 | | BitWriter* writer, LayerType layer, |
391 | 186 | AuxOut* aux_out) { |
392 | 186 | float color_factor = color_correlation.GetColorFactor(); |
393 | 186 | float base_correlation_x = color_correlation.GetBaseCorrelationX(); |
394 | 186 | float base_correlation_b = color_correlation.GetBaseCorrelationB(); |
395 | 186 | int32_t ytox_dc = color_correlation.GetYToXDC(); |
396 | 186 | int32_t ytob_dc = color_correlation.GetYToBDC(); |
397 | | |
398 | 186 | return writer->WithMaxBits( |
399 | 186 | 1 + 2 * kBitsPerByte + 12 + 32, layer, aux_out, [&]() -> Status { |
400 | 186 | if (ytox_dc == 0 && ytob_dc == 0 && |
401 | 186 | color_factor == kDefaultColorFactor && base_correlation_x == 0.0f && |
402 | 186 | base_correlation_b == jxl::cms::kYToBRatio) { |
403 | 186 | writer->Write(1, 1); |
404 | 186 | return true; |
405 | 186 | } |
406 | 0 | writer->Write(1, 0); |
407 | 0 | JXL_RETURN_IF_ERROR( |
408 | 0 | U32Coder::Write(kColorFactorDist, color_factor, writer)); |
409 | 0 | JXL_RETURN_IF_ERROR(F16Coder::Write(base_correlation_x, writer)); |
410 | 0 | JXL_RETURN_IF_ERROR(F16Coder::Write(base_correlation_b, writer)); |
411 | 0 | writer->Write(kBitsPerByte, |
412 | 0 | ytox_dc - std::numeric_limits<int8_t>::min()); |
413 | 0 | writer->Write(kBitsPerByte, |
414 | 0 | ytob_dc - std::numeric_limits<int8_t>::min()); |
415 | 0 | return true; |
416 | 0 | }); |
417 | 186 | } |
418 | | |
419 | | } // namespace jxl |
420 | | #endif // HWY_ONCE |