/src/libjxl/lib/jxl/enc_modular_simd.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_modular_simd.h" |
7 | | |
8 | | #include <cstdint> |
9 | | |
10 | | #include "lib/jxl/base/common.h" |
11 | | #include "lib/jxl/base/status.h" |
12 | | #include "lib/jxl/dec_ans.h" |
13 | | #include "lib/jxl/enc_ans_params.h" |
14 | | #include "lib/jxl/memory_manager_internal.h" |
15 | | #include "lib/jxl/modular/modular_image.h" |
16 | | |
17 | | #undef HWY_TARGET_INCLUDE |
18 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_modular_simd.cc" |
19 | | #include <hwy/foreach_target.h> |
20 | | #include <hwy/highway.h> |
21 | | |
22 | | #if HWY_TARGET == HWY_SCALAR |
23 | | #include "lib/jxl/modular/encoding/context_predict.h" |
24 | | #include "lib/jxl/pack_signed.h" |
25 | | #endif |
26 | | |
27 | | HWY_BEFORE_NAMESPACE(); |
28 | | namespace jxl { |
29 | | namespace HWY_NAMESPACE { |
30 | | |
31 | | // These templates are not found via ADL. |
32 | | using hwy::HWY_NAMESPACE::Add; |
33 | | using hwy::HWY_NAMESPACE::And; |
34 | | using hwy::HWY_NAMESPACE::Ge; |
35 | | using hwy::HWY_NAMESPACE::GetLane; |
36 | | using hwy::HWY_NAMESPACE::Gt; |
37 | | using hwy::HWY_NAMESPACE::IfThenElse; |
38 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
39 | | using hwy::HWY_NAMESPACE::Iota; |
40 | | using hwy::HWY_NAMESPACE::Load; |
41 | | using hwy::HWY_NAMESPACE::LoadU; |
42 | | using hwy::HWY_NAMESPACE::Lt; |
43 | | using hwy::HWY_NAMESPACE::Max; |
44 | | using hwy::HWY_NAMESPACE::Min; |
45 | | using hwy::HWY_NAMESPACE::Mul; |
46 | | using hwy::HWY_NAMESPACE::Not; |
47 | | using hwy::HWY_NAMESPACE::Set; |
48 | | using hwy::HWY_NAMESPACE::ShiftLeft; |
49 | | using hwy::HWY_NAMESPACE::ShiftRight; |
50 | | using hwy::HWY_NAMESPACE::Store; |
51 | | using hwy::HWY_NAMESPACE::StoreU; |
52 | | using hwy::HWY_NAMESPACE::Sub; |
53 | | using hwy::HWY_NAMESPACE::Xor; |
54 | | using hwy::HWY_NAMESPACE::Zero; |
55 | | |
56 | 0 | StatusOr<float> EstimateCost(const Image& img) { |
57 | 0 | size_t histo_cost = 0; |
58 | 0 | float histo_cost_frac = 0.0f; |
59 | 0 | size_t extra_bits = 0; |
60 | |
|
61 | | #if HWY_TARGET == HWY_SCALAR |
62 | | HybridUintConfig config; |
63 | | uint32_t cutoffs[] = {0, 1, 3, 5, 7, 11, 15, 23, 31, |
64 | | 47, 63, 95, 127, 191, 255, 392, 500}; |
65 | | constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1; |
66 | | Histogram histo[nc] = {}; |
67 | | for (const Channel& ch : img.channel) { |
68 | | const intptr_t onerow = ch.plane.PixelsPerRow(); |
69 | | for (size_t y = 0; y < ch.h; y++) { |
70 | | const pixel_type* JXL_RESTRICT r = ch.Row(y); |
71 | | for (size_t x = 0; x < ch.w; x++) { |
72 | | pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); |
73 | | pixel_type_w top = (y ? *(r + x - onerow) : left); |
74 | | pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); |
75 | | size_t max_diff = |
76 | | std::max({left, top, topleft}) - std::min({left, top, topleft}); |
77 | | size_t ctx = 0; |
78 | | for (uint32_t c : cutoffs) { |
79 | | ctx += (max_diff < c) ? 1 : 0; |
80 | | } |
81 | | pixel_type res = r[x] - ClampedGradient(top, left, topleft); |
82 | | uint32_t token; |
83 | | uint32_t nbits; |
84 | | uint32_t bits; |
85 | | config.Encode(PackSigned(res), &token, &nbits, &bits); |
86 | | histo[ctx].Add(token); |
87 | | extra_bits += nbits; |
88 | | } |
89 | | } |
90 | | for (auto& h : histo) { |
91 | | float f_cost = h.ShannonEntropy(); |
92 | | size_t i_cost = f_cost; |
93 | | histo_cost += i_cost; |
94 | | histo_cost_frac += f_cost - i_cost; |
95 | | h.Clear(); |
96 | | } |
97 | | } |
98 | | #else |
99 | 0 | JxlMemoryManager* memory_manager = img.memory_manager(); |
100 | 0 | const auto& ctx_map = estimate_cost_detail::ContextMap(); |
101 | 0 | const HWY_FULL(int32_t) di; |
102 | 0 | const HWY_FULL(uint32_t) du; |
103 | 0 | const HWY_FULL(float) df; |
104 | 0 | const auto kOne = Set(du, 1); |
105 | 0 | const auto kSplit = Set(du, 16); |
106 | 0 | const auto kExpOffset2 = Set(du, 129); // 127 + 2 |
107 | 0 | const auto kTokenBias = Set(du, 8); |
108 | 0 | const auto kTokenMul = Set(du, 4); |
109 | 0 | const auto kMsbMask = Set(du, 3); |
110 | 0 | const auto kMaxDiffCap = Set(du, estimate_cost_detail::kLastThreshold - 1); |
111 | 0 | const auto kLanes = Set(du, Lanes(du)); |
112 | 0 | const auto kIota = Iota(du, 0); |
113 | |
|
114 | 0 | size_t max_w = 0; |
115 | 0 | for (const Channel& ch : img.channel) { |
116 | 0 | if (ch.h == 0) continue; |
117 | 0 | max_w = std::max(max_w, ch.w); |
118 | 0 | } |
119 | 0 | max_w = RoundUpTo(max_w, Lanes(du)); |
120 | 0 | max_w = std::max(max_w, 2 * Lanes(du)); |
121 | |
|
122 | 0 | JXL_ASSIGN_OR_RETURN( |
123 | 0 | AlignedMemory buffer, |
124 | 0 | AlignedMemory::Create(memory_manager, max_w * 2 * sizeof(uint32_t))); |
125 | 0 | uint32_t* max_diff_row = buffer.address<uint32_t>(); |
126 | 0 | uint32_t* token_row = max_diff_row + max_w; |
127 | 0 | int32_t* primer = buffer.address<int32_t>(); |
128 | 0 | int32_t* top_primer = primer + max_w; |
129 | |
|
130 | 0 | HybridUintConfig config; |
131 | |
|
132 | 0 | Histogram histo[estimate_cost_detail::kLastCtx + 1] = {}; |
133 | 0 | auto extra_bits_lanes = Zero(du); |
134 | 0 | for (const Channel& ch : img.channel) { |
135 | 0 | if (ch.h == 0 || ch.w == 0) continue; |
136 | 0 | for (auto& h : histo) { |
137 | 0 | h.EnsureCapacity(32 * 4); |
138 | 0 | } |
139 | 0 | const pixel_type* JXL_RESTRICT r = ch.Row(0); |
140 | 0 | const pixel_type* JXL_RESTRICT last = primer; |
141 | 0 | primer[0] = 0; |
142 | 0 | StoreU(Load(di, r), di, primer + 1); |
143 | 0 | auto pos = kIota; |
144 | 0 | const auto last_pos = Set(du, ch.w); |
145 | 0 | for (size_t x = 0; x < ch.w; x += Lanes(di)) { |
146 | 0 | const auto left = LoadU(di, last); |
147 | 0 | const auto central = Load(di, r + x); |
148 | 0 | const auto ures = BitCast(du, Sub(central, left)); |
149 | 0 | const auto packed = |
150 | 0 | Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne)); |
151 | 0 | const auto not_literal = Ge(packed, kSplit); |
152 | 0 | const auto v = BitCast(du, ConvertTo(df, packed)); |
153 | 0 | const auto eb = Sub(ShiftRight<23>(v), kExpOffset2); |
154 | 0 | const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)), |
155 | 0 | And(ShiftRight<21>(v), kMsbMask)); |
156 | 0 | const auto tail_mask = Lt(pos, last_pos); |
157 | 0 | const auto eb_fixed = IfThenElseZero(not_literal, eb); |
158 | 0 | const auto token_fixed = IfThenElse(not_literal, token, packed); |
159 | 0 | extra_bits_lanes = |
160 | 0 | Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed)); |
161 | 0 | Store(token_fixed, du, token_row + x); |
162 | 0 | pos = Add(pos, kLanes); |
163 | 0 | last = r + x + Lanes(di) - 1; |
164 | 0 | } |
165 | 0 | for (size_t x = 0; x < ch.w; x++) { |
166 | 0 | histo[0].FastAdd(token_row[x]); |
167 | 0 | } |
168 | 0 | for (size_t y = 1; y < ch.h; y++) { |
169 | 0 | r = ch.Row(y); |
170 | 0 | const pixel_type* JXL_RESTRICT t = ch.Row(y - 1); |
171 | 0 | last = primer; |
172 | 0 | primer[0] = t[0]; |
173 | 0 | StoreU(Load(di, r), di, primer + 1); |
174 | 0 | top_primer[0] = t[0]; |
175 | 0 | StoreU(Load(di, t), di, top_primer + 1); |
176 | 0 | const pixel_type* JXL_RESTRICT top_last = top_primer; |
177 | 0 | pos = kIota; |
178 | 0 | for (size_t x = 0; x < ch.w; x += Lanes(di)) { |
179 | 0 | const auto left = LoadU(di, last); |
180 | 0 | const auto central = Load(di, r + x); |
181 | 0 | const auto topleft = LoadU(di, top_last); |
182 | 0 | const auto top = Load(di, t + x); |
183 | 0 | const auto l_ge_t = Ge(left, top); |
184 | 0 | const auto m = IfThenElse(l_ge_t, top, left); |
185 | 0 | const auto M = IfThenElse(l_ge_t, left, top); |
186 | 0 | const auto maxx = Max(topleft, M); |
187 | 0 | const auto minn = Min(topleft, m); |
188 | 0 | const auto max_diff = BitCast(du, Sub(maxx, minn)); |
189 | 0 | Store(Min(max_diff, kMaxDiffCap), du, max_diff_row + x); |
190 | 0 | const auto overshoot = Lt(topleft, m); |
191 | 0 | const auto undershoot = Gt(topleft, M); |
192 | 0 | const auto grad = |
193 | 0 | BitCast(di, Sub(Add(BitCast(du, top), BitCast(du, left)), |
194 | 0 | BitCast(du, topleft))); |
195 | 0 | const auto prediction = |
196 | 0 | IfThenElse(undershoot, m, IfThenElse(overshoot, M, grad)); |
197 | 0 | const auto ures = BitCast(du, Sub(central, prediction)); |
198 | 0 | const auto packed = |
199 | 0 | Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne)); |
200 | 0 | const auto not_literal = Ge(packed, kSplit); |
201 | 0 | const auto v = BitCast(du, ConvertTo(df, packed)); |
202 | 0 | const auto eb = Sub(ShiftRight<23>(v), kExpOffset2); |
203 | 0 | const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)), |
204 | 0 | And(ShiftRight<21>(v), kMsbMask)); |
205 | 0 | const auto tail_mask = Lt(pos, last_pos); |
206 | 0 | const auto eb_fixed = IfThenElseZero(not_literal, eb); |
207 | 0 | const auto token_fixed = IfThenElse(not_literal, token, packed); |
208 | 0 | extra_bits_lanes = |
209 | 0 | Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed)); |
210 | 0 | Store(token_fixed, du, token_row + x); |
211 | 0 | pos = Add(pos, kLanes); |
212 | 0 | last = r + x + Lanes(di) - 1; |
213 | 0 | top_last = t + x + Lanes(di) - 1; |
214 | 0 | } |
215 | 0 | for (size_t x = 0; x < ch.w; x++) { |
216 | 0 | size_t ctx = ctx_map[max_diff_row[x]]; |
217 | 0 | histo[ctx].FastAdd(token_row[x]); |
218 | 0 | } |
219 | 0 | } |
220 | 0 | for (auto& h : histo) { |
221 | 0 | h.Condition(); |
222 | 0 | float f_cost = h.ShannonEntropy(); |
223 | 0 | size_t i_cost = f_cost; |
224 | 0 | histo_cost += i_cost; |
225 | 0 | histo_cost_frac += f_cost - i_cost; |
226 | 0 | h.Clear(); |
227 | 0 | } |
228 | 0 | } |
229 | 0 | extra_bits = GetLane(SumOfLanes(du, extra_bits_lanes)); |
230 | 0 | #endif |
231 | 0 | size_t total_cost = |
232 | 0 | extra_bits + histo_cost + static_cast<size_t>(histo_cost_frac); |
233 | 0 | return total_cost; |
234 | 0 | } Unexecuted instantiation: jxl::N_SSE4::EstimateCost(jxl::Image const&) Unexecuted instantiation: jxl::N_AVX2::EstimateCost(jxl::Image const&) Unexecuted instantiation: jxl::N_AVX3::EstimateCost(jxl::Image const&) Unexecuted instantiation: jxl::N_AVX3_ZEN4::EstimateCost(jxl::Image const&) Unexecuted instantiation: jxl::N_AVX3_SPR::EstimateCost(jxl::Image const&) Unexecuted instantiation: jxl::N_SSE2::EstimateCost(jxl::Image const&) |
235 | | |
236 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
237 | | } // namespace HWY_NAMESPACE |
238 | | } // namespace jxl |
239 | | HWY_AFTER_NAMESPACE(); |
240 | | |
241 | | #if HWY_ONCE |
242 | | namespace jxl { |
243 | | |
244 | | HWY_EXPORT(EstimateCost); |
245 | | |
246 | 0 | StatusOr<float> EstimateCost(const Image& img) { |
247 | 0 | return HWY_DYNAMIC_DISPATCH(EstimateCost)(img); |
248 | 0 | } |
249 | | |
250 | | namespace estimate_cost_detail { |
251 | | /* |
252 | | cutoffs = [0, 1, 3, 5, 7, 11, 15, 23, 31, 47, 63, 95, 127, 191, 255, 392, 500] |
253 | | ctx_map = [[c for c,v in enumerate(cutoffs) if v <= i][0] for i in range(501)] |
254 | | */ |
255 | 0 | const std::array<uint8_t, kLastThreshold>& ContextMap() { |
256 | 0 | static const std::array<uint8_t, kLastThreshold> kCtxMap = { |
257 | 0 | 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, |
258 | 0 | 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, |
259 | 0 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, |
260 | 0 | 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
261 | 0 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
262 | 0 | 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, |
263 | 0 | 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, |
264 | 0 | 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
265 | 0 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
266 | 0 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
267 | 0 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, |
268 | 0 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
269 | 0 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
270 | 0 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
271 | 0 | 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
272 | 0 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
273 | 0 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
274 | 0 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
275 | 0 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
276 | 0 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
277 | 0 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
278 | 0 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, |
279 | 0 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
280 | 0 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
281 | 0 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
282 | 0 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
283 | 0 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
284 | 0 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16}; |
285 | 0 | return kCtxMap; |
286 | 0 | } |
287 | | } // namespace estimate_cost_detail |
288 | | |
289 | | } // namespace jxl |
290 | | #endif |