/src/libjxl/lib/jxl/enc_modular_simd.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_modular_simd.h" |
7 | | |
8 | | #include <cstdint> |
9 | | |
10 | | #include "lib/jxl/base/common.h" |
11 | | #include "lib/jxl/base/status.h" |
12 | | #include "lib/jxl/dec_ans.h" |
13 | | #include "lib/jxl/enc_ans_params.h" |
14 | | #include "lib/jxl/memory_manager_internal.h" |
15 | | #include "lib/jxl/modular/modular_image.h" |
16 | | |
17 | | #undef HWY_TARGET_INCLUDE |
18 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_modular_simd.cc" |
19 | | #include <hwy/foreach_target.h> |
20 | | #include <hwy/highway.h> |
21 | | |
22 | | #if HWY_TARGET == HWY_SCALAR |
23 | | #include "lib/jxl/modular/encoding/context_predict.h" |
24 | | #include "lib/jxl/pack_signed.h" |
25 | | #endif |
26 | | |
27 | | HWY_BEFORE_NAMESPACE(); |
28 | | namespace jxl { |
29 | | namespace HWY_NAMESPACE { |
30 | | |
31 | | // These templates are not found via ADL. |
32 | | using hwy::HWY_NAMESPACE::Add; |
33 | | using hwy::HWY_NAMESPACE::And; |
34 | | using hwy::HWY_NAMESPACE::Ge; |
35 | | using hwy::HWY_NAMESPACE::GetLane; |
36 | | using hwy::HWY_NAMESPACE::Gt; |
37 | | using hwy::HWY_NAMESPACE::IfThenElse; |
38 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
39 | | using hwy::HWY_NAMESPACE::Iota; |
40 | | using hwy::HWY_NAMESPACE::Load; |
41 | | using hwy::HWY_NAMESPACE::LoadU; |
42 | | using hwy::HWY_NAMESPACE::Lt; |
43 | | using hwy::HWY_NAMESPACE::Max; |
44 | | using hwy::HWY_NAMESPACE::Min; |
45 | | using hwy::HWY_NAMESPACE::Mul; |
46 | | using hwy::HWY_NAMESPACE::Not; |
47 | | using hwy::HWY_NAMESPACE::Set; |
48 | | using hwy::HWY_NAMESPACE::ShiftLeft; |
49 | | using hwy::HWY_NAMESPACE::ShiftRight; |
50 | | using hwy::HWY_NAMESPACE::Store; |
51 | | using hwy::HWY_NAMESPACE::StoreU; |
52 | | using hwy::HWY_NAMESPACE::Sub; |
53 | | using hwy::HWY_NAMESPACE::Xor; |
54 | | using hwy::HWY_NAMESPACE::Zero; |
55 | | |
56 | 11 | StatusOr<float> EstimateCost(const Image& img) { |
57 | 11 | size_t histo_cost = 0; |
58 | 11 | float histo_cost_frac = 0.0f; |
59 | 11 | size_t extra_bits = 0; |
60 | | |
61 | | #if HWY_TARGET == HWY_SCALAR |
62 | | HybridUintConfig config; |
63 | | uint32_t cutoffs[] = {0, 1, 3, 5, 7, 11, 15, 23, 31, |
64 | | 47, 63, 95, 127, 191, 255, 392, 500}; |
65 | | constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1; |
66 | | Histogram histo[nc] = {}; |
67 | | for (const Channel& ch : img.channel) { |
68 | | const intptr_t onerow = ch.plane.PixelsPerRow(); |
69 | | for (size_t y = 0; y < ch.h; y++) { |
70 | | const pixel_type* JXL_RESTRICT r = ch.Row(y); |
71 | | for (size_t x = 0; x < ch.w; x++) { |
72 | | pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); |
73 | | pixel_type_w top = (y ? *(r + x - onerow) : left); |
74 | | pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); |
75 | | size_t max_diff = |
76 | | std::max({left, top, topleft}) - std::min({left, top, topleft}); |
77 | | size_t ctx = 0; |
78 | | for (uint32_t c : cutoffs) { |
79 | | ctx += (max_diff < c) ? 1 : 0; |
80 | | } |
81 | | pixel_type res = r[x] - ClampedGradient(top, left, topleft); |
82 | | uint32_t token; |
83 | | uint32_t nbits; |
84 | | uint32_t bits; |
85 | | config.Encode(PackSigned(res), &token, &nbits, &bits); |
86 | | histo[ctx].Add(token); |
87 | | extra_bits += nbits; |
88 | | } |
89 | | } |
90 | | for (auto& h : histo) { |
91 | | float f_cost = h.ShannonEntropy(); |
92 | | size_t i_cost = f_cost; |
93 | | histo_cost += i_cost; |
94 | | histo_cost_frac += f_cost - i_cost; |
95 | | h.Clear(); |
96 | | } |
97 | | } |
98 | | #else |
99 | 11 | JxlMemoryManager* memory_manager = img.memory_manager(); |
100 | 11 | const auto& ctx_map = estimate_cost_detail::ContextMap(); |
101 | 11 | const HWY_FULL(int32_t) di; |
102 | 11 | const HWY_FULL(uint32_t) du; |
103 | 11 | const HWY_FULL(float) df; |
104 | 11 | const auto kOne = Set(du, 1); |
105 | 11 | const auto kSplit = Set(du, 16); |
106 | 11 | const auto kExpOffset2 = Set(du, 129); // 127 + 2 |
107 | 11 | const auto kTokenBias = Set(du, 8); |
108 | 11 | const auto kTokenMul = Set(du, 4); |
109 | 11 | const auto kMsbMask = Set(du, 3); |
110 | 11 | const auto kMaxDiffCap = Set(du, estimate_cost_detail::kLastThreshold - 1); |
111 | 11 | const auto kLanes = Set(du, Lanes(du)); |
112 | 11 | const auto kIota = Iota(du, 0); |
113 | 11 | const auto kLargeThreshold = Set(du, (1 << 22) - 1); |
114 | 11 | constexpr size_t kLargeShiftVal = 10; |
115 | 11 | const auto kLargeShift = Set(du, kLargeShiftVal); |
116 | | |
117 | 11 | size_t max_w = 0; |
118 | 11 | for (const Channel& ch : img.channel) { |
119 | 11 | if (ch.h == 0) continue; |
120 | 11 | max_w = std::max(max_w, ch.w); |
121 | 11 | } |
122 | 11 | max_w = RoundUpTo(max_w, Lanes(du)); |
123 | 11 | max_w = std::max(max_w, 2 * Lanes(du)); |
124 | | |
125 | 11 | JXL_ASSIGN_OR_RETURN( |
126 | 11 | AlignedMemory buffer, |
127 | 11 | AlignedMemory::Create(memory_manager, max_w * 2 * sizeof(uint32_t))); |
128 | 11 | uint32_t* max_diff_row = buffer.address<uint32_t>(); |
129 | 11 | uint32_t* token_row = max_diff_row + max_w; |
130 | 11 | int32_t* primer = buffer.address<int32_t>(); |
131 | 11 | int32_t* top_primer = primer + max_w; |
132 | | |
133 | 11 | HybridUintConfig config; |
134 | | |
135 | 11 | Histogram histo[estimate_cost_detail::kLastCtx + 1] = {}; |
136 | 11 | auto extra_bits_lanes = Zero(du); |
137 | 11 | for (const Channel& ch : img.channel) { |
138 | 11 | if (ch.h == 0 || ch.w == 0) continue; |
139 | 187 | for (auto& h : histo) { |
140 | 187 | h.EnsureCapacity(32 * 4); |
141 | 187 | } |
142 | 11 | const pixel_type* JXL_RESTRICT r = ch.Row(0); |
143 | 11 | const pixel_type* JXL_RESTRICT last = primer; |
144 | 11 | primer[0] = 0; |
145 | 11 | StoreU(Load(di, r), di, primer + 1); |
146 | 11 | auto pos = kIota; |
147 | 11 | const auto last_pos = Set(du, ch.w); |
148 | 122 | for (size_t x = 0; x < ch.w; x += Lanes(di)) { |
149 | 111 | const auto left = LoadU(di, last); |
150 | 111 | const auto central = Load(di, r + x); |
151 | 111 | const auto ures = BitCast(du, Sub(central, left)); |
152 | 111 | const auto packed = |
153 | 111 | Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne)); |
154 | 111 | const auto is_large = Gt(packed, kLargeThreshold); |
155 | 111 | const auto packed_shifted = ShiftRight<kLargeShiftVal>(packed); |
156 | 111 | const auto not_literal = Ge(packed, kSplit); |
157 | 111 | const auto packed_fixed = IfThenElse(is_large, packed_shifted, packed); |
158 | 111 | const auto v = BitCast(du, ConvertTo(df, packed_fixed)); |
159 | 111 | const auto eb_raw = Sub(ShiftRight<23>(v), kExpOffset2); |
160 | 111 | const auto eb = IfThenElse(is_large, Add(eb_raw, kLargeShift), eb_raw); |
161 | 111 | const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)), |
162 | 111 | And(ShiftRight<21>(v), kMsbMask)); |
163 | 111 | const auto tail_mask = Lt(pos, last_pos); |
164 | 111 | const auto eb_fixed = IfThenElseZero(not_literal, eb); |
165 | 111 | const auto token_fixed = IfThenElse(not_literal, token, packed); |
166 | 111 | extra_bits_lanes = |
167 | 111 | Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed)); |
168 | 111 | Store(token_fixed, du, token_row + x); |
169 | 111 | pos = Add(pos, kLanes); |
170 | 111 | last = r + x + Lanes(di) - 1; |
171 | 111 | } |
172 | 867 | for (size_t x = 0; x < ch.w; x++) { |
173 | 856 | histo[0].FastAdd(token_row[x]); |
174 | 856 | } |
175 | 515 | for (size_t y = 1; y < ch.h; y++) { |
176 | 504 | r = ch.Row(y); |
177 | 504 | const pixel_type* JXL_RESTRICT t = ch.Row(y - 1); |
178 | 504 | last = primer; |
179 | 504 | primer[0] = t[0]; |
180 | 504 | StoreU(Load(di, r), di, primer + 1); |
181 | 504 | top_primer[0] = t[0]; |
182 | 504 | StoreU(Load(di, t), di, top_primer + 1); |
183 | 504 | const pixel_type* JXL_RESTRICT top_last = top_primer; |
184 | 504 | pos = kIota; |
185 | 12.7k | for (size_t x = 0; x < ch.w; x += Lanes(di)) { |
186 | 12.2k | const auto left = LoadU(di, last); |
187 | 12.2k | const auto central = Load(di, r + x); |
188 | 12.2k | const auto topleft = LoadU(di, top_last); |
189 | 12.2k | const auto top = Load(di, t + x); |
190 | 12.2k | const auto l_ge_t = Ge(left, top); |
191 | 12.2k | const auto m = IfThenElse(l_ge_t, top, left); |
192 | 12.2k | const auto M = IfThenElse(l_ge_t, left, top); |
193 | 12.2k | const auto maxx = Max(topleft, M); |
194 | 12.2k | const auto minn = Min(topleft, m); |
195 | 12.2k | const auto max_diff = BitCast(du, Sub(maxx, minn)); |
196 | 12.2k | Store(Min(max_diff, kMaxDiffCap), du, max_diff_row + x); |
197 | 12.2k | const auto overshoot = Lt(topleft, m); |
198 | 12.2k | const auto undershoot = Gt(topleft, M); |
199 | 12.2k | const auto grad = |
200 | 12.2k | BitCast(di, Sub(Add(BitCast(du, top), BitCast(du, left)), |
201 | 12.2k | BitCast(du, topleft))); |
202 | 12.2k | const auto prediction = |
203 | 12.2k | IfThenElse(undershoot, m, IfThenElse(overshoot, M, grad)); |
204 | 12.2k | const auto ures = BitCast(du, Sub(central, prediction)); |
205 | 12.2k | const auto packed = |
206 | 12.2k | Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne)); |
207 | 12.2k | const auto is_large = Gt(packed, kLargeThreshold); |
208 | 12.2k | const auto packed_shifted = ShiftRight<kLargeShiftVal>(packed); |
209 | 12.2k | const auto not_literal = Ge(packed, kSplit); |
210 | 12.2k | const auto packed_fixed = IfThenElse(is_large, packed_shifted, packed); |
211 | 12.2k | const auto v = BitCast(du, ConvertTo(df, packed_fixed)); |
212 | 12.2k | const auto eb_raw = Sub(ShiftRight<23>(v), kExpOffset2); |
213 | 12.2k | const auto eb = IfThenElse(is_large, Add(eb_raw, kLargeShift), eb_raw); |
214 | 12.2k | const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)), |
215 | 12.2k | And(ShiftRight<21>(v), kMsbMask)); |
216 | 12.2k | const auto tail_mask = Lt(pos, last_pos); |
217 | 12.2k | const auto eb_fixed = IfThenElseZero(not_literal, eb); |
218 | 12.2k | const auto token_fixed = IfThenElse(not_literal, token, packed); |
219 | 12.2k | extra_bits_lanes = |
220 | 12.2k | Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed)); |
221 | 12.2k | Store(token_fixed, du, token_row + x); |
222 | 12.2k | pos = Add(pos, kLanes); |
223 | 12.2k | last = r + x + Lanes(di) - 1; |
224 | 12.2k | top_last = t + x + Lanes(di) - 1; |
225 | 12.2k | } |
226 | 96.0k | for (size_t x = 0; x < ch.w; x++) { |
227 | 95.5k | size_t ctx = ctx_map[max_diff_row[x]]; |
228 | 95.5k | histo[ctx].FastAdd(token_row[x]); |
229 | 95.5k | } |
230 | 504 | } |
231 | 187 | for (auto& h : histo) { |
232 | 187 | h.Condition(); |
233 | 187 | float f_cost = h.ShannonEntropy(); |
234 | 187 | size_t i_cost = f_cost; |
235 | 187 | histo_cost += i_cost; |
236 | 187 | histo_cost_frac += f_cost - i_cost; |
237 | 187 | h.Clear(); |
238 | 187 | } |
239 | 11 | } |
240 | 11 | extra_bits = GetLane(SumOfLanes(du, extra_bits_lanes)); |
241 | 11 | #endif |
242 | 11 | size_t total_cost = |
243 | 11 | extra_bits + histo_cost + static_cast<size_t>(histo_cost_frac); |
244 | 11 | return total_cost; |
245 | 11 | } Unexecuted instantiation: jxl::N_SSE4::EstimateCost(jxl::Image const&) jxl::N_AVX2::EstimateCost(jxl::Image const&) Line | Count | Source | 56 | 11 | StatusOr<float> EstimateCost(const Image& img) { | 57 | 11 | size_t histo_cost = 0; | 58 | 11 | float histo_cost_frac = 0.0f; | 59 | 11 | size_t extra_bits = 0; | 60 | | | 61 | | #if HWY_TARGET == HWY_SCALAR | 62 | | HybridUintConfig config; | 63 | | uint32_t cutoffs[] = {0, 1, 3, 5, 7, 11, 15, 23, 31, | 64 | | 47, 63, 95, 127, 191, 255, 392, 500}; | 65 | | constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1; | 66 | | Histogram histo[nc] = {}; | 67 | | for (const Channel& ch : img.channel) { | 68 | | const intptr_t onerow = ch.plane.PixelsPerRow(); | 69 | | for (size_t y = 0; y < ch.h; y++) { | 70 | | const pixel_type* JXL_RESTRICT r = ch.Row(y); | 71 | | for (size_t x = 0; x < ch.w; x++) { | 72 | | pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0); | 73 | | pixel_type_w top = (y ? *(r + x - onerow) : left); | 74 | | pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left); | 75 | | size_t max_diff = | 76 | | std::max({left, top, topleft}) - std::min({left, top, topleft}); | 77 | | size_t ctx = 0; | 78 | | for (uint32_t c : cutoffs) { | 79 | | ctx += (max_diff < c) ? 1 : 0; | 80 | | } | 81 | | pixel_type res = r[x] - ClampedGradient(top, left, topleft); | 82 | | uint32_t token; | 83 | | uint32_t nbits; | 84 | | uint32_t bits; | 85 | | config.Encode(PackSigned(res), &token, &nbits, &bits); | 86 | | histo[ctx].Add(token); | 87 | | extra_bits += nbits; | 88 | | } | 89 | | } | 90 | | for (auto& h : histo) { | 91 | | float f_cost = h.ShannonEntropy(); | 92 | | size_t i_cost = f_cost; | 93 | | histo_cost += i_cost; | 94 | | histo_cost_frac += f_cost - i_cost; | 95 | | h.Clear(); | 96 | | } | 97 | | } | 98 | | #else | 99 | 11 | JxlMemoryManager* memory_manager = img.memory_manager(); | 100 | 11 | const auto& ctx_map = estimate_cost_detail::ContextMap(); | 101 | 11 | const HWY_FULL(int32_t) di; | 102 | 11 | const HWY_FULL(uint32_t) du; | 103 | 11 | const HWY_FULL(float) df; | 104 | 11 | const auto kOne = Set(du, 1); | 105 | 11 | const auto kSplit = Set(du, 16); | 106 | 11 | const auto kExpOffset2 = Set(du, 129); // 127 + 2 | 107 | 11 | const auto kTokenBias = Set(du, 8); | 108 | 11 | const auto kTokenMul = Set(du, 4); | 109 | 11 | const auto kMsbMask = Set(du, 3); | 110 | 11 | const auto kMaxDiffCap = Set(du, estimate_cost_detail::kLastThreshold - 1); | 111 | 11 | const auto kLanes = Set(du, Lanes(du)); | 112 | 11 | const auto kIota = Iota(du, 0); | 113 | 11 | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 114 | 11 | constexpr size_t kLargeShiftVal = 10; | 115 | 11 | const auto kLargeShift = Set(du, kLargeShiftVal); | 116 | | | 117 | 11 | size_t max_w = 0; | 118 | 11 | for (const Channel& ch : img.channel) { | 119 | 11 | if (ch.h == 0) continue; | 120 | 11 | max_w = std::max(max_w, ch.w); | 121 | 11 | } | 122 | 11 | max_w = RoundUpTo(max_w, Lanes(du)); | 123 | 11 | max_w = std::max(max_w, 2 * Lanes(du)); | 124 | | | 125 | 11 | JXL_ASSIGN_OR_RETURN( | 126 | 11 | AlignedMemory buffer, | 127 | 11 | AlignedMemory::Create(memory_manager, max_w * 2 * sizeof(uint32_t))); | 128 | 11 | uint32_t* max_diff_row = buffer.address<uint32_t>(); | 129 | 11 | uint32_t* token_row = max_diff_row + max_w; | 130 | 11 | int32_t* primer = buffer.address<int32_t>(); | 131 | 11 | int32_t* top_primer = primer + max_w; | 132 | | | 133 | 11 | HybridUintConfig config; | 134 | | | 135 | 11 | Histogram histo[estimate_cost_detail::kLastCtx + 1] = {}; | 136 | 11 | auto extra_bits_lanes = Zero(du); | 137 | 11 | for (const Channel& ch : img.channel) { | 138 | 11 | if (ch.h == 0 || ch.w == 0) continue; | 139 | 187 | for (auto& h : histo) { | 140 | 187 | h.EnsureCapacity(32 * 4); | 141 | 187 | } | 142 | 11 | const pixel_type* JXL_RESTRICT r = ch.Row(0); | 143 | 11 | const pixel_type* JXL_RESTRICT last = primer; | 144 | 11 | primer[0] = 0; | 145 | 11 | StoreU(Load(di, r), di, primer + 1); | 146 | 11 | auto pos = kIota; | 147 | 11 | const auto last_pos = Set(du, ch.w); | 148 | 122 | for (size_t x = 0; x < ch.w; x += Lanes(di)) { | 149 | 111 | const auto left = LoadU(di, last); | 150 | 111 | const auto central = Load(di, r + x); | 151 | 111 | const auto ures = BitCast(du, Sub(central, left)); | 152 | 111 | const auto packed = | 153 | 111 | Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne)); | 154 | 111 | const auto is_large = Gt(packed, kLargeThreshold); | 155 | 111 | const auto packed_shifted = ShiftRight<kLargeShiftVal>(packed); | 156 | 111 | const auto not_literal = Ge(packed, kSplit); | 157 | 111 | const auto packed_fixed = IfThenElse(is_large, packed_shifted, packed); | 158 | 111 | const auto v = BitCast(du, ConvertTo(df, packed_fixed)); | 159 | 111 | const auto eb_raw = Sub(ShiftRight<23>(v), kExpOffset2); | 160 | 111 | const auto eb = IfThenElse(is_large, Add(eb_raw, kLargeShift), eb_raw); | 161 | 111 | const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)), | 162 | 111 | And(ShiftRight<21>(v), kMsbMask)); | 163 | 111 | const auto tail_mask = Lt(pos, last_pos); | 164 | 111 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 165 | 111 | const auto token_fixed = IfThenElse(not_literal, token, packed); | 166 | 111 | extra_bits_lanes = | 167 | 111 | Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed)); | 168 | 111 | Store(token_fixed, du, token_row + x); | 169 | 111 | pos = Add(pos, kLanes); | 170 | 111 | last = r + x + Lanes(di) - 1; | 171 | 111 | } | 172 | 867 | for (size_t x = 0; x < ch.w; x++) { | 173 | 856 | histo[0].FastAdd(token_row[x]); | 174 | 856 | } | 175 | 515 | for (size_t y = 1; y < ch.h; y++) { | 176 | 504 | r = ch.Row(y); | 177 | 504 | const pixel_type* JXL_RESTRICT t = ch.Row(y - 1); | 178 | 504 | last = primer; | 179 | 504 | primer[0] = t[0]; | 180 | 504 | StoreU(Load(di, r), di, primer + 1); | 181 | 504 | top_primer[0] = t[0]; | 182 | 504 | StoreU(Load(di, t), di, top_primer + 1); | 183 | 504 | const pixel_type* JXL_RESTRICT top_last = top_primer; | 184 | 504 | pos = kIota; | 185 | 12.7k | for (size_t x = 0; x < ch.w; x += Lanes(di)) { | 186 | 12.2k | const auto left = LoadU(di, last); | 187 | 12.2k | const auto central = Load(di, r + x); | 188 | 12.2k | const auto topleft = LoadU(di, top_last); | 189 | 12.2k | const auto top = Load(di, t + x); | 190 | 12.2k | const auto l_ge_t = Ge(left, top); | 191 | 12.2k | const auto m = IfThenElse(l_ge_t, top, left); | 192 | 12.2k | const auto M = IfThenElse(l_ge_t, left, top); | 193 | 12.2k | const auto maxx = Max(topleft, M); | 194 | 12.2k | const auto minn = Min(topleft, m); | 195 | 12.2k | const auto max_diff = BitCast(du, Sub(maxx, minn)); | 196 | 12.2k | Store(Min(max_diff, kMaxDiffCap), du, max_diff_row + x); | 197 | 12.2k | const auto overshoot = Lt(topleft, m); | 198 | 12.2k | const auto undershoot = Gt(topleft, M); | 199 | 12.2k | const auto grad = | 200 | 12.2k | BitCast(di, Sub(Add(BitCast(du, top), BitCast(du, left)), | 201 | 12.2k | BitCast(du, topleft))); | 202 | 12.2k | const auto prediction = | 203 | 12.2k | IfThenElse(undershoot, m, IfThenElse(overshoot, M, grad)); | 204 | 12.2k | const auto ures = BitCast(du, Sub(central, prediction)); | 205 | 12.2k | const auto packed = | 206 | 12.2k | Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne)); | 207 | 12.2k | const auto is_large = Gt(packed, kLargeThreshold); | 208 | 12.2k | const auto packed_shifted = ShiftRight<kLargeShiftVal>(packed); | 209 | 12.2k | const auto not_literal = Ge(packed, kSplit); | 210 | 12.2k | const auto packed_fixed = IfThenElse(is_large, packed_shifted, packed); | 211 | 12.2k | const auto v = BitCast(du, ConvertTo(df, packed_fixed)); | 212 | 12.2k | const auto eb_raw = Sub(ShiftRight<23>(v), kExpOffset2); | 213 | 12.2k | const auto eb = IfThenElse(is_large, Add(eb_raw, kLargeShift), eb_raw); | 214 | 12.2k | const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)), | 215 | 12.2k | And(ShiftRight<21>(v), kMsbMask)); | 216 | 12.2k | const auto tail_mask = Lt(pos, last_pos); | 217 | 12.2k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 218 | 12.2k | const auto token_fixed = IfThenElse(not_literal, token, packed); | 219 | 12.2k | extra_bits_lanes = | 220 | 12.2k | Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed)); | 221 | 12.2k | Store(token_fixed, du, token_row + x); | 222 | 12.2k | pos = Add(pos, kLanes); | 223 | 12.2k | last = r + x + Lanes(di) - 1; | 224 | 12.2k | top_last = t + x + Lanes(di) - 1; | 225 | 12.2k | } | 226 | 96.0k | for (size_t x = 0; x < ch.w; x++) { | 227 | 95.5k | size_t ctx = ctx_map[max_diff_row[x]]; | 228 | 95.5k | histo[ctx].FastAdd(token_row[x]); | 229 | 95.5k | } | 230 | 504 | } | 231 | 187 | for (auto& h : histo) { | 232 | 187 | h.Condition(); | 233 | 187 | float f_cost = h.ShannonEntropy(); | 234 | 187 | size_t i_cost = f_cost; | 235 | 187 | histo_cost += i_cost; | 236 | 187 | histo_cost_frac += f_cost - i_cost; | 237 | 187 | h.Clear(); | 238 | 187 | } | 239 | 11 | } | 240 | 11 | extra_bits = GetLane(SumOfLanes(du, extra_bits_lanes)); | 241 | 11 | #endif | 242 | 11 | size_t total_cost = | 243 | 11 | extra_bits + histo_cost + static_cast<size_t>(histo_cost_frac); | 244 | 11 | return total_cost; | 245 | 11 | } |
Unexecuted instantiation: jxl::N_SSE2::EstimateCost(jxl::Image const&) |
246 | | |
247 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
248 | | } // namespace HWY_NAMESPACE |
249 | | } // namespace jxl |
250 | | HWY_AFTER_NAMESPACE(); |
251 | | |
252 | | #if HWY_ONCE |
253 | | namespace jxl { |
254 | | |
255 | | HWY_EXPORT(EstimateCost); |
256 | | |
257 | 11 | StatusOr<float> EstimateCost(const Image& img) { |
258 | 11 | return HWY_DYNAMIC_DISPATCH(EstimateCost)(img); |
259 | 11 | } |
260 | | |
261 | | namespace estimate_cost_detail { |
262 | | /* |
263 | | cutoffs = [0, 1, 3, 5, 7, 11, 15, 23, 31, 47, 63, 95, 127, 191, 255, 392, 500] |
264 | | ctx_map = [[c for c,v in enumerate(cutoffs) if v <= i][0] for i in range(501)] |
265 | | */ |
266 | 11 | const std::array<uint8_t, kLastThreshold>& ContextMap() { |
267 | 11 | static const std::array<uint8_t, kLastThreshold> kCtxMap = { |
268 | 11 | 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, |
269 | 11 | 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, |
270 | 11 | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, |
271 | 11 | 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
272 | 11 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
273 | 11 | 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, |
274 | 11 | 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, |
275 | 11 | 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
276 | 11 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
277 | 11 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
278 | 11 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, |
279 | 11 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
280 | 11 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
281 | 11 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
282 | 11 | 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
283 | 11 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
284 | 11 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
285 | 11 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
286 | 11 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
287 | 11 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
288 | 11 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
289 | 11 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, |
290 | 11 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
291 | 11 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
292 | 11 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
293 | 11 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
294 | 11 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
295 | 11 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16}; |
296 | 11 | return kCtxMap; |
297 | 11 | } |
298 | | } // namespace estimate_cost_detail |
299 | | |
300 | | } // namespace jxl |
301 | | #endif |