Coverage Report

Created: 2026-06-30 07:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_modular_simd.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_modular_simd.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <array>
12
#include <cstddef>
13
#include <cstdint>
14
15
#include "lib/jxl/base/common.h"
16
#include "lib/jxl/base/compiler_specific.h"
17
#include "lib/jxl/base/status.h"
18
#include "lib/jxl/dec_ans.h"
19
#include "lib/jxl/enc_ans_params.h"
20
#include "lib/jxl/memory_manager_internal.h"
21
#include "lib/jxl/modular/modular_image.h"
22
23
#undef HWY_TARGET_INCLUDE
24
#define HWY_TARGET_INCLUDE "lib/jxl/enc_modular_simd.cc"
25
#include <hwy/foreach_target.h>
26
#include <hwy/highway.h>
27
28
#if HWY_TARGET == HWY_SCALAR
29
#include "lib/jxl/modular/encoding/context_predict.h"
30
#include "lib/jxl/pack_signed.h"
31
#endif
32
33
HWY_BEFORE_NAMESPACE();
34
namespace jxl {
35
namespace HWY_NAMESPACE {
36
37
// These templates are not found via ADL.
38
using hwy::HWY_NAMESPACE::Add;
39
using hwy::HWY_NAMESPACE::And;
40
using hwy::HWY_NAMESPACE::Ge;
41
using hwy::HWY_NAMESPACE::GetLane;
42
using hwy::HWY_NAMESPACE::Gt;
43
using hwy::HWY_NAMESPACE::IfThenElse;
44
using hwy::HWY_NAMESPACE::IfThenElseZero;
45
using hwy::HWY_NAMESPACE::Iota;
46
using hwy::HWY_NAMESPACE::Load;
47
using hwy::HWY_NAMESPACE::LoadU;
48
using hwy::HWY_NAMESPACE::Lt;
49
using hwy::HWY_NAMESPACE::Max;
50
using hwy::HWY_NAMESPACE::Min;
51
using hwy::HWY_NAMESPACE::Mul;
52
using hwy::HWY_NAMESPACE::Not;
53
using hwy::HWY_NAMESPACE::Set;
54
using hwy::HWY_NAMESPACE::ShiftLeft;
55
using hwy::HWY_NAMESPACE::ShiftRight;
56
using hwy::HWY_NAMESPACE::Store;
57
using hwy::HWY_NAMESPACE::StoreU;
58
using hwy::HWY_NAMESPACE::Sub;
59
using hwy::HWY_NAMESPACE::Xor;
60
using hwy::HWY_NAMESPACE::Zero;
61
62
3.00k
StatusOr<float> EstimateCost(const Image& img) {
63
3.00k
  size_t histo_cost = 0;
64
3.00k
  float histo_cost_frac = 0.0f;
65
3.00k
  size_t extra_bits = 0;
66
67
#if HWY_TARGET == HWY_SCALAR
68
  HybridUintConfig config;
69
  uint32_t cutoffs[] = {0,  1,  3,  5,   7,   11,  15,  23, 31,
70
                        47, 63, 95, 127, 191, 255, 392, 500};
71
  constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1;
72
  Histogram histo[nc] = {};
73
  for (const Channel& ch : img.channel) {
74
    const ptrdiff_t onerow = ch.plane.PixelsPerRow();
75
    for (size_t y = 0; y < ch.h; y++) {
76
      const pixel_type* JXL_RESTRICT r = ch.Row(y);
77
      for (size_t x = 0; x < ch.w; x++) {
78
        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
79
        pixel_type_w top = (y ? *(r + x - onerow) : left);
80
        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
81
        size_t max_diff =
82
            std::max({left, top, topleft}) - std::min({left, top, topleft});
83
        size_t ctx = 0;
84
        for (uint32_t c : cutoffs) {
85
          ctx += (max_diff < c) ? 1 : 0;
86
        }
87
        pixel_type res = r[x] - ClampedGradient(top, left, topleft);
88
        uint32_t token;
89
        uint32_t nbits;
90
        uint32_t bits;
91
        config.Encode(PackSigned(res), &token, &nbits, &bits);
92
        histo[ctx].Add(token);
93
        extra_bits += nbits;
94
      }
95
    }
96
    for (auto& h : histo) {
97
      float f_cost = h.ShannonEntropy();
98
      size_t i_cost = f_cost;
99
      histo_cost += i_cost;
100
      histo_cost_frac += f_cost - i_cost;
101
      h.Clear();
102
    }
103
  }
104
#else
105
3.00k
  JxlMemoryManager* memory_manager = img.memory_manager();
106
3.00k
  const auto& ctx_map = estimate_cost_detail::ContextMap();
107
3.00k
  const HWY_FULL(int32_t) di;
108
3.00k
  const HWY_FULL(uint32_t) du;
109
3.00k
  const HWY_FULL(float) df;
110
3.00k
  const auto kOne = Set(du, 1);
111
3.00k
  const auto kSplit = Set(du, 16);
112
3.00k
  const auto kExpOffset2 = Set(du, 129);  // 127 + 2
113
3.00k
  const auto kTokenBias = Set(du, 8);
114
3.00k
  const auto kTokenMul = Set(du, 4);
115
3.00k
  const auto kMsbMask = Set(du, 3);
116
3.00k
  const auto kMaxDiffCap = Set(du, estimate_cost_detail::kLastThreshold - 1);
117
3.00k
  const auto kLanes = Set(du, Lanes(du));
118
3.00k
  const auto kIota = Iota(du, 0);
119
3.00k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
120
3.00k
  constexpr size_t kLargeShiftVal = 10;
121
3.00k
  const auto kLargeShift = Set(du, kLargeShiftVal);
122
123
3.00k
  size_t max_w = 0;
124
3.00k
  for (const Channel& ch : img.channel) {
125
3.00k
    if (ch.h == 0) continue;
126
3.00k
    max_w = std::max(max_w, ch.w);
127
3.00k
  }
128
3.00k
  max_w = RoundUpTo(max_w, Lanes(du));
129
3.00k
  max_w = std::max(max_w, 2 * Lanes(du));
130
131
3.00k
  JXL_ASSIGN_OR_RETURN(
132
3.00k
      AlignedMemory buffer,
133
3.00k
      AlignedMemory::Create(memory_manager, max_w * 2 * sizeof(uint32_t)));
134
3.00k
  uint32_t* max_diff_row = buffer.address<uint32_t>();
135
3.00k
  uint32_t* token_row = max_diff_row + max_w;
136
3.00k
  int32_t* primer = buffer.address<int32_t>();
137
3.00k
  int32_t* top_primer = primer + max_w;
138
139
3.00k
  HybridUintConfig config;
140
141
3.00k
  Histogram histo[estimate_cost_detail::kLastCtx + 1] = {};
142
3.00k
  auto extra_bits_lanes = Zero(du);
143
3.00k
  for (const Channel& ch : img.channel) {
144
3.00k
    if (ch.h == 0 || ch.w == 0) continue;
145
51.0k
    for (auto& h : histo) {
146
51.0k
      h.EnsureCapacity(32 * 4);
147
51.0k
    }
148
3.00k
    const pixel_type* JXL_RESTRICT r = ch.Row(0);
149
3.00k
    const pixel_type* JXL_RESTRICT last = primer;
150
3.00k
    primer[0] = 0;
151
3.00k
    StoreU(Load(di, r), di, primer + 1);
152
3.00k
    auto pos = kIota;
153
3.00k
    const auto last_pos = Set(du, ch.w);
154
79.5k
    for (size_t x = 0; x < ch.w; x += Lanes(di)) {
155
76.5k
      const auto left = LoadU(di, last);
156
76.5k
      const auto central = Load(di, r + x);
157
76.5k
      const auto ures = BitCast(du, Sub(central, left));
158
76.5k
      const auto packed =
159
76.5k
          Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne));
160
76.5k
      const auto is_large = Gt(packed, kLargeThreshold);
161
76.5k
      const auto packed_shifted = ShiftRight<kLargeShiftVal>(packed);
162
76.5k
      const auto not_literal = Ge(packed, kSplit);
163
76.5k
      const auto packed_fixed = IfThenElse(is_large, packed_shifted, packed);
164
76.5k
      const auto v = BitCast(du, ConvertTo(df, packed_fixed));
165
76.5k
      const auto eb_raw = Sub(ShiftRight<23>(v), kExpOffset2);
166
76.5k
      const auto eb = IfThenElse(is_large, Add(eb_raw, kLargeShift), eb_raw);
167
76.5k
      const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)),
168
76.5k
                             And(ShiftRight<21>(v), kMsbMask));
169
76.5k
      const auto tail_mask = Lt(pos, last_pos);
170
76.5k
      const auto eb_fixed = IfThenElseZero(not_literal, eb);
171
76.5k
      const auto token_fixed = IfThenElse(not_literal, token, packed);
172
76.5k
      extra_bits_lanes =
173
76.5k
          Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed));
174
76.5k
      Store(token_fixed, du, token_row + x);
175
76.5k
      pos = Add(pos, kLanes);
176
76.5k
      last = r + x + Lanes(di) - 1;
177
76.5k
    }
178
610k
    for (size_t x = 0; x < ch.w; x++) {
179
607k
      histo[0].FastAdd(token_row[x]);
180
607k
    }
181
625k
    for (size_t y = 1; y < ch.h; y++) {
182
622k
      r = ch.Row(y);
183
622k
      const pixel_type* JXL_RESTRICT t = ch.Row(y - 1);
184
622k
      last = primer;
185
622k
      primer[0] = t[0];
186
622k
      StoreU(Load(di, r), di, primer + 1);
187
622k
      top_primer[0] = t[0];
188
622k
      StoreU(Load(di, t), di, top_primer + 1);
189
622k
      const pixel_type* JXL_RESTRICT top_last = top_primer;
190
622k
      pos = kIota;
191
19.6M
      for (size_t x = 0; x < ch.w; x += Lanes(di)) {
192
18.9M
        const auto left = LoadU(di, last);
193
18.9M
        const auto central = Load(di, r + x);
194
18.9M
        const auto topleft = LoadU(di, top_last);
195
18.9M
        const auto top = Load(di, t + x);
196
18.9M
        const auto l_ge_t = Ge(left, top);
197
18.9M
        const auto m = IfThenElse(l_ge_t, top, left);
198
18.9M
        const auto M = IfThenElse(l_ge_t, left, top);
199
18.9M
        const auto maxx = Max(topleft, M);
200
18.9M
        const auto minn = Min(topleft, m);
201
18.9M
        const auto max_diff = BitCast(du, Sub(maxx, minn));
202
18.9M
        Store(Min(max_diff, kMaxDiffCap), du, max_diff_row + x);
203
18.9M
        const auto overshoot = Lt(topleft, m);
204
18.9M
        const auto undershoot = Gt(topleft, M);
205
18.9M
        const auto grad =
206
18.9M
            BitCast(di, Sub(Add(BitCast(du, top), BitCast(du, left)),
207
18.9M
                            BitCast(du, topleft)));
208
18.9M
        const auto prediction =
209
18.9M
            IfThenElse(undershoot, m, IfThenElse(overshoot, M, grad));
210
18.9M
        const auto ures = BitCast(du, Sub(central, prediction));
211
18.9M
        const auto packed =
212
18.9M
            Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne));
213
18.9M
        const auto is_large = Gt(packed, kLargeThreshold);
214
18.9M
        const auto packed_shifted = ShiftRight<kLargeShiftVal>(packed);
215
18.9M
        const auto not_literal = Ge(packed, kSplit);
216
18.9M
        const auto packed_fixed = IfThenElse(is_large, packed_shifted, packed);
217
18.9M
        const auto v = BitCast(du, ConvertTo(df, packed_fixed));
218
18.9M
        const auto eb_raw = Sub(ShiftRight<23>(v), kExpOffset2);
219
18.9M
        const auto eb = IfThenElse(is_large, Add(eb_raw, kLargeShift), eb_raw);
220
18.9M
        const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)),
221
18.9M
                               And(ShiftRight<21>(v), kMsbMask));
222
18.9M
        const auto tail_mask = Lt(pos, last_pos);
223
18.9M
        const auto eb_fixed = IfThenElseZero(not_literal, eb);
224
18.9M
        const auto token_fixed = IfThenElse(not_literal, token, packed);
225
18.9M
        extra_bits_lanes =
226
18.9M
            Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed));
227
18.9M
        Store(token_fixed, du, token_row + x);
228
18.9M
        pos = Add(pos, kLanes);
229
18.9M
        last = r + x + Lanes(di) - 1;
230
18.9M
        top_last = t + x + Lanes(di) - 1;
231
18.9M
      }
232
151M
      for (size_t x = 0; x < ch.w; x++) {
233
150M
        size_t ctx = ctx_map[max_diff_row[x]];
234
150M
        histo[ctx].FastAdd(token_row[x]);
235
150M
      }
236
622k
    }
237
51.0k
    for (auto& h : histo) {
238
51.0k
      h.Condition();
239
51.0k
      float f_cost = h.ShannonEntropy();
240
51.0k
      size_t i_cost = f_cost;
241
51.0k
      histo_cost += i_cost;
242
51.0k
      histo_cost_frac += f_cost - i_cost;
243
51.0k
      h.Clear();
244
51.0k
    }
245
3.00k
  }
246
3.00k
  extra_bits = GetLane(SumOfLanes(du, extra_bits_lanes));
247
3.00k
#endif
248
3.00k
  size_t total_cost =
249
3.00k
      extra_bits + histo_cost + static_cast<size_t>(histo_cost_frac);
250
3.00k
  return total_cost;
251
3.00k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateCost(jxl::Image const&)
jxl::N_AVX2::EstimateCost(jxl::Image const&)
Line
Count
Source
62
3.00k
StatusOr<float> EstimateCost(const Image& img) {
63
3.00k
  size_t histo_cost = 0;
64
3.00k
  float histo_cost_frac = 0.0f;
65
3.00k
  size_t extra_bits = 0;
66
67
#if HWY_TARGET == HWY_SCALAR
68
  HybridUintConfig config;
69
  uint32_t cutoffs[] = {0,  1,  3,  5,   7,   11,  15,  23, 31,
70
                        47, 63, 95, 127, 191, 255, 392, 500};
71
  constexpr size_t nc = sizeof(cutoffs) / sizeof(*cutoffs) + 1;
72
  Histogram histo[nc] = {};
73
  for (const Channel& ch : img.channel) {
74
    const ptrdiff_t onerow = ch.plane.PixelsPerRow();
75
    for (size_t y = 0; y < ch.h; y++) {
76
      const pixel_type* JXL_RESTRICT r = ch.Row(y);
77
      for (size_t x = 0; x < ch.w; x++) {
78
        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
79
        pixel_type_w top = (y ? *(r + x - onerow) : left);
80
        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
81
        size_t max_diff =
82
            std::max({left, top, topleft}) - std::min({left, top, topleft});
83
        size_t ctx = 0;
84
        for (uint32_t c : cutoffs) {
85
          ctx += (max_diff < c) ? 1 : 0;
86
        }
87
        pixel_type res = r[x] - ClampedGradient(top, left, topleft);
88
        uint32_t token;
89
        uint32_t nbits;
90
        uint32_t bits;
91
        config.Encode(PackSigned(res), &token, &nbits, &bits);
92
        histo[ctx].Add(token);
93
        extra_bits += nbits;
94
      }
95
    }
96
    for (auto& h : histo) {
97
      float f_cost = h.ShannonEntropy();
98
      size_t i_cost = f_cost;
99
      histo_cost += i_cost;
100
      histo_cost_frac += f_cost - i_cost;
101
      h.Clear();
102
    }
103
  }
104
#else
105
3.00k
  JxlMemoryManager* memory_manager = img.memory_manager();
106
3.00k
  const auto& ctx_map = estimate_cost_detail::ContextMap();
107
3.00k
  const HWY_FULL(int32_t) di;
108
3.00k
  const HWY_FULL(uint32_t) du;
109
3.00k
  const HWY_FULL(float) df;
110
3.00k
  const auto kOne = Set(du, 1);
111
3.00k
  const auto kSplit = Set(du, 16);
112
3.00k
  const auto kExpOffset2 = Set(du, 129);  // 127 + 2
113
3.00k
  const auto kTokenBias = Set(du, 8);
114
3.00k
  const auto kTokenMul = Set(du, 4);
115
3.00k
  const auto kMsbMask = Set(du, 3);
116
3.00k
  const auto kMaxDiffCap = Set(du, estimate_cost_detail::kLastThreshold - 1);
117
3.00k
  const auto kLanes = Set(du, Lanes(du));
118
3.00k
  const auto kIota = Iota(du, 0);
119
3.00k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
120
3.00k
  constexpr size_t kLargeShiftVal = 10;
121
3.00k
  const auto kLargeShift = Set(du, kLargeShiftVal);
122
123
3.00k
  size_t max_w = 0;
124
3.00k
  for (const Channel& ch : img.channel) {
125
3.00k
    if (ch.h == 0) continue;
126
3.00k
    max_w = std::max(max_w, ch.w);
127
3.00k
  }
128
3.00k
  max_w = RoundUpTo(max_w, Lanes(du));
129
3.00k
  max_w = std::max(max_w, 2 * Lanes(du));
130
131
3.00k
  JXL_ASSIGN_OR_RETURN(
132
3.00k
      AlignedMemory buffer,
133
3.00k
      AlignedMemory::Create(memory_manager, max_w * 2 * sizeof(uint32_t)));
134
3.00k
  uint32_t* max_diff_row = buffer.address<uint32_t>();
135
3.00k
  uint32_t* token_row = max_diff_row + max_w;
136
3.00k
  int32_t* primer = buffer.address<int32_t>();
137
3.00k
  int32_t* top_primer = primer + max_w;
138
139
3.00k
  HybridUintConfig config;
140
141
3.00k
  Histogram histo[estimate_cost_detail::kLastCtx + 1] = {};
142
3.00k
  auto extra_bits_lanes = Zero(du);
143
3.00k
  for (const Channel& ch : img.channel) {
144
3.00k
    if (ch.h == 0 || ch.w == 0) continue;
145
51.0k
    for (auto& h : histo) {
146
51.0k
      h.EnsureCapacity(32 * 4);
147
51.0k
    }
148
3.00k
    const pixel_type* JXL_RESTRICT r = ch.Row(0);
149
3.00k
    const pixel_type* JXL_RESTRICT last = primer;
150
3.00k
    primer[0] = 0;
151
3.00k
    StoreU(Load(di, r), di, primer + 1);
152
3.00k
    auto pos = kIota;
153
3.00k
    const auto last_pos = Set(du, ch.w);
154
79.5k
    for (size_t x = 0; x < ch.w; x += Lanes(di)) {
155
76.5k
      const auto left = LoadU(di, last);
156
76.5k
      const auto central = Load(di, r + x);
157
76.5k
      const auto ures = BitCast(du, Sub(central, left));
158
76.5k
      const auto packed =
159
76.5k
          Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne));
160
76.5k
      const auto is_large = Gt(packed, kLargeThreshold);
161
76.5k
      const auto packed_shifted = ShiftRight<kLargeShiftVal>(packed);
162
76.5k
      const auto not_literal = Ge(packed, kSplit);
163
76.5k
      const auto packed_fixed = IfThenElse(is_large, packed_shifted, packed);
164
76.5k
      const auto v = BitCast(du, ConvertTo(df, packed_fixed));
165
76.5k
      const auto eb_raw = Sub(ShiftRight<23>(v), kExpOffset2);
166
76.5k
      const auto eb = IfThenElse(is_large, Add(eb_raw, kLargeShift), eb_raw);
167
76.5k
      const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)),
168
76.5k
                             And(ShiftRight<21>(v), kMsbMask));
169
76.5k
      const auto tail_mask = Lt(pos, last_pos);
170
76.5k
      const auto eb_fixed = IfThenElseZero(not_literal, eb);
171
76.5k
      const auto token_fixed = IfThenElse(not_literal, token, packed);
172
76.5k
      extra_bits_lanes =
173
76.5k
          Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed));
174
76.5k
      Store(token_fixed, du, token_row + x);
175
76.5k
      pos = Add(pos, kLanes);
176
76.5k
      last = r + x + Lanes(di) - 1;
177
76.5k
    }
178
610k
    for (size_t x = 0; x < ch.w; x++) {
179
607k
      histo[0].FastAdd(token_row[x]);
180
607k
    }
181
625k
    for (size_t y = 1; y < ch.h; y++) {
182
622k
      r = ch.Row(y);
183
622k
      const pixel_type* JXL_RESTRICT t = ch.Row(y - 1);
184
622k
      last = primer;
185
622k
      primer[0] = t[0];
186
622k
      StoreU(Load(di, r), di, primer + 1);
187
622k
      top_primer[0] = t[0];
188
622k
      StoreU(Load(di, t), di, top_primer + 1);
189
622k
      const pixel_type* JXL_RESTRICT top_last = top_primer;
190
622k
      pos = kIota;
191
19.6M
      for (size_t x = 0; x < ch.w; x += Lanes(di)) {
192
18.9M
        const auto left = LoadU(di, last);
193
18.9M
        const auto central = Load(di, r + x);
194
18.9M
        const auto topleft = LoadU(di, top_last);
195
18.9M
        const auto top = Load(di, t + x);
196
18.9M
        const auto l_ge_t = Ge(left, top);
197
18.9M
        const auto m = IfThenElse(l_ge_t, top, left);
198
18.9M
        const auto M = IfThenElse(l_ge_t, left, top);
199
18.9M
        const auto maxx = Max(topleft, M);
200
18.9M
        const auto minn = Min(topleft, m);
201
18.9M
        const auto max_diff = BitCast(du, Sub(maxx, minn));
202
18.9M
        Store(Min(max_diff, kMaxDiffCap), du, max_diff_row + x);
203
18.9M
        const auto overshoot = Lt(topleft, m);
204
18.9M
        const auto undershoot = Gt(topleft, M);
205
18.9M
        const auto grad =
206
18.9M
            BitCast(di, Sub(Add(BitCast(du, top), BitCast(du, left)),
207
18.9M
                            BitCast(du, topleft)));
208
18.9M
        const auto prediction =
209
18.9M
            IfThenElse(undershoot, m, IfThenElse(overshoot, M, grad));
210
18.9M
        const auto ures = BitCast(du, Sub(central, prediction));
211
18.9M
        const auto packed =
212
18.9M
            Xor(ShiftLeft<1>(ures), Sub(ShiftRight<31>(Not(ures)), kOne));
213
18.9M
        const auto is_large = Gt(packed, kLargeThreshold);
214
18.9M
        const auto packed_shifted = ShiftRight<kLargeShiftVal>(packed);
215
18.9M
        const auto not_literal = Ge(packed, kSplit);
216
18.9M
        const auto packed_fixed = IfThenElse(is_large, packed_shifted, packed);
217
18.9M
        const auto v = BitCast(du, ConvertTo(df, packed_fixed));
218
18.9M
        const auto eb_raw = Sub(ShiftRight<23>(v), kExpOffset2);
219
18.9M
        const auto eb = IfThenElse(is_large, Add(eb_raw, kLargeShift), eb_raw);
220
18.9M
        const auto token = Add(Add(kTokenBias, Mul(eb, kTokenMul)),
221
18.9M
                               And(ShiftRight<21>(v), kMsbMask));
222
18.9M
        const auto tail_mask = Lt(pos, last_pos);
223
18.9M
        const auto eb_fixed = IfThenElseZero(not_literal, eb);
224
18.9M
        const auto token_fixed = IfThenElse(not_literal, token, packed);
225
18.9M
        extra_bits_lanes =
226
18.9M
            Add(extra_bits_lanes, IfThenElseZero(tail_mask, eb_fixed));
227
18.9M
        Store(token_fixed, du, token_row + x);
228
18.9M
        pos = Add(pos, kLanes);
229
18.9M
        last = r + x + Lanes(di) - 1;
230
18.9M
        top_last = t + x + Lanes(di) - 1;
231
18.9M
      }
232
151M
      for (size_t x = 0; x < ch.w; x++) {
233
150M
        size_t ctx = ctx_map[max_diff_row[x]];
234
150M
        histo[ctx].FastAdd(token_row[x]);
235
150M
      }
236
622k
    }
237
51.0k
    for (auto& h : histo) {
238
51.0k
      h.Condition();
239
51.0k
      float f_cost = h.ShannonEntropy();
240
51.0k
      size_t i_cost = f_cost;
241
51.0k
      histo_cost += i_cost;
242
51.0k
      histo_cost_frac += f_cost - i_cost;
243
51.0k
      h.Clear();
244
51.0k
    }
245
3.00k
  }
246
3.00k
  extra_bits = GetLane(SumOfLanes(du, extra_bits_lanes));
247
3.00k
#endif
248
3.00k
  size_t total_cost =
249
3.00k
      extra_bits + histo_cost + static_cast<size_t>(histo_cost_frac);
250
3.00k
  return total_cost;
251
3.00k
}
Unexecuted instantiation: jxl::N_SSE2::EstimateCost(jxl::Image const&)
252
253
// NOLINTNEXTLINE(google-readability-namespace-comments)
254
}  // namespace HWY_NAMESPACE
255
}  // namespace jxl
256
HWY_AFTER_NAMESPACE();
257
258
#if HWY_ONCE
259
namespace jxl {
260
261
HWY_EXPORT(EstimateCost);
262
263
3.00k
StatusOr<float> EstimateCost(const Image& img) {
264
3.00k
  return HWY_DYNAMIC_DISPATCH(EstimateCost)(img);
265
3.00k
}
266
267
namespace estimate_cost_detail {
268
/*
269
cutoffs = [0, 1, 3, 5, 7, 11, 15, 23, 31, 47, 63, 95, 127, 191, 255, 392, 500]
270
ctx_map = [[c for c,v in enumerate(cutoffs) if v <= i][0] for i in range(501)]
271
*/
272
3.00k
const std::array<uint8_t, kLastThreshold>& ContextMap() {
273
3.00k
  static const std::array<uint8_t, kLastThreshold> kCtxMap = {
274
3.00k
      0,  1,  1,  2,  2,  3,  3,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,
275
3.00k
      6,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
276
3.00k
      8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,
277
3.00k
      9,  9,  9,  9,  9,  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 10,
278
3.00k
      10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
279
3.00k
      10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
280
3.00k
      11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
281
3.00k
      11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
282
3.00k
      12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
283
3.00k
      12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
284
3.00k
      12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13,
285
3.00k
      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
286
3.00k
      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
287
3.00k
      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
288
3.00k
      13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
289
3.00k
      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
290
3.00k
      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
291
3.00k
      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
292
3.00k
      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
293
3.00k
      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
294
3.00k
      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
295
3.00k
      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
296
3.00k
      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
297
3.00k
      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
298
3.00k
      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
299
3.00k
      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
300
3.00k
      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
301
3.00k
      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16};
302
3.00k
  return kCtxMap;
303
3.00k
}
304
}  // namespace estimate_cost_detail
305
306
}  // namespace jxl
307
#endif