Coverage Report

Created: 2026-06-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_ans_simd.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ans_simd.h"
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/compiler_specific.h"
11
#include "lib/jxl/base/status.h"
12
#include "lib/jxl/dec_ans.h"
13
#include "lib/jxl/memory_manager_internal.h"
14
15
#undef HWY_TARGET_INCLUDE
16
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc"
17
#include <hwy/foreach_target.h>
18
#include <hwy/highway.h>
19
20
HWY_BEFORE_NAMESPACE();
21
namespace jxl {
22
namespace HWY_NAMESPACE {
23
24
// These templates are not found via ADL.
25
using hwy::HWY_NAMESPACE::Add;
26
using hwy::HWY_NAMESPACE::And;
27
using hwy::HWY_NAMESPACE::Ge;
28
using hwy::HWY_NAMESPACE::GetLane;
29
using hwy::HWY_NAMESPACE::Gt;
30
using hwy::HWY_NAMESPACE::IfThenElse;
31
using hwy::HWY_NAMESPACE::IfThenElseZero;
32
using hwy::HWY_NAMESPACE::Iota;
33
using hwy::HWY_NAMESPACE::LoadU;
34
using hwy::HWY_NAMESPACE::Lt;
35
using hwy::HWY_NAMESPACE::Mul;
36
using hwy::HWY_NAMESPACE::Or;
37
using hwy::HWY_NAMESPACE::Set;
38
using hwy::HWY_NAMESPACE::ShiftRight;
39
using hwy::HWY_NAMESPACE::Store;
40
using hwy::HWY_NAMESPACE::Sub;
41
using hwy::HWY_NAMESPACE::Zero;
42
43
template <size_t E, size_t M, size_t L>
44
uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len,
45
854k
                               uint32_t* JXL_RESTRICT out) {
46
854k
  const HWY_FULL(uint32_t) du;
47
854k
  const HWY_FULL(float) df;
48
854k
  const auto kZero = Zero(du);
49
854k
  const auto kSplit = Set(du, 1 << E);
50
854k
  const auto kExpOffset = Set(du, 127);
51
854k
  const auto kEBOffset = Set(du, 127 + M + L);
52
854k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
854k
  const auto kMulN = Set(du, 1 << (M + L));
54
854k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
854k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
854k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
854k
  constexpr size_t kLargeShiftVal = 10;
58
854k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
854k
  auto extra_bits = kZero;
61
854k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
54.8M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
54.0M
    const auto val = LoadU(du, values + i);
64
54.0M
    const auto is_large = Gt(val, kLargeThreshold);
65
54.0M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
54.0M
    const auto not_literal = Ge(val, kSplit);
67
54.0M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
54.0M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
54.0M
    const auto l = And(val, kMaskL);
70
54.0M
    const auto exp = ShiftRight<23>(b);
71
54.0M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
54.0M
    const auto n = Sub(exp_fixed, kExpOffset);
73
54.0M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
54.0M
    const auto m = ShiftRight<23 - M - L>(b);
75
54.0M
    const auto a = Add(kBase, Mul(n, kMulN));
76
54.0M
    const auto d = And(m, kMaskM);
77
54.0M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
54.0M
    const auto c = Or(a, l);
79
54.0M
    extra_bits = Add(extra_bits, eb_fixed);
80
54.0M
    const auto t = Or(c, d);
81
54.0M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
54.0M
    Store(t_fixed, du, out + i);
83
54.0M
  }
84
854k
  if (last_full < len) {
85
735k
    const auto stop = Set(du, len);
86
735k
    const auto fence = Iota(du, last_full);
87
735k
    const auto take = Lt(fence, stop);
88
735k
    const auto val = LoadU(du, values + last_full);
89
735k
    const auto is_large = Gt(val, kLargeThreshold);
90
735k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
735k
    const auto not_literal = Ge(val, kSplit);
92
735k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
735k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
735k
    const auto l = And(val, kMaskL);
95
735k
    const auto exp = ShiftRight<23>(b);
96
735k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
735k
    const auto n = Sub(exp_fixed, kExpOffset);
98
735k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
735k
    const auto m = ShiftRight<23 - M - L>(b);
100
735k
    const auto a = Add(kBase, Mul(n, kMulN));
101
735k
    const auto d = And(m, kMaskM);
102
735k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
735k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
735k
    const auto c = Or(a, l);
105
735k
    extra_bits = Add(extra_bits, eb_masked);
106
735k
    const auto t = Or(c, d);
107
735k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
735k
    Store(t_fixed, du, out + last_full);
109
735k
  }
110
854k
  return GetLane(SumOfLanes(du, extra_bits));
111
854k
}
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
99.8k
                               uint32_t* JXL_RESTRICT out) {
46
99.8k
  const HWY_FULL(uint32_t) du;
47
99.8k
  const HWY_FULL(float) df;
48
99.8k
  const auto kZero = Zero(du);
49
99.8k
  const auto kSplit = Set(du, 1 << E);
50
99.8k
  const auto kExpOffset = Set(du, 127);
51
99.8k
  const auto kEBOffset = Set(du, 127 + M + L);
52
99.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
99.8k
  const auto kMulN = Set(du, 1 << (M + L));
54
99.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
99.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
99.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
99.8k
  constexpr size_t kLargeShiftVal = 10;
58
99.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
99.8k
  auto extra_bits = kZero;
61
99.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
8.30M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
8.20M
    const auto val = LoadU(du, values + i);
64
8.20M
    const auto is_large = Gt(val, kLargeThreshold);
65
8.20M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
8.20M
    const auto not_literal = Ge(val, kSplit);
67
8.20M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
8.20M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
8.20M
    const auto l = And(val, kMaskL);
70
8.20M
    const auto exp = ShiftRight<23>(b);
71
8.20M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
8.20M
    const auto n = Sub(exp_fixed, kExpOffset);
73
8.20M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
8.20M
    const auto m = ShiftRight<23 - M - L>(b);
75
8.20M
    const auto a = Add(kBase, Mul(n, kMulN));
76
8.20M
    const auto d = And(m, kMaskM);
77
8.20M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
8.20M
    const auto c = Or(a, l);
79
8.20M
    extra_bits = Add(extra_bits, eb_fixed);
80
8.20M
    const auto t = Or(c, d);
81
8.20M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
8.20M
    Store(t_fixed, du, out + i);
83
8.20M
  }
84
99.8k
  if (last_full < len) {
85
85.8k
    const auto stop = Set(du, len);
86
85.8k
    const auto fence = Iota(du, last_full);
87
85.8k
    const auto take = Lt(fence, stop);
88
85.8k
    const auto val = LoadU(du, values + last_full);
89
85.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
85.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
85.8k
    const auto not_literal = Ge(val, kSplit);
92
85.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
85.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
85.8k
    const auto l = And(val, kMaskL);
95
85.8k
    const auto exp = ShiftRight<23>(b);
96
85.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
85.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
85.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
85.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
85.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
85.8k
    const auto d = And(m, kMaskM);
102
85.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
85.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
85.8k
    const auto c = Or(a, l);
105
85.8k
    extra_bits = Add(extra_bits, eb_masked);
106
85.8k
    const auto t = Or(c, d);
107
85.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
85.8k
    Store(t_fixed, du, out + last_full);
109
85.8k
  }
110
99.8k
  return GetLane(SumOfLanes(du, extra_bits));
111
99.8k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
96.8k
                               uint32_t* JXL_RESTRICT out) {
46
96.8k
  const HWY_FULL(uint32_t) du;
47
96.8k
  const HWY_FULL(float) df;
48
96.8k
  const auto kZero = Zero(du);
49
96.8k
  const auto kSplit = Set(du, 1 << E);
50
96.8k
  const auto kExpOffset = Set(du, 127);
51
96.8k
  const auto kEBOffset = Set(du, 127 + M + L);
52
96.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
96.8k
  const auto kMulN = Set(du, 1 << (M + L));
54
96.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
96.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
96.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
96.8k
  constexpr size_t kLargeShiftVal = 10;
58
96.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
96.8k
  auto extra_bits = kZero;
61
96.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
8.29M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
8.20M
    const auto val = LoadU(du, values + i);
64
8.20M
    const auto is_large = Gt(val, kLargeThreshold);
65
8.20M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
8.20M
    const auto not_literal = Ge(val, kSplit);
67
8.20M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
8.20M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
8.20M
    const auto l = And(val, kMaskL);
70
8.20M
    const auto exp = ShiftRight<23>(b);
71
8.20M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
8.20M
    const auto n = Sub(exp_fixed, kExpOffset);
73
8.20M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
8.20M
    const auto m = ShiftRight<23 - M - L>(b);
75
8.20M
    const auto a = Add(kBase, Mul(n, kMulN));
76
8.20M
    const auto d = And(m, kMaskM);
77
8.20M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
8.20M
    const auto c = Or(a, l);
79
8.20M
    extra_bits = Add(extra_bits, eb_fixed);
80
8.20M
    const auto t = Or(c, d);
81
8.20M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
8.20M
    Store(t_fixed, du, out + i);
83
8.20M
  }
84
96.8k
  if (last_full < len) {
85
83.0k
    const auto stop = Set(du, len);
86
83.0k
    const auto fence = Iota(du, last_full);
87
83.0k
    const auto take = Lt(fence, stop);
88
83.0k
    const auto val = LoadU(du, values + last_full);
89
83.0k
    const auto is_large = Gt(val, kLargeThreshold);
90
83.0k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
83.0k
    const auto not_literal = Ge(val, kSplit);
92
83.0k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
83.0k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
83.0k
    const auto l = And(val, kMaskL);
95
83.0k
    const auto exp = ShiftRight<23>(b);
96
83.0k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
83.0k
    const auto n = Sub(exp_fixed, kExpOffset);
98
83.0k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
83.0k
    const auto m = ShiftRight<23 - M - L>(b);
100
83.0k
    const auto a = Add(kBase, Mul(n, kMulN));
101
83.0k
    const auto d = And(m, kMaskM);
102
83.0k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
83.0k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
83.0k
    const auto c = Or(a, l);
105
83.0k
    extra_bits = Add(extra_bits, eb_masked);
106
83.0k
    const auto t = Or(c, d);
107
83.0k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
83.0k
    Store(t_fixed, du, out + last_full);
109
83.0k
  }
110
96.8k
  return GetLane(SumOfLanes(du, extra_bits));
111
96.8k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
96.8k
                               uint32_t* JXL_RESTRICT out) {
46
96.8k
  const HWY_FULL(uint32_t) du;
47
96.8k
  const HWY_FULL(float) df;
48
96.8k
  const auto kZero = Zero(du);
49
96.8k
  const auto kSplit = Set(du, 1 << E);
50
96.8k
  const auto kExpOffset = Set(du, 127);
51
96.8k
  const auto kEBOffset = Set(du, 127 + M + L);
52
96.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
96.8k
  const auto kMulN = Set(du, 1 << (M + L));
54
96.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
96.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
96.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
96.8k
  constexpr size_t kLargeShiftVal = 10;
58
96.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
96.8k
  auto extra_bits = kZero;
61
96.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
8.29M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
8.20M
    const auto val = LoadU(du, values + i);
64
8.20M
    const auto is_large = Gt(val, kLargeThreshold);
65
8.20M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
8.20M
    const auto not_literal = Ge(val, kSplit);
67
8.20M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
8.20M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
8.20M
    const auto l = And(val, kMaskL);
70
8.20M
    const auto exp = ShiftRight<23>(b);
71
8.20M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
8.20M
    const auto n = Sub(exp_fixed, kExpOffset);
73
8.20M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
8.20M
    const auto m = ShiftRight<23 - M - L>(b);
75
8.20M
    const auto a = Add(kBase, Mul(n, kMulN));
76
8.20M
    const auto d = And(m, kMaskM);
77
8.20M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
8.20M
    const auto c = Or(a, l);
79
8.20M
    extra_bits = Add(extra_bits, eb_fixed);
80
8.20M
    const auto t = Or(c, d);
81
8.20M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
8.20M
    Store(t_fixed, du, out + i);
83
8.20M
  }
84
96.8k
  if (last_full < len) {
85
83.0k
    const auto stop = Set(du, len);
86
83.0k
    const auto fence = Iota(du, last_full);
87
83.0k
    const auto take = Lt(fence, stop);
88
83.0k
    const auto val = LoadU(du, values + last_full);
89
83.0k
    const auto is_large = Gt(val, kLargeThreshold);
90
83.0k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
83.0k
    const auto not_literal = Ge(val, kSplit);
92
83.0k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
83.0k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
83.0k
    const auto l = And(val, kMaskL);
95
83.0k
    const auto exp = ShiftRight<23>(b);
96
83.0k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
83.0k
    const auto n = Sub(exp_fixed, kExpOffset);
98
83.0k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
83.0k
    const auto m = ShiftRight<23 - M - L>(b);
100
83.0k
    const auto a = Add(kBase, Mul(n, kMulN));
101
83.0k
    const auto d = And(m, kMaskM);
102
83.0k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
83.0k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
83.0k
    const auto c = Or(a, l);
105
83.0k
    extra_bits = Add(extra_bits, eb_masked);
106
83.0k
    const auto t = Or(c, d);
107
83.0k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
83.0k
    Store(t_fixed, du, out + last_full);
109
83.0k
  }
110
96.8k
  return GetLane(SumOfLanes(du, extra_bits));
111
96.8k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
96.8k
                               uint32_t* JXL_RESTRICT out) {
46
96.8k
  const HWY_FULL(uint32_t) du;
47
96.8k
  const HWY_FULL(float) df;
48
96.8k
  const auto kZero = Zero(du);
49
96.8k
  const auto kSplit = Set(du, 1 << E);
50
96.8k
  const auto kExpOffset = Set(du, 127);
51
96.8k
  const auto kEBOffset = Set(du, 127 + M + L);
52
96.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
96.8k
  const auto kMulN = Set(du, 1 << (M + L));
54
96.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
96.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
96.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
96.8k
  constexpr size_t kLargeShiftVal = 10;
58
96.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
96.8k
  auto extra_bits = kZero;
61
96.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
8.29M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
8.20M
    const auto val = LoadU(du, values + i);
64
8.20M
    const auto is_large = Gt(val, kLargeThreshold);
65
8.20M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
8.20M
    const auto not_literal = Ge(val, kSplit);
67
8.20M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
8.20M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
8.20M
    const auto l = And(val, kMaskL);
70
8.20M
    const auto exp = ShiftRight<23>(b);
71
8.20M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
8.20M
    const auto n = Sub(exp_fixed, kExpOffset);
73
8.20M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
8.20M
    const auto m = ShiftRight<23 - M - L>(b);
75
8.20M
    const auto a = Add(kBase, Mul(n, kMulN));
76
8.20M
    const auto d = And(m, kMaskM);
77
8.20M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
8.20M
    const auto c = Or(a, l);
79
8.20M
    extra_bits = Add(extra_bits, eb_fixed);
80
8.20M
    const auto t = Or(c, d);
81
8.20M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
8.20M
    Store(t_fixed, du, out + i);
83
8.20M
  }
84
96.8k
  if (last_full < len) {
85
83.0k
    const auto stop = Set(du, len);
86
83.0k
    const auto fence = Iota(du, last_full);
87
83.0k
    const auto take = Lt(fence, stop);
88
83.0k
    const auto val = LoadU(du, values + last_full);
89
83.0k
    const auto is_large = Gt(val, kLargeThreshold);
90
83.0k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
83.0k
    const auto not_literal = Ge(val, kSplit);
92
83.0k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
83.0k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
83.0k
    const auto l = And(val, kMaskL);
95
83.0k
    const auto exp = ShiftRight<23>(b);
96
83.0k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
83.0k
    const auto n = Sub(exp_fixed, kExpOffset);
98
83.0k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
83.0k
    const auto m = ShiftRight<23 - M - L>(b);
100
83.0k
    const auto a = Add(kBase, Mul(n, kMulN));
101
83.0k
    const auto d = And(m, kMaskM);
102
83.0k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
83.0k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
83.0k
    const auto c = Or(a, l);
105
83.0k
    extra_bits = Add(extra_bits, eb_masked);
106
83.0k
    const auto t = Or(c, d);
107
83.0k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
83.0k
    Store(t_fixed, du, out + last_full);
109
83.0k
  }
110
96.8k
  return GetLane(SumOfLanes(du, extra_bits));
111
96.8k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.6k
                               uint32_t* JXL_RESTRICT out) {
46
20.6k
  const HWY_FULL(uint32_t) du;
47
20.6k
  const HWY_FULL(float) df;
48
20.6k
  const auto kZero = Zero(du);
49
20.6k
  const auto kSplit = Set(du, 1 << E);
50
20.6k
  const auto kExpOffset = Set(du, 127);
51
20.6k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.6k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.6k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.6k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.6k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.6k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.6k
  constexpr size_t kLargeShiftVal = 10;
58
20.6k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.6k
  auto extra_bits = kZero;
61
20.6k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.05M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.6k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.6k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.6k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.6k
                               uint32_t* JXL_RESTRICT out) {
46
20.6k
  const HWY_FULL(uint32_t) du;
47
20.6k
  const HWY_FULL(float) df;
48
20.6k
  const auto kZero = Zero(du);
49
20.6k
  const auto kSplit = Set(du, 1 << E);
50
20.6k
  const auto kExpOffset = Set(du, 127);
51
20.6k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.6k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.6k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.6k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.6k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.6k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.6k
  constexpr size_t kLargeShiftVal = 10;
58
20.6k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.6k
  auto extra_bits = kZero;
61
20.6k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.05M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.6k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.6k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.6k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
17.6k
                               uint32_t* JXL_RESTRICT out) {
46
17.6k
  const HWY_FULL(uint32_t) du;
47
17.6k
  const HWY_FULL(float) df;
48
17.6k
  const auto kZero = Zero(du);
49
17.6k
  const auto kSplit = Set(du, 1 << E);
50
17.6k
  const auto kExpOffset = Set(du, 127);
51
17.6k
  const auto kEBOffset = Set(du, 127 + M + L);
52
17.6k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
17.6k
  const auto kMulN = Set(du, 1 << (M + L));
54
17.6k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
17.6k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
17.6k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
17.6k
  constexpr size_t kLargeShiftVal = 10;
58
17.6k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
17.6k
  auto extra_bits = kZero;
61
17.6k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
572k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
555k
    const auto val = LoadU(du, values + i);
64
555k
    const auto is_large = Gt(val, kLargeThreshold);
65
555k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
555k
    const auto not_literal = Ge(val, kSplit);
67
555k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
555k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
555k
    const auto l = And(val, kMaskL);
70
555k
    const auto exp = ShiftRight<23>(b);
71
555k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
555k
    const auto n = Sub(exp_fixed, kExpOffset);
73
555k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
555k
    const auto m = ShiftRight<23 - M - L>(b);
75
555k
    const auto a = Add(kBase, Mul(n, kMulN));
76
555k
    const auto d = And(m, kMaskM);
77
555k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
555k
    const auto c = Or(a, l);
79
555k
    extra_bits = Add(extra_bits, eb_fixed);
80
555k
    const auto t = Or(c, d);
81
555k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
555k
    Store(t_fixed, du, out + i);
83
555k
  }
84
17.6k
  if (last_full < len) {
85
15.2k
    const auto stop = Set(du, len);
86
15.2k
    const auto fence = Iota(du, last_full);
87
15.2k
    const auto take = Lt(fence, stop);
88
15.2k
    const auto val = LoadU(du, values + last_full);
89
15.2k
    const auto is_large = Gt(val, kLargeThreshold);
90
15.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
15.2k
    const auto not_literal = Ge(val, kSplit);
92
15.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
15.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
15.2k
    const auto l = And(val, kMaskL);
95
15.2k
    const auto exp = ShiftRight<23>(b);
96
15.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
15.2k
    const auto n = Sub(exp_fixed, kExpOffset);
98
15.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
15.2k
    const auto m = ShiftRight<23 - M - L>(b);
100
15.2k
    const auto a = Add(kBase, Mul(n, kMulN));
101
15.2k
    const auto d = And(m, kMaskM);
102
15.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
15.2k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
15.2k
    const auto c = Or(a, l);
105
15.2k
    extra_bits = Add(extra_bits, eb_masked);
106
15.2k
    const auto t = Or(c, d);
107
15.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
15.2k
    Store(t_fixed, du, out + last_full);
109
15.2k
  }
110
17.6k
  return GetLane(SumOfLanes(du, extra_bits));
111
17.6k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
17.6k
                               uint32_t* JXL_RESTRICT out) {
46
17.6k
  const HWY_FULL(uint32_t) du;
47
17.6k
  const HWY_FULL(float) df;
48
17.6k
  const auto kZero = Zero(du);
49
17.6k
  const auto kSplit = Set(du, 1 << E);
50
17.6k
  const auto kExpOffset = Set(du, 127);
51
17.6k
  const auto kEBOffset = Set(du, 127 + M + L);
52
17.6k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
17.6k
  const auto kMulN = Set(du, 1 << (M + L));
54
17.6k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
17.6k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
17.6k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
17.6k
  constexpr size_t kLargeShiftVal = 10;
58
17.6k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
17.6k
  auto extra_bits = kZero;
61
17.6k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
572k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
555k
    const auto val = LoadU(du, values + i);
64
555k
    const auto is_large = Gt(val, kLargeThreshold);
65
555k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
555k
    const auto not_literal = Ge(val, kSplit);
67
555k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
555k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
555k
    const auto l = And(val, kMaskL);
70
555k
    const auto exp = ShiftRight<23>(b);
71
555k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
555k
    const auto n = Sub(exp_fixed, kExpOffset);
73
555k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
555k
    const auto m = ShiftRight<23 - M - L>(b);
75
555k
    const auto a = Add(kBase, Mul(n, kMulN));
76
555k
    const auto d = And(m, kMaskM);
77
555k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
555k
    const auto c = Or(a, l);
79
555k
    extra_bits = Add(extra_bits, eb_fixed);
80
555k
    const auto t = Or(c, d);
81
555k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
555k
    Store(t_fixed, du, out + i);
83
555k
  }
84
17.6k
  if (last_full < len) {
85
15.2k
    const auto stop = Set(du, len);
86
15.2k
    const auto fence = Iota(du, last_full);
87
15.2k
    const auto take = Lt(fence, stop);
88
15.2k
    const auto val = LoadU(du, values + last_full);
89
15.2k
    const auto is_large = Gt(val, kLargeThreshold);
90
15.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
15.2k
    const auto not_literal = Ge(val, kSplit);
92
15.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
15.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
15.2k
    const auto l = And(val, kMaskL);
95
15.2k
    const auto exp = ShiftRight<23>(b);
96
15.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
15.2k
    const auto n = Sub(exp_fixed, kExpOffset);
98
15.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
15.2k
    const auto m = ShiftRight<23 - M - L>(b);
100
15.2k
    const auto a = Add(kBase, Mul(n, kMulN));
101
15.2k
    const auto d = And(m, kMaskM);
102
15.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
15.2k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
15.2k
    const auto c = Or(a, l);
105
15.2k
    extra_bits = Add(extra_bits, eb_masked);
106
15.2k
    const auto t = Or(c, d);
107
15.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
15.2k
    Store(t_fixed, du, out + last_full);
109
15.2k
  }
110
17.6k
  return GetLane(SumOfLanes(du, extra_bits));
111
17.6k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
20.7k
                               uint32_t* JXL_RESTRICT out) {
46
20.7k
  const HWY_FULL(uint32_t) du;
47
20.7k
  const HWY_FULL(float) df;
48
20.7k
  const auto kZero = Zero(du);
49
20.7k
  const auto kSplit = Set(du, 1 << E);
50
20.7k
  const auto kExpOffset = Set(du, 127);
51
20.7k
  const auto kEBOffset = Set(du, 127 + M + L);
52
20.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
20.7k
  const auto kMulN = Set(du, 1 << (M + L));
54
20.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
20.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
20.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
20.7k
  constexpr size_t kLargeShiftVal = 10;
58
20.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
20.7k
  auto extra_bits = kZero;
61
20.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
1.06M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
1.03M
    const auto val = LoadU(du, values + i);
64
1.03M
    const auto is_large = Gt(val, kLargeThreshold);
65
1.03M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
1.03M
    const auto not_literal = Ge(val, kSplit);
67
1.03M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
1.03M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
1.03M
    const auto l = And(val, kMaskL);
70
1.03M
    const auto exp = ShiftRight<23>(b);
71
1.03M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
1.03M
    const auto n = Sub(exp_fixed, kExpOffset);
73
1.03M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
1.03M
    const auto m = ShiftRight<23 - M - L>(b);
75
1.03M
    const auto a = Add(kBase, Mul(n, kMulN));
76
1.03M
    const auto d = And(m, kMaskM);
77
1.03M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
1.03M
    const auto c = Or(a, l);
79
1.03M
    extra_bits = Add(extra_bits, eb_fixed);
80
1.03M
    const auto t = Or(c, d);
81
1.03M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
1.03M
    Store(t_fixed, du, out + i);
83
1.03M
  }
84
20.7k
  if (last_full < len) {
85
17.8k
    const auto stop = Set(du, len);
86
17.8k
    const auto fence = Iota(du, last_full);
87
17.8k
    const auto take = Lt(fence, stop);
88
17.8k
    const auto val = LoadU(du, values + last_full);
89
17.8k
    const auto is_large = Gt(val, kLargeThreshold);
90
17.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
17.8k
    const auto not_literal = Ge(val, kSplit);
92
17.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
17.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
17.8k
    const auto l = And(val, kMaskL);
95
17.8k
    const auto exp = ShiftRight<23>(b);
96
17.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
17.8k
    const auto n = Sub(exp_fixed, kExpOffset);
98
17.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
17.8k
    const auto m = ShiftRight<23 - M - L>(b);
100
17.8k
    const auto a = Add(kBase, Mul(n, kMulN));
101
17.8k
    const auto d = And(m, kMaskM);
102
17.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
17.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
17.8k
    const auto c = Or(a, l);
105
17.8k
    extra_bits = Add(extra_bits, eb_masked);
106
17.8k
    const auto t = Or(c, d);
107
17.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
17.8k
    Store(t_fixed, du, out + last_full);
109
17.8k
  }
110
20.7k
  return GetLane(SumOfLanes(du, extra_bits));
111
20.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
15.4k
                               uint32_t* JXL_RESTRICT out) {
46
15.4k
  const HWY_FULL(uint32_t) du;
47
15.4k
  const HWY_FULL(float) df;
48
15.4k
  const auto kZero = Zero(du);
49
15.4k
  const auto kSplit = Set(du, 1 << E);
50
15.4k
  const auto kExpOffset = Set(du, 127);
51
15.4k
  const auto kEBOffset = Set(du, 127 + M + L);
52
15.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
15.4k
  const auto kMulN = Set(du, 1 << (M + L));
54
15.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
15.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
15.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
15.4k
  constexpr size_t kLargeShiftVal = 10;
58
15.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
15.4k
  auto extra_bits = kZero;
61
15.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
500k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
485k
    const auto val = LoadU(du, values + i);
64
485k
    const auto is_large = Gt(val, kLargeThreshold);
65
485k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
485k
    const auto not_literal = Ge(val, kSplit);
67
485k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
485k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
485k
    const auto l = And(val, kMaskL);
70
485k
    const auto exp = ShiftRight<23>(b);
71
485k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
485k
    const auto n = Sub(exp_fixed, kExpOffset);
73
485k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
485k
    const auto m = ShiftRight<23 - M - L>(b);
75
485k
    const auto a = Add(kBase, Mul(n, kMulN));
76
485k
    const auto d = And(m, kMaskM);
77
485k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
485k
    const auto c = Or(a, l);
79
485k
    extra_bits = Add(extra_bits, eb_fixed);
80
485k
    const auto t = Or(c, d);
81
485k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
485k
    Store(t_fixed, du, out + i);
83
485k
  }
84
15.4k
  if (last_full < len) {
85
13.2k
    const auto stop = Set(du, len);
86
13.2k
    const auto fence = Iota(du, last_full);
87
13.2k
    const auto take = Lt(fence, stop);
88
13.2k
    const auto val = LoadU(du, values + last_full);
89
13.2k
    const auto is_large = Gt(val, kLargeThreshold);
90
13.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
13.2k
    const auto not_literal = Ge(val, kSplit);
92
13.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
13.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
13.2k
    const auto l = And(val, kMaskL);
95
13.2k
    const auto exp = ShiftRight<23>(b);
96
13.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
13.2k
    const auto n = Sub(exp_fixed, kExpOffset);
98
13.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
13.2k
    const auto m = ShiftRight<23 - M - L>(b);
100
13.2k
    const auto a = Add(kBase, Mul(n, kMulN));
101
13.2k
    const auto d = And(m, kMaskM);
102
13.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
13.2k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
13.2k
    const auto c = Or(a, l);
105
13.2k
    extra_bits = Add(extra_bits, eb_masked);
106
13.2k
    const auto t = Or(c, d);
107
13.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
13.2k
    Store(t_fixed, du, out + last_full);
109
13.2k
  }
110
15.4k
  return GetLane(SumOfLanes(du, extra_bits));
111
15.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
15.4k
                               uint32_t* JXL_RESTRICT out) {
46
15.4k
  const HWY_FULL(uint32_t) du;
47
15.4k
  const HWY_FULL(float) df;
48
15.4k
  const auto kZero = Zero(du);
49
15.4k
  const auto kSplit = Set(du, 1 << E);
50
15.4k
  const auto kExpOffset = Set(du, 127);
51
15.4k
  const auto kEBOffset = Set(du, 127 + M + L);
52
15.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
15.4k
  const auto kMulN = Set(du, 1 << (M + L));
54
15.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
15.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
15.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
15.4k
  constexpr size_t kLargeShiftVal = 10;
58
15.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
15.4k
  auto extra_bits = kZero;
61
15.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
500k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
485k
    const auto val = LoadU(du, values + i);
64
485k
    const auto is_large = Gt(val, kLargeThreshold);
65
485k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
485k
    const auto not_literal = Ge(val, kSplit);
67
485k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
485k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
485k
    const auto l = And(val, kMaskL);
70
485k
    const auto exp = ShiftRight<23>(b);
71
485k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
485k
    const auto n = Sub(exp_fixed, kExpOffset);
73
485k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
485k
    const auto m = ShiftRight<23 - M - L>(b);
75
485k
    const auto a = Add(kBase, Mul(n, kMulN));
76
485k
    const auto d = And(m, kMaskM);
77
485k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
485k
    const auto c = Or(a, l);
79
485k
    extra_bits = Add(extra_bits, eb_fixed);
80
485k
    const auto t = Or(c, d);
81
485k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
485k
    Store(t_fixed, du, out + i);
83
485k
  }
84
15.4k
  if (last_full < len) {
85
13.2k
    const auto stop = Set(du, len);
86
13.2k
    const auto fence = Iota(du, last_full);
87
13.2k
    const auto take = Lt(fence, stop);
88
13.2k
    const auto val = LoadU(du, values + last_full);
89
13.2k
    const auto is_large = Gt(val, kLargeThreshold);
90
13.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
13.2k
    const auto not_literal = Ge(val, kSplit);
92
13.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
13.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
13.2k
    const auto l = And(val, kMaskL);
95
13.2k
    const auto exp = ShiftRight<23>(b);
96
13.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
13.2k
    const auto n = Sub(exp_fixed, kExpOffset);
98
13.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
13.2k
    const auto m = ShiftRight<23 - M - L>(b);
100
13.2k
    const auto a = Add(kBase, Mul(n, kMulN));
101
13.2k
    const auto d = And(m, kMaskM);
102
13.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
13.2k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
13.2k
    const auto c = Or(a, l);
105
13.2k
    extra_bits = Add(extra_bits, eb_masked);
106
13.2k
    const auto t = Or(c, d);
107
13.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
13.2k
    Store(t_fixed, du, out + last_full);
109
13.2k
  }
110
15.4k
  return GetLane(SumOfLanes(du, extra_bits));
111
15.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
15.4k
                               uint32_t* JXL_RESTRICT out) {
46
15.4k
  const HWY_FULL(uint32_t) du;
47
15.4k
  const HWY_FULL(float) df;
48
15.4k
  const auto kZero = Zero(du);
49
15.4k
  const auto kSplit = Set(du, 1 << E);
50
15.4k
  const auto kExpOffset = Set(du, 127);
51
15.4k
  const auto kEBOffset = Set(du, 127 + M + L);
52
15.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
15.4k
  const auto kMulN = Set(du, 1 << (M + L));
54
15.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
15.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
15.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
15.4k
  constexpr size_t kLargeShiftVal = 10;
58
15.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
15.4k
  auto extra_bits = kZero;
61
15.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
500k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
485k
    const auto val = LoadU(du, values + i);
64
485k
    const auto is_large = Gt(val, kLargeThreshold);
65
485k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
485k
    const auto not_literal = Ge(val, kSplit);
67
485k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
485k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
485k
    const auto l = And(val, kMaskL);
70
485k
    const auto exp = ShiftRight<23>(b);
71
485k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
485k
    const auto n = Sub(exp_fixed, kExpOffset);
73
485k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
485k
    const auto m = ShiftRight<23 - M - L>(b);
75
485k
    const auto a = Add(kBase, Mul(n, kMulN));
76
485k
    const auto d = And(m, kMaskM);
77
485k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
485k
    const auto c = Or(a, l);
79
485k
    extra_bits = Add(extra_bits, eb_fixed);
80
485k
    const auto t = Or(c, d);
81
485k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
485k
    Store(t_fixed, du, out + i);
83
485k
  }
84
15.4k
  if (last_full < len) {
85
13.2k
    const auto stop = Set(du, len);
86
13.2k
    const auto fence = Iota(du, last_full);
87
13.2k
    const auto take = Lt(fence, stop);
88
13.2k
    const auto val = LoadU(du, values + last_full);
89
13.2k
    const auto is_large = Gt(val, kLargeThreshold);
90
13.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
13.2k
    const auto not_literal = Ge(val, kSplit);
92
13.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
13.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
13.2k
    const auto l = And(val, kMaskL);
95
13.2k
    const auto exp = ShiftRight<23>(b);
96
13.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
13.2k
    const auto n = Sub(exp_fixed, kExpOffset);
98
13.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
13.2k
    const auto m = ShiftRight<23 - M - L>(b);
100
13.2k
    const auto a = Add(kBase, Mul(n, kMulN));
101
13.2k
    const auto d = And(m, kMaskM);
102
13.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
13.2k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
13.2k
    const auto c = Or(a, l);
105
13.2k
    extra_bits = Add(extra_bits, eb_masked);
106
13.2k
    const auto t = Or(c, d);
107
13.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
13.2k
    Store(t_fixed, du, out + last_full);
109
13.2k
  }
110
15.4k
  return GetLane(SumOfLanes(du, extra_bits));
111
15.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
15.4k
                               uint32_t* JXL_RESTRICT out) {
46
15.4k
  const HWY_FULL(uint32_t) du;
47
15.4k
  const HWY_FULL(float) df;
48
15.4k
  const auto kZero = Zero(du);
49
15.4k
  const auto kSplit = Set(du, 1 << E);
50
15.4k
  const auto kExpOffset = Set(du, 127);
51
15.4k
  const auto kEBOffset = Set(du, 127 + M + L);
52
15.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
15.4k
  const auto kMulN = Set(du, 1 << (M + L));
54
15.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
15.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
15.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
15.4k
  constexpr size_t kLargeShiftVal = 10;
58
15.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
15.4k
  auto extra_bits = kZero;
61
15.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
500k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
485k
    const auto val = LoadU(du, values + i);
64
485k
    const auto is_large = Gt(val, kLargeThreshold);
65
485k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
485k
    const auto not_literal = Ge(val, kSplit);
67
485k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
485k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
485k
    const auto l = And(val, kMaskL);
70
485k
    const auto exp = ShiftRight<23>(b);
71
485k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
485k
    const auto n = Sub(exp_fixed, kExpOffset);
73
485k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
485k
    const auto m = ShiftRight<23 - M - L>(b);
75
485k
    const auto a = Add(kBase, Mul(n, kMulN));
76
485k
    const auto d = And(m, kMaskM);
77
485k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
485k
    const auto c = Or(a, l);
79
485k
    extra_bits = Add(extra_bits, eb_fixed);
80
485k
    const auto t = Or(c, d);
81
485k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
485k
    Store(t_fixed, du, out + i);
83
485k
  }
84
15.4k
  if (last_full < len) {
85
13.2k
    const auto stop = Set(du, len);
86
13.2k
    const auto fence = Iota(du, last_full);
87
13.2k
    const auto take = Lt(fence, stop);
88
13.2k
    const auto val = LoadU(du, values + last_full);
89
13.2k
    const auto is_large = Gt(val, kLargeThreshold);
90
13.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
13.2k
    const auto not_literal = Ge(val, kSplit);
92
13.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
13.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
13.2k
    const auto l = And(val, kMaskL);
95
13.2k
    const auto exp = ShiftRight<23>(b);
96
13.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
13.2k
    const auto n = Sub(exp_fixed, kExpOffset);
98
13.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
13.2k
    const auto m = ShiftRight<23 - M - L>(b);
100
13.2k
    const auto a = Add(kBase, Mul(n, kMulN));
101
13.2k
    const auto d = And(m, kMaskM);
102
13.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
13.2k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
13.2k
    const auto c = Or(a, l);
105
13.2k
    extra_bits = Add(extra_bits, eb_masked);
106
13.2k
    const auto t = Or(c, d);
107
13.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
13.2k
    Store(t_fixed, du, out + last_full);
109
13.2k
  }
110
15.4k
  return GetLane(SumOfLanes(du, extra_bits));
111
15.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
15.4k
                               uint32_t* JXL_RESTRICT out) {
46
15.4k
  const HWY_FULL(uint32_t) du;
47
15.4k
  const HWY_FULL(float) df;
48
15.4k
  const auto kZero = Zero(du);
49
15.4k
  const auto kSplit = Set(du, 1 << E);
50
15.4k
  const auto kExpOffset = Set(du, 127);
51
15.4k
  const auto kEBOffset = Set(du, 127 + M + L);
52
15.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
15.4k
  const auto kMulN = Set(du, 1 << (M + L));
54
15.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
15.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
15.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
15.4k
  constexpr size_t kLargeShiftVal = 10;
58
15.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
15.4k
  auto extra_bits = kZero;
61
15.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
500k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
485k
    const auto val = LoadU(du, values + i);
64
485k
    const auto is_large = Gt(val, kLargeThreshold);
65
485k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
485k
    const auto not_literal = Ge(val, kSplit);
67
485k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
485k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
485k
    const auto l = And(val, kMaskL);
70
485k
    const auto exp = ShiftRight<23>(b);
71
485k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
485k
    const auto n = Sub(exp_fixed, kExpOffset);
73
485k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
485k
    const auto m = ShiftRight<23 - M - L>(b);
75
485k
    const auto a = Add(kBase, Mul(n, kMulN));
76
485k
    const auto d = And(m, kMaskM);
77
485k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
485k
    const auto c = Or(a, l);
79
485k
    extra_bits = Add(extra_bits, eb_fixed);
80
485k
    const auto t = Or(c, d);
81
485k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
485k
    Store(t_fixed, du, out + i);
83
485k
  }
84
15.4k
  if (last_full < len) {
85
13.2k
    const auto stop = Set(du, len);
86
13.2k
    const auto fence = Iota(du, last_full);
87
13.2k
    const auto take = Lt(fence, stop);
88
13.2k
    const auto val = LoadU(du, values + last_full);
89
13.2k
    const auto is_large = Gt(val, kLargeThreshold);
90
13.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
13.2k
    const auto not_literal = Ge(val, kSplit);
92
13.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
13.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
13.2k
    const auto l = And(val, kMaskL);
95
13.2k
    const auto exp = ShiftRight<23>(b);
96
13.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
13.2k
    const auto n = Sub(exp_fixed, kExpOffset);
98
13.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
13.2k
    const auto m = ShiftRight<23 - M - L>(b);
100
13.2k
    const auto a = Add(kBase, Mul(n, kMulN));
101
13.2k
    const auto d = And(m, kMaskM);
102
13.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
13.2k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
13.2k
    const auto c = Or(a, l);
105
13.2k
    extra_bits = Add(extra_bits, eb_masked);
106
13.2k
    const auto t = Or(c, d);
107
13.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
13.2k
    Store(t_fixed, du, out + last_full);
109
13.2k
  }
110
15.4k
  return GetLane(SumOfLanes(du, extra_bits));
111
15.4k
}
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
112
113
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
114
854k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
115
854k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
116
#if HWY_TARGET == HWY_SCALAR
117
  uint32_t extra_bits = 0;
118
  for (size_t i = 0; i < len; ++i) {
119
    uint32_t v = values[i];
120
    uint32_t tok, nbits, bits;
121
    cfg.Encode(v, &tok, &nbits, &bits);
122
    extra_bits += nbits;
123
    out[i] = tok;
124
  }
125
  return extra_bits;
126
#else
127
854k
  if (cfg.split_exponent == 0) {
128
99.8k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
129
755k
  } else if (cfg.split_exponent == 2) {
130
96.8k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
131
96.8k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
132
658k
  } else if (cfg.split_exponent == 3) {
133
82.8k
    if (cfg.msb_in_token == 1) {
134
41.4k
      if (cfg.lsb_in_token == 0) {
135
20.7k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
136
20.7k
      } else {
137
20.7k
        JXL_DASSERT(cfg.lsb_in_token == 2);
138
20.7k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
139
20.7k
      }
140
41.4k
    } else {
141
41.4k
      JXL_DASSERT(cfg.msb_in_token == 2);
142
41.4k
      if (cfg.lsb_in_token == 0) {
143
20.7k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
144
20.7k
      } else {
145
20.7k
        JXL_DASSERT(cfg.lsb_in_token == 1);
146
20.7k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
147
20.7k
      }
148
41.4k
    }
149
575k
  } else if (cfg.split_exponent == 4) {
150
276k
    if (cfg.msb_in_token == 1) {
151
138k
      if (cfg.lsb_in_token == 0) {
152
20.7k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
153
117k
      } else if (cfg.lsb_in_token == 2) {
154
96.8k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
155
96.8k
      } else {
156
20.7k
        JXL_DASSERT(cfg.lsb_in_token == 3);
157
20.7k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
158
20.7k
      }
159
138k
    } else {
160
138k
      JXL_DASSERT(cfg.msb_in_token == 2);
161
138k
      if (cfg.lsb_in_token == 0) {
162
96.8k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
163
96.8k
      } else if (cfg.lsb_in_token == 1) {
164
20.7k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
165
20.7k
      } else {
166
20.7k
        JXL_DASSERT(cfg.lsb_in_token == 2);
167
20.7k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
168
20.7k
      }
169
138k
    }
170
298k
  } else if (cfg.split_exponent == 5) {
171
144k
    if (cfg.msb_in_token == 1) {
172
62.1k
      if (cfg.lsb_in_token == 0) {
173
20.7k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
174
41.4k
      } else if (cfg.lsb_in_token == 2) {
175
20.7k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
176
20.7k
      } else {
177
20.6k
        JXL_DASSERT(cfg.lsb_in_token == 4);
178
20.6k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
179
20.6k
      }
180
82.8k
    } else {
181
82.8k
      JXL_DASSERT(cfg.msb_in_token == 2);
182
82.8k
      if (cfg.lsb_in_token == 0) {
183
20.7k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
184
62.1k
      } else if (cfg.lsb_in_token == 1) {
185
20.7k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
186
41.4k
      } else if (cfg.lsb_in_token == 2) {
187
20.7k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
188
20.7k
      } else {
189
20.6k
        JXL_DASSERT(cfg.lsb_in_token == 3);
190
20.6k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
191
20.6k
      }
192
82.8k
    }
193
153k
  } else if (cfg.split_exponent == 6) {
194
56.0k
    if (cfg.msb_in_token == 0) {
195
20.7k
      JXL_DASSERT(cfg.lsb_in_token == 0);
196
20.7k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
197
35.3k
    } else if (cfg.msb_in_token == 1) {
198
17.6k
      JXL_DASSERT(cfg.lsb_in_token == 5);
199
17.6k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
200
17.6k
    } else {
201
17.6k
      JXL_DASSERT(cfg.msb_in_token == 2);
202
17.6k
      JXL_DASSERT(cfg.lsb_in_token == 4);
203
17.6k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
204
17.6k
    }
205
97.7k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
206
97.7k
    JXL_DASSERT(cfg.msb_in_token == 0);
207
97.7k
    JXL_DASSERT(cfg.lsb_in_token == 0);
208
97.7k
    if (cfg.split_exponent == 7) {
209
20.7k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
210
77.0k
    } else if (cfg.split_exponent == 8) {
211
15.4k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
212
61.6k
    } else if (cfg.split_exponent == 9) {
213
15.4k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
214
46.2k
    } else if (cfg.split_exponent == 10) {
215
15.4k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
216
30.8k
    } else if (cfg.split_exponent == 11) {
217
15.4k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
218
15.4k
    } else {
219
15.4k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
220
15.4k
    }
221
97.7k
  } else {
222
0
    JXL_DASSERT(false);
223
0
  }
224
0
  return ~0;
225
854k
#endif
226
854k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Line
Count
Source
114
854k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
115
854k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
116
#if HWY_TARGET == HWY_SCALAR
117
  uint32_t extra_bits = 0;
118
  for (size_t i = 0; i < len; ++i) {
119
    uint32_t v = values[i];
120
    uint32_t tok, nbits, bits;
121
    cfg.Encode(v, &tok, &nbits, &bits);
122
    extra_bits += nbits;
123
    out[i] = tok;
124
  }
125
  return extra_bits;
126
#else
127
854k
  if (cfg.split_exponent == 0) {
128
99.8k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
129
755k
  } else if (cfg.split_exponent == 2) {
130
96.8k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
131
96.8k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
132
658k
  } else if (cfg.split_exponent == 3) {
133
82.8k
    if (cfg.msb_in_token == 1) {
134
41.4k
      if (cfg.lsb_in_token == 0) {
135
20.7k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
136
20.7k
      } else {
137
20.7k
        JXL_DASSERT(cfg.lsb_in_token == 2);
138
20.7k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
139
20.7k
      }
140
41.4k
    } else {
141
41.4k
      JXL_DASSERT(cfg.msb_in_token == 2);
142
41.4k
      if (cfg.lsb_in_token == 0) {
143
20.7k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
144
20.7k
      } else {
145
20.7k
        JXL_DASSERT(cfg.lsb_in_token == 1);
146
20.7k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
147
20.7k
      }
148
41.4k
    }
149
575k
  } else if (cfg.split_exponent == 4) {
150
276k
    if (cfg.msb_in_token == 1) {
151
138k
      if (cfg.lsb_in_token == 0) {
152
20.7k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
153
117k
      } else if (cfg.lsb_in_token == 2) {
154
96.8k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
155
96.8k
      } else {
156
20.7k
        JXL_DASSERT(cfg.lsb_in_token == 3);
157
20.7k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
158
20.7k
      }
159
138k
    } else {
160
138k
      JXL_DASSERT(cfg.msb_in_token == 2);
161
138k
      if (cfg.lsb_in_token == 0) {
162
96.8k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
163
96.8k
      } else if (cfg.lsb_in_token == 1) {
164
20.7k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
165
20.7k
      } else {
166
20.7k
        JXL_DASSERT(cfg.lsb_in_token == 2);
167
20.7k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
168
20.7k
      }
169
138k
    }
170
298k
  } else if (cfg.split_exponent == 5) {
171
144k
    if (cfg.msb_in_token == 1) {
172
62.1k
      if (cfg.lsb_in_token == 0) {
173
20.7k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
174
41.4k
      } else if (cfg.lsb_in_token == 2) {
175
20.7k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
176
20.7k
      } else {
177
20.6k
        JXL_DASSERT(cfg.lsb_in_token == 4);
178
20.6k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
179
20.6k
      }
180
82.8k
    } else {
181
82.8k
      JXL_DASSERT(cfg.msb_in_token == 2);
182
82.8k
      if (cfg.lsb_in_token == 0) {
183
20.7k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
184
62.1k
      } else if (cfg.lsb_in_token == 1) {
185
20.7k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
186
41.4k
      } else if (cfg.lsb_in_token == 2) {
187
20.7k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
188
20.7k
      } else {
189
20.6k
        JXL_DASSERT(cfg.lsb_in_token == 3);
190
20.6k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
191
20.6k
      }
192
82.8k
    }
193
153k
  } else if (cfg.split_exponent == 6) {
194
56.0k
    if (cfg.msb_in_token == 0) {
195
20.7k
      JXL_DASSERT(cfg.lsb_in_token == 0);
196
20.7k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
197
35.3k
    } else if (cfg.msb_in_token == 1) {
198
17.6k
      JXL_DASSERT(cfg.lsb_in_token == 5);
199
17.6k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
200
17.6k
    } else {
201
17.6k
      JXL_DASSERT(cfg.msb_in_token == 2);
202
17.6k
      JXL_DASSERT(cfg.lsb_in_token == 4);
203
17.6k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
204
17.6k
    }
205
97.7k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
206
97.7k
    JXL_DASSERT(cfg.msb_in_token == 0);
207
97.7k
    JXL_DASSERT(cfg.lsb_in_token == 0);
208
97.7k
    if (cfg.split_exponent == 7) {
209
20.7k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
210
77.0k
    } else if (cfg.split_exponent == 8) {
211
15.4k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
212
61.6k
    } else if (cfg.split_exponent == 9) {
213
15.4k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
214
46.2k
    } else if (cfg.split_exponent == 10) {
215
15.4k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
216
30.8k
    } else if (cfg.split_exponent == 11) {
217
15.4k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
218
15.4k
    } else {
219
15.4k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
220
15.4k
    }
221
97.7k
  } else {
222
0
    JXL_DASSERT(false);
223
0
  }
224
0
  return ~0;
225
854k
#endif
226
854k
}
Unexecuted instantiation: jxl::N_AVX3::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_SPR::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
227
228
// NOLINTNEXTLINE(google-readability-namespace-comments)
229
}  // namespace HWY_NAMESPACE
230
}  // namespace jxl
231
HWY_AFTER_NAMESPACE();
232
233
#if HWY_ONCE
234
namespace jxl {
235
236
HWY_EXPORT(EstimateTokenCost);
237
238
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
239
854k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
240
854k
  JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent);
241
854k
  return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens);
242
854k
}
243
244
}  // namespace jxl
245
#endif