Coverage Report

Created: 2025-12-31 07:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_ans_simd.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ans_simd.h"
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/status.h"
11
#include "lib/jxl/dec_ans.h"
12
#include "lib/jxl/memory_manager_internal.h"
13
14
#undef HWY_TARGET_INCLUDE
15
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc"
16
#include <hwy/foreach_target.h>
17
#include <hwy/highway.h>
18
19
HWY_BEFORE_NAMESPACE();
20
namespace jxl {
21
namespace HWY_NAMESPACE {
22
23
// These templates are not found via ADL.
24
using hwy::HWY_NAMESPACE::Add;
25
using hwy::HWY_NAMESPACE::And;
26
using hwy::HWY_NAMESPACE::Ge;
27
using hwy::HWY_NAMESPACE::GetLane;
28
using hwy::HWY_NAMESPACE::Gt;
29
using hwy::HWY_NAMESPACE::IfThenElse;
30
using hwy::HWY_NAMESPACE::IfThenElseZero;
31
using hwy::HWY_NAMESPACE::Iota;
32
using hwy::HWY_NAMESPACE::LoadU;
33
using hwy::HWY_NAMESPACE::Lt;
34
using hwy::HWY_NAMESPACE::Mul;
35
using hwy::HWY_NAMESPACE::Or;
36
using hwy::HWY_NAMESPACE::Set;
37
using hwy::HWY_NAMESPACE::ShiftRight;
38
using hwy::HWY_NAMESPACE::Store;
39
using hwy::HWY_NAMESPACE::Sub;
40
using hwy::HWY_NAMESPACE::Zero;
41
42
template <size_t E, size_t M, size_t L>
43
uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len,
44
834k
                               uint32_t* JXL_RESTRICT out) {
45
834k
  const HWY_FULL(uint32_t) du;
46
834k
  const HWY_FULL(float) df;
47
834k
  const auto kZero = Zero(du);
48
834k
  const auto kSplit = Set(du, 1 << E);
49
834k
  const auto kExpOffset = Set(du, 127);
50
834k
  const auto kEBOffset = Set(du, 127 + M + L);
51
834k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
834k
  const auto kMulN = Set(du, 1 << (M + L));
53
834k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
834k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
834k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
834k
  constexpr size_t kLargeShiftVal = 10;
57
834k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
834k
  auto extra_bits = kZero;
60
834k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
56.4M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
55.6M
    const auto val = LoadU(du, values + i);
63
55.6M
    const auto is_large = Gt(val, kLargeThreshold);
64
55.6M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
55.6M
    const auto not_literal = Ge(val, kSplit);
66
55.6M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
55.6M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
55.6M
    const auto l = And(val, kMaskL);
69
55.6M
    const auto exp = ShiftRight<23>(b);
70
55.6M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
55.6M
    const auto n = Sub(exp_fixed, kExpOffset);
72
55.6M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
55.6M
    const auto m = ShiftRight<23 - M - L>(b);
74
55.6M
    const auto a = Add(kBase, Mul(n, kMulN));
75
55.6M
    const auto d = And(m, kMaskM);
76
55.6M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
55.6M
    const auto c = Or(a, l);
78
55.6M
    extra_bits = Add(extra_bits, eb_fixed);
79
55.6M
    const auto t = Or(c, d);
80
55.6M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
55.6M
    Store(t_fixed, du, out + i);
82
55.6M
  }
83
834k
  if (last_full < len) {
84
717k
    const auto stop = Set(du, len);
85
717k
    const auto fence = Iota(du, last_full);
86
717k
    const auto take = Lt(fence, stop);
87
717k
    const auto val = LoadU(du, values + last_full);
88
717k
    const auto is_large = Gt(val, kLargeThreshold);
89
717k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
717k
    const auto not_literal = Ge(val, kSplit);
91
717k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
717k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
717k
    const auto l = And(val, kMaskL);
94
717k
    const auto exp = ShiftRight<23>(b);
95
717k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
717k
    const auto n = Sub(exp_fixed, kExpOffset);
97
717k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
717k
    const auto m = ShiftRight<23 - M - L>(b);
99
717k
    const auto a = Add(kBase, Mul(n, kMulN));
100
717k
    const auto d = And(m, kMaskM);
101
717k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
717k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
717k
    const auto c = Or(a, l);
104
717k
    extra_bits = Add(extra_bits, eb_masked);
105
717k
    const auto t = Or(c, d);
106
717k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
717k
    Store(t_fixed, du, out + last_full);
108
717k
  }
109
834k
  return GetLane(SumOfLanes(du, extra_bits));
110
834k
}
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
100k
                               uint32_t* JXL_RESTRICT out) {
45
100k
  const HWY_FULL(uint32_t) du;
46
100k
  const HWY_FULL(float) df;
47
100k
  const auto kZero = Zero(du);
48
100k
  const auto kSplit = Set(du, 1 << E);
49
100k
  const auto kExpOffset = Set(du, 127);
50
100k
  const auto kEBOffset = Set(du, 127 + M + L);
51
100k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
100k
  const auto kMulN = Set(du, 1 << (M + L));
53
100k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
100k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
100k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
100k
  constexpr size_t kLargeShiftVal = 10;
57
100k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
100k
  auto extra_bits = kZero;
60
100k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
8.83M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
8.73M
    const auto val = LoadU(du, values + i);
63
8.73M
    const auto is_large = Gt(val, kLargeThreshold);
64
8.73M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
8.73M
    const auto not_literal = Ge(val, kSplit);
66
8.73M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
8.73M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
8.73M
    const auto l = And(val, kMaskL);
69
8.73M
    const auto exp = ShiftRight<23>(b);
70
8.73M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
8.73M
    const auto n = Sub(exp_fixed, kExpOffset);
72
8.73M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
8.73M
    const auto m = ShiftRight<23 - M - L>(b);
74
8.73M
    const auto a = Add(kBase, Mul(n, kMulN));
75
8.73M
    const auto d = And(m, kMaskM);
76
8.73M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
8.73M
    const auto c = Or(a, l);
78
8.73M
    extra_bits = Add(extra_bits, eb_fixed);
79
8.73M
    const auto t = Or(c, d);
80
8.73M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
8.73M
    Store(t_fixed, du, out + i);
82
8.73M
  }
83
100k
  if (last_full < len) {
84
87.0k
    const auto stop = Set(du, len);
85
87.0k
    const auto fence = Iota(du, last_full);
86
87.0k
    const auto take = Lt(fence, stop);
87
87.0k
    const auto val = LoadU(du, values + last_full);
88
87.0k
    const auto is_large = Gt(val, kLargeThreshold);
89
87.0k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
87.0k
    const auto not_literal = Ge(val, kSplit);
91
87.0k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
87.0k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
87.0k
    const auto l = And(val, kMaskL);
94
87.0k
    const auto exp = ShiftRight<23>(b);
95
87.0k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
87.0k
    const auto n = Sub(exp_fixed, kExpOffset);
97
87.0k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
87.0k
    const auto m = ShiftRight<23 - M - L>(b);
99
87.0k
    const auto a = Add(kBase, Mul(n, kMulN));
100
87.0k
    const auto d = And(m, kMaskM);
101
87.0k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
87.0k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
87.0k
    const auto c = Or(a, l);
104
87.0k
    extra_bits = Add(extra_bits, eb_masked);
105
87.0k
    const auto t = Or(c, d);
106
87.0k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
87.0k
    Store(t_fixed, du, out + last_full);
108
87.0k
  }
109
100k
  return GetLane(SumOfLanes(du, extra_bits));
110
100k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
97.9k
                               uint32_t* JXL_RESTRICT out) {
45
97.9k
  const HWY_FULL(uint32_t) du;
46
97.9k
  const HWY_FULL(float) df;
47
97.9k
  const auto kZero = Zero(du);
48
97.9k
  const auto kSplit = Set(du, 1 << E);
49
97.9k
  const auto kExpOffset = Set(du, 127);
50
97.9k
  const auto kEBOffset = Set(du, 127 + M + L);
51
97.9k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
97.9k
  const auto kMulN = Set(du, 1 << (M + L));
53
97.9k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
97.9k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
97.9k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
97.9k
  constexpr size_t kLargeShiftVal = 10;
57
97.9k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
97.9k
  auto extra_bits = kZero;
60
97.9k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
8.83M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
8.73M
    const auto val = LoadU(du, values + i);
63
8.73M
    const auto is_large = Gt(val, kLargeThreshold);
64
8.73M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
8.73M
    const auto not_literal = Ge(val, kSplit);
66
8.73M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
8.73M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
8.73M
    const auto l = And(val, kMaskL);
69
8.73M
    const auto exp = ShiftRight<23>(b);
70
8.73M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
8.73M
    const auto n = Sub(exp_fixed, kExpOffset);
72
8.73M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
8.73M
    const auto m = ShiftRight<23 - M - L>(b);
74
8.73M
    const auto a = Add(kBase, Mul(n, kMulN));
75
8.73M
    const auto d = And(m, kMaskM);
76
8.73M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
8.73M
    const auto c = Or(a, l);
78
8.73M
    extra_bits = Add(extra_bits, eb_fixed);
79
8.73M
    const auto t = Or(c, d);
80
8.73M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
8.73M
    Store(t_fixed, du, out + i);
82
8.73M
  }
83
97.9k
  if (last_full < len) {
84
84.5k
    const auto stop = Set(du, len);
85
84.5k
    const auto fence = Iota(du, last_full);
86
84.5k
    const auto take = Lt(fence, stop);
87
84.5k
    const auto val = LoadU(du, values + last_full);
88
84.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
84.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
84.5k
    const auto not_literal = Ge(val, kSplit);
91
84.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
84.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
84.5k
    const auto l = And(val, kMaskL);
94
84.5k
    const auto exp = ShiftRight<23>(b);
95
84.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
84.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
84.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
84.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
84.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
84.5k
    const auto d = And(m, kMaskM);
101
84.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
84.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
84.5k
    const auto c = Or(a, l);
104
84.5k
    extra_bits = Add(extra_bits, eb_masked);
105
84.5k
    const auto t = Or(c, d);
106
84.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
84.5k
    Store(t_fixed, du, out + last_full);
108
84.5k
  }
109
97.9k
  return GetLane(SumOfLanes(du, extra_bits));
110
97.9k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
97.9k
                               uint32_t* JXL_RESTRICT out) {
45
97.9k
  const HWY_FULL(uint32_t) du;
46
97.9k
  const HWY_FULL(float) df;
47
97.9k
  const auto kZero = Zero(du);
48
97.9k
  const auto kSplit = Set(du, 1 << E);
49
97.9k
  const auto kExpOffset = Set(du, 127);
50
97.9k
  const auto kEBOffset = Set(du, 127 + M + L);
51
97.9k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
97.9k
  const auto kMulN = Set(du, 1 << (M + L));
53
97.9k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
97.9k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
97.9k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
97.9k
  constexpr size_t kLargeShiftVal = 10;
57
97.9k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
97.9k
  auto extra_bits = kZero;
60
97.9k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
8.83M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
8.73M
    const auto val = LoadU(du, values + i);
63
8.73M
    const auto is_large = Gt(val, kLargeThreshold);
64
8.73M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
8.73M
    const auto not_literal = Ge(val, kSplit);
66
8.73M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
8.73M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
8.73M
    const auto l = And(val, kMaskL);
69
8.73M
    const auto exp = ShiftRight<23>(b);
70
8.73M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
8.73M
    const auto n = Sub(exp_fixed, kExpOffset);
72
8.73M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
8.73M
    const auto m = ShiftRight<23 - M - L>(b);
74
8.73M
    const auto a = Add(kBase, Mul(n, kMulN));
75
8.73M
    const auto d = And(m, kMaskM);
76
8.73M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
8.73M
    const auto c = Or(a, l);
78
8.73M
    extra_bits = Add(extra_bits, eb_fixed);
79
8.73M
    const auto t = Or(c, d);
80
8.73M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
8.73M
    Store(t_fixed, du, out + i);
82
8.73M
  }
83
97.9k
  if (last_full < len) {
84
84.5k
    const auto stop = Set(du, len);
85
84.5k
    const auto fence = Iota(du, last_full);
86
84.5k
    const auto take = Lt(fence, stop);
87
84.5k
    const auto val = LoadU(du, values + last_full);
88
84.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
84.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
84.5k
    const auto not_literal = Ge(val, kSplit);
91
84.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
84.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
84.5k
    const auto l = And(val, kMaskL);
94
84.5k
    const auto exp = ShiftRight<23>(b);
95
84.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
84.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
84.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
84.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
84.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
84.5k
    const auto d = And(m, kMaskM);
101
84.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
84.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
84.5k
    const auto c = Or(a, l);
104
84.5k
    extra_bits = Add(extra_bits, eb_masked);
105
84.5k
    const auto t = Or(c, d);
106
84.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
84.5k
    Store(t_fixed, du, out + last_full);
108
84.5k
  }
109
97.9k
  return GetLane(SumOfLanes(du, extra_bits));
110
97.9k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
97.9k
                               uint32_t* JXL_RESTRICT out) {
45
97.9k
  const HWY_FULL(uint32_t) du;
46
97.9k
  const HWY_FULL(float) df;
47
97.9k
  const auto kZero = Zero(du);
48
97.9k
  const auto kSplit = Set(du, 1 << E);
49
97.9k
  const auto kExpOffset = Set(du, 127);
50
97.9k
  const auto kEBOffset = Set(du, 127 + M + L);
51
97.9k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
97.9k
  const auto kMulN = Set(du, 1 << (M + L));
53
97.9k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
97.9k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
97.9k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
97.9k
  constexpr size_t kLargeShiftVal = 10;
57
97.9k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
97.9k
  auto extra_bits = kZero;
60
97.9k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
8.83M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
8.73M
    const auto val = LoadU(du, values + i);
63
8.73M
    const auto is_large = Gt(val, kLargeThreshold);
64
8.73M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
8.73M
    const auto not_literal = Ge(val, kSplit);
66
8.73M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
8.73M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
8.73M
    const auto l = And(val, kMaskL);
69
8.73M
    const auto exp = ShiftRight<23>(b);
70
8.73M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
8.73M
    const auto n = Sub(exp_fixed, kExpOffset);
72
8.73M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
8.73M
    const auto m = ShiftRight<23 - M - L>(b);
74
8.73M
    const auto a = Add(kBase, Mul(n, kMulN));
75
8.73M
    const auto d = And(m, kMaskM);
76
8.73M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
8.73M
    const auto c = Or(a, l);
78
8.73M
    extra_bits = Add(extra_bits, eb_fixed);
79
8.73M
    const auto t = Or(c, d);
80
8.73M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
8.73M
    Store(t_fixed, du, out + i);
82
8.73M
  }
83
97.9k
  if (last_full < len) {
84
84.5k
    const auto stop = Set(du, len);
85
84.5k
    const auto fence = Iota(du, last_full);
86
84.5k
    const auto take = Lt(fence, stop);
87
84.5k
    const auto val = LoadU(du, values + last_full);
88
84.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
84.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
84.5k
    const auto not_literal = Ge(val, kSplit);
91
84.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
84.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
84.5k
    const auto l = And(val, kMaskL);
94
84.5k
    const auto exp = ShiftRight<23>(b);
95
84.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
84.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
84.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
84.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
84.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
84.5k
    const auto d = And(m, kMaskM);
101
84.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
84.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
84.5k
    const auto c = Or(a, l);
104
84.5k
    extra_bits = Add(extra_bits, eb_masked);
105
84.5k
    const auto t = Or(c, d);
106
84.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
84.5k
    Store(t_fixed, du, out + last_full);
108
84.5k
  }
109
97.9k
  return GetLane(SumOfLanes(du, extra_bits));
110
97.9k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.4k
                               uint32_t* JXL_RESTRICT out) {
45
19.4k
  const HWY_FULL(uint32_t) du;
46
19.4k
  const HWY_FULL(float) df;
47
19.4k
  const auto kZero = Zero(du);
48
19.4k
  const auto kSplit = Set(du, 1 << E);
49
19.4k
  const auto kExpOffset = Set(du, 127);
50
19.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.4k
  constexpr size_t kLargeShiftVal = 10;
57
19.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.4k
  auto extra_bits = kZero;
60
19.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.4k
  if (last_full < len) {
84
16.6k
    const auto stop = Set(du, len);
85
16.6k
    const auto fence = Iota(du, last_full);
86
16.6k
    const auto take = Lt(fence, stop);
87
16.6k
    const auto val = LoadU(du, values + last_full);
88
16.6k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.6k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.6k
    const auto not_literal = Ge(val, kSplit);
91
16.6k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.6k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.6k
    const auto l = And(val, kMaskL);
94
16.6k
    const auto exp = ShiftRight<23>(b);
95
16.6k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.6k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.6k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.6k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.6k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.6k
    const auto d = And(m, kMaskM);
101
16.6k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.6k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.6k
    const auto c = Or(a, l);
104
16.6k
    extra_bits = Add(extra_bits, eb_masked);
105
16.6k
    const auto t = Or(c, d);
106
16.6k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.6k
    Store(t_fixed, du, out + last_full);
108
16.6k
  }
109
19.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.4k
                               uint32_t* JXL_RESTRICT out) {
45
19.4k
  const HWY_FULL(uint32_t) du;
46
19.4k
  const HWY_FULL(float) df;
47
19.4k
  const auto kZero = Zero(du);
48
19.4k
  const auto kSplit = Set(du, 1 << E);
49
19.4k
  const auto kExpOffset = Set(du, 127);
50
19.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.4k
  constexpr size_t kLargeShiftVal = 10;
57
19.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.4k
  auto extra_bits = kZero;
60
19.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.4k
  if (last_full < len) {
84
16.6k
    const auto stop = Set(du, len);
85
16.6k
    const auto fence = Iota(du, last_full);
86
16.6k
    const auto take = Lt(fence, stop);
87
16.6k
    const auto val = LoadU(du, values + last_full);
88
16.6k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.6k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.6k
    const auto not_literal = Ge(val, kSplit);
91
16.6k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.6k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.6k
    const auto l = And(val, kMaskL);
94
16.6k
    const auto exp = ShiftRight<23>(b);
95
16.6k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.6k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.6k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.6k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.6k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.6k
    const auto d = And(m, kMaskM);
101
16.6k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.6k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.6k
    const auto c = Or(a, l);
104
16.6k
    extra_bits = Add(extra_bits, eb_masked);
105
16.6k
    const auto t = Or(c, d);
106
16.6k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.6k
    Store(t_fixed, du, out + last_full);
108
16.6k
  }
109
19.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
17.1k
                               uint32_t* JXL_RESTRICT out) {
45
17.1k
  const HWY_FULL(uint32_t) du;
46
17.1k
  const HWY_FULL(float) df;
47
17.1k
  const auto kZero = Zero(du);
48
17.1k
  const auto kSplit = Set(du, 1 << E);
49
17.1k
  const auto kExpOffset = Set(du, 127);
50
17.1k
  const auto kEBOffset = Set(du, 127 + M + L);
51
17.1k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
17.1k
  const auto kMulN = Set(du, 1 << (M + L));
53
17.1k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
17.1k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
17.1k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
17.1k
  constexpr size_t kLargeShiftVal = 10;
57
17.1k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
17.1k
  auto extra_bits = kZero;
60
17.1k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
579k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
562k
    const auto val = LoadU(du, values + i);
63
562k
    const auto is_large = Gt(val, kLargeThreshold);
64
562k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
562k
    const auto not_literal = Ge(val, kSplit);
66
562k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
562k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
562k
    const auto l = And(val, kMaskL);
69
562k
    const auto exp = ShiftRight<23>(b);
70
562k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
562k
    const auto n = Sub(exp_fixed, kExpOffset);
72
562k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
562k
    const auto m = ShiftRight<23 - M - L>(b);
74
562k
    const auto a = Add(kBase, Mul(n, kMulN));
75
562k
    const auto d = And(m, kMaskM);
76
562k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
562k
    const auto c = Or(a, l);
78
562k
    extra_bits = Add(extra_bits, eb_fixed);
79
562k
    const auto t = Or(c, d);
80
562k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
562k
    Store(t_fixed, du, out + i);
82
562k
  }
83
17.1k
  if (last_full < len) {
84
14.6k
    const auto stop = Set(du, len);
85
14.6k
    const auto fence = Iota(du, last_full);
86
14.6k
    const auto take = Lt(fence, stop);
87
14.6k
    const auto val = LoadU(du, values + last_full);
88
14.6k
    const auto is_large = Gt(val, kLargeThreshold);
89
14.6k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
14.6k
    const auto not_literal = Ge(val, kSplit);
91
14.6k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
14.6k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
14.6k
    const auto l = And(val, kMaskL);
94
14.6k
    const auto exp = ShiftRight<23>(b);
95
14.6k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
14.6k
    const auto n = Sub(exp_fixed, kExpOffset);
97
14.6k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
14.6k
    const auto m = ShiftRight<23 - M - L>(b);
99
14.6k
    const auto a = Add(kBase, Mul(n, kMulN));
100
14.6k
    const auto d = And(m, kMaskM);
101
14.6k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
14.6k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
14.6k
    const auto c = Or(a, l);
104
14.6k
    extra_bits = Add(extra_bits, eb_masked);
105
14.6k
    const auto t = Or(c, d);
106
14.6k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
14.6k
    Store(t_fixed, du, out + last_full);
108
14.6k
  }
109
17.1k
  return GetLane(SumOfLanes(du, extra_bits));
110
17.1k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
17.1k
                               uint32_t* JXL_RESTRICT out) {
45
17.1k
  const HWY_FULL(uint32_t) du;
46
17.1k
  const HWY_FULL(float) df;
47
17.1k
  const auto kZero = Zero(du);
48
17.1k
  const auto kSplit = Set(du, 1 << E);
49
17.1k
  const auto kExpOffset = Set(du, 127);
50
17.1k
  const auto kEBOffset = Set(du, 127 + M + L);
51
17.1k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
17.1k
  const auto kMulN = Set(du, 1 << (M + L));
53
17.1k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
17.1k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
17.1k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
17.1k
  constexpr size_t kLargeShiftVal = 10;
57
17.1k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
17.1k
  auto extra_bits = kZero;
60
17.1k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
579k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
562k
    const auto val = LoadU(du, values + i);
63
562k
    const auto is_large = Gt(val, kLargeThreshold);
64
562k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
562k
    const auto not_literal = Ge(val, kSplit);
66
562k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
562k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
562k
    const auto l = And(val, kMaskL);
69
562k
    const auto exp = ShiftRight<23>(b);
70
562k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
562k
    const auto n = Sub(exp_fixed, kExpOffset);
72
562k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
562k
    const auto m = ShiftRight<23 - M - L>(b);
74
562k
    const auto a = Add(kBase, Mul(n, kMulN));
75
562k
    const auto d = And(m, kMaskM);
76
562k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
562k
    const auto c = Or(a, l);
78
562k
    extra_bits = Add(extra_bits, eb_fixed);
79
562k
    const auto t = Or(c, d);
80
562k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
562k
    Store(t_fixed, du, out + i);
82
562k
  }
83
17.1k
  if (last_full < len) {
84
14.6k
    const auto stop = Set(du, len);
85
14.6k
    const auto fence = Iota(du, last_full);
86
14.6k
    const auto take = Lt(fence, stop);
87
14.6k
    const auto val = LoadU(du, values + last_full);
88
14.6k
    const auto is_large = Gt(val, kLargeThreshold);
89
14.6k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
14.6k
    const auto not_literal = Ge(val, kSplit);
91
14.6k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
14.6k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
14.6k
    const auto l = And(val, kMaskL);
94
14.6k
    const auto exp = ShiftRight<23>(b);
95
14.6k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
14.6k
    const auto n = Sub(exp_fixed, kExpOffset);
97
14.6k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
14.6k
    const auto m = ShiftRight<23 - M - L>(b);
99
14.6k
    const auto a = Add(kBase, Mul(n, kMulN));
100
14.6k
    const auto d = And(m, kMaskM);
101
14.6k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
14.6k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
14.6k
    const auto c = Or(a, l);
104
14.6k
    extra_bits = Add(extra_bits, eb_masked);
105
14.6k
    const auto t = Or(c, d);
106
14.6k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
14.6k
    Store(t_fixed, du, out + last_full);
108
14.6k
  }
109
17.1k
  return GetLane(SumOfLanes(du, extra_bits));
110
17.1k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
19.5k
                               uint32_t* JXL_RESTRICT out) {
45
19.5k
  const HWY_FULL(uint32_t) du;
46
19.5k
  const HWY_FULL(float) df;
47
19.5k
  const auto kZero = Zero(du);
48
19.5k
  const auto kSplit = Set(du, 1 << E);
49
19.5k
  const auto kExpOffset = Set(du, 127);
50
19.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
19.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
19.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
19.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
19.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
19.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
19.5k
  constexpr size_t kLargeShiftVal = 10;
57
19.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
19.5k
  auto extra_bits = kZero;
60
19.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
1.02M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
1.00M
    const auto val = LoadU(du, values + i);
63
1.00M
    const auto is_large = Gt(val, kLargeThreshold);
64
1.00M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
1.00M
    const auto not_literal = Ge(val, kSplit);
66
1.00M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
1.00M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
1.00M
    const auto l = And(val, kMaskL);
69
1.00M
    const auto exp = ShiftRight<23>(b);
70
1.00M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
1.00M
    const auto n = Sub(exp_fixed, kExpOffset);
72
1.00M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
1.00M
    const auto m = ShiftRight<23 - M - L>(b);
74
1.00M
    const auto a = Add(kBase, Mul(n, kMulN));
75
1.00M
    const auto d = And(m, kMaskM);
76
1.00M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
1.00M
    const auto c = Or(a, l);
78
1.00M
    extra_bits = Add(extra_bits, eb_fixed);
79
1.00M
    const auto t = Or(c, d);
80
1.00M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
1.00M
    Store(t_fixed, du, out + i);
82
1.00M
  }
83
19.5k
  if (last_full < len) {
84
16.7k
    const auto stop = Set(du, len);
85
16.7k
    const auto fence = Iota(du, last_full);
86
16.7k
    const auto take = Lt(fence, stop);
87
16.7k
    const auto val = LoadU(du, values + last_full);
88
16.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
16.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
16.7k
    const auto not_literal = Ge(val, kSplit);
91
16.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
16.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
16.7k
    const auto l = And(val, kMaskL);
94
16.7k
    const auto exp = ShiftRight<23>(b);
95
16.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
16.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
16.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
16.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
16.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
16.7k
    const auto d = And(m, kMaskM);
101
16.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
16.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
16.7k
    const auto c = Or(a, l);
104
16.7k
    extra_bits = Add(extra_bits, eb_masked);
105
16.7k
    const auto t = Or(c, d);
106
16.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
16.7k
    Store(t_fixed, du, out + last_full);
108
16.7k
  }
109
19.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
19.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
14.8k
                               uint32_t* JXL_RESTRICT out) {
45
14.8k
  const HWY_FULL(uint32_t) du;
46
14.8k
  const HWY_FULL(float) df;
47
14.8k
  const auto kZero = Zero(du);
48
14.8k
  const auto kSplit = Set(du, 1 << E);
49
14.8k
  const auto kExpOffset = Set(du, 127);
50
14.8k
  const auto kEBOffset = Set(du, 127 + M + L);
51
14.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
14.8k
  const auto kMulN = Set(du, 1 << (M + L));
53
14.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
14.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
14.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
14.8k
  constexpr size_t kLargeShiftVal = 10;
57
14.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
14.8k
  auto extra_bits = kZero;
60
14.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
493k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
478k
    const auto val = LoadU(du, values + i);
63
478k
    const auto is_large = Gt(val, kLargeThreshold);
64
478k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
478k
    const auto not_literal = Ge(val, kSplit);
66
478k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
478k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
478k
    const auto l = And(val, kMaskL);
69
478k
    const auto exp = ShiftRight<23>(b);
70
478k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
478k
    const auto n = Sub(exp_fixed, kExpOffset);
72
478k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
478k
    const auto m = ShiftRight<23 - M - L>(b);
74
478k
    const auto a = Add(kBase, Mul(n, kMulN));
75
478k
    const auto d = And(m, kMaskM);
76
478k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
478k
    const auto c = Or(a, l);
78
478k
    extra_bits = Add(extra_bits, eb_fixed);
79
478k
    const auto t = Or(c, d);
80
478k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
478k
    Store(t_fixed, du, out + i);
82
478k
  }
83
14.8k
  if (last_full < len) {
84
12.7k
    const auto stop = Set(du, len);
85
12.7k
    const auto fence = Iota(du, last_full);
86
12.7k
    const auto take = Lt(fence, stop);
87
12.7k
    const auto val = LoadU(du, values + last_full);
88
12.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
12.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
12.7k
    const auto not_literal = Ge(val, kSplit);
91
12.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
12.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
12.7k
    const auto l = And(val, kMaskL);
94
12.7k
    const auto exp = ShiftRight<23>(b);
95
12.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
12.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
12.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
12.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
12.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
12.7k
    const auto d = And(m, kMaskM);
101
12.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
12.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
12.7k
    const auto c = Or(a, l);
104
12.7k
    extra_bits = Add(extra_bits, eb_masked);
105
12.7k
    const auto t = Or(c, d);
106
12.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
12.7k
    Store(t_fixed, du, out + last_full);
108
12.7k
  }
109
14.8k
  return GetLane(SumOfLanes(du, extra_bits));
110
14.8k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
14.8k
                               uint32_t* JXL_RESTRICT out) {
45
14.8k
  const HWY_FULL(uint32_t) du;
46
14.8k
  const HWY_FULL(float) df;
47
14.8k
  const auto kZero = Zero(du);
48
14.8k
  const auto kSplit = Set(du, 1 << E);
49
14.8k
  const auto kExpOffset = Set(du, 127);
50
14.8k
  const auto kEBOffset = Set(du, 127 + M + L);
51
14.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
14.8k
  const auto kMulN = Set(du, 1 << (M + L));
53
14.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
14.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
14.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
14.8k
  constexpr size_t kLargeShiftVal = 10;
57
14.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
14.8k
  auto extra_bits = kZero;
60
14.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
493k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
478k
    const auto val = LoadU(du, values + i);
63
478k
    const auto is_large = Gt(val, kLargeThreshold);
64
478k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
478k
    const auto not_literal = Ge(val, kSplit);
66
478k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
478k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
478k
    const auto l = And(val, kMaskL);
69
478k
    const auto exp = ShiftRight<23>(b);
70
478k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
478k
    const auto n = Sub(exp_fixed, kExpOffset);
72
478k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
478k
    const auto m = ShiftRight<23 - M - L>(b);
74
478k
    const auto a = Add(kBase, Mul(n, kMulN));
75
478k
    const auto d = And(m, kMaskM);
76
478k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
478k
    const auto c = Or(a, l);
78
478k
    extra_bits = Add(extra_bits, eb_fixed);
79
478k
    const auto t = Or(c, d);
80
478k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
478k
    Store(t_fixed, du, out + i);
82
478k
  }
83
14.8k
  if (last_full < len) {
84
12.7k
    const auto stop = Set(du, len);
85
12.7k
    const auto fence = Iota(du, last_full);
86
12.7k
    const auto take = Lt(fence, stop);
87
12.7k
    const auto val = LoadU(du, values + last_full);
88
12.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
12.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
12.7k
    const auto not_literal = Ge(val, kSplit);
91
12.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
12.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
12.7k
    const auto l = And(val, kMaskL);
94
12.7k
    const auto exp = ShiftRight<23>(b);
95
12.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
12.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
12.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
12.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
12.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
12.7k
    const auto d = And(m, kMaskM);
101
12.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
12.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
12.7k
    const auto c = Or(a, l);
104
12.7k
    extra_bits = Add(extra_bits, eb_masked);
105
12.7k
    const auto t = Or(c, d);
106
12.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
12.7k
    Store(t_fixed, du, out + last_full);
108
12.7k
  }
109
14.8k
  return GetLane(SumOfLanes(du, extra_bits));
110
14.8k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
14.8k
                               uint32_t* JXL_RESTRICT out) {
45
14.8k
  const HWY_FULL(uint32_t) du;
46
14.8k
  const HWY_FULL(float) df;
47
14.8k
  const auto kZero = Zero(du);
48
14.8k
  const auto kSplit = Set(du, 1 << E);
49
14.8k
  const auto kExpOffset = Set(du, 127);
50
14.8k
  const auto kEBOffset = Set(du, 127 + M + L);
51
14.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
14.8k
  const auto kMulN = Set(du, 1 << (M + L));
53
14.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
14.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
14.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
14.8k
  constexpr size_t kLargeShiftVal = 10;
57
14.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
14.8k
  auto extra_bits = kZero;
60
14.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
493k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
478k
    const auto val = LoadU(du, values + i);
63
478k
    const auto is_large = Gt(val, kLargeThreshold);
64
478k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
478k
    const auto not_literal = Ge(val, kSplit);
66
478k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
478k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
478k
    const auto l = And(val, kMaskL);
69
478k
    const auto exp = ShiftRight<23>(b);
70
478k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
478k
    const auto n = Sub(exp_fixed, kExpOffset);
72
478k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
478k
    const auto m = ShiftRight<23 - M - L>(b);
74
478k
    const auto a = Add(kBase, Mul(n, kMulN));
75
478k
    const auto d = And(m, kMaskM);
76
478k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
478k
    const auto c = Or(a, l);
78
478k
    extra_bits = Add(extra_bits, eb_fixed);
79
478k
    const auto t = Or(c, d);
80
478k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
478k
    Store(t_fixed, du, out + i);
82
478k
  }
83
14.8k
  if (last_full < len) {
84
12.7k
    const auto stop = Set(du, len);
85
12.7k
    const auto fence = Iota(du, last_full);
86
12.7k
    const auto take = Lt(fence, stop);
87
12.7k
    const auto val = LoadU(du, values + last_full);
88
12.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
12.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
12.7k
    const auto not_literal = Ge(val, kSplit);
91
12.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
12.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
12.7k
    const auto l = And(val, kMaskL);
94
12.7k
    const auto exp = ShiftRight<23>(b);
95
12.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
12.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
12.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
12.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
12.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
12.7k
    const auto d = And(m, kMaskM);
101
12.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
12.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
12.7k
    const auto c = Or(a, l);
104
12.7k
    extra_bits = Add(extra_bits, eb_masked);
105
12.7k
    const auto t = Or(c, d);
106
12.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
12.7k
    Store(t_fixed, du, out + last_full);
108
12.7k
  }
109
14.8k
  return GetLane(SumOfLanes(du, extra_bits));
110
14.8k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
14.8k
                               uint32_t* JXL_RESTRICT out) {
45
14.8k
  const HWY_FULL(uint32_t) du;
46
14.8k
  const HWY_FULL(float) df;
47
14.8k
  const auto kZero = Zero(du);
48
14.8k
  const auto kSplit = Set(du, 1 << E);
49
14.8k
  const auto kExpOffset = Set(du, 127);
50
14.8k
  const auto kEBOffset = Set(du, 127 + M + L);
51
14.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
14.8k
  const auto kMulN = Set(du, 1 << (M + L));
53
14.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
14.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
14.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
14.8k
  constexpr size_t kLargeShiftVal = 10;
57
14.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
14.8k
  auto extra_bits = kZero;
60
14.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
493k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
478k
    const auto val = LoadU(du, values + i);
63
478k
    const auto is_large = Gt(val, kLargeThreshold);
64
478k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
478k
    const auto not_literal = Ge(val, kSplit);
66
478k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
478k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
478k
    const auto l = And(val, kMaskL);
69
478k
    const auto exp = ShiftRight<23>(b);
70
478k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
478k
    const auto n = Sub(exp_fixed, kExpOffset);
72
478k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
478k
    const auto m = ShiftRight<23 - M - L>(b);
74
478k
    const auto a = Add(kBase, Mul(n, kMulN));
75
478k
    const auto d = And(m, kMaskM);
76
478k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
478k
    const auto c = Or(a, l);
78
478k
    extra_bits = Add(extra_bits, eb_fixed);
79
478k
    const auto t = Or(c, d);
80
478k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
478k
    Store(t_fixed, du, out + i);
82
478k
  }
83
14.8k
  if (last_full < len) {
84
12.7k
    const auto stop = Set(du, len);
85
12.7k
    const auto fence = Iota(du, last_full);
86
12.7k
    const auto take = Lt(fence, stop);
87
12.7k
    const auto val = LoadU(du, values + last_full);
88
12.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
12.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
12.7k
    const auto not_literal = Ge(val, kSplit);
91
12.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
12.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
12.7k
    const auto l = And(val, kMaskL);
94
12.7k
    const auto exp = ShiftRight<23>(b);
95
12.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
12.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
12.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
12.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
12.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
12.7k
    const auto d = And(m, kMaskM);
101
12.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
12.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
12.7k
    const auto c = Or(a, l);
104
12.7k
    extra_bits = Add(extra_bits, eb_masked);
105
12.7k
    const auto t = Or(c, d);
106
12.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
12.7k
    Store(t_fixed, du, out + last_full);
108
12.7k
  }
109
14.8k
  return GetLane(SumOfLanes(du, extra_bits));
110
14.8k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
14.8k
                               uint32_t* JXL_RESTRICT out) {
45
14.8k
  const HWY_FULL(uint32_t) du;
46
14.8k
  const HWY_FULL(float) df;
47
14.8k
  const auto kZero = Zero(du);
48
14.8k
  const auto kSplit = Set(du, 1 << E);
49
14.8k
  const auto kExpOffset = Set(du, 127);
50
14.8k
  const auto kEBOffset = Set(du, 127 + M + L);
51
14.8k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
14.8k
  const auto kMulN = Set(du, 1 << (M + L));
53
14.8k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
14.8k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
14.8k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
14.8k
  constexpr size_t kLargeShiftVal = 10;
57
14.8k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
14.8k
  auto extra_bits = kZero;
60
14.8k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
493k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
478k
    const auto val = LoadU(du, values + i);
63
478k
    const auto is_large = Gt(val, kLargeThreshold);
64
478k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
478k
    const auto not_literal = Ge(val, kSplit);
66
478k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
478k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
478k
    const auto l = And(val, kMaskL);
69
478k
    const auto exp = ShiftRight<23>(b);
70
478k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
478k
    const auto n = Sub(exp_fixed, kExpOffset);
72
478k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
478k
    const auto m = ShiftRight<23 - M - L>(b);
74
478k
    const auto a = Add(kBase, Mul(n, kMulN));
75
478k
    const auto d = And(m, kMaskM);
76
478k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
478k
    const auto c = Or(a, l);
78
478k
    extra_bits = Add(extra_bits, eb_fixed);
79
478k
    const auto t = Or(c, d);
80
478k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
478k
    Store(t_fixed, du, out + i);
82
478k
  }
83
14.8k
  if (last_full < len) {
84
12.7k
    const auto stop = Set(du, len);
85
12.7k
    const auto fence = Iota(du, last_full);
86
12.7k
    const auto take = Lt(fence, stop);
87
12.7k
    const auto val = LoadU(du, values + last_full);
88
12.7k
    const auto is_large = Gt(val, kLargeThreshold);
89
12.7k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
12.7k
    const auto not_literal = Ge(val, kSplit);
91
12.7k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
12.7k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
12.7k
    const auto l = And(val, kMaskL);
94
12.7k
    const auto exp = ShiftRight<23>(b);
95
12.7k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
12.7k
    const auto n = Sub(exp_fixed, kExpOffset);
97
12.7k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
12.7k
    const auto m = ShiftRight<23 - M - L>(b);
99
12.7k
    const auto a = Add(kBase, Mul(n, kMulN));
100
12.7k
    const auto d = And(m, kMaskM);
101
12.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
12.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
12.7k
    const auto c = Or(a, l);
104
12.7k
    extra_bits = Add(extra_bits, eb_masked);
105
12.7k
    const auto t = Or(c, d);
106
12.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
12.7k
    Store(t_fixed, du, out + last_full);
108
12.7k
  }
109
14.8k
  return GetLane(SumOfLanes(du, extra_bits));
110
14.8k
}
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
111
112
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
113
834k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
834k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
834k
  if (cfg.split_exponent == 0) {
127
100k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
734k
  } else if (cfg.split_exponent == 2) {
129
97.9k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
97.9k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
636k
  } else if (cfg.split_exponent == 3) {
132
78.0k
    if (cfg.msb_in_token == 1) {
133
39.0k
      if (cfg.lsb_in_token == 0) {
134
19.5k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
19.5k
      } else {
136
19.5k
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
19.5k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
19.5k
      }
139
39.0k
    } else {
140
39.0k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
39.0k
      if (cfg.lsb_in_token == 0) {
142
19.5k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
19.5k
      } else {
144
19.5k
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
19.5k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
19.5k
      }
147
39.0k
    }
148
558k
  } else if (cfg.split_exponent == 4) {
149
273k
    if (cfg.msb_in_token == 1) {
150
136k
      if (cfg.lsb_in_token == 0) {
151
19.5k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
117k
      } else if (cfg.lsb_in_token == 2) {
153
97.9k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
97.9k
      } else {
155
19.5k
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
19.5k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
19.5k
      }
158
136k
    } else {
159
136k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
136k
      if (cfg.lsb_in_token == 0) {
161
97.9k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
97.9k
      } else if (cfg.lsb_in_token == 1) {
163
19.5k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
19.5k
      } else {
165
19.5k
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
19.5k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
19.5k
      }
168
136k
    }
169
284k
  } else if (cfg.split_exponent == 5) {
170
136k
    if (cfg.msb_in_token == 1) {
171
58.5k
      if (cfg.lsb_in_token == 0) {
172
19.5k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
39.0k
      } else if (cfg.lsb_in_token == 2) {
174
19.5k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
19.5k
      } else {
176
19.4k
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
19.4k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
19.4k
      }
179
78.0k
    } else {
180
78.0k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
78.0k
      if (cfg.lsb_in_token == 0) {
182
19.5k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
58.5k
      } else if (cfg.lsb_in_token == 1) {
184
19.5k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
39.0k
      } else if (cfg.lsb_in_token == 2) {
186
19.5k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
19.5k
      } else {
188
19.4k
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
19.4k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
19.4k
      }
191
78.0k
    }
192
147k
  } else if (cfg.split_exponent == 6) {
193
53.8k
    if (cfg.msb_in_token == 0) {
194
19.5k
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
19.5k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
34.3k
    } else if (cfg.msb_in_token == 1) {
197
17.1k
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
17.1k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
17.1k
    } else {
200
17.1k
      JXL_DASSERT(cfg.msb_in_token == 2);
201
17.1k
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
17.1k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
17.1k
    }
204
93.7k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
93.7k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
93.7k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
93.7k
    if (cfg.split_exponent == 7) {
208
19.5k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
74.2k
    } else if (cfg.split_exponent == 8) {
210
14.8k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
59.3k
    } else if (cfg.split_exponent == 9) {
212
14.8k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
44.5k
    } else if (cfg.split_exponent == 10) {
214
14.8k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
29.6k
    } else if (cfg.split_exponent == 11) {
216
14.8k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
14.8k
    } else {
218
14.8k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
14.8k
    }
220
93.7k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
834k
#endif
225
834k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Line
Count
Source
113
834k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
834k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
834k
  if (cfg.split_exponent == 0) {
127
100k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
734k
  } else if (cfg.split_exponent == 2) {
129
97.9k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
97.9k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
636k
  } else if (cfg.split_exponent == 3) {
132
78.0k
    if (cfg.msb_in_token == 1) {
133
39.0k
      if (cfg.lsb_in_token == 0) {
134
19.5k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
19.5k
      } else {
136
19.5k
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
19.5k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
19.5k
      }
139
39.0k
    } else {
140
39.0k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
39.0k
      if (cfg.lsb_in_token == 0) {
142
19.5k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
19.5k
      } else {
144
19.5k
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
19.5k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
19.5k
      }
147
39.0k
    }
148
558k
  } else if (cfg.split_exponent == 4) {
149
273k
    if (cfg.msb_in_token == 1) {
150
136k
      if (cfg.lsb_in_token == 0) {
151
19.5k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
117k
      } else if (cfg.lsb_in_token == 2) {
153
97.9k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
97.9k
      } else {
155
19.5k
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
19.5k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
19.5k
      }
158
136k
    } else {
159
136k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
136k
      if (cfg.lsb_in_token == 0) {
161
97.9k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
97.9k
      } else if (cfg.lsb_in_token == 1) {
163
19.5k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
19.5k
      } else {
165
19.5k
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
19.5k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
19.5k
      }
168
136k
    }
169
284k
  } else if (cfg.split_exponent == 5) {
170
136k
    if (cfg.msb_in_token == 1) {
171
58.5k
      if (cfg.lsb_in_token == 0) {
172
19.5k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
39.0k
      } else if (cfg.lsb_in_token == 2) {
174
19.5k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
19.5k
      } else {
176
19.4k
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
19.4k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
19.4k
      }
179
78.0k
    } else {
180
78.0k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
78.0k
      if (cfg.lsb_in_token == 0) {
182
19.5k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
58.5k
      } else if (cfg.lsb_in_token == 1) {
184
19.5k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
39.0k
      } else if (cfg.lsb_in_token == 2) {
186
19.5k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
19.5k
      } else {
188
19.4k
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
19.4k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
19.4k
      }
191
78.0k
    }
192
147k
  } else if (cfg.split_exponent == 6) {
193
53.8k
    if (cfg.msb_in_token == 0) {
194
19.5k
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
19.5k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
34.3k
    } else if (cfg.msb_in_token == 1) {
197
17.1k
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
17.1k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
17.1k
    } else {
200
17.1k
      JXL_DASSERT(cfg.msb_in_token == 2);
201
17.1k
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
17.1k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
17.1k
    }
204
93.7k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
93.7k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
93.7k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
93.7k
    if (cfg.split_exponent == 7) {
208
19.5k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
74.2k
    } else if (cfg.split_exponent == 8) {
210
14.8k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
59.3k
    } else if (cfg.split_exponent == 9) {
212
14.8k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
44.5k
    } else if (cfg.split_exponent == 10) {
214
14.8k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
29.6k
    } else if (cfg.split_exponent == 11) {
216
14.8k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
14.8k
    } else {
218
14.8k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
14.8k
    }
220
93.7k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
834k
#endif
225
834k
}
Unexecuted instantiation: jxl::N_AVX3::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_SPR::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
226
227
// NOLINTNEXTLINE(google-readability-namespace-comments)
228
}  // namespace HWY_NAMESPACE
229
}  // namespace jxl
230
HWY_AFTER_NAMESPACE();
231
232
#if HWY_ONCE
233
namespace jxl {
234
235
HWY_EXPORT(EstimateTokenCost);
236
237
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
238
834k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
239
834k
  JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent);
240
834k
  return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens);
241
834k
}
242
243
}  // namespace jxl
244
#endif