Coverage Report

Created: 2025-08-11 08:01

/src/libjxl/lib/jxl/enc_ans_simd.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ans_simd.h"
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/status.h"
11
#include "lib/jxl/dec_ans.h"
12
#include "lib/jxl/memory_manager_internal.h"
13
14
#undef HWY_TARGET_INCLUDE
15
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc"
16
#include <hwy/foreach_target.h>
17
#include <hwy/highway.h>
18
19
HWY_BEFORE_NAMESPACE();
20
namespace jxl {
21
namespace HWY_NAMESPACE {
22
23
// These templates are not found via ADL.
24
using hwy::HWY_NAMESPACE::Add;
25
using hwy::HWY_NAMESPACE::And;
26
using hwy::HWY_NAMESPACE::Ge;
27
using hwy::HWY_NAMESPACE::GetLane;
28
using hwy::HWY_NAMESPACE::Gt;
29
using hwy::HWY_NAMESPACE::IfThenElse;
30
using hwy::HWY_NAMESPACE::IfThenElseZero;
31
using hwy::HWY_NAMESPACE::Iota;
32
using hwy::HWY_NAMESPACE::LoadU;
33
using hwy::HWY_NAMESPACE::Lt;
34
using hwy::HWY_NAMESPACE::Mul;
35
using hwy::HWY_NAMESPACE::Or;
36
using hwy::HWY_NAMESPACE::Set;
37
using hwy::HWY_NAMESPACE::ShiftRight;
38
using hwy::HWY_NAMESPACE::Store;
39
using hwy::HWY_NAMESPACE::Sub;
40
using hwy::HWY_NAMESPACE::Zero;
41
42
template <size_t E, size_t M, size_t L>
43
uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len,
44
449k
                               uint32_t* JXL_RESTRICT out) {
45
449k
  const HWY_FULL(uint32_t) du;
46
449k
  const HWY_FULL(float) df;
47
449k
  const auto kZero = Zero(du);
48
449k
  const auto kSplit = Set(du, 1 << E);
49
449k
  const auto kExpOffset = Set(du, 127);
50
449k
  const auto kEBOffset = Set(du, 127 + M + L);
51
449k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
449k
  const auto kMulN = Set(du, 1 << (M + L));
53
449k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
449k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
449k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
449k
  constexpr size_t kLargeShiftVal = 10;
57
449k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
449k
  auto extra_bits = kZero;
60
449k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
38.7M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
38.3M
    const auto val = LoadU(du, values + i);
63
38.3M
    const auto is_large = Gt(val, kLargeThreshold);
64
38.3M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
38.3M
    const auto not_literal = Ge(val, kSplit);
66
38.3M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
38.3M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
38.3M
    const auto l = And(val, kMaskL);
69
38.3M
    const auto exp = ShiftRight<23>(b);
70
38.3M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
38.3M
    const auto n = Sub(exp_fixed, kExpOffset);
72
38.3M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
38.3M
    const auto m = ShiftRight<23 - M - L>(b);
74
38.3M
    const auto a = Add(kBase, Mul(n, kMulN));
75
38.3M
    const auto d = And(m, kMaskM);
76
38.3M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
38.3M
    const auto c = Or(a, l);
78
38.3M
    extra_bits = Add(extra_bits, eb_fixed);
79
38.3M
    const auto t = Or(c, d);
80
38.3M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
38.3M
    Store(t_fixed, du, out + i);
82
38.3M
  }
83
449k
  if (last_full < len) {
84
385k
    const auto stop = Set(du, len);
85
385k
    const auto fence = Iota(du, last_full);
86
385k
    const auto take = Lt(fence, stop);
87
385k
    const auto val = LoadU(du, values + last_full);
88
385k
    const auto is_large = Gt(val, kLargeThreshold);
89
385k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
385k
    const auto not_literal = Ge(val, kSplit);
91
385k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
385k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
385k
    const auto l = And(val, kMaskL);
94
385k
    const auto exp = ShiftRight<23>(b);
95
385k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
385k
    const auto n = Sub(exp_fixed, kExpOffset);
97
385k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
385k
    const auto m = ShiftRight<23 - M - L>(b);
99
385k
    const auto a = Add(kBase, Mul(n, kMulN));
100
385k
    const auto d = And(m, kMaskM);
101
385k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
385k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
385k
    const auto c = Or(a, l);
104
385k
    extra_bits = Add(extra_bits, eb_masked);
105
385k
    const auto t = Or(c, d);
106
385k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
385k
    Store(t_fixed, du, out + last_full);
108
385k
  }
109
449k
  return GetLane(SumOfLanes(du, extra_bits));
110
449k
}
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
63.4k
                               uint32_t* JXL_RESTRICT out) {
45
63.4k
  const HWY_FULL(uint32_t) du;
46
63.4k
  const HWY_FULL(float) df;
47
63.4k
  const auto kZero = Zero(du);
48
63.4k
  const auto kSplit = Set(du, 1 << E);
49
63.4k
  const auto kExpOffset = Set(du, 127);
50
63.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
63.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
63.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
63.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
63.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
63.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
63.4k
  constexpr size_t kLargeShiftVal = 10;
57
63.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
63.4k
  auto extra_bits = kZero;
60
63.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
6.22M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
6.16M
    const auto val = LoadU(du, values + i);
63
6.16M
    const auto is_large = Gt(val, kLargeThreshold);
64
6.16M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
6.16M
    const auto not_literal = Ge(val, kSplit);
66
6.16M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
6.16M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
6.16M
    const auto l = And(val, kMaskL);
69
6.16M
    const auto exp = ShiftRight<23>(b);
70
6.16M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
6.16M
    const auto n = Sub(exp_fixed, kExpOffset);
72
6.16M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
6.16M
    const auto m = ShiftRight<23 - M - L>(b);
74
6.16M
    const auto a = Add(kBase, Mul(n, kMulN));
75
6.16M
    const auto d = And(m, kMaskM);
76
6.16M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
6.16M
    const auto c = Or(a, l);
78
6.16M
    extra_bits = Add(extra_bits, eb_fixed);
79
6.16M
    const auto t = Or(c, d);
80
6.16M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
6.16M
    Store(t_fixed, du, out + i);
82
6.16M
  }
83
63.4k
  if (last_full < len) {
84
54.8k
    const auto stop = Set(du, len);
85
54.8k
    const auto fence = Iota(du, last_full);
86
54.8k
    const auto take = Lt(fence, stop);
87
54.8k
    const auto val = LoadU(du, values + last_full);
88
54.8k
    const auto is_large = Gt(val, kLargeThreshold);
89
54.8k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
54.8k
    const auto not_literal = Ge(val, kSplit);
91
54.8k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
54.8k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
54.8k
    const auto l = And(val, kMaskL);
94
54.8k
    const auto exp = ShiftRight<23>(b);
95
54.8k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
54.8k
    const auto n = Sub(exp_fixed, kExpOffset);
97
54.8k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
54.8k
    const auto m = ShiftRight<23 - M - L>(b);
99
54.8k
    const auto a = Add(kBase, Mul(n, kMulN));
100
54.8k
    const auto d = And(m, kMaskM);
101
54.8k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
54.8k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
54.8k
    const auto c = Or(a, l);
104
54.8k
    extra_bits = Add(extra_bits, eb_masked);
105
54.8k
    const auto t = Or(c, d);
106
54.8k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
54.8k
    Store(t_fixed, du, out + last_full);
108
54.8k
  }
109
63.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
63.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
62.7k
                               uint32_t* JXL_RESTRICT out) {
45
62.7k
  const HWY_FULL(uint32_t) du;
46
62.7k
  const HWY_FULL(float) df;
47
62.7k
  const auto kZero = Zero(du);
48
62.7k
  const auto kSplit = Set(du, 1 << E);
49
62.7k
  const auto kExpOffset = Set(du, 127);
50
62.7k
  const auto kEBOffset = Set(du, 127 + M + L);
51
62.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
62.7k
  const auto kMulN = Set(du, 1 << (M + L));
53
62.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
62.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
62.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
62.7k
  constexpr size_t kLargeShiftVal = 10;
57
62.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
62.7k
  auto extra_bits = kZero;
60
62.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
6.22M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
6.15M
    const auto val = LoadU(du, values + i);
63
6.15M
    const auto is_large = Gt(val, kLargeThreshold);
64
6.15M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
6.15M
    const auto not_literal = Ge(val, kSplit);
66
6.15M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
6.15M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
6.15M
    const auto l = And(val, kMaskL);
69
6.15M
    const auto exp = ShiftRight<23>(b);
70
6.15M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
6.15M
    const auto n = Sub(exp_fixed, kExpOffset);
72
6.15M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
6.15M
    const auto m = ShiftRight<23 - M - L>(b);
74
6.15M
    const auto a = Add(kBase, Mul(n, kMulN));
75
6.15M
    const auto d = And(m, kMaskM);
76
6.15M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
6.15M
    const auto c = Or(a, l);
78
6.15M
    extra_bits = Add(extra_bits, eb_fixed);
79
6.15M
    const auto t = Or(c, d);
80
6.15M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
6.15M
    Store(t_fixed, du, out + i);
82
6.15M
  }
83
62.7k
  if (last_full < len) {
84
54.1k
    const auto stop = Set(du, len);
85
54.1k
    const auto fence = Iota(du, last_full);
86
54.1k
    const auto take = Lt(fence, stop);
87
54.1k
    const auto val = LoadU(du, values + last_full);
88
54.1k
    const auto is_large = Gt(val, kLargeThreshold);
89
54.1k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
54.1k
    const auto not_literal = Ge(val, kSplit);
91
54.1k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
54.1k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
54.1k
    const auto l = And(val, kMaskL);
94
54.1k
    const auto exp = ShiftRight<23>(b);
95
54.1k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
54.1k
    const auto n = Sub(exp_fixed, kExpOffset);
97
54.1k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
54.1k
    const auto m = ShiftRight<23 - M - L>(b);
99
54.1k
    const auto a = Add(kBase, Mul(n, kMulN));
100
54.1k
    const auto d = And(m, kMaskM);
101
54.1k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
54.1k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
54.1k
    const auto c = Or(a, l);
104
54.1k
    extra_bits = Add(extra_bits, eb_masked);
105
54.1k
    const auto t = Or(c, d);
106
54.1k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
54.1k
    Store(t_fixed, du, out + last_full);
108
54.1k
  }
109
62.7k
  return GetLane(SumOfLanes(du, extra_bits));
110
62.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
62.7k
                               uint32_t* JXL_RESTRICT out) {
45
62.7k
  const HWY_FULL(uint32_t) du;
46
62.7k
  const HWY_FULL(float) df;
47
62.7k
  const auto kZero = Zero(du);
48
62.7k
  const auto kSplit = Set(du, 1 << E);
49
62.7k
  const auto kExpOffset = Set(du, 127);
50
62.7k
  const auto kEBOffset = Set(du, 127 + M + L);
51
62.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
62.7k
  const auto kMulN = Set(du, 1 << (M + L));
53
62.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
62.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
62.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
62.7k
  constexpr size_t kLargeShiftVal = 10;
57
62.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
62.7k
  auto extra_bits = kZero;
60
62.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
6.22M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
6.15M
    const auto val = LoadU(du, values + i);
63
6.15M
    const auto is_large = Gt(val, kLargeThreshold);
64
6.15M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
6.15M
    const auto not_literal = Ge(val, kSplit);
66
6.15M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
6.15M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
6.15M
    const auto l = And(val, kMaskL);
69
6.15M
    const auto exp = ShiftRight<23>(b);
70
6.15M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
6.15M
    const auto n = Sub(exp_fixed, kExpOffset);
72
6.15M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
6.15M
    const auto m = ShiftRight<23 - M - L>(b);
74
6.15M
    const auto a = Add(kBase, Mul(n, kMulN));
75
6.15M
    const auto d = And(m, kMaskM);
76
6.15M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
6.15M
    const auto c = Or(a, l);
78
6.15M
    extra_bits = Add(extra_bits, eb_fixed);
79
6.15M
    const auto t = Or(c, d);
80
6.15M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
6.15M
    Store(t_fixed, du, out + i);
82
6.15M
  }
83
62.7k
  if (last_full < len) {
84
54.1k
    const auto stop = Set(du, len);
85
54.1k
    const auto fence = Iota(du, last_full);
86
54.1k
    const auto take = Lt(fence, stop);
87
54.1k
    const auto val = LoadU(du, values + last_full);
88
54.1k
    const auto is_large = Gt(val, kLargeThreshold);
89
54.1k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
54.1k
    const auto not_literal = Ge(val, kSplit);
91
54.1k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
54.1k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
54.1k
    const auto l = And(val, kMaskL);
94
54.1k
    const auto exp = ShiftRight<23>(b);
95
54.1k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
54.1k
    const auto n = Sub(exp_fixed, kExpOffset);
97
54.1k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
54.1k
    const auto m = ShiftRight<23 - M - L>(b);
99
54.1k
    const auto a = Add(kBase, Mul(n, kMulN));
100
54.1k
    const auto d = And(m, kMaskM);
101
54.1k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
54.1k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
54.1k
    const auto c = Or(a, l);
104
54.1k
    extra_bits = Add(extra_bits, eb_masked);
105
54.1k
    const auto t = Or(c, d);
106
54.1k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
54.1k
    Store(t_fixed, du, out + last_full);
108
54.1k
  }
109
62.7k
  return GetLane(SumOfLanes(du, extra_bits));
110
62.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
62.7k
                               uint32_t* JXL_RESTRICT out) {
45
62.7k
  const HWY_FULL(uint32_t) du;
46
62.7k
  const HWY_FULL(float) df;
47
62.7k
  const auto kZero = Zero(du);
48
62.7k
  const auto kSplit = Set(du, 1 << E);
49
62.7k
  const auto kExpOffset = Set(du, 127);
50
62.7k
  const auto kEBOffset = Set(du, 127 + M + L);
51
62.7k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
62.7k
  const auto kMulN = Set(du, 1 << (M + L));
53
62.7k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
62.7k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
62.7k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
62.7k
  constexpr size_t kLargeShiftVal = 10;
57
62.7k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
62.7k
  auto extra_bits = kZero;
60
62.7k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
6.22M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
6.15M
    const auto val = LoadU(du, values + i);
63
6.15M
    const auto is_large = Gt(val, kLargeThreshold);
64
6.15M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
6.15M
    const auto not_literal = Ge(val, kSplit);
66
6.15M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
6.15M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
6.15M
    const auto l = And(val, kMaskL);
69
6.15M
    const auto exp = ShiftRight<23>(b);
70
6.15M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
6.15M
    const auto n = Sub(exp_fixed, kExpOffset);
72
6.15M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
6.15M
    const auto m = ShiftRight<23 - M - L>(b);
74
6.15M
    const auto a = Add(kBase, Mul(n, kMulN));
75
6.15M
    const auto d = And(m, kMaskM);
76
6.15M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
6.15M
    const auto c = Or(a, l);
78
6.15M
    extra_bits = Add(extra_bits, eb_fixed);
79
6.15M
    const auto t = Or(c, d);
80
6.15M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
6.15M
    Store(t_fixed, du, out + i);
82
6.15M
  }
83
62.7k
  if (last_full < len) {
84
54.1k
    const auto stop = Set(du, len);
85
54.1k
    const auto fence = Iota(du, last_full);
86
54.1k
    const auto take = Lt(fence, stop);
87
54.1k
    const auto val = LoadU(du, values + last_full);
88
54.1k
    const auto is_large = Gt(val, kLargeThreshold);
89
54.1k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
54.1k
    const auto not_literal = Ge(val, kSplit);
91
54.1k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
54.1k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
54.1k
    const auto l = And(val, kMaskL);
94
54.1k
    const auto exp = ShiftRight<23>(b);
95
54.1k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
54.1k
    const auto n = Sub(exp_fixed, kExpOffset);
97
54.1k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
54.1k
    const auto m = ShiftRight<23 - M - L>(b);
99
54.1k
    const auto a = Add(kBase, Mul(n, kMulN));
100
54.1k
    const auto d = And(m, kMaskM);
101
54.1k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
54.1k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
54.1k
    const auto c = Or(a, l);
104
54.1k
    extra_bits = Add(extra_bits, eb_masked);
105
54.1k
    const auto t = Or(c, d);
106
54.1k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
54.1k
    Store(t_fixed, du, out + last_full);
108
54.1k
  }
109
62.7k
  return GetLane(SumOfLanes(du, extra_bits));
110
62.7k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.73k
                               uint32_t* JXL_RESTRICT out) {
45
8.73k
  const HWY_FULL(uint32_t) du;
46
8.73k
  const HWY_FULL(float) df;
47
8.73k
  const auto kZero = Zero(du);
48
8.73k
  const auto kSplit = Set(du, 1 << E);
49
8.73k
  const auto kExpOffset = Set(du, 127);
50
8.73k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.73k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.73k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.73k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.73k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.73k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.73k
  constexpr size_t kLargeShiftVal = 10;
57
8.73k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.73k
  auto extra_bits = kZero;
60
8.73k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.73k
  if (last_full < len) {
84
7.37k
    const auto stop = Set(du, len);
85
7.37k
    const auto fence = Iota(du, last_full);
86
7.37k
    const auto take = Lt(fence, stop);
87
7.37k
    const auto val = LoadU(du, values + last_full);
88
7.37k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.37k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.37k
    const auto not_literal = Ge(val, kSplit);
91
7.37k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.37k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.37k
    const auto l = And(val, kMaskL);
94
7.37k
    const auto exp = ShiftRight<23>(b);
95
7.37k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.37k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.37k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.37k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.37k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.37k
    const auto d = And(m, kMaskM);
101
7.37k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.37k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.37k
    const auto c = Or(a, l);
104
7.37k
    extra_bits = Add(extra_bits, eb_masked);
105
7.37k
    const auto t = Or(c, d);
106
7.37k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.37k
    Store(t_fixed, du, out + last_full);
108
7.37k
  }
109
8.73k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.73k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.73k
                               uint32_t* JXL_RESTRICT out) {
45
8.73k
  const HWY_FULL(uint32_t) du;
46
8.73k
  const HWY_FULL(float) df;
47
8.73k
  const auto kZero = Zero(du);
48
8.73k
  const auto kSplit = Set(du, 1 << E);
49
8.73k
  const auto kExpOffset = Set(du, 127);
50
8.73k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.73k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.73k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.73k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.73k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.73k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.73k
  constexpr size_t kLargeShiftVal = 10;
57
8.73k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.73k
  auto extra_bits = kZero;
60
8.73k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.73k
  if (last_full < len) {
84
7.37k
    const auto stop = Set(du, len);
85
7.37k
    const auto fence = Iota(du, last_full);
86
7.37k
    const auto take = Lt(fence, stop);
87
7.37k
    const auto val = LoadU(du, values + last_full);
88
7.37k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.37k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.37k
    const auto not_literal = Ge(val, kSplit);
91
7.37k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.37k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.37k
    const auto l = And(val, kMaskL);
94
7.37k
    const auto exp = ShiftRight<23>(b);
95
7.37k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.37k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.37k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.37k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.37k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.37k
    const auto d = And(m, kMaskM);
101
7.37k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.37k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.37k
    const auto c = Or(a, l);
104
7.37k
    extra_bits = Add(extra_bits, eb_masked);
105
7.37k
    const auto t = Or(c, d);
106
7.37k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.37k
    Store(t_fixed, du, out + last_full);
108
7.37k
  }
109
8.73k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.73k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
7.90k
                               uint32_t* JXL_RESTRICT out) {
45
7.90k
  const HWY_FULL(uint32_t) du;
46
7.90k
  const HWY_FULL(float) df;
47
7.90k
  const auto kZero = Zero(du);
48
7.90k
  const auto kSplit = Set(du, 1 << E);
49
7.90k
  const auto kExpOffset = Set(du, 127);
50
7.90k
  const auto kEBOffset = Set(du, 127 + M + L);
51
7.90k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
7.90k
  const auto kMulN = Set(du, 1 << (M + L));
53
7.90k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
7.90k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
7.90k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
7.90k
  constexpr size_t kLargeShiftVal = 10;
57
7.90k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
7.90k
  auto extra_bits = kZero;
60
7.90k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
351k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
343k
    const auto val = LoadU(du, values + i);
63
343k
    const auto is_large = Gt(val, kLargeThreshold);
64
343k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
343k
    const auto not_literal = Ge(val, kSplit);
66
343k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
343k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
343k
    const auto l = And(val, kMaskL);
69
343k
    const auto exp = ShiftRight<23>(b);
70
343k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
343k
    const auto n = Sub(exp_fixed, kExpOffset);
72
343k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
343k
    const auto m = ShiftRight<23 - M - L>(b);
74
343k
    const auto a = Add(kBase, Mul(n, kMulN));
75
343k
    const auto d = And(m, kMaskM);
76
343k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
343k
    const auto c = Or(a, l);
78
343k
    extra_bits = Add(extra_bits, eb_fixed);
79
343k
    const auto t = Or(c, d);
80
343k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
343k
    Store(t_fixed, du, out + i);
82
343k
  }
83
7.90k
  if (last_full < len) {
84
6.69k
    const auto stop = Set(du, len);
85
6.69k
    const auto fence = Iota(du, last_full);
86
6.69k
    const auto take = Lt(fence, stop);
87
6.69k
    const auto val = LoadU(du, values + last_full);
88
6.69k
    const auto is_large = Gt(val, kLargeThreshold);
89
6.69k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
6.69k
    const auto not_literal = Ge(val, kSplit);
91
6.69k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
6.69k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
6.69k
    const auto l = And(val, kMaskL);
94
6.69k
    const auto exp = ShiftRight<23>(b);
95
6.69k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
6.69k
    const auto n = Sub(exp_fixed, kExpOffset);
97
6.69k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
6.69k
    const auto m = ShiftRight<23 - M - L>(b);
99
6.69k
    const auto a = Add(kBase, Mul(n, kMulN));
100
6.69k
    const auto d = And(m, kMaskM);
101
6.69k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
6.69k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
6.69k
    const auto c = Or(a, l);
104
6.69k
    extra_bits = Add(extra_bits, eb_masked);
105
6.69k
    const auto t = Or(c, d);
106
6.69k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
6.69k
    Store(t_fixed, du, out + last_full);
108
6.69k
  }
109
7.90k
  return GetLane(SumOfLanes(du, extra_bits));
110
7.90k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
7.90k
                               uint32_t* JXL_RESTRICT out) {
45
7.90k
  const HWY_FULL(uint32_t) du;
46
7.90k
  const HWY_FULL(float) df;
47
7.90k
  const auto kZero = Zero(du);
48
7.90k
  const auto kSplit = Set(du, 1 << E);
49
7.90k
  const auto kExpOffset = Set(du, 127);
50
7.90k
  const auto kEBOffset = Set(du, 127 + M + L);
51
7.90k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
7.90k
  const auto kMulN = Set(du, 1 << (M + L));
53
7.90k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
7.90k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
7.90k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
7.90k
  constexpr size_t kLargeShiftVal = 10;
57
7.90k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
7.90k
  auto extra_bits = kZero;
60
7.90k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
351k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
343k
    const auto val = LoadU(du, values + i);
63
343k
    const auto is_large = Gt(val, kLargeThreshold);
64
343k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
343k
    const auto not_literal = Ge(val, kSplit);
66
343k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
343k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
343k
    const auto l = And(val, kMaskL);
69
343k
    const auto exp = ShiftRight<23>(b);
70
343k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
343k
    const auto n = Sub(exp_fixed, kExpOffset);
72
343k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
343k
    const auto m = ShiftRight<23 - M - L>(b);
74
343k
    const auto a = Add(kBase, Mul(n, kMulN));
75
343k
    const auto d = And(m, kMaskM);
76
343k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
343k
    const auto c = Or(a, l);
78
343k
    extra_bits = Add(extra_bits, eb_fixed);
79
343k
    const auto t = Or(c, d);
80
343k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
343k
    Store(t_fixed, du, out + i);
82
343k
  }
83
7.90k
  if (last_full < len) {
84
6.69k
    const auto stop = Set(du, len);
85
6.69k
    const auto fence = Iota(du, last_full);
86
6.69k
    const auto take = Lt(fence, stop);
87
6.69k
    const auto val = LoadU(du, values + last_full);
88
6.69k
    const auto is_large = Gt(val, kLargeThreshold);
89
6.69k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
6.69k
    const auto not_literal = Ge(val, kSplit);
91
6.69k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
6.69k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
6.69k
    const auto l = And(val, kMaskL);
94
6.69k
    const auto exp = ShiftRight<23>(b);
95
6.69k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
6.69k
    const auto n = Sub(exp_fixed, kExpOffset);
97
6.69k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
6.69k
    const auto m = ShiftRight<23 - M - L>(b);
99
6.69k
    const auto a = Add(kBase, Mul(n, kMulN));
100
6.69k
    const auto d = And(m, kMaskM);
101
6.69k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
6.69k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
6.69k
    const auto c = Or(a, l);
104
6.69k
    extra_bits = Add(extra_bits, eb_masked);
105
6.69k
    const auto t = Or(c, d);
106
6.69k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
6.69k
    Store(t_fixed, du, out + last_full);
108
6.69k
  }
109
7.90k
  return GetLane(SumOfLanes(du, extra_bits));
110
7.90k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
8.75k
                               uint32_t* JXL_RESTRICT out) {
45
8.75k
  const HWY_FULL(uint32_t) du;
46
8.75k
  const HWY_FULL(float) df;
47
8.75k
  const auto kZero = Zero(du);
48
8.75k
  const auto kSplit = Set(du, 1 << E);
49
8.75k
  const auto kExpOffset = Set(du, 127);
50
8.75k
  const auto kEBOffset = Set(du, 127 + M + L);
51
8.75k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
8.75k
  const auto kMulN = Set(du, 1 << (M + L));
53
8.75k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
8.75k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
8.75k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
8.75k
  constexpr size_t kLargeShiftVal = 10;
57
8.75k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
8.75k
  auto extra_bits = kZero;
60
8.75k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
684k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
675k
    const auto val = LoadU(du, values + i);
63
675k
    const auto is_large = Gt(val, kLargeThreshold);
64
675k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
675k
    const auto not_literal = Ge(val, kSplit);
66
675k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
675k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
675k
    const auto l = And(val, kMaskL);
69
675k
    const auto exp = ShiftRight<23>(b);
70
675k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
675k
    const auto n = Sub(exp_fixed, kExpOffset);
72
675k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
675k
    const auto m = ShiftRight<23 - M - L>(b);
74
675k
    const auto a = Add(kBase, Mul(n, kMulN));
75
675k
    const auto d = And(m, kMaskM);
76
675k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
675k
    const auto c = Or(a, l);
78
675k
    extra_bits = Add(extra_bits, eb_fixed);
79
675k
    const auto t = Or(c, d);
80
675k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
675k
    Store(t_fixed, du, out + i);
82
675k
  }
83
8.75k
  if (last_full < len) {
84
7.39k
    const auto stop = Set(du, len);
85
7.39k
    const auto fence = Iota(du, last_full);
86
7.39k
    const auto take = Lt(fence, stop);
87
7.39k
    const auto val = LoadU(du, values + last_full);
88
7.39k
    const auto is_large = Gt(val, kLargeThreshold);
89
7.39k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
7.39k
    const auto not_literal = Ge(val, kSplit);
91
7.39k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
7.39k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
7.39k
    const auto l = And(val, kMaskL);
94
7.39k
    const auto exp = ShiftRight<23>(b);
95
7.39k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
7.39k
    const auto n = Sub(exp_fixed, kExpOffset);
97
7.39k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
7.39k
    const auto m = ShiftRight<23 - M - L>(b);
99
7.39k
    const auto a = Add(kBase, Mul(n, kMulN));
100
7.39k
    const auto d = And(m, kMaskM);
101
7.39k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
7.39k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
7.39k
    const auto c = Or(a, l);
104
7.39k
    extra_bits = Add(extra_bits, eb_masked);
105
7.39k
    const auto t = Or(c, d);
106
7.39k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
7.39k
    Store(t_fixed, du, out + last_full);
108
7.39k
  }
109
8.75k
  return GetLane(SumOfLanes(du, extra_bits));
110
8.75k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.73k
                               uint32_t* JXL_RESTRICT out) {
45
6.73k
  const HWY_FULL(uint32_t) du;
46
6.73k
  const HWY_FULL(float) df;
47
6.73k
  const auto kZero = Zero(du);
48
6.73k
  const auto kSplit = Set(du, 1 << E);
49
6.73k
  const auto kExpOffset = Set(du, 127);
50
6.73k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.73k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.73k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.73k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.73k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.73k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.73k
  constexpr size_t kLargeShiftVal = 10;
57
6.73k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.73k
  auto extra_bits = kZero;
60
6.73k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
307k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
301k
    const auto val = LoadU(du, values + i);
63
301k
    const auto is_large = Gt(val, kLargeThreshold);
64
301k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
301k
    const auto not_literal = Ge(val, kSplit);
66
301k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
301k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
301k
    const auto l = And(val, kMaskL);
69
301k
    const auto exp = ShiftRight<23>(b);
70
301k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
301k
    const auto n = Sub(exp_fixed, kExpOffset);
72
301k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
301k
    const auto m = ShiftRight<23 - M - L>(b);
74
301k
    const auto a = Add(kBase, Mul(n, kMulN));
75
301k
    const auto d = And(m, kMaskM);
76
301k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
301k
    const auto c = Or(a, l);
78
301k
    extra_bits = Add(extra_bits, eb_fixed);
79
301k
    const auto t = Or(c, d);
80
301k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
301k
    Store(t_fixed, du, out + i);
82
301k
  }
83
6.73k
  if (last_full < len) {
84
5.73k
    const auto stop = Set(du, len);
85
5.73k
    const auto fence = Iota(du, last_full);
86
5.73k
    const auto take = Lt(fence, stop);
87
5.73k
    const auto val = LoadU(du, values + last_full);
88
5.73k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.73k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.73k
    const auto not_literal = Ge(val, kSplit);
91
5.73k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.73k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.73k
    const auto l = And(val, kMaskL);
94
5.73k
    const auto exp = ShiftRight<23>(b);
95
5.73k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.73k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.73k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.73k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.73k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.73k
    const auto d = And(m, kMaskM);
101
5.73k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.73k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.73k
    const auto c = Or(a, l);
104
5.73k
    extra_bits = Add(extra_bits, eb_masked);
105
5.73k
    const auto t = Or(c, d);
106
5.73k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.73k
    Store(t_fixed, du, out + last_full);
108
5.73k
  }
109
6.73k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.73k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.73k
                               uint32_t* JXL_RESTRICT out) {
45
6.73k
  const HWY_FULL(uint32_t) du;
46
6.73k
  const HWY_FULL(float) df;
47
6.73k
  const auto kZero = Zero(du);
48
6.73k
  const auto kSplit = Set(du, 1 << E);
49
6.73k
  const auto kExpOffset = Set(du, 127);
50
6.73k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.73k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.73k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.73k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.73k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.73k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.73k
  constexpr size_t kLargeShiftVal = 10;
57
6.73k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.73k
  auto extra_bits = kZero;
60
6.73k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
307k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
301k
    const auto val = LoadU(du, values + i);
63
301k
    const auto is_large = Gt(val, kLargeThreshold);
64
301k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
301k
    const auto not_literal = Ge(val, kSplit);
66
301k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
301k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
301k
    const auto l = And(val, kMaskL);
69
301k
    const auto exp = ShiftRight<23>(b);
70
301k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
301k
    const auto n = Sub(exp_fixed, kExpOffset);
72
301k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
301k
    const auto m = ShiftRight<23 - M - L>(b);
74
301k
    const auto a = Add(kBase, Mul(n, kMulN));
75
301k
    const auto d = And(m, kMaskM);
76
301k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
301k
    const auto c = Or(a, l);
78
301k
    extra_bits = Add(extra_bits, eb_fixed);
79
301k
    const auto t = Or(c, d);
80
301k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
301k
    Store(t_fixed, du, out + i);
82
301k
  }
83
6.73k
  if (last_full < len) {
84
5.73k
    const auto stop = Set(du, len);
85
5.73k
    const auto fence = Iota(du, last_full);
86
5.73k
    const auto take = Lt(fence, stop);
87
5.73k
    const auto val = LoadU(du, values + last_full);
88
5.73k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.73k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.73k
    const auto not_literal = Ge(val, kSplit);
91
5.73k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.73k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.73k
    const auto l = And(val, kMaskL);
94
5.73k
    const auto exp = ShiftRight<23>(b);
95
5.73k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.73k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.73k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.73k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.73k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.73k
    const auto d = And(m, kMaskM);
101
5.73k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.73k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.73k
    const auto c = Or(a, l);
104
5.73k
    extra_bits = Add(extra_bits, eb_masked);
105
5.73k
    const auto t = Or(c, d);
106
5.73k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.73k
    Store(t_fixed, du, out + last_full);
108
5.73k
  }
109
6.73k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.73k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.73k
                               uint32_t* JXL_RESTRICT out) {
45
6.73k
  const HWY_FULL(uint32_t) du;
46
6.73k
  const HWY_FULL(float) df;
47
6.73k
  const auto kZero = Zero(du);
48
6.73k
  const auto kSplit = Set(du, 1 << E);
49
6.73k
  const auto kExpOffset = Set(du, 127);
50
6.73k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.73k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.73k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.73k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.73k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.73k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.73k
  constexpr size_t kLargeShiftVal = 10;
57
6.73k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.73k
  auto extra_bits = kZero;
60
6.73k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
307k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
301k
    const auto val = LoadU(du, values + i);
63
301k
    const auto is_large = Gt(val, kLargeThreshold);
64
301k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
301k
    const auto not_literal = Ge(val, kSplit);
66
301k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
301k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
301k
    const auto l = And(val, kMaskL);
69
301k
    const auto exp = ShiftRight<23>(b);
70
301k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
301k
    const auto n = Sub(exp_fixed, kExpOffset);
72
301k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
301k
    const auto m = ShiftRight<23 - M - L>(b);
74
301k
    const auto a = Add(kBase, Mul(n, kMulN));
75
301k
    const auto d = And(m, kMaskM);
76
301k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
301k
    const auto c = Or(a, l);
78
301k
    extra_bits = Add(extra_bits, eb_fixed);
79
301k
    const auto t = Or(c, d);
80
301k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
301k
    Store(t_fixed, du, out + i);
82
301k
  }
83
6.73k
  if (last_full < len) {
84
5.73k
    const auto stop = Set(du, len);
85
5.73k
    const auto fence = Iota(du, last_full);
86
5.73k
    const auto take = Lt(fence, stop);
87
5.73k
    const auto val = LoadU(du, values + last_full);
88
5.73k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.73k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.73k
    const auto not_literal = Ge(val, kSplit);
91
5.73k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.73k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.73k
    const auto l = And(val, kMaskL);
94
5.73k
    const auto exp = ShiftRight<23>(b);
95
5.73k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.73k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.73k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.73k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.73k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.73k
    const auto d = And(m, kMaskM);
101
5.73k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.73k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.73k
    const auto c = Or(a, l);
104
5.73k
    extra_bits = Add(extra_bits, eb_masked);
105
5.73k
    const auto t = Or(c, d);
106
5.73k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.73k
    Store(t_fixed, du, out + last_full);
108
5.73k
  }
109
6.73k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.73k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.73k
                               uint32_t* JXL_RESTRICT out) {
45
6.73k
  const HWY_FULL(uint32_t) du;
46
6.73k
  const HWY_FULL(float) df;
47
6.73k
  const auto kZero = Zero(du);
48
6.73k
  const auto kSplit = Set(du, 1 << E);
49
6.73k
  const auto kExpOffset = Set(du, 127);
50
6.73k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.73k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.73k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.73k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.73k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.73k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.73k
  constexpr size_t kLargeShiftVal = 10;
57
6.73k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.73k
  auto extra_bits = kZero;
60
6.73k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
307k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
301k
    const auto val = LoadU(du, values + i);
63
301k
    const auto is_large = Gt(val, kLargeThreshold);
64
301k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
301k
    const auto not_literal = Ge(val, kSplit);
66
301k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
301k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
301k
    const auto l = And(val, kMaskL);
69
301k
    const auto exp = ShiftRight<23>(b);
70
301k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
301k
    const auto n = Sub(exp_fixed, kExpOffset);
72
301k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
301k
    const auto m = ShiftRight<23 - M - L>(b);
74
301k
    const auto a = Add(kBase, Mul(n, kMulN));
75
301k
    const auto d = And(m, kMaskM);
76
301k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
301k
    const auto c = Or(a, l);
78
301k
    extra_bits = Add(extra_bits, eb_fixed);
79
301k
    const auto t = Or(c, d);
80
301k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
301k
    Store(t_fixed, du, out + i);
82
301k
  }
83
6.73k
  if (last_full < len) {
84
5.73k
    const auto stop = Set(du, len);
85
5.73k
    const auto fence = Iota(du, last_full);
86
5.73k
    const auto take = Lt(fence, stop);
87
5.73k
    const auto val = LoadU(du, values + last_full);
88
5.73k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.73k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.73k
    const auto not_literal = Ge(val, kSplit);
91
5.73k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.73k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.73k
    const auto l = And(val, kMaskL);
94
5.73k
    const auto exp = ShiftRight<23>(b);
95
5.73k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.73k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.73k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.73k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.73k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.73k
    const auto d = And(m, kMaskM);
101
5.73k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.73k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.73k
    const auto c = Or(a, l);
104
5.73k
    extra_bits = Add(extra_bits, eb_masked);
105
5.73k
    const auto t = Or(c, d);
106
5.73k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.73k
    Store(t_fixed, du, out + last_full);
108
5.73k
  }
109
6.73k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.73k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.73k
                               uint32_t* JXL_RESTRICT out) {
45
6.73k
  const HWY_FULL(uint32_t) du;
46
6.73k
  const HWY_FULL(float) df;
47
6.73k
  const auto kZero = Zero(du);
48
6.73k
  const auto kSplit = Set(du, 1 << E);
49
6.73k
  const auto kExpOffset = Set(du, 127);
50
6.73k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.73k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.73k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.73k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.73k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.73k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.73k
  constexpr size_t kLargeShiftVal = 10;
57
6.73k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.73k
  auto extra_bits = kZero;
60
6.73k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
307k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
301k
    const auto val = LoadU(du, values + i);
63
301k
    const auto is_large = Gt(val, kLargeThreshold);
64
301k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
301k
    const auto not_literal = Ge(val, kSplit);
66
301k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
301k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
301k
    const auto l = And(val, kMaskL);
69
301k
    const auto exp = ShiftRight<23>(b);
70
301k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
301k
    const auto n = Sub(exp_fixed, kExpOffset);
72
301k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
301k
    const auto m = ShiftRight<23 - M - L>(b);
74
301k
    const auto a = Add(kBase, Mul(n, kMulN));
75
301k
    const auto d = And(m, kMaskM);
76
301k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
301k
    const auto c = Or(a, l);
78
301k
    extra_bits = Add(extra_bits, eb_fixed);
79
301k
    const auto t = Or(c, d);
80
301k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
301k
    Store(t_fixed, du, out + i);
82
301k
  }
83
6.73k
  if (last_full < len) {
84
5.73k
    const auto stop = Set(du, len);
85
5.73k
    const auto fence = Iota(du, last_full);
86
5.73k
    const auto take = Lt(fence, stop);
87
5.73k
    const auto val = LoadU(du, values + last_full);
88
5.73k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.73k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.73k
    const auto not_literal = Ge(val, kSplit);
91
5.73k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.73k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.73k
    const auto l = And(val, kMaskL);
94
5.73k
    const auto exp = ShiftRight<23>(b);
95
5.73k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.73k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.73k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.73k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.73k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.73k
    const auto d = And(m, kMaskM);
101
5.73k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.73k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.73k
    const auto c = Or(a, l);
104
5.73k
    extra_bits = Add(extra_bits, eb_masked);
105
5.73k
    const auto t = Or(c, d);
106
5.73k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.73k
    Store(t_fixed, du, out + last_full);
108
5.73k
  }
109
6.73k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.73k
}
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
111
112
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
113
449k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
449k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
449k
  if (cfg.split_exponent == 0) {
127
63.4k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
386k
  } else if (cfg.split_exponent == 2) {
129
62.7k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
62.7k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
323k
  } else if (cfg.split_exponent == 3) {
132
35.0k
    if (cfg.msb_in_token == 1) {
133
17.5k
      if (cfg.lsb_in_token == 0) {
134
8.75k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
8.75k
      } else {
136
8.75k
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
8.75k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
8.75k
      }
139
17.5k
    } else {
140
17.5k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
17.5k
      if (cfg.lsb_in_token == 0) {
142
8.75k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
8.75k
      } else {
144
8.75k
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
8.75k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
8.75k
      }
147
17.5k
    }
148
288k
  } else if (cfg.split_exponent == 4) {
149
160k
    if (cfg.msb_in_token == 1) {
150
80.2k
      if (cfg.lsb_in_token == 0) {
151
8.75k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
71.4k
      } else if (cfg.lsb_in_token == 2) {
153
62.7k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
62.7k
      } else {
155
8.75k
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
8.75k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
8.75k
      }
158
80.2k
    } else {
159
80.2k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
80.2k
      if (cfg.lsb_in_token == 0) {
161
62.7k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
62.7k
      } else if (cfg.lsb_in_token == 1) {
163
8.75k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
8.75k
      } else {
165
8.75k
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
8.75k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
8.75k
      }
168
80.2k
    }
169
160k
  } else if (cfg.split_exponent == 5) {
170
61.2k
    if (cfg.msb_in_token == 1) {
171
26.2k
      if (cfg.lsb_in_token == 0) {
172
8.75k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
17.4k
      } else if (cfg.lsb_in_token == 2) {
174
8.75k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
8.75k
      } else {
176
8.73k
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
8.73k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
8.73k
      }
179
35.0k
    } else {
180
35.0k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
35.0k
      if (cfg.lsb_in_token == 0) {
182
8.75k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
26.2k
      } else if (cfg.lsb_in_token == 1) {
184
8.75k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
17.4k
      } else if (cfg.lsb_in_token == 2) {
186
8.75k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
8.75k
      } else {
188
8.73k
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
8.73k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
8.73k
      }
191
35.0k
    }
192
66.9k
  } else if (cfg.split_exponent == 6) {
193
24.5k
    if (cfg.msb_in_token == 0) {
194
8.75k
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
8.75k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
15.8k
    } else if (cfg.msb_in_token == 1) {
197
7.90k
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
7.90k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
7.90k
    } else {
200
7.90k
      JXL_DASSERT(cfg.msb_in_token == 2);
201
7.90k
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
7.90k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
7.90k
    }
204
42.4k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
42.4k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
42.4k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
42.4k
    if (cfg.split_exponent == 7) {
208
8.75k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
33.6k
    } else if (cfg.split_exponent == 8) {
210
6.73k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
26.9k
    } else if (cfg.split_exponent == 9) {
212
6.73k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
20.2k
    } else if (cfg.split_exponent == 10) {
214
6.73k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
13.4k
    } else if (cfg.split_exponent == 11) {
216
6.73k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
6.73k
    } else {
218
6.73k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
6.73k
    }
220
42.4k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
449k
#endif
225
449k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Line
Count
Source
113
449k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
449k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
449k
  if (cfg.split_exponent == 0) {
127
63.4k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
386k
  } else if (cfg.split_exponent == 2) {
129
62.7k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
62.7k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
323k
  } else if (cfg.split_exponent == 3) {
132
35.0k
    if (cfg.msb_in_token == 1) {
133
17.5k
      if (cfg.lsb_in_token == 0) {
134
8.75k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
8.75k
      } else {
136
8.75k
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
8.75k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
8.75k
      }
139
17.5k
    } else {
140
17.5k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
17.5k
      if (cfg.lsb_in_token == 0) {
142
8.75k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
8.75k
      } else {
144
8.75k
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
8.75k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
8.75k
      }
147
17.5k
    }
148
288k
  } else if (cfg.split_exponent == 4) {
149
160k
    if (cfg.msb_in_token == 1) {
150
80.2k
      if (cfg.lsb_in_token == 0) {
151
8.75k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
71.4k
      } else if (cfg.lsb_in_token == 2) {
153
62.7k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
62.7k
      } else {
155
8.75k
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
8.75k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
8.75k
      }
158
80.2k
    } else {
159
80.2k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
80.2k
      if (cfg.lsb_in_token == 0) {
161
62.7k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
62.7k
      } else if (cfg.lsb_in_token == 1) {
163
8.75k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
8.75k
      } else {
165
8.75k
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
8.75k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
8.75k
      }
168
80.2k
    }
169
160k
  } else if (cfg.split_exponent == 5) {
170
61.2k
    if (cfg.msb_in_token == 1) {
171
26.2k
      if (cfg.lsb_in_token == 0) {
172
8.75k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
17.4k
      } else if (cfg.lsb_in_token == 2) {
174
8.75k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
8.75k
      } else {
176
8.73k
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
8.73k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
8.73k
      }
179
35.0k
    } else {
180
35.0k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
35.0k
      if (cfg.lsb_in_token == 0) {
182
8.75k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
26.2k
      } else if (cfg.lsb_in_token == 1) {
184
8.75k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
17.4k
      } else if (cfg.lsb_in_token == 2) {
186
8.75k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
8.75k
      } else {
188
8.73k
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
8.73k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
8.73k
      }
191
35.0k
    }
192
66.9k
  } else if (cfg.split_exponent == 6) {
193
24.5k
    if (cfg.msb_in_token == 0) {
194
8.75k
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
8.75k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
15.8k
    } else if (cfg.msb_in_token == 1) {
197
7.90k
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
7.90k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
7.90k
    } else {
200
7.90k
      JXL_DASSERT(cfg.msb_in_token == 2);
201
7.90k
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
7.90k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
7.90k
    }
204
42.4k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
42.4k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
42.4k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
42.4k
    if (cfg.split_exponent == 7) {
208
8.75k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
33.6k
    } else if (cfg.split_exponent == 8) {
210
6.73k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
26.9k
    } else if (cfg.split_exponent == 9) {
212
6.73k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
20.2k
    } else if (cfg.split_exponent == 10) {
214
6.73k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
13.4k
    } else if (cfg.split_exponent == 11) {
216
6.73k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
6.73k
    } else {
218
6.73k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
6.73k
    }
220
42.4k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
449k
#endif
225
449k
}
Unexecuted instantiation: jxl::N_AVX3::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_SPR::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
226
227
// NOLINTNEXTLINE(google-readability-namespace-comments)
228
}  // namespace HWY_NAMESPACE
229
}  // namespace jxl
230
HWY_AFTER_NAMESPACE();
231
232
#if HWY_ONCE
233
namespace jxl {
234
235
HWY_EXPORT(EstimateTokenCost);
236
237
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
238
449k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
239
449k
  JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent);
240
449k
  return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens);
241
449k
}
242
243
}  // namespace jxl
244
#endif