Coverage Report

Created: 2025-11-16 07:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_ans_simd.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ans_simd.h"
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/status.h"
11
#include "lib/jxl/dec_ans.h"
12
#include "lib/jxl/memory_manager_internal.h"
13
14
#undef HWY_TARGET_INCLUDE
15
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc"
16
#include <hwy/foreach_target.h>
17
#include <hwy/highway.h>
18
19
HWY_BEFORE_NAMESPACE();
20
namespace jxl {
21
namespace HWY_NAMESPACE {
22
23
// These templates are not found via ADL.
24
using hwy::HWY_NAMESPACE::Add;
25
using hwy::HWY_NAMESPACE::And;
26
using hwy::HWY_NAMESPACE::Ge;
27
using hwy::HWY_NAMESPACE::GetLane;
28
using hwy::HWY_NAMESPACE::Gt;
29
using hwy::HWY_NAMESPACE::IfThenElse;
30
using hwy::HWY_NAMESPACE::IfThenElseZero;
31
using hwy::HWY_NAMESPACE::Iota;
32
using hwy::HWY_NAMESPACE::LoadU;
33
using hwy::HWY_NAMESPACE::Lt;
34
using hwy::HWY_NAMESPACE::Mul;
35
using hwy::HWY_NAMESPACE::Or;
36
using hwy::HWY_NAMESPACE::Set;
37
using hwy::HWY_NAMESPACE::ShiftRight;
38
using hwy::HWY_NAMESPACE::Store;
39
using hwy::HWY_NAMESPACE::Sub;
40
using hwy::HWY_NAMESPACE::Zero;
41
42
template <size_t E, size_t M, size_t L>
43
uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len,
44
625k
                               uint32_t* JXL_RESTRICT out) {
45
625k
  const HWY_FULL(uint32_t) du;
46
625k
  const HWY_FULL(float) df;
47
625k
  const auto kZero = Zero(du);
48
625k
  const auto kSplit = Set(du, 1 << E);
49
625k
  const auto kExpOffset = Set(du, 127);
50
625k
  const auto kEBOffset = Set(du, 127 + M + L);
51
625k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
625k
  const auto kMulN = Set(du, 1 << (M + L));
53
625k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
625k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
625k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
625k
  constexpr size_t kLargeShiftVal = 10;
57
625k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
625k
  auto extra_bits = kZero;
60
625k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
47.0M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
46.4M
    const auto val = LoadU(du, values + i);
63
46.4M
    const auto is_large = Gt(val, kLargeThreshold);
64
46.4M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
46.4M
    const auto not_literal = Ge(val, kSplit);
66
46.4M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
46.4M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
46.4M
    const auto l = And(val, kMaskL);
69
46.4M
    const auto exp = ShiftRight<23>(b);
70
46.4M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
46.4M
    const auto n = Sub(exp_fixed, kExpOffset);
72
46.4M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
46.4M
    const auto m = ShiftRight<23 - M - L>(b);
74
46.4M
    const auto a = Add(kBase, Mul(n, kMulN));
75
46.4M
    const auto d = And(m, kMaskM);
76
46.4M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
46.4M
    const auto c = Or(a, l);
78
46.4M
    extra_bits = Add(extra_bits, eb_fixed);
79
46.4M
    const auto t = Or(c, d);
80
46.4M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
46.4M
    Store(t_fixed, du, out + i);
82
46.4M
  }
83
625k
  if (last_full < len) {
84
538k
    const auto stop = Set(du, len);
85
538k
    const auto fence = Iota(du, last_full);
86
538k
    const auto take = Lt(fence, stop);
87
538k
    const auto val = LoadU(du, values + last_full);
88
538k
    const auto is_large = Gt(val, kLargeThreshold);
89
538k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
538k
    const auto not_literal = Ge(val, kSplit);
91
538k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
538k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
538k
    const auto l = And(val, kMaskL);
94
538k
    const auto exp = ShiftRight<23>(b);
95
538k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
538k
    const auto n = Sub(exp_fixed, kExpOffset);
97
538k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
538k
    const auto m = ShiftRight<23 - M - L>(b);
99
538k
    const auto a = Add(kBase, Mul(n, kMulN));
100
538k
    const auto d = And(m, kMaskM);
101
538k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
538k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
538k
    const auto c = Or(a, l);
104
538k
    extra_bits = Add(extra_bits, eb_masked);
105
538k
    const auto t = Or(c, d);
106
538k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
538k
    Store(t_fixed, du, out + last_full);
108
538k
  }
109
625k
  return GetLane(SumOfLanes(du, extra_bits));
110
625k
}
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
81.1k
                               uint32_t* JXL_RESTRICT out) {
45
81.1k
  const HWY_FULL(uint32_t) du;
46
81.1k
  const HWY_FULL(float) df;
47
81.1k
  const auto kZero = Zero(du);
48
81.1k
  const auto kSplit = Set(du, 1 << E);
49
81.1k
  const auto kExpOffset = Set(du, 127);
50
81.1k
  const auto kEBOffset = Set(du, 127 + M + L);
51
81.1k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
81.1k
  const auto kMulN = Set(du, 1 << (M + L));
53
81.1k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
81.1k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
81.1k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
81.1k
  constexpr size_t kLargeShiftVal = 10;
57
81.1k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
81.1k
  auto extra_bits = kZero;
60
81.1k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
7.56M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
7.48M
    const auto val = LoadU(du, values + i);
63
7.48M
    const auto is_large = Gt(val, kLargeThreshold);
64
7.48M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
7.48M
    const auto not_literal = Ge(val, kSplit);
66
7.48M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
7.48M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
7.48M
    const auto l = And(val, kMaskL);
69
7.48M
    const auto exp = ShiftRight<23>(b);
70
7.48M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
7.48M
    const auto n = Sub(exp_fixed, kExpOffset);
72
7.48M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
7.48M
    const auto m = ShiftRight<23 - M - L>(b);
74
7.48M
    const auto a = Add(kBase, Mul(n, kMulN));
75
7.48M
    const auto d = And(m, kMaskM);
76
7.48M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
7.48M
    const auto c = Or(a, l);
78
7.48M
    extra_bits = Add(extra_bits, eb_fixed);
79
7.48M
    const auto t = Or(c, d);
80
7.48M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
7.48M
    Store(t_fixed, du, out + i);
82
7.48M
  }
83
81.1k
  if (last_full < len) {
84
70.1k
    const auto stop = Set(du, len);
85
70.1k
    const auto fence = Iota(du, last_full);
86
70.1k
    const auto take = Lt(fence, stop);
87
70.1k
    const auto val = LoadU(du, values + last_full);
88
70.1k
    const auto is_large = Gt(val, kLargeThreshold);
89
70.1k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
70.1k
    const auto not_literal = Ge(val, kSplit);
91
70.1k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
70.1k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
70.1k
    const auto l = And(val, kMaskL);
94
70.1k
    const auto exp = ShiftRight<23>(b);
95
70.1k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
70.1k
    const auto n = Sub(exp_fixed, kExpOffset);
97
70.1k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
70.1k
    const auto m = ShiftRight<23 - M - L>(b);
99
70.1k
    const auto a = Add(kBase, Mul(n, kMulN));
100
70.1k
    const auto d = And(m, kMaskM);
101
70.1k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
70.1k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
70.1k
    const auto c = Or(a, l);
104
70.1k
    extra_bits = Add(extra_bits, eb_masked);
105
70.1k
    const auto t = Or(c, d);
106
70.1k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
70.1k
    Store(t_fixed, du, out + last_full);
108
70.1k
  }
109
81.1k
  return GetLane(SumOfLanes(du, extra_bits));
110
81.1k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
79.2k
                               uint32_t* JXL_RESTRICT out) {
45
79.2k
  const HWY_FULL(uint32_t) du;
46
79.2k
  const HWY_FULL(float) df;
47
79.2k
  const auto kZero = Zero(du);
48
79.2k
  const auto kSplit = Set(du, 1 << E);
49
79.2k
  const auto kExpOffset = Set(du, 127);
50
79.2k
  const auto kEBOffset = Set(du, 127 + M + L);
51
79.2k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
79.2k
  const auto kMulN = Set(du, 1 << (M + L));
53
79.2k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
79.2k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
79.2k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
79.2k
  constexpr size_t kLargeShiftVal = 10;
57
79.2k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
79.2k
  auto extra_bits = kZero;
60
79.2k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
7.56M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
7.48M
    const auto val = LoadU(du, values + i);
63
7.48M
    const auto is_large = Gt(val, kLargeThreshold);
64
7.48M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
7.48M
    const auto not_literal = Ge(val, kSplit);
66
7.48M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
7.48M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
7.48M
    const auto l = And(val, kMaskL);
69
7.48M
    const auto exp = ShiftRight<23>(b);
70
7.48M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
7.48M
    const auto n = Sub(exp_fixed, kExpOffset);
72
7.48M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
7.48M
    const auto m = ShiftRight<23 - M - L>(b);
74
7.48M
    const auto a = Add(kBase, Mul(n, kMulN));
75
7.48M
    const auto d = And(m, kMaskM);
76
7.48M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
7.48M
    const auto c = Or(a, l);
78
7.48M
    extra_bits = Add(extra_bits, eb_fixed);
79
7.48M
    const auto t = Or(c, d);
80
7.48M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
7.48M
    Store(t_fixed, du, out + i);
82
7.48M
  }
83
79.2k
  if (last_full < len) {
84
68.3k
    const auto stop = Set(du, len);
85
68.3k
    const auto fence = Iota(du, last_full);
86
68.3k
    const auto take = Lt(fence, stop);
87
68.3k
    const auto val = LoadU(du, values + last_full);
88
68.3k
    const auto is_large = Gt(val, kLargeThreshold);
89
68.3k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
68.3k
    const auto not_literal = Ge(val, kSplit);
91
68.3k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
68.3k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
68.3k
    const auto l = And(val, kMaskL);
94
68.3k
    const auto exp = ShiftRight<23>(b);
95
68.3k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
68.3k
    const auto n = Sub(exp_fixed, kExpOffset);
97
68.3k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
68.3k
    const auto m = ShiftRight<23 - M - L>(b);
99
68.3k
    const auto a = Add(kBase, Mul(n, kMulN));
100
68.3k
    const auto d = And(m, kMaskM);
101
68.3k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
68.3k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
68.3k
    const auto c = Or(a, l);
104
68.3k
    extra_bits = Add(extra_bits, eb_masked);
105
68.3k
    const auto t = Or(c, d);
106
68.3k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
68.3k
    Store(t_fixed, du, out + last_full);
108
68.3k
  }
109
79.2k
  return GetLane(SumOfLanes(du, extra_bits));
110
79.2k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
79.2k
                               uint32_t* JXL_RESTRICT out) {
45
79.2k
  const HWY_FULL(uint32_t) du;
46
79.2k
  const HWY_FULL(float) df;
47
79.2k
  const auto kZero = Zero(du);
48
79.2k
  const auto kSplit = Set(du, 1 << E);
49
79.2k
  const auto kExpOffset = Set(du, 127);
50
79.2k
  const auto kEBOffset = Set(du, 127 + M + L);
51
79.2k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
79.2k
  const auto kMulN = Set(du, 1 << (M + L));
53
79.2k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
79.2k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
79.2k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
79.2k
  constexpr size_t kLargeShiftVal = 10;
57
79.2k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
79.2k
  auto extra_bits = kZero;
60
79.2k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
7.56M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
7.48M
    const auto val = LoadU(du, values + i);
63
7.48M
    const auto is_large = Gt(val, kLargeThreshold);
64
7.48M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
7.48M
    const auto not_literal = Ge(val, kSplit);
66
7.48M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
7.48M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
7.48M
    const auto l = And(val, kMaskL);
69
7.48M
    const auto exp = ShiftRight<23>(b);
70
7.48M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
7.48M
    const auto n = Sub(exp_fixed, kExpOffset);
72
7.48M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
7.48M
    const auto m = ShiftRight<23 - M - L>(b);
74
7.48M
    const auto a = Add(kBase, Mul(n, kMulN));
75
7.48M
    const auto d = And(m, kMaskM);
76
7.48M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
7.48M
    const auto c = Or(a, l);
78
7.48M
    extra_bits = Add(extra_bits, eb_fixed);
79
7.48M
    const auto t = Or(c, d);
80
7.48M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
7.48M
    Store(t_fixed, du, out + i);
82
7.48M
  }
83
79.2k
  if (last_full < len) {
84
68.3k
    const auto stop = Set(du, len);
85
68.3k
    const auto fence = Iota(du, last_full);
86
68.3k
    const auto take = Lt(fence, stop);
87
68.3k
    const auto val = LoadU(du, values + last_full);
88
68.3k
    const auto is_large = Gt(val, kLargeThreshold);
89
68.3k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
68.3k
    const auto not_literal = Ge(val, kSplit);
91
68.3k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
68.3k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
68.3k
    const auto l = And(val, kMaskL);
94
68.3k
    const auto exp = ShiftRight<23>(b);
95
68.3k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
68.3k
    const auto n = Sub(exp_fixed, kExpOffset);
97
68.3k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
68.3k
    const auto m = ShiftRight<23 - M - L>(b);
99
68.3k
    const auto a = Add(kBase, Mul(n, kMulN));
100
68.3k
    const auto d = And(m, kMaskM);
101
68.3k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
68.3k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
68.3k
    const auto c = Or(a, l);
104
68.3k
    extra_bits = Add(extra_bits, eb_masked);
105
68.3k
    const auto t = Or(c, d);
106
68.3k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
68.3k
    Store(t_fixed, du, out + last_full);
108
68.3k
  }
109
79.2k
  return GetLane(SumOfLanes(du, extra_bits));
110
79.2k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
79.2k
                               uint32_t* JXL_RESTRICT out) {
45
79.2k
  const HWY_FULL(uint32_t) du;
46
79.2k
  const HWY_FULL(float) df;
47
79.2k
  const auto kZero = Zero(du);
48
79.2k
  const auto kSplit = Set(du, 1 << E);
49
79.2k
  const auto kExpOffset = Set(du, 127);
50
79.2k
  const auto kEBOffset = Set(du, 127 + M + L);
51
79.2k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
79.2k
  const auto kMulN = Set(du, 1 << (M + L));
53
79.2k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
79.2k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
79.2k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
79.2k
  constexpr size_t kLargeShiftVal = 10;
57
79.2k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
79.2k
  auto extra_bits = kZero;
60
79.2k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
7.56M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
7.48M
    const auto val = LoadU(du, values + i);
63
7.48M
    const auto is_large = Gt(val, kLargeThreshold);
64
7.48M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
7.48M
    const auto not_literal = Ge(val, kSplit);
66
7.48M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
7.48M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
7.48M
    const auto l = And(val, kMaskL);
69
7.48M
    const auto exp = ShiftRight<23>(b);
70
7.48M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
7.48M
    const auto n = Sub(exp_fixed, kExpOffset);
72
7.48M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
7.48M
    const auto m = ShiftRight<23 - M - L>(b);
74
7.48M
    const auto a = Add(kBase, Mul(n, kMulN));
75
7.48M
    const auto d = And(m, kMaskM);
76
7.48M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
7.48M
    const auto c = Or(a, l);
78
7.48M
    extra_bits = Add(extra_bits, eb_fixed);
79
7.48M
    const auto t = Or(c, d);
80
7.48M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
7.48M
    Store(t_fixed, du, out + i);
82
7.48M
  }
83
79.2k
  if (last_full < len) {
84
68.3k
    const auto stop = Set(du, len);
85
68.3k
    const auto fence = Iota(du, last_full);
86
68.3k
    const auto take = Lt(fence, stop);
87
68.3k
    const auto val = LoadU(du, values + last_full);
88
68.3k
    const auto is_large = Gt(val, kLargeThreshold);
89
68.3k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
68.3k
    const auto not_literal = Ge(val, kSplit);
91
68.3k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
68.3k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
68.3k
    const auto l = And(val, kMaskL);
94
68.3k
    const auto exp = ShiftRight<23>(b);
95
68.3k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
68.3k
    const auto n = Sub(exp_fixed, kExpOffset);
97
68.3k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
68.3k
    const auto m = ShiftRight<23 - M - L>(b);
99
68.3k
    const auto a = Add(kBase, Mul(n, kMulN));
100
68.3k
    const auto d = And(m, kMaskM);
101
68.3k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
68.3k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
68.3k
    const auto c = Or(a, l);
104
68.3k
    extra_bits = Add(extra_bits, eb_masked);
105
68.3k
    const auto t = Or(c, d);
106
68.3k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
68.3k
    Store(t_fixed, du, out + last_full);
108
68.3k
  }
109
79.2k
  return GetLane(SumOfLanes(du, extra_bits));
110
79.2k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
12.1k
                               uint32_t* JXL_RESTRICT out) {
45
12.1k
  const HWY_FULL(uint32_t) du;
46
12.1k
  const HWY_FULL(float) df;
47
12.1k
  const auto kZero = Zero(du);
48
12.1k
  const auto kSplit = Set(du, 1 << E);
49
12.1k
  const auto kExpOffset = Set(du, 127);
50
12.1k
  const auto kEBOffset = Set(du, 127 + M + L);
51
12.1k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
12.1k
  const auto kMulN = Set(du, 1 << (M + L));
53
12.1k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
12.1k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
12.1k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
12.1k
  constexpr size_t kLargeShiftVal = 10;
57
12.1k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
12.1k
  auto extra_bits = kZero;
60
12.1k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
459k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
447k
    const auto val = LoadU(du, values + i);
63
447k
    const auto is_large = Gt(val, kLargeThreshold);
64
447k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
447k
    const auto not_literal = Ge(val, kSplit);
66
447k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
447k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
447k
    const auto l = And(val, kMaskL);
69
447k
    const auto exp = ShiftRight<23>(b);
70
447k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
447k
    const auto n = Sub(exp_fixed, kExpOffset);
72
447k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
447k
    const auto m = ShiftRight<23 - M - L>(b);
74
447k
    const auto a = Add(kBase, Mul(n, kMulN));
75
447k
    const auto d = And(m, kMaskM);
76
447k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
447k
    const auto c = Or(a, l);
78
447k
    extra_bits = Add(extra_bits, eb_fixed);
79
447k
    const auto t = Or(c, d);
80
447k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
447k
    Store(t_fixed, du, out + i);
82
447k
  }
83
12.1k
  if (last_full < len) {
84
10.4k
    const auto stop = Set(du, len);
85
10.4k
    const auto fence = Iota(du, last_full);
86
10.4k
    const auto take = Lt(fence, stop);
87
10.4k
    const auto val = LoadU(du, values + last_full);
88
10.4k
    const auto is_large = Gt(val, kLargeThreshold);
89
10.4k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
10.4k
    const auto not_literal = Ge(val, kSplit);
91
10.4k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
10.4k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
10.4k
    const auto l = And(val, kMaskL);
94
10.4k
    const auto exp = ShiftRight<23>(b);
95
10.4k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
10.4k
    const auto n = Sub(exp_fixed, kExpOffset);
97
10.4k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
10.4k
    const auto m = ShiftRight<23 - M - L>(b);
99
10.4k
    const auto a = Add(kBase, Mul(n, kMulN));
100
10.4k
    const auto d = And(m, kMaskM);
101
10.4k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
10.4k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
10.4k
    const auto c = Or(a, l);
104
10.4k
    extra_bits = Add(extra_bits, eb_masked);
105
10.4k
    const auto t = Or(c, d);
106
10.4k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
10.4k
    Store(t_fixed, du, out + last_full);
108
10.4k
  }
109
12.1k
  return GetLane(SumOfLanes(du, extra_bits));
110
12.1k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
12.1k
                               uint32_t* JXL_RESTRICT out) {
45
12.1k
  const HWY_FULL(uint32_t) du;
46
12.1k
  const HWY_FULL(float) df;
47
12.1k
  const auto kZero = Zero(du);
48
12.1k
  const auto kSplit = Set(du, 1 << E);
49
12.1k
  const auto kExpOffset = Set(du, 127);
50
12.1k
  const auto kEBOffset = Set(du, 127 + M + L);
51
12.1k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
12.1k
  const auto kMulN = Set(du, 1 << (M + L));
53
12.1k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
12.1k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
12.1k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
12.1k
  constexpr size_t kLargeShiftVal = 10;
57
12.1k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
12.1k
  auto extra_bits = kZero;
60
12.1k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
459k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
447k
    const auto val = LoadU(du, values + i);
63
447k
    const auto is_large = Gt(val, kLargeThreshold);
64
447k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
447k
    const auto not_literal = Ge(val, kSplit);
66
447k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
447k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
447k
    const auto l = And(val, kMaskL);
69
447k
    const auto exp = ShiftRight<23>(b);
70
447k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
447k
    const auto n = Sub(exp_fixed, kExpOffset);
72
447k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
447k
    const auto m = ShiftRight<23 - M - L>(b);
74
447k
    const auto a = Add(kBase, Mul(n, kMulN));
75
447k
    const auto d = And(m, kMaskM);
76
447k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
447k
    const auto c = Or(a, l);
78
447k
    extra_bits = Add(extra_bits, eb_fixed);
79
447k
    const auto t = Or(c, d);
80
447k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
447k
    Store(t_fixed, du, out + i);
82
447k
  }
83
12.1k
  if (last_full < len) {
84
10.4k
    const auto stop = Set(du, len);
85
10.4k
    const auto fence = Iota(du, last_full);
86
10.4k
    const auto take = Lt(fence, stop);
87
10.4k
    const auto val = LoadU(du, values + last_full);
88
10.4k
    const auto is_large = Gt(val, kLargeThreshold);
89
10.4k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
10.4k
    const auto not_literal = Ge(val, kSplit);
91
10.4k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
10.4k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
10.4k
    const auto l = And(val, kMaskL);
94
10.4k
    const auto exp = ShiftRight<23>(b);
95
10.4k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
10.4k
    const auto n = Sub(exp_fixed, kExpOffset);
97
10.4k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
10.4k
    const auto m = ShiftRight<23 - M - L>(b);
99
10.4k
    const auto a = Add(kBase, Mul(n, kMulN));
100
10.4k
    const auto d = And(m, kMaskM);
101
10.4k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
10.4k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
10.4k
    const auto c = Or(a, l);
104
10.4k
    extra_bits = Add(extra_bits, eb_masked);
105
10.4k
    const auto t = Or(c, d);
106
10.4k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
10.4k
    Store(t_fixed, du, out + last_full);
108
10.4k
  }
109
12.1k
  return GetLane(SumOfLanes(du, extra_bits));
110
12.1k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
13.4k
                               uint32_t* JXL_RESTRICT out) {
45
13.4k
  const HWY_FULL(uint32_t) du;
46
13.4k
  const HWY_FULL(float) df;
47
13.4k
  const auto kZero = Zero(du);
48
13.4k
  const auto kSplit = Set(du, 1 << E);
49
13.4k
  const auto kExpOffset = Set(du, 127);
50
13.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
13.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
13.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
13.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
13.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
13.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
13.4k
  constexpr size_t kLargeShiftVal = 10;
57
13.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
13.4k
  auto extra_bits = kZero;
60
13.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
820k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
806k
    const auto val = LoadU(du, values + i);
63
806k
    const auto is_large = Gt(val, kLargeThreshold);
64
806k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
806k
    const auto not_literal = Ge(val, kSplit);
66
806k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
806k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
806k
    const auto l = And(val, kMaskL);
69
806k
    const auto exp = ShiftRight<23>(b);
70
806k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
806k
    const auto n = Sub(exp_fixed, kExpOffset);
72
806k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
806k
    const auto m = ShiftRight<23 - M - L>(b);
74
806k
    const auto a = Add(kBase, Mul(n, kMulN));
75
806k
    const auto d = And(m, kMaskM);
76
806k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
806k
    const auto c = Or(a, l);
78
806k
    extra_bits = Add(extra_bits, eb_fixed);
79
806k
    const auto t = Or(c, d);
80
806k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
806k
    Store(t_fixed, du, out + i);
82
806k
  }
83
13.4k
  if (last_full < len) {
84
11.5k
    const auto stop = Set(du, len);
85
11.5k
    const auto fence = Iota(du, last_full);
86
11.5k
    const auto take = Lt(fence, stop);
87
11.5k
    const auto val = LoadU(du, values + last_full);
88
11.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
11.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
11.5k
    const auto not_literal = Ge(val, kSplit);
91
11.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
11.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
11.5k
    const auto l = And(val, kMaskL);
94
11.5k
    const auto exp = ShiftRight<23>(b);
95
11.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
11.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
11.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
11.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
11.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
11.5k
    const auto d = And(m, kMaskM);
101
11.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
11.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
11.5k
    const auto c = Or(a, l);
104
11.5k
    extra_bits = Add(extra_bits, eb_masked);
105
11.5k
    const auto t = Or(c, d);
106
11.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
11.5k
    Store(t_fixed, du, out + last_full);
108
11.5k
  }
109
13.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
13.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
10.5k
                               uint32_t* JXL_RESTRICT out) {
45
10.5k
  const HWY_FULL(uint32_t) du;
46
10.5k
  const HWY_FULL(float) df;
47
10.5k
  const auto kZero = Zero(du);
48
10.5k
  const auto kSplit = Set(du, 1 << E);
49
10.5k
  const auto kExpOffset = Set(du, 127);
50
10.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
10.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
10.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
10.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
10.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
10.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
10.5k
  constexpr size_t kLargeShiftVal = 10;
57
10.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
10.5k
  auto extra_bits = kZero;
60
10.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
389k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
379k
    const auto val = LoadU(du, values + i);
63
379k
    const auto is_large = Gt(val, kLargeThreshold);
64
379k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
379k
    const auto not_literal = Ge(val, kSplit);
66
379k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
379k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
379k
    const auto l = And(val, kMaskL);
69
379k
    const auto exp = ShiftRight<23>(b);
70
379k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
379k
    const auto n = Sub(exp_fixed, kExpOffset);
72
379k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
379k
    const auto m = ShiftRight<23 - M - L>(b);
74
379k
    const auto a = Add(kBase, Mul(n, kMulN));
75
379k
    const auto d = And(m, kMaskM);
76
379k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
379k
    const auto c = Or(a, l);
78
379k
    extra_bits = Add(extra_bits, eb_fixed);
79
379k
    const auto t = Or(c, d);
80
379k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
379k
    Store(t_fixed, du, out + i);
82
379k
  }
83
10.5k
  if (last_full < len) {
84
9.14k
    const auto stop = Set(du, len);
85
9.14k
    const auto fence = Iota(du, last_full);
86
9.14k
    const auto take = Lt(fence, stop);
87
9.14k
    const auto val = LoadU(du, values + last_full);
88
9.14k
    const auto is_large = Gt(val, kLargeThreshold);
89
9.14k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
9.14k
    const auto not_literal = Ge(val, kSplit);
91
9.14k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
9.14k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
9.14k
    const auto l = And(val, kMaskL);
94
9.14k
    const auto exp = ShiftRight<23>(b);
95
9.14k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
9.14k
    const auto n = Sub(exp_fixed, kExpOffset);
97
9.14k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
9.14k
    const auto m = ShiftRight<23 - M - L>(b);
99
9.14k
    const auto a = Add(kBase, Mul(n, kMulN));
100
9.14k
    const auto d = And(m, kMaskM);
101
9.14k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
9.14k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
9.14k
    const auto c = Or(a, l);
104
9.14k
    extra_bits = Add(extra_bits, eb_masked);
105
9.14k
    const auto t = Or(c, d);
106
9.14k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
9.14k
    Store(t_fixed, du, out + last_full);
108
9.14k
  }
109
10.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
10.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
10.5k
                               uint32_t* JXL_RESTRICT out) {
45
10.5k
  const HWY_FULL(uint32_t) du;
46
10.5k
  const HWY_FULL(float) df;
47
10.5k
  const auto kZero = Zero(du);
48
10.5k
  const auto kSplit = Set(du, 1 << E);
49
10.5k
  const auto kExpOffset = Set(du, 127);
50
10.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
10.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
10.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
10.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
10.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
10.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
10.5k
  constexpr size_t kLargeShiftVal = 10;
57
10.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
10.5k
  auto extra_bits = kZero;
60
10.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
389k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
379k
    const auto val = LoadU(du, values + i);
63
379k
    const auto is_large = Gt(val, kLargeThreshold);
64
379k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
379k
    const auto not_literal = Ge(val, kSplit);
66
379k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
379k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
379k
    const auto l = And(val, kMaskL);
69
379k
    const auto exp = ShiftRight<23>(b);
70
379k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
379k
    const auto n = Sub(exp_fixed, kExpOffset);
72
379k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
379k
    const auto m = ShiftRight<23 - M - L>(b);
74
379k
    const auto a = Add(kBase, Mul(n, kMulN));
75
379k
    const auto d = And(m, kMaskM);
76
379k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
379k
    const auto c = Or(a, l);
78
379k
    extra_bits = Add(extra_bits, eb_fixed);
79
379k
    const auto t = Or(c, d);
80
379k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
379k
    Store(t_fixed, du, out + i);
82
379k
  }
83
10.5k
  if (last_full < len) {
84
9.14k
    const auto stop = Set(du, len);
85
9.14k
    const auto fence = Iota(du, last_full);
86
9.14k
    const auto take = Lt(fence, stop);
87
9.14k
    const auto val = LoadU(du, values + last_full);
88
9.14k
    const auto is_large = Gt(val, kLargeThreshold);
89
9.14k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
9.14k
    const auto not_literal = Ge(val, kSplit);
91
9.14k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
9.14k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
9.14k
    const auto l = And(val, kMaskL);
94
9.14k
    const auto exp = ShiftRight<23>(b);
95
9.14k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
9.14k
    const auto n = Sub(exp_fixed, kExpOffset);
97
9.14k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
9.14k
    const auto m = ShiftRight<23 - M - L>(b);
99
9.14k
    const auto a = Add(kBase, Mul(n, kMulN));
100
9.14k
    const auto d = And(m, kMaskM);
101
9.14k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
9.14k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
9.14k
    const auto c = Or(a, l);
104
9.14k
    extra_bits = Add(extra_bits, eb_masked);
105
9.14k
    const auto t = Or(c, d);
106
9.14k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
9.14k
    Store(t_fixed, du, out + last_full);
108
9.14k
  }
109
10.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
10.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
10.5k
                               uint32_t* JXL_RESTRICT out) {
45
10.5k
  const HWY_FULL(uint32_t) du;
46
10.5k
  const HWY_FULL(float) df;
47
10.5k
  const auto kZero = Zero(du);
48
10.5k
  const auto kSplit = Set(du, 1 << E);
49
10.5k
  const auto kExpOffset = Set(du, 127);
50
10.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
10.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
10.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
10.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
10.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
10.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
10.5k
  constexpr size_t kLargeShiftVal = 10;
57
10.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
10.5k
  auto extra_bits = kZero;
60
10.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
389k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
379k
    const auto val = LoadU(du, values + i);
63
379k
    const auto is_large = Gt(val, kLargeThreshold);
64
379k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
379k
    const auto not_literal = Ge(val, kSplit);
66
379k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
379k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
379k
    const auto l = And(val, kMaskL);
69
379k
    const auto exp = ShiftRight<23>(b);
70
379k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
379k
    const auto n = Sub(exp_fixed, kExpOffset);
72
379k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
379k
    const auto m = ShiftRight<23 - M - L>(b);
74
379k
    const auto a = Add(kBase, Mul(n, kMulN));
75
379k
    const auto d = And(m, kMaskM);
76
379k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
379k
    const auto c = Or(a, l);
78
379k
    extra_bits = Add(extra_bits, eb_fixed);
79
379k
    const auto t = Or(c, d);
80
379k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
379k
    Store(t_fixed, du, out + i);
82
379k
  }
83
10.5k
  if (last_full < len) {
84
9.14k
    const auto stop = Set(du, len);
85
9.14k
    const auto fence = Iota(du, last_full);
86
9.14k
    const auto take = Lt(fence, stop);
87
9.14k
    const auto val = LoadU(du, values + last_full);
88
9.14k
    const auto is_large = Gt(val, kLargeThreshold);
89
9.14k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
9.14k
    const auto not_literal = Ge(val, kSplit);
91
9.14k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
9.14k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
9.14k
    const auto l = And(val, kMaskL);
94
9.14k
    const auto exp = ShiftRight<23>(b);
95
9.14k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
9.14k
    const auto n = Sub(exp_fixed, kExpOffset);
97
9.14k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
9.14k
    const auto m = ShiftRight<23 - M - L>(b);
99
9.14k
    const auto a = Add(kBase, Mul(n, kMulN));
100
9.14k
    const auto d = And(m, kMaskM);
101
9.14k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
9.14k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
9.14k
    const auto c = Or(a, l);
104
9.14k
    extra_bits = Add(extra_bits, eb_masked);
105
9.14k
    const auto t = Or(c, d);
106
9.14k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
9.14k
    Store(t_fixed, du, out + last_full);
108
9.14k
  }
109
10.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
10.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
10.5k
                               uint32_t* JXL_RESTRICT out) {
45
10.5k
  const HWY_FULL(uint32_t) du;
46
10.5k
  const HWY_FULL(float) df;
47
10.5k
  const auto kZero = Zero(du);
48
10.5k
  const auto kSplit = Set(du, 1 << E);
49
10.5k
  const auto kExpOffset = Set(du, 127);
50
10.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
10.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
10.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
10.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
10.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
10.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
10.5k
  constexpr size_t kLargeShiftVal = 10;
57
10.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
10.5k
  auto extra_bits = kZero;
60
10.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
389k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
379k
    const auto val = LoadU(du, values + i);
63
379k
    const auto is_large = Gt(val, kLargeThreshold);
64
379k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
379k
    const auto not_literal = Ge(val, kSplit);
66
379k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
379k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
379k
    const auto l = And(val, kMaskL);
69
379k
    const auto exp = ShiftRight<23>(b);
70
379k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
379k
    const auto n = Sub(exp_fixed, kExpOffset);
72
379k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
379k
    const auto m = ShiftRight<23 - M - L>(b);
74
379k
    const auto a = Add(kBase, Mul(n, kMulN));
75
379k
    const auto d = And(m, kMaskM);
76
379k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
379k
    const auto c = Or(a, l);
78
379k
    extra_bits = Add(extra_bits, eb_fixed);
79
379k
    const auto t = Or(c, d);
80
379k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
379k
    Store(t_fixed, du, out + i);
82
379k
  }
83
10.5k
  if (last_full < len) {
84
9.14k
    const auto stop = Set(du, len);
85
9.14k
    const auto fence = Iota(du, last_full);
86
9.14k
    const auto take = Lt(fence, stop);
87
9.14k
    const auto val = LoadU(du, values + last_full);
88
9.14k
    const auto is_large = Gt(val, kLargeThreshold);
89
9.14k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
9.14k
    const auto not_literal = Ge(val, kSplit);
91
9.14k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
9.14k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
9.14k
    const auto l = And(val, kMaskL);
94
9.14k
    const auto exp = ShiftRight<23>(b);
95
9.14k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
9.14k
    const auto n = Sub(exp_fixed, kExpOffset);
97
9.14k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
9.14k
    const auto m = ShiftRight<23 - M - L>(b);
99
9.14k
    const auto a = Add(kBase, Mul(n, kMulN));
100
9.14k
    const auto d = And(m, kMaskM);
101
9.14k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
9.14k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
9.14k
    const auto c = Or(a, l);
104
9.14k
    extra_bits = Add(extra_bits, eb_masked);
105
9.14k
    const auto t = Or(c, d);
106
9.14k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
9.14k
    Store(t_fixed, du, out + last_full);
108
9.14k
  }
109
10.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
10.5k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
10.5k
                               uint32_t* JXL_RESTRICT out) {
45
10.5k
  const HWY_FULL(uint32_t) du;
46
10.5k
  const HWY_FULL(float) df;
47
10.5k
  const auto kZero = Zero(du);
48
10.5k
  const auto kSplit = Set(du, 1 << E);
49
10.5k
  const auto kExpOffset = Set(du, 127);
50
10.5k
  const auto kEBOffset = Set(du, 127 + M + L);
51
10.5k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
10.5k
  const auto kMulN = Set(du, 1 << (M + L));
53
10.5k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
10.5k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
10.5k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
10.5k
  constexpr size_t kLargeShiftVal = 10;
57
10.5k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
10.5k
  auto extra_bits = kZero;
60
10.5k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
389k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
379k
    const auto val = LoadU(du, values + i);
63
379k
    const auto is_large = Gt(val, kLargeThreshold);
64
379k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
379k
    const auto not_literal = Ge(val, kSplit);
66
379k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
379k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
379k
    const auto l = And(val, kMaskL);
69
379k
    const auto exp = ShiftRight<23>(b);
70
379k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
379k
    const auto n = Sub(exp_fixed, kExpOffset);
72
379k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
379k
    const auto m = ShiftRight<23 - M - L>(b);
74
379k
    const auto a = Add(kBase, Mul(n, kMulN));
75
379k
    const auto d = And(m, kMaskM);
76
379k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
379k
    const auto c = Or(a, l);
78
379k
    extra_bits = Add(extra_bits, eb_fixed);
79
379k
    const auto t = Or(c, d);
80
379k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
379k
    Store(t_fixed, du, out + i);
82
379k
  }
83
10.5k
  if (last_full < len) {
84
9.14k
    const auto stop = Set(du, len);
85
9.14k
    const auto fence = Iota(du, last_full);
86
9.14k
    const auto take = Lt(fence, stop);
87
9.14k
    const auto val = LoadU(du, values + last_full);
88
9.14k
    const auto is_large = Gt(val, kLargeThreshold);
89
9.14k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
9.14k
    const auto not_literal = Ge(val, kSplit);
91
9.14k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
9.14k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
9.14k
    const auto l = And(val, kMaskL);
94
9.14k
    const auto exp = ShiftRight<23>(b);
95
9.14k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
9.14k
    const auto n = Sub(exp_fixed, kExpOffset);
97
9.14k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
9.14k
    const auto m = ShiftRight<23 - M - L>(b);
99
9.14k
    const auto a = Add(kBase, Mul(n, kMulN));
100
9.14k
    const auto d = And(m, kMaskM);
101
9.14k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
9.14k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
9.14k
    const auto c = Or(a, l);
104
9.14k
    extra_bits = Add(extra_bits, eb_masked);
105
9.14k
    const auto t = Or(c, d);
106
9.14k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
9.14k
    Store(t_fixed, du, out + last_full);
108
9.14k
  }
109
10.5k
  return GetLane(SumOfLanes(du, extra_bits));
110
10.5k
}
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
111
112
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
113
625k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
625k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
625k
  if (cfg.split_exponent == 0) {
127
81.1k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
544k
  } else if (cfg.split_exponent == 2) {
129
79.2k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
79.2k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
464k
  } else if (cfg.split_exponent == 3) {
132
53.9k
    if (cfg.msb_in_token == 1) {
133
26.9k
      if (cfg.lsb_in_token == 0) {
134
13.4k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
13.4k
      } else {
136
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
13.4k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
13.4k
      }
139
26.9k
    } else {
140
26.9k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
26.9k
      if (cfg.lsb_in_token == 0) {
142
13.4k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
13.4k
      } else {
144
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
13.4k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
13.4k
      }
147
26.9k
    }
148
410k
  } else if (cfg.split_exponent == 4) {
149
212k
    if (cfg.msb_in_token == 1) {
150
106k
      if (cfg.lsb_in_token == 0) {
151
13.4k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
92.7k
      } else if (cfg.lsb_in_token == 2) {
153
79.2k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
79.2k
      } else {
155
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
13.4k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
13.4k
      }
158
106k
    } else {
159
106k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
106k
      if (cfg.lsb_in_token == 0) {
161
79.2k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
79.2k
      } else if (cfg.lsb_in_token == 1) {
163
13.4k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
13.4k
      } else {
165
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
13.4k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
13.4k
      }
168
106k
    }
169
212k
  } else if (cfg.split_exponent == 5) {
170
94.3k
    if (cfg.msb_in_token == 1) {
171
40.4k
      if (cfg.lsb_in_token == 0) {
172
13.4k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
26.9k
      } else if (cfg.lsb_in_token == 2) {
174
13.4k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
13.4k
      } else {
176
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
13.4k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
13.4k
      }
179
53.8k
    } else {
180
53.8k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
53.8k
      if (cfg.lsb_in_token == 0) {
182
13.4k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
40.4k
      } else if (cfg.lsb_in_token == 1) {
184
13.4k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
26.9k
      } else if (cfg.lsb_in_token == 2) {
186
13.4k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
13.4k
      } else {
188
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
13.4k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
13.4k
      }
191
53.8k
    }
192
104k
  } else if (cfg.split_exponent == 6) {
193
37.8k
    if (cfg.msb_in_token == 0) {
194
13.4k
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
13.4k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
24.3k
    } else if (cfg.msb_in_token == 1) {
197
12.1k
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
12.1k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
12.1k
    } else {
200
12.1k
      JXL_DASSERT(cfg.msb_in_token == 2);
201
12.1k
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
12.1k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
12.1k
    }
204
66.4k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
66.4k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
66.4k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
66.4k
    if (cfg.split_exponent == 7) {
208
13.4k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
52.9k
    } else if (cfg.split_exponent == 8) {
210
10.5k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
42.3k
    } else if (cfg.split_exponent == 9) {
212
10.5k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
31.7k
    } else if (cfg.split_exponent == 10) {
214
10.5k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
21.1k
    } else if (cfg.split_exponent == 11) {
216
10.5k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
10.5k
    } else {
218
10.5k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
10.5k
    }
220
66.4k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
625k
#endif
225
625k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Line
Count
Source
113
625k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
625k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
625k
  if (cfg.split_exponent == 0) {
127
81.1k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
544k
  } else if (cfg.split_exponent == 2) {
129
79.2k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
79.2k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
464k
  } else if (cfg.split_exponent == 3) {
132
53.9k
    if (cfg.msb_in_token == 1) {
133
26.9k
      if (cfg.lsb_in_token == 0) {
134
13.4k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
13.4k
      } else {
136
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
13.4k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
13.4k
      }
139
26.9k
    } else {
140
26.9k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
26.9k
      if (cfg.lsb_in_token == 0) {
142
13.4k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
13.4k
      } else {
144
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
13.4k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
13.4k
      }
147
26.9k
    }
148
410k
  } else if (cfg.split_exponent == 4) {
149
212k
    if (cfg.msb_in_token == 1) {
150
106k
      if (cfg.lsb_in_token == 0) {
151
13.4k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
92.7k
      } else if (cfg.lsb_in_token == 2) {
153
79.2k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
79.2k
      } else {
155
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
13.4k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
13.4k
      }
158
106k
    } else {
159
106k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
106k
      if (cfg.lsb_in_token == 0) {
161
79.2k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
79.2k
      } else if (cfg.lsb_in_token == 1) {
163
13.4k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
13.4k
      } else {
165
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
13.4k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
13.4k
      }
168
106k
    }
169
212k
  } else if (cfg.split_exponent == 5) {
170
94.3k
    if (cfg.msb_in_token == 1) {
171
40.4k
      if (cfg.lsb_in_token == 0) {
172
13.4k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
26.9k
      } else if (cfg.lsb_in_token == 2) {
174
13.4k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
13.4k
      } else {
176
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
13.4k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
13.4k
      }
179
53.8k
    } else {
180
53.8k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
53.8k
      if (cfg.lsb_in_token == 0) {
182
13.4k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
40.4k
      } else if (cfg.lsb_in_token == 1) {
184
13.4k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
26.9k
      } else if (cfg.lsb_in_token == 2) {
186
13.4k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
13.4k
      } else {
188
13.4k
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
13.4k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
13.4k
      }
191
53.8k
    }
192
104k
  } else if (cfg.split_exponent == 6) {
193
37.8k
    if (cfg.msb_in_token == 0) {
194
13.4k
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
13.4k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
24.3k
    } else if (cfg.msb_in_token == 1) {
197
12.1k
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
12.1k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
12.1k
    } else {
200
12.1k
      JXL_DASSERT(cfg.msb_in_token == 2);
201
12.1k
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
12.1k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
12.1k
    }
204
66.4k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
66.4k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
66.4k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
66.4k
    if (cfg.split_exponent == 7) {
208
13.4k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
52.9k
    } else if (cfg.split_exponent == 8) {
210
10.5k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
42.3k
    } else if (cfg.split_exponent == 9) {
212
10.5k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
31.7k
    } else if (cfg.split_exponent == 10) {
214
10.5k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
21.1k
    } else if (cfg.split_exponent == 11) {
216
10.5k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
10.5k
    } else {
218
10.5k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
10.5k
    }
220
66.4k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
625k
#endif
225
625k
}
Unexecuted instantiation: jxl::N_AVX3::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_SPR::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
226
227
// NOLINTNEXTLINE(google-readability-namespace-comments)
228
}  // namespace HWY_NAMESPACE
229
}  // namespace jxl
230
HWY_AFTER_NAMESPACE();
231
232
#if HWY_ONCE
233
namespace jxl {
234
235
HWY_EXPORT(EstimateTokenCost);
236
237
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
238
625k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
239
625k
  JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent);
240
625k
  return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens);
241
625k
}
242
243
}  // namespace jxl
244
#endif