Coverage Report

Created: 2025-07-23 08:18

/src/libjxl/lib/jxl/enc_ans_simd.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ans_simd.h"
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/status.h"
11
#include "lib/jxl/dec_ans.h"
12
#include "lib/jxl/memory_manager_internal.h"
13
14
#undef HWY_TARGET_INCLUDE
15
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc"
16
#include <hwy/foreach_target.h>
17
#include <hwy/highway.h>
18
19
HWY_BEFORE_NAMESPACE();
20
namespace jxl {
21
namespace HWY_NAMESPACE {
22
23
// These templates are not found via ADL.
24
using hwy::HWY_NAMESPACE::Add;
25
using hwy::HWY_NAMESPACE::And;
26
using hwy::HWY_NAMESPACE::Ge;
27
using hwy::HWY_NAMESPACE::GetLane;
28
using hwy::HWY_NAMESPACE::IfThenElse;
29
using hwy::HWY_NAMESPACE::IfThenElseZero;
30
using hwy::HWY_NAMESPACE::Iota;
31
using hwy::HWY_NAMESPACE::LoadU;
32
using hwy::HWY_NAMESPACE::Lt;
33
using hwy::HWY_NAMESPACE::Mul;
34
using hwy::HWY_NAMESPACE::Or;
35
using hwy::HWY_NAMESPACE::Set;
36
using hwy::HWY_NAMESPACE::ShiftRight;
37
using hwy::HWY_NAMESPACE::Store;
38
using hwy::HWY_NAMESPACE::Sub;
39
using hwy::HWY_NAMESPACE::Zero;
40
41
template <size_t E, size_t M, size_t L>
42
uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len,
43
404k
                               uint32_t* JXL_RESTRICT out) {
44
404k
  const HWY_FULL(uint32_t) du;
45
404k
  const HWY_FULL(float) df;
46
404k
  const auto kZero = Zero(du);
47
404k
  const auto kSplit = Set(du, 1 << E);
48
404k
  const auto kExpOffset = Set(du, 127);
49
404k
  const auto kEBOffset = Set(du, 127 + M + L);
50
404k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
404k
  const auto kMulN = Set(du, 1 << (M + L));
52
404k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
404k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
404k
  auto extra_bits = kZero;
56
404k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
33.6M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
33.2M
    const auto val = LoadU(du, values + i);
59
33.2M
    const auto not_literal = Ge(val, kSplit);
60
33.2M
    const auto b = BitCast(du, ConvertTo(df, val));
61
33.2M
    const auto l = And(val, kMaskL);
62
33.2M
    const auto exp = ShiftRight<23>(b);
63
33.2M
    const auto n = Sub(exp, kExpOffset);
64
33.2M
    const auto eb = Sub(exp, kEBOffset);
65
33.2M
    const auto m = ShiftRight<23 - M - L>(b);
66
33.2M
    const auto a = Add(kBase, Mul(n, kMulN));
67
33.2M
    const auto d = And(m, kMaskM);
68
33.2M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
33.2M
    const auto c = Or(a, l);
70
33.2M
    extra_bits = Add(extra_bits, eb_fixed);
71
33.2M
    const auto t = Or(c, d);
72
33.2M
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
33.2M
    Store(t_fixed, du, out + i);
74
33.2M
  }
75
404k
  if (last_full < len) {
76
346k
    const auto stop = Set(du, len);
77
346k
    const auto fence = Iota(du, last_full);
78
346k
    const auto take = Lt(fence, stop);
79
346k
    const auto val = LoadU(du, values + last_full);
80
346k
    const auto not_literal = Ge(val, kSplit);
81
346k
    const auto b = BitCast(du, ConvertTo(df, val));
82
346k
    const auto l = And(val, kMaskL);
83
346k
    const auto exp = ShiftRight<23>(b);
84
346k
    const auto n = Sub(exp, kExpOffset);
85
346k
    const auto eb = Sub(exp, kEBOffset);
86
346k
    const auto m = ShiftRight<23 - M - L>(b);
87
346k
    const auto a = Add(kBase, Mul(n, kMulN));
88
346k
    const auto d = And(m, kMaskM);
89
346k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
346k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
346k
    const auto c = Or(a, l);
92
346k
    extra_bits = Add(extra_bits, eb_masked);
93
346k
    const auto t = Or(c, d);
94
346k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
346k
    Store(t_fixed, du, out + last_full);
96
346k
  }
97
404k
  return GetLane(SumOfLanes(du, extra_bits));
98
404k
}
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
57.0k
                               uint32_t* JXL_RESTRICT out) {
44
57.0k
  const HWY_FULL(uint32_t) du;
45
57.0k
  const HWY_FULL(float) df;
46
57.0k
  const auto kZero = Zero(du);
47
57.0k
  const auto kSplit = Set(du, 1 << E);
48
57.0k
  const auto kExpOffset = Set(du, 127);
49
57.0k
  const auto kEBOffset = Set(du, 127 + M + L);
50
57.0k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
57.0k
  const auto kMulN = Set(du, 1 << (M + L));
52
57.0k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
57.0k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
57.0k
  auto extra_bits = kZero;
56
57.0k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
5.49M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
5.43M
    const auto val = LoadU(du, values + i);
59
5.43M
    const auto not_literal = Ge(val, kSplit);
60
5.43M
    const auto b = BitCast(du, ConvertTo(df, val));
61
5.43M
    const auto l = And(val, kMaskL);
62
5.43M
    const auto exp = ShiftRight<23>(b);
63
5.43M
    const auto n = Sub(exp, kExpOffset);
64
5.43M
    const auto eb = Sub(exp, kEBOffset);
65
5.43M
    const auto m = ShiftRight<23 - M - L>(b);
66
5.43M
    const auto a = Add(kBase, Mul(n, kMulN));
67
5.43M
    const auto d = And(m, kMaskM);
68
5.43M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
5.43M
    const auto c = Or(a, l);
70
5.43M
    extra_bits = Add(extra_bits, eb_fixed);
71
5.43M
    const auto t = Or(c, d);
72
5.43M
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
5.43M
    Store(t_fixed, du, out + i);
74
5.43M
  }
75
57.0k
  if (last_full < len) {
76
49.3k
    const auto stop = Set(du, len);
77
49.3k
    const auto fence = Iota(du, last_full);
78
49.3k
    const auto take = Lt(fence, stop);
79
49.3k
    const auto val = LoadU(du, values + last_full);
80
49.3k
    const auto not_literal = Ge(val, kSplit);
81
49.3k
    const auto b = BitCast(du, ConvertTo(df, val));
82
49.3k
    const auto l = And(val, kMaskL);
83
49.3k
    const auto exp = ShiftRight<23>(b);
84
49.3k
    const auto n = Sub(exp, kExpOffset);
85
49.3k
    const auto eb = Sub(exp, kEBOffset);
86
49.3k
    const auto m = ShiftRight<23 - M - L>(b);
87
49.3k
    const auto a = Add(kBase, Mul(n, kMulN));
88
49.3k
    const auto d = And(m, kMaskM);
89
49.3k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
49.3k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
49.3k
    const auto c = Or(a, l);
92
49.3k
    extra_bits = Add(extra_bits, eb_masked);
93
49.3k
    const auto t = Or(c, d);
94
49.3k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
49.3k
    Store(t_fixed, du, out + last_full);
96
49.3k
  }
97
57.0k
  return GetLane(SumOfLanes(du, extra_bits));
98
57.0k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
56.4k
                               uint32_t* JXL_RESTRICT out) {
44
56.4k
  const HWY_FULL(uint32_t) du;
45
56.4k
  const HWY_FULL(float) df;
46
56.4k
  const auto kZero = Zero(du);
47
56.4k
  const auto kSplit = Set(du, 1 << E);
48
56.4k
  const auto kExpOffset = Set(du, 127);
49
56.4k
  const auto kEBOffset = Set(du, 127 + M + L);
50
56.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
56.4k
  const auto kMulN = Set(du, 1 << (M + L));
52
56.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
56.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
56.4k
  auto extra_bits = kZero;
56
56.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
5.48M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
5.43M
    const auto val = LoadU(du, values + i);
59
5.43M
    const auto not_literal = Ge(val, kSplit);
60
5.43M
    const auto b = BitCast(du, ConvertTo(df, val));
61
5.43M
    const auto l = And(val, kMaskL);
62
5.43M
    const auto exp = ShiftRight<23>(b);
63
5.43M
    const auto n = Sub(exp, kExpOffset);
64
5.43M
    const auto eb = Sub(exp, kEBOffset);
65
5.43M
    const auto m = ShiftRight<23 - M - L>(b);
66
5.43M
    const auto a = Add(kBase, Mul(n, kMulN));
67
5.43M
    const auto d = And(m, kMaskM);
68
5.43M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
5.43M
    const auto c = Or(a, l);
70
5.43M
    extra_bits = Add(extra_bits, eb_fixed);
71
5.43M
    const auto t = Or(c, d);
72
5.43M
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
5.43M
    Store(t_fixed, du, out + i);
74
5.43M
  }
75
56.4k
  if (last_full < len) {
76
48.7k
    const auto stop = Set(du, len);
77
48.7k
    const auto fence = Iota(du, last_full);
78
48.7k
    const auto take = Lt(fence, stop);
79
48.7k
    const auto val = LoadU(du, values + last_full);
80
48.7k
    const auto not_literal = Ge(val, kSplit);
81
48.7k
    const auto b = BitCast(du, ConvertTo(df, val));
82
48.7k
    const auto l = And(val, kMaskL);
83
48.7k
    const auto exp = ShiftRight<23>(b);
84
48.7k
    const auto n = Sub(exp, kExpOffset);
85
48.7k
    const auto eb = Sub(exp, kEBOffset);
86
48.7k
    const auto m = ShiftRight<23 - M - L>(b);
87
48.7k
    const auto a = Add(kBase, Mul(n, kMulN));
88
48.7k
    const auto d = And(m, kMaskM);
89
48.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
48.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
48.7k
    const auto c = Or(a, l);
92
48.7k
    extra_bits = Add(extra_bits, eb_masked);
93
48.7k
    const auto t = Or(c, d);
94
48.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
48.7k
    Store(t_fixed, du, out + last_full);
96
48.7k
  }
97
56.4k
  return GetLane(SumOfLanes(du, extra_bits));
98
56.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
56.4k
                               uint32_t* JXL_RESTRICT out) {
44
56.4k
  const HWY_FULL(uint32_t) du;
45
56.4k
  const HWY_FULL(float) df;
46
56.4k
  const auto kZero = Zero(du);
47
56.4k
  const auto kSplit = Set(du, 1 << E);
48
56.4k
  const auto kExpOffset = Set(du, 127);
49
56.4k
  const auto kEBOffset = Set(du, 127 + M + L);
50
56.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
56.4k
  const auto kMulN = Set(du, 1 << (M + L));
52
56.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
56.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
56.4k
  auto extra_bits = kZero;
56
56.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
5.48M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
5.43M
    const auto val = LoadU(du, values + i);
59
5.43M
    const auto not_literal = Ge(val, kSplit);
60
5.43M
    const auto b = BitCast(du, ConvertTo(df, val));
61
5.43M
    const auto l = And(val, kMaskL);
62
5.43M
    const auto exp = ShiftRight<23>(b);
63
5.43M
    const auto n = Sub(exp, kExpOffset);
64
5.43M
    const auto eb = Sub(exp, kEBOffset);
65
5.43M
    const auto m = ShiftRight<23 - M - L>(b);
66
5.43M
    const auto a = Add(kBase, Mul(n, kMulN));
67
5.43M
    const auto d = And(m, kMaskM);
68
5.43M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
5.43M
    const auto c = Or(a, l);
70
5.43M
    extra_bits = Add(extra_bits, eb_fixed);
71
5.43M
    const auto t = Or(c, d);
72
5.43M
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
5.43M
    Store(t_fixed, du, out + i);
74
5.43M
  }
75
56.4k
  if (last_full < len) {
76
48.7k
    const auto stop = Set(du, len);
77
48.7k
    const auto fence = Iota(du, last_full);
78
48.7k
    const auto take = Lt(fence, stop);
79
48.7k
    const auto val = LoadU(du, values + last_full);
80
48.7k
    const auto not_literal = Ge(val, kSplit);
81
48.7k
    const auto b = BitCast(du, ConvertTo(df, val));
82
48.7k
    const auto l = And(val, kMaskL);
83
48.7k
    const auto exp = ShiftRight<23>(b);
84
48.7k
    const auto n = Sub(exp, kExpOffset);
85
48.7k
    const auto eb = Sub(exp, kEBOffset);
86
48.7k
    const auto m = ShiftRight<23 - M - L>(b);
87
48.7k
    const auto a = Add(kBase, Mul(n, kMulN));
88
48.7k
    const auto d = And(m, kMaskM);
89
48.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
48.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
48.7k
    const auto c = Or(a, l);
92
48.7k
    extra_bits = Add(extra_bits, eb_masked);
93
48.7k
    const auto t = Or(c, d);
94
48.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
48.7k
    Store(t_fixed, du, out + last_full);
96
48.7k
  }
97
56.4k
  return GetLane(SumOfLanes(du, extra_bits));
98
56.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
56.4k
                               uint32_t* JXL_RESTRICT out) {
44
56.4k
  const HWY_FULL(uint32_t) du;
45
56.4k
  const HWY_FULL(float) df;
46
56.4k
  const auto kZero = Zero(du);
47
56.4k
  const auto kSplit = Set(du, 1 << E);
48
56.4k
  const auto kExpOffset = Set(du, 127);
49
56.4k
  const auto kEBOffset = Set(du, 127 + M + L);
50
56.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
56.4k
  const auto kMulN = Set(du, 1 << (M + L));
52
56.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
56.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
56.4k
  auto extra_bits = kZero;
56
56.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
5.48M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
5.43M
    const auto val = LoadU(du, values + i);
59
5.43M
    const auto not_literal = Ge(val, kSplit);
60
5.43M
    const auto b = BitCast(du, ConvertTo(df, val));
61
5.43M
    const auto l = And(val, kMaskL);
62
5.43M
    const auto exp = ShiftRight<23>(b);
63
5.43M
    const auto n = Sub(exp, kExpOffset);
64
5.43M
    const auto eb = Sub(exp, kEBOffset);
65
5.43M
    const auto m = ShiftRight<23 - M - L>(b);
66
5.43M
    const auto a = Add(kBase, Mul(n, kMulN));
67
5.43M
    const auto d = And(m, kMaskM);
68
5.43M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
5.43M
    const auto c = Or(a, l);
70
5.43M
    extra_bits = Add(extra_bits, eb_fixed);
71
5.43M
    const auto t = Or(c, d);
72
5.43M
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
5.43M
    Store(t_fixed, du, out + i);
74
5.43M
  }
75
56.4k
  if (last_full < len) {
76
48.7k
    const auto stop = Set(du, len);
77
48.7k
    const auto fence = Iota(du, last_full);
78
48.7k
    const auto take = Lt(fence, stop);
79
48.7k
    const auto val = LoadU(du, values + last_full);
80
48.7k
    const auto not_literal = Ge(val, kSplit);
81
48.7k
    const auto b = BitCast(du, ConvertTo(df, val));
82
48.7k
    const auto l = And(val, kMaskL);
83
48.7k
    const auto exp = ShiftRight<23>(b);
84
48.7k
    const auto n = Sub(exp, kExpOffset);
85
48.7k
    const auto eb = Sub(exp, kEBOffset);
86
48.7k
    const auto m = ShiftRight<23 - M - L>(b);
87
48.7k
    const auto a = Add(kBase, Mul(n, kMulN));
88
48.7k
    const auto d = And(m, kMaskM);
89
48.7k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
48.7k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
48.7k
    const auto c = Or(a, l);
92
48.7k
    extra_bits = Add(extra_bits, eb_masked);
93
48.7k
    const auto t = Or(c, d);
94
48.7k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
48.7k
    Store(t_fixed, du, out + last_full);
96
48.7k
  }
97
56.4k
  return GetLane(SumOfLanes(du, extra_bits));
98
56.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.85k
                               uint32_t* JXL_RESTRICT out) {
44
7.85k
  const HWY_FULL(uint32_t) du;
45
7.85k
  const HWY_FULL(float) df;
46
7.85k
  const auto kZero = Zero(du);
47
7.85k
  const auto kSplit = Set(du, 1 << E);
48
7.85k
  const auto kExpOffset = Set(du, 127);
49
7.85k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.85k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.85k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.85k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.85k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.85k
  auto extra_bits = kZero;
56
7.85k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.85k
  if (last_full < len) {
76
6.62k
    const auto stop = Set(du, len);
77
6.62k
    const auto fence = Iota(du, last_full);
78
6.62k
    const auto take = Lt(fence, stop);
79
6.62k
    const auto val = LoadU(du, values + last_full);
80
6.62k
    const auto not_literal = Ge(val, kSplit);
81
6.62k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.62k
    const auto l = And(val, kMaskL);
83
6.62k
    const auto exp = ShiftRight<23>(b);
84
6.62k
    const auto n = Sub(exp, kExpOffset);
85
6.62k
    const auto eb = Sub(exp, kEBOffset);
86
6.62k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.62k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.62k
    const auto d = And(m, kMaskM);
89
6.62k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.62k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.62k
    const auto c = Or(a, l);
92
6.62k
    extra_bits = Add(extra_bits, eb_masked);
93
6.62k
    const auto t = Or(c, d);
94
6.62k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.62k
    Store(t_fixed, du, out + last_full);
96
6.62k
  }
97
7.85k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.85k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.85k
                               uint32_t* JXL_RESTRICT out) {
44
7.85k
  const HWY_FULL(uint32_t) du;
45
7.85k
  const HWY_FULL(float) df;
46
7.85k
  const auto kZero = Zero(du);
47
7.85k
  const auto kSplit = Set(du, 1 << E);
48
7.85k
  const auto kExpOffset = Set(du, 127);
49
7.85k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.85k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.85k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.85k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.85k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.85k
  auto extra_bits = kZero;
56
7.85k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.85k
  if (last_full < len) {
76
6.62k
    const auto stop = Set(du, len);
77
6.62k
    const auto fence = Iota(du, last_full);
78
6.62k
    const auto take = Lt(fence, stop);
79
6.62k
    const auto val = LoadU(du, values + last_full);
80
6.62k
    const auto not_literal = Ge(val, kSplit);
81
6.62k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.62k
    const auto l = And(val, kMaskL);
83
6.62k
    const auto exp = ShiftRight<23>(b);
84
6.62k
    const auto n = Sub(exp, kExpOffset);
85
6.62k
    const auto eb = Sub(exp, kEBOffset);
86
6.62k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.62k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.62k
    const auto d = And(m, kMaskM);
89
6.62k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.62k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.62k
    const auto c = Or(a, l);
92
6.62k
    extra_bits = Add(extra_bits, eb_masked);
93
6.62k
    const auto t = Or(c, d);
94
6.62k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.62k
    Store(t_fixed, du, out + last_full);
96
6.62k
  }
97
7.85k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.85k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.11k
                               uint32_t* JXL_RESTRICT out) {
44
7.11k
  const HWY_FULL(uint32_t) du;
45
7.11k
  const HWY_FULL(float) df;
46
7.11k
  const auto kZero = Zero(du);
47
7.11k
  const auto kSplit = Set(du, 1 << E);
48
7.11k
  const auto kExpOffset = Set(du, 127);
49
7.11k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.11k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.11k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.11k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.11k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.11k
  auto extra_bits = kZero;
56
7.11k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
302k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
295k
    const auto val = LoadU(du, values + i);
59
295k
    const auto not_literal = Ge(val, kSplit);
60
295k
    const auto b = BitCast(du, ConvertTo(df, val));
61
295k
    const auto l = And(val, kMaskL);
62
295k
    const auto exp = ShiftRight<23>(b);
63
295k
    const auto n = Sub(exp, kExpOffset);
64
295k
    const auto eb = Sub(exp, kEBOffset);
65
295k
    const auto m = ShiftRight<23 - M - L>(b);
66
295k
    const auto a = Add(kBase, Mul(n, kMulN));
67
295k
    const auto d = And(m, kMaskM);
68
295k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
295k
    const auto c = Or(a, l);
70
295k
    extra_bits = Add(extra_bits, eb_fixed);
71
295k
    const auto t = Or(c, d);
72
295k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
295k
    Store(t_fixed, du, out + i);
74
295k
  }
75
7.11k
  if (last_full < len) {
76
6.02k
    const auto stop = Set(du, len);
77
6.02k
    const auto fence = Iota(du, last_full);
78
6.02k
    const auto take = Lt(fence, stop);
79
6.02k
    const auto val = LoadU(du, values + last_full);
80
6.02k
    const auto not_literal = Ge(val, kSplit);
81
6.02k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.02k
    const auto l = And(val, kMaskL);
83
6.02k
    const auto exp = ShiftRight<23>(b);
84
6.02k
    const auto n = Sub(exp, kExpOffset);
85
6.02k
    const auto eb = Sub(exp, kEBOffset);
86
6.02k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.02k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.02k
    const auto d = And(m, kMaskM);
89
6.02k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.02k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.02k
    const auto c = Or(a, l);
92
6.02k
    extra_bits = Add(extra_bits, eb_masked);
93
6.02k
    const auto t = Or(c, d);
94
6.02k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.02k
    Store(t_fixed, du, out + last_full);
96
6.02k
  }
97
7.11k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.11k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.11k
                               uint32_t* JXL_RESTRICT out) {
44
7.11k
  const HWY_FULL(uint32_t) du;
45
7.11k
  const HWY_FULL(float) df;
46
7.11k
  const auto kZero = Zero(du);
47
7.11k
  const auto kSplit = Set(du, 1 << E);
48
7.11k
  const auto kExpOffset = Set(du, 127);
49
7.11k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.11k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.11k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.11k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.11k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.11k
  auto extra_bits = kZero;
56
7.11k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
302k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
295k
    const auto val = LoadU(du, values + i);
59
295k
    const auto not_literal = Ge(val, kSplit);
60
295k
    const auto b = BitCast(du, ConvertTo(df, val));
61
295k
    const auto l = And(val, kMaskL);
62
295k
    const auto exp = ShiftRight<23>(b);
63
295k
    const auto n = Sub(exp, kExpOffset);
64
295k
    const auto eb = Sub(exp, kEBOffset);
65
295k
    const auto m = ShiftRight<23 - M - L>(b);
66
295k
    const auto a = Add(kBase, Mul(n, kMulN));
67
295k
    const auto d = And(m, kMaskM);
68
295k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
295k
    const auto c = Or(a, l);
70
295k
    extra_bits = Add(extra_bits, eb_fixed);
71
295k
    const auto t = Or(c, d);
72
295k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
295k
    Store(t_fixed, du, out + i);
74
295k
  }
75
7.11k
  if (last_full < len) {
76
6.02k
    const auto stop = Set(du, len);
77
6.02k
    const auto fence = Iota(du, last_full);
78
6.02k
    const auto take = Lt(fence, stop);
79
6.02k
    const auto val = LoadU(du, values + last_full);
80
6.02k
    const auto not_literal = Ge(val, kSplit);
81
6.02k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.02k
    const auto l = And(val, kMaskL);
83
6.02k
    const auto exp = ShiftRight<23>(b);
84
6.02k
    const auto n = Sub(exp, kExpOffset);
85
6.02k
    const auto eb = Sub(exp, kEBOffset);
86
6.02k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.02k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.02k
    const auto d = And(m, kMaskM);
89
6.02k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.02k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.02k
    const auto c = Or(a, l);
92
6.02k
    extra_bits = Add(extra_bits, eb_masked);
93
6.02k
    const auto t = Or(c, d);
94
6.02k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.02k
    Store(t_fixed, du, out + last_full);
96
6.02k
  }
97
7.11k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.11k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
7.86k
                               uint32_t* JXL_RESTRICT out) {
44
7.86k
  const HWY_FULL(uint32_t) du;
45
7.86k
  const HWY_FULL(float) df;
46
7.86k
  const auto kZero = Zero(du);
47
7.86k
  const auto kSplit = Set(du, 1 << E);
48
7.86k
  const auto kExpOffset = Set(du, 127);
49
7.86k
  const auto kEBOffset = Set(du, 127 + M + L);
50
7.86k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
7.86k
  const auto kMulN = Set(du, 1 << (M + L));
52
7.86k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
7.86k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
7.86k
  auto extra_bits = kZero;
56
7.86k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
575k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
567k
    const auto val = LoadU(du, values + i);
59
567k
    const auto not_literal = Ge(val, kSplit);
60
567k
    const auto b = BitCast(du, ConvertTo(df, val));
61
567k
    const auto l = And(val, kMaskL);
62
567k
    const auto exp = ShiftRight<23>(b);
63
567k
    const auto n = Sub(exp, kExpOffset);
64
567k
    const auto eb = Sub(exp, kEBOffset);
65
567k
    const auto m = ShiftRight<23 - M - L>(b);
66
567k
    const auto a = Add(kBase, Mul(n, kMulN));
67
567k
    const auto d = And(m, kMaskM);
68
567k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
567k
    const auto c = Or(a, l);
70
567k
    extra_bits = Add(extra_bits, eb_fixed);
71
567k
    const auto t = Or(c, d);
72
567k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
567k
    Store(t_fixed, du, out + i);
74
567k
  }
75
7.86k
  if (last_full < len) {
76
6.63k
    const auto stop = Set(du, len);
77
6.63k
    const auto fence = Iota(du, last_full);
78
6.63k
    const auto take = Lt(fence, stop);
79
6.63k
    const auto val = LoadU(du, values + last_full);
80
6.63k
    const auto not_literal = Ge(val, kSplit);
81
6.63k
    const auto b = BitCast(du, ConvertTo(df, val));
82
6.63k
    const auto l = And(val, kMaskL);
83
6.63k
    const auto exp = ShiftRight<23>(b);
84
6.63k
    const auto n = Sub(exp, kExpOffset);
85
6.63k
    const auto eb = Sub(exp, kEBOffset);
86
6.63k
    const auto m = ShiftRight<23 - M - L>(b);
87
6.63k
    const auto a = Add(kBase, Mul(n, kMulN));
88
6.63k
    const auto d = And(m, kMaskM);
89
6.63k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
6.63k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
6.63k
    const auto c = Or(a, l);
92
6.63k
    extra_bits = Add(extra_bits, eb_masked);
93
6.63k
    const auto t = Or(c, d);
94
6.63k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
6.63k
    Store(t_fixed, du, out + last_full);
96
6.63k
  }
97
7.86k
  return GetLane(SumOfLanes(du, extra_bits));
98
7.86k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
6.03k
                               uint32_t* JXL_RESTRICT out) {
44
6.03k
  const HWY_FULL(uint32_t) du;
45
6.03k
  const HWY_FULL(float) df;
46
6.03k
  const auto kZero = Zero(du);
47
6.03k
  const auto kSplit = Set(du, 1 << E);
48
6.03k
  const auto kExpOffset = Set(du, 127);
49
6.03k
  const auto kEBOffset = Set(du, 127 + M + L);
50
6.03k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
6.03k
  const auto kMulN = Set(du, 1 << (M + L));
52
6.03k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
6.03k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
6.03k
  auto extra_bits = kZero;
56
6.03k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
263k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
257k
    const auto val = LoadU(du, values + i);
59
257k
    const auto not_literal = Ge(val, kSplit);
60
257k
    const auto b = BitCast(du, ConvertTo(df, val));
61
257k
    const auto l = And(val, kMaskL);
62
257k
    const auto exp = ShiftRight<23>(b);
63
257k
    const auto n = Sub(exp, kExpOffset);
64
257k
    const auto eb = Sub(exp, kEBOffset);
65
257k
    const auto m = ShiftRight<23 - M - L>(b);
66
257k
    const auto a = Add(kBase, Mul(n, kMulN));
67
257k
    const auto d = And(m, kMaskM);
68
257k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
257k
    const auto c = Or(a, l);
70
257k
    extra_bits = Add(extra_bits, eb_fixed);
71
257k
    const auto t = Or(c, d);
72
257k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
257k
    Store(t_fixed, du, out + i);
74
257k
  }
75
6.03k
  if (last_full < len) {
76
5.12k
    const auto stop = Set(du, len);
77
5.12k
    const auto fence = Iota(du, last_full);
78
5.12k
    const auto take = Lt(fence, stop);
79
5.12k
    const auto val = LoadU(du, values + last_full);
80
5.12k
    const auto not_literal = Ge(val, kSplit);
81
5.12k
    const auto b = BitCast(du, ConvertTo(df, val));
82
5.12k
    const auto l = And(val, kMaskL);
83
5.12k
    const auto exp = ShiftRight<23>(b);
84
5.12k
    const auto n = Sub(exp, kExpOffset);
85
5.12k
    const auto eb = Sub(exp, kEBOffset);
86
5.12k
    const auto m = ShiftRight<23 - M - L>(b);
87
5.12k
    const auto a = Add(kBase, Mul(n, kMulN));
88
5.12k
    const auto d = And(m, kMaskM);
89
5.12k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
5.12k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
5.12k
    const auto c = Or(a, l);
92
5.12k
    extra_bits = Add(extra_bits, eb_masked);
93
5.12k
    const auto t = Or(c, d);
94
5.12k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
5.12k
    Store(t_fixed, du, out + last_full);
96
5.12k
  }
97
6.03k
  return GetLane(SumOfLanes(du, extra_bits));
98
6.03k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
6.03k
                               uint32_t* JXL_RESTRICT out) {
44
6.03k
  const HWY_FULL(uint32_t) du;
45
6.03k
  const HWY_FULL(float) df;
46
6.03k
  const auto kZero = Zero(du);
47
6.03k
  const auto kSplit = Set(du, 1 << E);
48
6.03k
  const auto kExpOffset = Set(du, 127);
49
6.03k
  const auto kEBOffset = Set(du, 127 + M + L);
50
6.03k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
6.03k
  const auto kMulN = Set(du, 1 << (M + L));
52
6.03k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
6.03k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
6.03k
  auto extra_bits = kZero;
56
6.03k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
263k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
257k
    const auto val = LoadU(du, values + i);
59
257k
    const auto not_literal = Ge(val, kSplit);
60
257k
    const auto b = BitCast(du, ConvertTo(df, val));
61
257k
    const auto l = And(val, kMaskL);
62
257k
    const auto exp = ShiftRight<23>(b);
63
257k
    const auto n = Sub(exp, kExpOffset);
64
257k
    const auto eb = Sub(exp, kEBOffset);
65
257k
    const auto m = ShiftRight<23 - M - L>(b);
66
257k
    const auto a = Add(kBase, Mul(n, kMulN));
67
257k
    const auto d = And(m, kMaskM);
68
257k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
257k
    const auto c = Or(a, l);
70
257k
    extra_bits = Add(extra_bits, eb_fixed);
71
257k
    const auto t = Or(c, d);
72
257k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
257k
    Store(t_fixed, du, out + i);
74
257k
  }
75
6.03k
  if (last_full < len) {
76
5.12k
    const auto stop = Set(du, len);
77
5.12k
    const auto fence = Iota(du, last_full);
78
5.12k
    const auto take = Lt(fence, stop);
79
5.12k
    const auto val = LoadU(du, values + last_full);
80
5.12k
    const auto not_literal = Ge(val, kSplit);
81
5.12k
    const auto b = BitCast(du, ConvertTo(df, val));
82
5.12k
    const auto l = And(val, kMaskL);
83
5.12k
    const auto exp = ShiftRight<23>(b);
84
5.12k
    const auto n = Sub(exp, kExpOffset);
85
5.12k
    const auto eb = Sub(exp, kEBOffset);
86
5.12k
    const auto m = ShiftRight<23 - M - L>(b);
87
5.12k
    const auto a = Add(kBase, Mul(n, kMulN));
88
5.12k
    const auto d = And(m, kMaskM);
89
5.12k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
5.12k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
5.12k
    const auto c = Or(a, l);
92
5.12k
    extra_bits = Add(extra_bits, eb_masked);
93
5.12k
    const auto t = Or(c, d);
94
5.12k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
5.12k
    Store(t_fixed, du, out + last_full);
96
5.12k
  }
97
6.03k
  return GetLane(SumOfLanes(du, extra_bits));
98
6.03k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
6.03k
                               uint32_t* JXL_RESTRICT out) {
44
6.03k
  const HWY_FULL(uint32_t) du;
45
6.03k
  const HWY_FULL(float) df;
46
6.03k
  const auto kZero = Zero(du);
47
6.03k
  const auto kSplit = Set(du, 1 << E);
48
6.03k
  const auto kExpOffset = Set(du, 127);
49
6.03k
  const auto kEBOffset = Set(du, 127 + M + L);
50
6.03k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
6.03k
  const auto kMulN = Set(du, 1 << (M + L));
52
6.03k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
6.03k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
6.03k
  auto extra_bits = kZero;
56
6.03k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
263k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
257k
    const auto val = LoadU(du, values + i);
59
257k
    const auto not_literal = Ge(val, kSplit);
60
257k
    const auto b = BitCast(du, ConvertTo(df, val));
61
257k
    const auto l = And(val, kMaskL);
62
257k
    const auto exp = ShiftRight<23>(b);
63
257k
    const auto n = Sub(exp, kExpOffset);
64
257k
    const auto eb = Sub(exp, kEBOffset);
65
257k
    const auto m = ShiftRight<23 - M - L>(b);
66
257k
    const auto a = Add(kBase, Mul(n, kMulN));
67
257k
    const auto d = And(m, kMaskM);
68
257k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
257k
    const auto c = Or(a, l);
70
257k
    extra_bits = Add(extra_bits, eb_fixed);
71
257k
    const auto t = Or(c, d);
72
257k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
257k
    Store(t_fixed, du, out + i);
74
257k
  }
75
6.03k
  if (last_full < len) {
76
5.12k
    const auto stop = Set(du, len);
77
5.12k
    const auto fence = Iota(du, last_full);
78
5.12k
    const auto take = Lt(fence, stop);
79
5.12k
    const auto val = LoadU(du, values + last_full);
80
5.12k
    const auto not_literal = Ge(val, kSplit);
81
5.12k
    const auto b = BitCast(du, ConvertTo(df, val));
82
5.12k
    const auto l = And(val, kMaskL);
83
5.12k
    const auto exp = ShiftRight<23>(b);
84
5.12k
    const auto n = Sub(exp, kExpOffset);
85
5.12k
    const auto eb = Sub(exp, kEBOffset);
86
5.12k
    const auto m = ShiftRight<23 - M - L>(b);
87
5.12k
    const auto a = Add(kBase, Mul(n, kMulN));
88
5.12k
    const auto d = And(m, kMaskM);
89
5.12k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
5.12k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
5.12k
    const auto c = Or(a, l);
92
5.12k
    extra_bits = Add(extra_bits, eb_masked);
93
5.12k
    const auto t = Or(c, d);
94
5.12k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
5.12k
    Store(t_fixed, du, out + last_full);
96
5.12k
  }
97
6.03k
  return GetLane(SumOfLanes(du, extra_bits));
98
6.03k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
6.03k
                               uint32_t* JXL_RESTRICT out) {
44
6.03k
  const HWY_FULL(uint32_t) du;
45
6.03k
  const HWY_FULL(float) df;
46
6.03k
  const auto kZero = Zero(du);
47
6.03k
  const auto kSplit = Set(du, 1 << E);
48
6.03k
  const auto kExpOffset = Set(du, 127);
49
6.03k
  const auto kEBOffset = Set(du, 127 + M + L);
50
6.03k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
6.03k
  const auto kMulN = Set(du, 1 << (M + L));
52
6.03k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
6.03k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
6.03k
  auto extra_bits = kZero;
56
6.03k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
263k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
257k
    const auto val = LoadU(du, values + i);
59
257k
    const auto not_literal = Ge(val, kSplit);
60
257k
    const auto b = BitCast(du, ConvertTo(df, val));
61
257k
    const auto l = And(val, kMaskL);
62
257k
    const auto exp = ShiftRight<23>(b);
63
257k
    const auto n = Sub(exp, kExpOffset);
64
257k
    const auto eb = Sub(exp, kEBOffset);
65
257k
    const auto m = ShiftRight<23 - M - L>(b);
66
257k
    const auto a = Add(kBase, Mul(n, kMulN));
67
257k
    const auto d = And(m, kMaskM);
68
257k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
257k
    const auto c = Or(a, l);
70
257k
    extra_bits = Add(extra_bits, eb_fixed);
71
257k
    const auto t = Or(c, d);
72
257k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
257k
    Store(t_fixed, du, out + i);
74
257k
  }
75
6.03k
  if (last_full < len) {
76
5.12k
    const auto stop = Set(du, len);
77
5.12k
    const auto fence = Iota(du, last_full);
78
5.12k
    const auto take = Lt(fence, stop);
79
5.12k
    const auto val = LoadU(du, values + last_full);
80
5.12k
    const auto not_literal = Ge(val, kSplit);
81
5.12k
    const auto b = BitCast(du, ConvertTo(df, val));
82
5.12k
    const auto l = And(val, kMaskL);
83
5.12k
    const auto exp = ShiftRight<23>(b);
84
5.12k
    const auto n = Sub(exp, kExpOffset);
85
5.12k
    const auto eb = Sub(exp, kEBOffset);
86
5.12k
    const auto m = ShiftRight<23 - M - L>(b);
87
5.12k
    const auto a = Add(kBase, Mul(n, kMulN));
88
5.12k
    const auto d = And(m, kMaskM);
89
5.12k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
5.12k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
5.12k
    const auto c = Or(a, l);
92
5.12k
    extra_bits = Add(extra_bits, eb_masked);
93
5.12k
    const auto t = Or(c, d);
94
5.12k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
5.12k
    Store(t_fixed, du, out + last_full);
96
5.12k
  }
97
6.03k
  return GetLane(SumOfLanes(du, extra_bits));
98
6.03k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
43
6.03k
                               uint32_t* JXL_RESTRICT out) {
44
6.03k
  const HWY_FULL(uint32_t) du;
45
6.03k
  const HWY_FULL(float) df;
46
6.03k
  const auto kZero = Zero(du);
47
6.03k
  const auto kSplit = Set(du, 1 << E);
48
6.03k
  const auto kExpOffset = Set(du, 127);
49
6.03k
  const auto kEBOffset = Set(du, 127 + M + L);
50
6.03k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
51
6.03k
  const auto kMulN = Set(du, 1 << (M + L));
52
6.03k
  const auto kMaskL = Set(du, (1 << L) - 1);
53
6.03k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
54
55
6.03k
  auto extra_bits = kZero;
56
6.03k
  size_t last_full = Lanes(du) * (len / Lanes(du));
57
263k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
58
257k
    const auto val = LoadU(du, values + i);
59
257k
    const auto not_literal = Ge(val, kSplit);
60
257k
    const auto b = BitCast(du, ConvertTo(df, val));
61
257k
    const auto l = And(val, kMaskL);
62
257k
    const auto exp = ShiftRight<23>(b);
63
257k
    const auto n = Sub(exp, kExpOffset);
64
257k
    const auto eb = Sub(exp, kEBOffset);
65
257k
    const auto m = ShiftRight<23 - M - L>(b);
66
257k
    const auto a = Add(kBase, Mul(n, kMulN));
67
257k
    const auto d = And(m, kMaskM);
68
257k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
69
257k
    const auto c = Or(a, l);
70
257k
    extra_bits = Add(extra_bits, eb_fixed);
71
257k
    const auto t = Or(c, d);
72
257k
    const auto t_fixed = IfThenElse(not_literal, t, val);
73
257k
    Store(t_fixed, du, out + i);
74
257k
  }
75
6.03k
  if (last_full < len) {
76
5.12k
    const auto stop = Set(du, len);
77
5.12k
    const auto fence = Iota(du, last_full);
78
5.12k
    const auto take = Lt(fence, stop);
79
5.12k
    const auto val = LoadU(du, values + last_full);
80
5.12k
    const auto not_literal = Ge(val, kSplit);
81
5.12k
    const auto b = BitCast(du, ConvertTo(df, val));
82
5.12k
    const auto l = And(val, kMaskL);
83
5.12k
    const auto exp = ShiftRight<23>(b);
84
5.12k
    const auto n = Sub(exp, kExpOffset);
85
5.12k
    const auto eb = Sub(exp, kEBOffset);
86
5.12k
    const auto m = ShiftRight<23 - M - L>(b);
87
5.12k
    const auto a = Add(kBase, Mul(n, kMulN));
88
5.12k
    const auto d = And(m, kMaskM);
89
5.12k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
90
5.12k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
91
5.12k
    const auto c = Or(a, l);
92
5.12k
    extra_bits = Add(extra_bits, eb_masked);
93
5.12k
    const auto t = Or(c, d);
94
5.12k
    const auto t_fixed = IfThenElse(not_literal, t, val);
95
5.12k
    Store(t_fixed, du, out + last_full);
96
5.12k
  }
97
6.03k
  return GetLane(SumOfLanes(du, extra_bits));
98
6.03k
}
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
99
100
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
101
404k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
102
404k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
103
#if HWY_TARGET == HWY_SCALAR
104
  uint32_t extra_bits = 0;
105
  for (size_t i = 0; i < len; ++i) {
106
    uint32_t v = values[i];
107
    uint32_t tok, nbits, bits;
108
    cfg.Encode(v, &tok, &nbits, &bits);
109
    extra_bits += nbits;
110
    out[i] = tok;
111
  }
112
  return extra_bits;
113
#else
114
404k
  if (cfg.split_exponent == 0) {
115
57.0k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
116
347k
  } else if (cfg.split_exponent == 2) {
117
56.4k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
118
56.4k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
119
290k
  } else if (cfg.split_exponent == 3) {
120
31.4k
    if (cfg.msb_in_token == 1) {
121
15.7k
      if (cfg.lsb_in_token == 0) {
122
7.86k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
123
7.86k
      } else {
124
7.86k
        JXL_DASSERT(cfg.lsb_in_token == 2);
125
7.86k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
126
7.86k
      }
127
15.7k
    } else {
128
15.7k
      JXL_DASSERT(cfg.msb_in_token == 2);
129
15.7k
      if (cfg.lsb_in_token == 0) {
130
7.86k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
131
7.86k
      } else {
132
7.86k
        JXL_DASSERT(cfg.lsb_in_token == 1);
133
7.86k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
134
7.86k
      }
135
15.7k
    }
136
259k
  } else if (cfg.split_exponent == 4) {
137
144k
    if (cfg.msb_in_token == 1) {
138
72.1k
      if (cfg.lsb_in_token == 0) {
139
7.86k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
140
64.3k
      } else if (cfg.lsb_in_token == 2) {
141
56.4k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
142
56.4k
      } else {
143
7.86k
        JXL_DASSERT(cfg.lsb_in_token == 3);
144
7.86k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
145
7.86k
      }
146
72.1k
    } else {
147
72.1k
      JXL_DASSERT(cfg.msb_in_token == 2);
148
72.1k
      if (cfg.lsb_in_token == 0) {
149
56.4k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
150
56.4k
      } else if (cfg.lsb_in_token == 1) {
151
7.86k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
152
7.86k
      } else {
153
7.86k
        JXL_DASSERT(cfg.lsb_in_token == 2);
154
7.86k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
155
7.86k
      }
156
72.1k
    }
157
144k
  } else if (cfg.split_exponent == 5) {
158
55.0k
    if (cfg.msb_in_token == 1) {
159
23.5k
      if (cfg.lsb_in_token == 0) {
160
7.86k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
161
15.7k
      } else if (cfg.lsb_in_token == 2) {
162
7.86k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
163
7.86k
      } else {
164
7.85k
        JXL_DASSERT(cfg.lsb_in_token == 4);
165
7.85k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
166
7.85k
      }
167
31.4k
    } else {
168
31.4k
      JXL_DASSERT(cfg.msb_in_token == 2);
169
31.4k
      if (cfg.lsb_in_token == 0) {
170
7.86k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
171
23.5k
      } else if (cfg.lsb_in_token == 1) {
172
7.86k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
173
15.7k
      } else if (cfg.lsb_in_token == 2) {
174
7.86k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
175
7.86k
      } else {
176
7.85k
        JXL_DASSERT(cfg.lsb_in_token == 3);
177
7.85k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
178
7.85k
      }
179
31.4k
    }
180
60.1k
  } else if (cfg.split_exponent == 6) {
181
22.0k
    if (cfg.msb_in_token == 0) {
182
7.86k
      JXL_DASSERT(cfg.lsb_in_token == 0);
183
7.86k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
184
14.2k
    } else if (cfg.msb_in_token == 1) {
185
7.11k
      JXL_DASSERT(cfg.lsb_in_token == 5);
186
7.11k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
187
7.11k
    } else {
188
7.11k
      JXL_DASSERT(cfg.msb_in_token == 2);
189
7.11k
      JXL_DASSERT(cfg.lsb_in_token == 4);
190
7.11k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
191
7.11k
    }
192
38.0k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
193
38.0k
    JXL_DASSERT(cfg.msb_in_token == 0);
194
38.0k
    JXL_DASSERT(cfg.lsb_in_token == 0);
195
38.0k
    if (cfg.split_exponent == 7) {
196
7.86k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
197
30.1k
    } else if (cfg.split_exponent == 8) {
198
6.03k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
199
24.1k
    } else if (cfg.split_exponent == 9) {
200
6.03k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
201
18.1k
    } else if (cfg.split_exponent == 10) {
202
6.03k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
203
12.0k
    } else if (cfg.split_exponent == 11) {
204
6.03k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
205
6.03k
    } else {
206
6.03k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
207
6.03k
    }
208
38.0k
  } else {
209
0
    JXL_DASSERT(false);
210
0
  }
211
0
  return ~0;
212
404k
#endif
213
404k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Line
Count
Source
101
404k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
102
404k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
103
#if HWY_TARGET == HWY_SCALAR
104
  uint32_t extra_bits = 0;
105
  for (size_t i = 0; i < len; ++i) {
106
    uint32_t v = values[i];
107
    uint32_t tok, nbits, bits;
108
    cfg.Encode(v, &tok, &nbits, &bits);
109
    extra_bits += nbits;
110
    out[i] = tok;
111
  }
112
  return extra_bits;
113
#else
114
404k
  if (cfg.split_exponent == 0) {
115
57.0k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
116
347k
  } else if (cfg.split_exponent == 2) {
117
56.4k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
118
56.4k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
119
290k
  } else if (cfg.split_exponent == 3) {
120
31.4k
    if (cfg.msb_in_token == 1) {
121
15.7k
      if (cfg.lsb_in_token == 0) {
122
7.86k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
123
7.86k
      } else {
124
7.86k
        JXL_DASSERT(cfg.lsb_in_token == 2);
125
7.86k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
126
7.86k
      }
127
15.7k
    } else {
128
15.7k
      JXL_DASSERT(cfg.msb_in_token == 2);
129
15.7k
      if (cfg.lsb_in_token == 0) {
130
7.86k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
131
7.86k
      } else {
132
7.86k
        JXL_DASSERT(cfg.lsb_in_token == 1);
133
7.86k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
134
7.86k
      }
135
15.7k
    }
136
259k
  } else if (cfg.split_exponent == 4) {
137
144k
    if (cfg.msb_in_token == 1) {
138
72.1k
      if (cfg.lsb_in_token == 0) {
139
7.86k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
140
64.3k
      } else if (cfg.lsb_in_token == 2) {
141
56.4k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
142
56.4k
      } else {
143
7.86k
        JXL_DASSERT(cfg.lsb_in_token == 3);
144
7.86k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
145
7.86k
      }
146
72.1k
    } else {
147
72.1k
      JXL_DASSERT(cfg.msb_in_token == 2);
148
72.1k
      if (cfg.lsb_in_token == 0) {
149
56.4k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
150
56.4k
      } else if (cfg.lsb_in_token == 1) {
151
7.86k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
152
7.86k
      } else {
153
7.86k
        JXL_DASSERT(cfg.lsb_in_token == 2);
154
7.86k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
155
7.86k
      }
156
72.1k
    }
157
144k
  } else if (cfg.split_exponent == 5) {
158
55.0k
    if (cfg.msb_in_token == 1) {
159
23.5k
      if (cfg.lsb_in_token == 0) {
160
7.86k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
161
15.7k
      } else if (cfg.lsb_in_token == 2) {
162
7.86k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
163
7.86k
      } else {
164
7.85k
        JXL_DASSERT(cfg.lsb_in_token == 4);
165
7.85k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
166
7.85k
      }
167
31.4k
    } else {
168
31.4k
      JXL_DASSERT(cfg.msb_in_token == 2);
169
31.4k
      if (cfg.lsb_in_token == 0) {
170
7.86k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
171
23.5k
      } else if (cfg.lsb_in_token == 1) {
172
7.86k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
173
15.7k
      } else if (cfg.lsb_in_token == 2) {
174
7.86k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
175
7.86k
      } else {
176
7.85k
        JXL_DASSERT(cfg.lsb_in_token == 3);
177
7.85k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
178
7.85k
      }
179
31.4k
    }
180
60.1k
  } else if (cfg.split_exponent == 6) {
181
22.0k
    if (cfg.msb_in_token == 0) {
182
7.86k
      JXL_DASSERT(cfg.lsb_in_token == 0);
183
7.86k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
184
14.2k
    } else if (cfg.msb_in_token == 1) {
185
7.11k
      JXL_DASSERT(cfg.lsb_in_token == 5);
186
7.11k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
187
7.11k
    } else {
188
7.11k
      JXL_DASSERT(cfg.msb_in_token == 2);
189
7.11k
      JXL_DASSERT(cfg.lsb_in_token == 4);
190
7.11k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
191
7.11k
    }
192
38.0k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
193
38.0k
    JXL_DASSERT(cfg.msb_in_token == 0);
194
38.0k
    JXL_DASSERT(cfg.lsb_in_token == 0);
195
38.0k
    if (cfg.split_exponent == 7) {
196
7.86k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
197
30.1k
    } else if (cfg.split_exponent == 8) {
198
6.03k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
199
24.1k
    } else if (cfg.split_exponent == 9) {
200
6.03k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
201
18.1k
    } else if (cfg.split_exponent == 10) {
202
6.03k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
203
12.0k
    } else if (cfg.split_exponent == 11) {
204
6.03k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
205
6.03k
    } else {
206
6.03k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
207
6.03k
    }
208
38.0k
  } else {
209
0
    JXL_DASSERT(false);
210
0
  }
211
0
  return ~0;
212
404k
#endif
213
404k
}
Unexecuted instantiation: jxl::N_AVX3::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_AVX3_SPR::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
214
215
// NOLINTNEXTLINE(google-readability-namespace-comments)
216
}  // namespace HWY_NAMESPACE
217
}  // namespace jxl
218
HWY_AFTER_NAMESPACE();
219
220
#if HWY_ONCE
221
namespace jxl {
222
223
HWY_EXPORT(EstimateTokenCost);
224
225
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
226
404k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
227
404k
  JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent);
228
404k
  return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens);
229
404k
}
230
231
}  // namespace jxl
232
#endif