Coverage Report

Created: 2026-06-30 07:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_ans_simd.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ans_simd.h"
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/compiler_specific.h"
11
#include "lib/jxl/base/status.h"
12
#include "lib/jxl/dec_ans.h"
13
#include "lib/jxl/memory_manager_internal.h"
14
15
#undef HWY_TARGET_INCLUDE
16
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc"
17
#include <hwy/foreach_target.h>
18
#include <hwy/highway.h>
19
20
HWY_BEFORE_NAMESPACE();
21
namespace jxl {
22
namespace HWY_NAMESPACE {
23
24
// These templates are not found via ADL.
25
using hwy::HWY_NAMESPACE::Add;
26
using hwy::HWY_NAMESPACE::And;
27
using hwy::HWY_NAMESPACE::Ge;
28
using hwy::HWY_NAMESPACE::GetLane;
29
using hwy::HWY_NAMESPACE::Gt;
30
using hwy::HWY_NAMESPACE::IfThenElse;
31
using hwy::HWY_NAMESPACE::IfThenElseZero;
32
using hwy::HWY_NAMESPACE::Iota;
33
using hwy::HWY_NAMESPACE::LoadU;
34
using hwy::HWY_NAMESPACE::Lt;
35
using hwy::HWY_NAMESPACE::Mul;
36
using hwy::HWY_NAMESPACE::Or;
37
using hwy::HWY_NAMESPACE::Set;
38
using hwy::HWY_NAMESPACE::ShiftRight;
39
using hwy::HWY_NAMESPACE::Store;
40
using hwy::HWY_NAMESPACE::Sub;
41
using hwy::HWY_NAMESPACE::Zero;
42
43
template <size_t E, size_t M, size_t L>
44
uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len,
45
476k
                               uint32_t* JXL_RESTRICT out) {
46
476k
  const HWY_FULL(uint32_t) du;
47
476k
  const HWY_FULL(float) df;
48
476k
  const auto kZero = Zero(du);
49
476k
  const auto kSplit = Set(du, 1 << E);
50
476k
  const auto kExpOffset = Set(du, 127);
51
476k
  const auto kEBOffset = Set(du, 127 + M + L);
52
476k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
476k
  const auto kMulN = Set(du, 1 << (M + L));
54
476k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
476k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
476k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
476k
  constexpr size_t kLargeShiftVal = 10;
58
476k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
476k
  auto extra_bits = kZero;
61
476k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
60.4M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
59.9M
    const auto val = LoadU(du, values + i);
64
59.9M
    const auto is_large = Gt(val, kLargeThreshold);
65
59.9M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
59.9M
    const auto not_literal = Ge(val, kSplit);
67
59.9M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
59.9M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
59.9M
    const auto l = And(val, kMaskL);
70
59.9M
    const auto exp = ShiftRight<23>(b);
71
59.9M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
59.9M
    const auto n = Sub(exp_fixed, kExpOffset);
73
59.9M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
59.9M
    const auto m = ShiftRight<23 - M - L>(b);
75
59.9M
    const auto a = Add(kBase, Mul(n, kMulN));
76
59.9M
    const auto d = And(m, kMaskM);
77
59.9M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
59.9M
    const auto c = Or(a, l);
79
59.9M
    extra_bits = Add(extra_bits, eb_fixed);
80
59.9M
    const auto t = Or(c, d);
81
59.9M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
59.9M
    Store(t_fixed, du, out + i);
83
59.9M
  }
84
476k
  if (last_full < len) {
85
410k
    const auto stop = Set(du, len);
86
410k
    const auto fence = Iota(du, last_full);
87
410k
    const auto take = Lt(fence, stop);
88
410k
    const auto val = LoadU(du, values + last_full);
89
410k
    const auto is_large = Gt(val, kLargeThreshold);
90
410k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
410k
    const auto not_literal = Ge(val, kSplit);
92
410k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
410k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
410k
    const auto l = And(val, kMaskL);
95
410k
    const auto exp = ShiftRight<23>(b);
96
410k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
410k
    const auto n = Sub(exp_fixed, kExpOffset);
98
410k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
410k
    const auto m = ShiftRight<23 - M - L>(b);
100
410k
    const auto a = Add(kBase, Mul(n, kMulN));
101
410k
    const auto d = And(m, kMaskM);
102
410k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
410k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
410k
    const auto c = Or(a, l);
105
410k
    extra_bits = Add(extra_bits, eb_masked);
106
410k
    const auto t = Or(c, d);
107
410k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
410k
    Store(t_fixed, du, out + last_full);
109
410k
  }
110
476k
  return GetLane(SumOfLanes(du, extra_bits));
111
476k
}
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
80.6k
                               uint32_t* JXL_RESTRICT out) {
46
80.6k
  const HWY_FULL(uint32_t) du;
47
80.6k
  const HWY_FULL(float) df;
48
80.6k
  const auto kZero = Zero(du);
49
80.6k
  const auto kSplit = Set(du, 1 << E);
50
80.6k
  const auto kExpOffset = Set(du, 127);
51
80.6k
  const auto kEBOffset = Set(du, 127 + M + L);
52
80.6k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
80.6k
  const auto kMulN = Set(du, 1 << (M + L));
54
80.6k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
80.6k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
80.6k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
80.6k
  constexpr size_t kLargeShiftVal = 10;
58
80.6k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
80.6k
  auto extra_bits = kZero;
61
80.6k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
11.2M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
11.2M
    const auto val = LoadU(du, values + i);
64
11.2M
    const auto is_large = Gt(val, kLargeThreshold);
65
11.2M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
11.2M
    const auto not_literal = Ge(val, kSplit);
67
11.2M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
11.2M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
11.2M
    const auto l = And(val, kMaskL);
70
11.2M
    const auto exp = ShiftRight<23>(b);
71
11.2M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
11.2M
    const auto n = Sub(exp_fixed, kExpOffset);
73
11.2M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
11.2M
    const auto m = ShiftRight<23 - M - L>(b);
75
11.2M
    const auto a = Add(kBase, Mul(n, kMulN));
76
11.2M
    const auto d = And(m, kMaskM);
77
11.2M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
11.2M
    const auto c = Or(a, l);
79
11.2M
    extra_bits = Add(extra_bits, eb_fixed);
80
11.2M
    const auto t = Or(c, d);
81
11.2M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
11.2M
    Store(t_fixed, du, out + i);
83
11.2M
  }
84
80.6k
  if (last_full < len) {
85
69.9k
    const auto stop = Set(du, len);
86
69.9k
    const auto fence = Iota(du, last_full);
87
69.9k
    const auto take = Lt(fence, stop);
88
69.9k
    const auto val = LoadU(du, values + last_full);
89
69.9k
    const auto is_large = Gt(val, kLargeThreshold);
90
69.9k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
69.9k
    const auto not_literal = Ge(val, kSplit);
92
69.9k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
69.9k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
69.9k
    const auto l = And(val, kMaskL);
95
69.9k
    const auto exp = ShiftRight<23>(b);
96
69.9k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
69.9k
    const auto n = Sub(exp_fixed, kExpOffset);
98
69.9k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
69.9k
    const auto m = ShiftRight<23 - M - L>(b);
100
69.9k
    const auto a = Add(kBase, Mul(n, kMulN));
101
69.9k
    const auto d = And(m, kMaskM);
102
69.9k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
69.9k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
69.9k
    const auto c = Or(a, l);
105
69.9k
    extra_bits = Add(extra_bits, eb_masked);
106
69.9k
    const auto t = Or(c, d);
107
69.9k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
69.9k
    Store(t_fixed, du, out + last_full);
109
69.9k
  }
110
80.6k
  return GetLane(SumOfLanes(du, extra_bits));
111
80.6k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
80.0k
                               uint32_t* JXL_RESTRICT out) {
46
80.0k
  const HWY_FULL(uint32_t) du;
47
80.0k
  const HWY_FULL(float) df;
48
80.0k
  const auto kZero = Zero(du);
49
80.0k
  const auto kSplit = Set(du, 1 << E);
50
80.0k
  const auto kExpOffset = Set(du, 127);
51
80.0k
  const auto kEBOffset = Set(du, 127 + M + L);
52
80.0k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
80.0k
  const auto kMulN = Set(du, 1 << (M + L));
54
80.0k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
80.0k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
80.0k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
80.0k
  constexpr size_t kLargeShiftVal = 10;
58
80.0k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
80.0k
  auto extra_bits = kZero;
61
80.0k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
11.2M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
11.2M
    const auto val = LoadU(du, values + i);
64
11.2M
    const auto is_large = Gt(val, kLargeThreshold);
65
11.2M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
11.2M
    const auto not_literal = Ge(val, kSplit);
67
11.2M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
11.2M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
11.2M
    const auto l = And(val, kMaskL);
70
11.2M
    const auto exp = ShiftRight<23>(b);
71
11.2M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
11.2M
    const auto n = Sub(exp_fixed, kExpOffset);
73
11.2M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
11.2M
    const auto m = ShiftRight<23 - M - L>(b);
75
11.2M
    const auto a = Add(kBase, Mul(n, kMulN));
76
11.2M
    const auto d = And(m, kMaskM);
77
11.2M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
11.2M
    const auto c = Or(a, l);
79
11.2M
    extra_bits = Add(extra_bits, eb_fixed);
80
11.2M
    const auto t = Or(c, d);
81
11.2M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
11.2M
    Store(t_fixed, du, out + i);
83
11.2M
  }
84
80.0k
  if (last_full < len) {
85
69.4k
    const auto stop = Set(du, len);
86
69.4k
    const auto fence = Iota(du, last_full);
87
69.4k
    const auto take = Lt(fence, stop);
88
69.4k
    const auto val = LoadU(du, values + last_full);
89
69.4k
    const auto is_large = Gt(val, kLargeThreshold);
90
69.4k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
69.4k
    const auto not_literal = Ge(val, kSplit);
92
69.4k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
69.4k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
69.4k
    const auto l = And(val, kMaskL);
95
69.4k
    const auto exp = ShiftRight<23>(b);
96
69.4k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
69.4k
    const auto n = Sub(exp_fixed, kExpOffset);
98
69.4k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
69.4k
    const auto m = ShiftRight<23 - M - L>(b);
100
69.4k
    const auto a = Add(kBase, Mul(n, kMulN));
101
69.4k
    const auto d = And(m, kMaskM);
102
69.4k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
69.4k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
69.4k
    const auto c = Or(a, l);
105
69.4k
    extra_bits = Add(extra_bits, eb_masked);
106
69.4k
    const auto t = Or(c, d);
107
69.4k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
69.4k
    Store(t_fixed, du, out + last_full);
109
69.4k
  }
110
80.0k
  return GetLane(SumOfLanes(du, extra_bits));
111
80.0k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
80.0k
                               uint32_t* JXL_RESTRICT out) {
46
80.0k
  const HWY_FULL(uint32_t) du;
47
80.0k
  const HWY_FULL(float) df;
48
80.0k
  const auto kZero = Zero(du);
49
80.0k
  const auto kSplit = Set(du, 1 << E);
50
80.0k
  const auto kExpOffset = Set(du, 127);
51
80.0k
  const auto kEBOffset = Set(du, 127 + M + L);
52
80.0k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
80.0k
  const auto kMulN = Set(du, 1 << (M + L));
54
80.0k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
80.0k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
80.0k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
80.0k
  constexpr size_t kLargeShiftVal = 10;
58
80.0k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
80.0k
  auto extra_bits = kZero;
61
80.0k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
11.2M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
11.2M
    const auto val = LoadU(du, values + i);
64
11.2M
    const auto is_large = Gt(val, kLargeThreshold);
65
11.2M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
11.2M
    const auto not_literal = Ge(val, kSplit);
67
11.2M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
11.2M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
11.2M
    const auto l = And(val, kMaskL);
70
11.2M
    const auto exp = ShiftRight<23>(b);
71
11.2M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
11.2M
    const auto n = Sub(exp_fixed, kExpOffset);
73
11.2M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
11.2M
    const auto m = ShiftRight<23 - M - L>(b);
75
11.2M
    const auto a = Add(kBase, Mul(n, kMulN));
76
11.2M
    const auto d = And(m, kMaskM);
77
11.2M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
11.2M
    const auto c = Or(a, l);
79
11.2M
    extra_bits = Add(extra_bits, eb_fixed);
80
11.2M
    const auto t = Or(c, d);
81
11.2M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
11.2M
    Store(t_fixed, du, out + i);
83
11.2M
  }
84
80.0k
  if (last_full < len) {
85
69.4k
    const auto stop = Set(du, len);
86
69.4k
    const auto fence = Iota(du, last_full);
87
69.4k
    const auto take = Lt(fence, stop);
88
69.4k
    const auto val = LoadU(du, values + last_full);
89
69.4k
    const auto is_large = Gt(val, kLargeThreshold);
90
69.4k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
69.4k
    const auto not_literal = Ge(val, kSplit);
92
69.4k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
69.4k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
69.4k
    const auto l = And(val, kMaskL);
95
69.4k
    const auto exp = ShiftRight<23>(b);
96
69.4k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
69.4k
    const auto n = Sub(exp_fixed, kExpOffset);
98
69.4k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
69.4k
    const auto m = ShiftRight<23 - M - L>(b);
100
69.4k
    const auto a = Add(kBase, Mul(n, kMulN));
101
69.4k
    const auto d = And(m, kMaskM);
102
69.4k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
69.4k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
69.4k
    const auto c = Or(a, l);
105
69.4k
    extra_bits = Add(extra_bits, eb_masked);
106
69.4k
    const auto t = Or(c, d);
107
69.4k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
69.4k
    Store(t_fixed, du, out + last_full);
109
69.4k
  }
110
80.0k
  return GetLane(SumOfLanes(du, extra_bits));
111
80.0k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
80.0k
                               uint32_t* JXL_RESTRICT out) {
46
80.0k
  const HWY_FULL(uint32_t) du;
47
80.0k
  const HWY_FULL(float) df;
48
80.0k
  const auto kZero = Zero(du);
49
80.0k
  const auto kSplit = Set(du, 1 << E);
50
80.0k
  const auto kExpOffset = Set(du, 127);
51
80.0k
  const auto kEBOffset = Set(du, 127 + M + L);
52
80.0k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
80.0k
  const auto kMulN = Set(du, 1 << (M + L));
54
80.0k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
80.0k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
80.0k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
80.0k
  constexpr size_t kLargeShiftVal = 10;
58
80.0k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
80.0k
  auto extra_bits = kZero;
61
80.0k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
11.2M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
11.2M
    const auto val = LoadU(du, values + i);
64
11.2M
    const auto is_large = Gt(val, kLargeThreshold);
65
11.2M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
11.2M
    const auto not_literal = Ge(val, kSplit);
67
11.2M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
11.2M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
11.2M
    const auto l = And(val, kMaskL);
70
11.2M
    const auto exp = ShiftRight<23>(b);
71
11.2M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
11.2M
    const auto n = Sub(exp_fixed, kExpOffset);
73
11.2M
    const auto eb = Sub(exp_fixed, kEBOffset);
74
11.2M
    const auto m = ShiftRight<23 - M - L>(b);
75
11.2M
    const auto a = Add(kBase, Mul(n, kMulN));
76
11.2M
    const auto d = And(m, kMaskM);
77
11.2M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
11.2M
    const auto c = Or(a, l);
79
11.2M
    extra_bits = Add(extra_bits, eb_fixed);
80
11.2M
    const auto t = Or(c, d);
81
11.2M
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
11.2M
    Store(t_fixed, du, out + i);
83
11.2M
  }
84
80.0k
  if (last_full < len) {
85
69.4k
    const auto stop = Set(du, len);
86
69.4k
    const auto fence = Iota(du, last_full);
87
69.4k
    const auto take = Lt(fence, stop);
88
69.4k
    const auto val = LoadU(du, values + last_full);
89
69.4k
    const auto is_large = Gt(val, kLargeThreshold);
90
69.4k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
69.4k
    const auto not_literal = Ge(val, kSplit);
92
69.4k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
69.4k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
69.4k
    const auto l = And(val, kMaskL);
95
69.4k
    const auto exp = ShiftRight<23>(b);
96
69.4k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
69.4k
    const auto n = Sub(exp_fixed, kExpOffset);
98
69.4k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
69.4k
    const auto m = ShiftRight<23 - M - L>(b);
100
69.4k
    const auto a = Add(kBase, Mul(n, kMulN));
101
69.4k
    const auto d = And(m, kMaskM);
102
69.4k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
69.4k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
69.4k
    const auto c = Or(a, l);
105
69.4k
    extra_bits = Add(extra_bits, eb_masked);
106
69.4k
    const auto t = Or(c, d);
107
69.4k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
69.4k
    Store(t_fixed, du, out + last_full);
109
69.4k
  }
110
80.0k
  return GetLane(SumOfLanes(du, extra_bits));
111
80.0k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.92k
                               uint32_t* JXL_RESTRICT out) {
46
6.92k
  const HWY_FULL(uint32_t) du;
47
6.92k
  const HWY_FULL(float) df;
48
6.92k
  const auto kZero = Zero(du);
49
6.92k
  const auto kSplit = Set(du, 1 << E);
50
6.92k
  const auto kExpOffset = Set(du, 127);
51
6.92k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.92k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.92k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.92k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.92k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.92k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.92k
  constexpr size_t kLargeShiftVal = 10;
58
6.92k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.92k
  auto extra_bits = kZero;
61
6.92k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.92k
  if (last_full < len) {
85
5.87k
    const auto stop = Set(du, len);
86
5.87k
    const auto fence = Iota(du, last_full);
87
5.87k
    const auto take = Lt(fence, stop);
88
5.87k
    const auto val = LoadU(du, values + last_full);
89
5.87k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.87k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.87k
    const auto not_literal = Ge(val, kSplit);
92
5.87k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.87k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.87k
    const auto l = And(val, kMaskL);
95
5.87k
    const auto exp = ShiftRight<23>(b);
96
5.87k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.87k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.87k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.87k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.87k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.87k
    const auto d = And(m, kMaskM);
102
5.87k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.87k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.87k
    const auto c = Or(a, l);
105
5.87k
    extra_bits = Add(extra_bits, eb_masked);
106
5.87k
    const auto t = Or(c, d);
107
5.87k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.87k
    Store(t_fixed, du, out + last_full);
109
5.87k
  }
110
6.92k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.92k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.92k
                               uint32_t* JXL_RESTRICT out) {
46
6.92k
  const HWY_FULL(uint32_t) du;
47
6.92k
  const HWY_FULL(float) df;
48
6.92k
  const auto kZero = Zero(du);
49
6.92k
  const auto kSplit = Set(du, 1 << E);
50
6.92k
  const auto kExpOffset = Set(du, 127);
51
6.92k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.92k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.92k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.92k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.92k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.92k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.92k
  constexpr size_t kLargeShiftVal = 10;
58
6.92k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.92k
  auto extra_bits = kZero;
61
6.92k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.92k
  if (last_full < len) {
85
5.87k
    const auto stop = Set(du, len);
86
5.87k
    const auto fence = Iota(du, last_full);
87
5.87k
    const auto take = Lt(fence, stop);
88
5.87k
    const auto val = LoadU(du, values + last_full);
89
5.87k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.87k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.87k
    const auto not_literal = Ge(val, kSplit);
92
5.87k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.87k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.87k
    const auto l = And(val, kMaskL);
95
5.87k
    const auto exp = ShiftRight<23>(b);
96
5.87k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.87k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.87k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.87k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.87k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.87k
    const auto d = And(m, kMaskM);
102
5.87k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.87k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.87k
    const auto c = Or(a, l);
105
5.87k
    extra_bits = Add(extra_bits, eb_masked);
106
5.87k
    const auto t = Or(c, d);
107
5.87k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.87k
    Store(t_fixed, du, out + last_full);
109
5.87k
  }
110
6.92k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.92k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
5.98k
                               uint32_t* JXL_RESTRICT out) {
46
5.98k
  const HWY_FULL(uint32_t) du;
47
5.98k
  const HWY_FULL(float) df;
48
5.98k
  const auto kZero = Zero(du);
49
5.98k
  const auto kSplit = Set(du, 1 << E);
50
5.98k
  const auto kExpOffset = Set(du, 127);
51
5.98k
  const auto kEBOffset = Set(du, 127 + M + L);
52
5.98k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
5.98k
  const auto kMulN = Set(du, 1 << (M + L));
54
5.98k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
5.98k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
5.98k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
5.98k
  constexpr size_t kLargeShiftVal = 10;
58
5.98k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
5.98k
  auto extra_bits = kZero;
61
5.98k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
376k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
370k
    const auto val = LoadU(du, values + i);
64
370k
    const auto is_large = Gt(val, kLargeThreshold);
65
370k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
370k
    const auto not_literal = Ge(val, kSplit);
67
370k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
370k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
370k
    const auto l = And(val, kMaskL);
70
370k
    const auto exp = ShiftRight<23>(b);
71
370k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
370k
    const auto n = Sub(exp_fixed, kExpOffset);
73
370k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
370k
    const auto m = ShiftRight<23 - M - L>(b);
75
370k
    const auto a = Add(kBase, Mul(n, kMulN));
76
370k
    const auto d = And(m, kMaskM);
77
370k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
370k
    const auto c = Or(a, l);
79
370k
    extra_bits = Add(extra_bits, eb_fixed);
80
370k
    const auto t = Or(c, d);
81
370k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
370k
    Store(t_fixed, du, out + i);
83
370k
  }
84
5.98k
  if (last_full < len) {
85
5.11k
    const auto stop = Set(du, len);
86
5.11k
    const auto fence = Iota(du, last_full);
87
5.11k
    const auto take = Lt(fence, stop);
88
5.11k
    const auto val = LoadU(du, values + last_full);
89
5.11k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.11k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.11k
    const auto not_literal = Ge(val, kSplit);
92
5.11k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.11k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.11k
    const auto l = And(val, kMaskL);
95
5.11k
    const auto exp = ShiftRight<23>(b);
96
5.11k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.11k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.11k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.11k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.11k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.11k
    const auto d = And(m, kMaskM);
102
5.11k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.11k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.11k
    const auto c = Or(a, l);
105
5.11k
    extra_bits = Add(extra_bits, eb_masked);
106
5.11k
    const auto t = Or(c, d);
107
5.11k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.11k
    Store(t_fixed, du, out + last_full);
109
5.11k
  }
110
5.98k
  return GetLane(SumOfLanes(du, extra_bits));
111
5.98k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
5.98k
                               uint32_t* JXL_RESTRICT out) {
46
5.98k
  const HWY_FULL(uint32_t) du;
47
5.98k
  const HWY_FULL(float) df;
48
5.98k
  const auto kZero = Zero(du);
49
5.98k
  const auto kSplit = Set(du, 1 << E);
50
5.98k
  const auto kExpOffset = Set(du, 127);
51
5.98k
  const auto kEBOffset = Set(du, 127 + M + L);
52
5.98k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
5.98k
  const auto kMulN = Set(du, 1 << (M + L));
54
5.98k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
5.98k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
5.98k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
5.98k
  constexpr size_t kLargeShiftVal = 10;
58
5.98k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
5.98k
  auto extra_bits = kZero;
61
5.98k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
376k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
370k
    const auto val = LoadU(du, values + i);
64
370k
    const auto is_large = Gt(val, kLargeThreshold);
65
370k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
370k
    const auto not_literal = Ge(val, kSplit);
67
370k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
370k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
370k
    const auto l = And(val, kMaskL);
70
370k
    const auto exp = ShiftRight<23>(b);
71
370k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
370k
    const auto n = Sub(exp_fixed, kExpOffset);
73
370k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
370k
    const auto m = ShiftRight<23 - M - L>(b);
75
370k
    const auto a = Add(kBase, Mul(n, kMulN));
76
370k
    const auto d = And(m, kMaskM);
77
370k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
370k
    const auto c = Or(a, l);
79
370k
    extra_bits = Add(extra_bits, eb_fixed);
80
370k
    const auto t = Or(c, d);
81
370k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
370k
    Store(t_fixed, du, out + i);
83
370k
  }
84
5.98k
  if (last_full < len) {
85
5.11k
    const auto stop = Set(du, len);
86
5.11k
    const auto fence = Iota(du, last_full);
87
5.11k
    const auto take = Lt(fence, stop);
88
5.11k
    const auto val = LoadU(du, values + last_full);
89
5.11k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.11k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.11k
    const auto not_literal = Ge(val, kSplit);
92
5.11k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.11k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.11k
    const auto l = And(val, kMaskL);
95
5.11k
    const auto exp = ShiftRight<23>(b);
96
5.11k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.11k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.11k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.11k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.11k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.11k
    const auto d = And(m, kMaskM);
102
5.11k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.11k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.11k
    const auto c = Or(a, l);
105
5.11k
    extra_bits = Add(extra_bits, eb_masked);
106
5.11k
    const auto t = Or(c, d);
107
5.11k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.11k
    Store(t_fixed, du, out + last_full);
109
5.11k
  }
110
5.98k
  return GetLane(SumOfLanes(du, extra_bits));
111
5.98k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
6.93k
                               uint32_t* JXL_RESTRICT out) {
46
6.93k
  const HWY_FULL(uint32_t) du;
47
6.93k
  const HWY_FULL(float) df;
48
6.93k
  const auto kZero = Zero(du);
49
6.93k
  const auto kSplit = Set(du, 1 << E);
50
6.93k
  const auto kExpOffset = Set(du, 127);
51
6.93k
  const auto kEBOffset = Set(du, 127 + M + L);
52
6.93k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
6.93k
  const auto kMulN = Set(du, 1 << (M + L));
54
6.93k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
6.93k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
6.93k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
6.93k
  constexpr size_t kLargeShiftVal = 10;
58
6.93k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
6.93k
  auto extra_bits = kZero;
61
6.93k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
753k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
746k
    const auto val = LoadU(du, values + i);
64
746k
    const auto is_large = Gt(val, kLargeThreshold);
65
746k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
746k
    const auto not_literal = Ge(val, kSplit);
67
746k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
746k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
746k
    const auto l = And(val, kMaskL);
70
746k
    const auto exp = ShiftRight<23>(b);
71
746k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
746k
    const auto n = Sub(exp_fixed, kExpOffset);
73
746k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
746k
    const auto m = ShiftRight<23 - M - L>(b);
75
746k
    const auto a = Add(kBase, Mul(n, kMulN));
76
746k
    const auto d = And(m, kMaskM);
77
746k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
746k
    const auto c = Or(a, l);
79
746k
    extra_bits = Add(extra_bits, eb_fixed);
80
746k
    const auto t = Or(c, d);
81
746k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
746k
    Store(t_fixed, du, out + i);
83
746k
  }
84
6.93k
  if (last_full < len) {
85
5.88k
    const auto stop = Set(du, len);
86
5.88k
    const auto fence = Iota(du, last_full);
87
5.88k
    const auto take = Lt(fence, stop);
88
5.88k
    const auto val = LoadU(du, values + last_full);
89
5.88k
    const auto is_large = Gt(val, kLargeThreshold);
90
5.88k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
5.88k
    const auto not_literal = Ge(val, kSplit);
92
5.88k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
5.88k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
5.88k
    const auto l = And(val, kMaskL);
95
5.88k
    const auto exp = ShiftRight<23>(b);
96
5.88k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
5.88k
    const auto n = Sub(exp_fixed, kExpOffset);
98
5.88k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
5.88k
    const auto m = ShiftRight<23 - M - L>(b);
100
5.88k
    const auto a = Add(kBase, Mul(n, kMulN));
101
5.88k
    const auto d = And(m, kMaskM);
102
5.88k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
5.88k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
5.88k
    const auto c = Or(a, l);
105
5.88k
    extra_bits = Add(extra_bits, eb_masked);
106
5.88k
    const auto t = Or(c, d);
107
5.88k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
5.88k
    Store(t_fixed, du, out + last_full);
109
5.88k
  }
110
6.93k
  return GetLane(SumOfLanes(du, extra_bits));
111
6.93k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
5.16k
                               uint32_t* JXL_RESTRICT out) {
46
5.16k
  const HWY_FULL(uint32_t) du;
47
5.16k
  const HWY_FULL(float) df;
48
5.16k
  const auto kZero = Zero(du);
49
5.16k
  const auto kSplit = Set(du, 1 << E);
50
5.16k
  const auto kExpOffset = Set(du, 127);
51
5.16k
  const auto kEBOffset = Set(du, 127 + M + L);
52
5.16k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
5.16k
  const auto kMulN = Set(du, 1 << (M + L));
54
5.16k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
5.16k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
5.16k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
5.16k
  constexpr size_t kLargeShiftVal = 10;
58
5.16k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
5.16k
  auto extra_bits = kZero;
61
5.16k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
345k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
339k
    const auto val = LoadU(du, values + i);
64
339k
    const auto is_large = Gt(val, kLargeThreshold);
65
339k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
339k
    const auto not_literal = Ge(val, kSplit);
67
339k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
339k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
339k
    const auto l = And(val, kMaskL);
70
339k
    const auto exp = ShiftRight<23>(b);
71
339k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
339k
    const auto n = Sub(exp_fixed, kExpOffset);
73
339k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
339k
    const auto m = ShiftRight<23 - M - L>(b);
75
339k
    const auto a = Add(kBase, Mul(n, kMulN));
76
339k
    const auto d = And(m, kMaskM);
77
339k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
339k
    const auto c = Or(a, l);
79
339k
    extra_bits = Add(extra_bits, eb_fixed);
80
339k
    const auto t = Or(c, d);
81
339k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
339k
    Store(t_fixed, du, out + i);
83
339k
  }
84
5.16k
  if (last_full < len) {
85
4.35k
    const auto stop = Set(du, len);
86
4.35k
    const auto fence = Iota(du, last_full);
87
4.35k
    const auto take = Lt(fence, stop);
88
4.35k
    const auto val = LoadU(du, values + last_full);
89
4.35k
    const auto is_large = Gt(val, kLargeThreshold);
90
4.35k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
4.35k
    const auto not_literal = Ge(val, kSplit);
92
4.35k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
4.35k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
4.35k
    const auto l = And(val, kMaskL);
95
4.35k
    const auto exp = ShiftRight<23>(b);
96
4.35k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
4.35k
    const auto n = Sub(exp_fixed, kExpOffset);
98
4.35k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
4.35k
    const auto m = ShiftRight<23 - M - L>(b);
100
4.35k
    const auto a = Add(kBase, Mul(n, kMulN));
101
4.35k
    const auto d = And(m, kMaskM);
102
4.35k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
4.35k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
4.35k
    const auto c = Or(a, l);
105
4.35k
    extra_bits = Add(extra_bits, eb_masked);
106
4.35k
    const auto t = Or(c, d);
107
4.35k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
4.35k
    Store(t_fixed, du, out + last_full);
109
4.35k
  }
110
5.16k
  return GetLane(SumOfLanes(du, extra_bits));
111
5.16k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
5.16k
                               uint32_t* JXL_RESTRICT out) {
46
5.16k
  const HWY_FULL(uint32_t) du;
47
5.16k
  const HWY_FULL(float) df;
48
5.16k
  const auto kZero = Zero(du);
49
5.16k
  const auto kSplit = Set(du, 1 << E);
50
5.16k
  const auto kExpOffset = Set(du, 127);
51
5.16k
  const auto kEBOffset = Set(du, 127 + M + L);
52
5.16k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
5.16k
  const auto kMulN = Set(du, 1 << (M + L));
54
5.16k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
5.16k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
5.16k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
5.16k
  constexpr size_t kLargeShiftVal = 10;
58
5.16k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
5.16k
  auto extra_bits = kZero;
61
5.16k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
345k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
339k
    const auto val = LoadU(du, values + i);
64
339k
    const auto is_large = Gt(val, kLargeThreshold);
65
339k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
339k
    const auto not_literal = Ge(val, kSplit);
67
339k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
339k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
339k
    const auto l = And(val, kMaskL);
70
339k
    const auto exp = ShiftRight<23>(b);
71
339k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
339k
    const auto n = Sub(exp_fixed, kExpOffset);
73
339k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
339k
    const auto m = ShiftRight<23 - M - L>(b);
75
339k
    const auto a = Add(kBase, Mul(n, kMulN));
76
339k
    const auto d = And(m, kMaskM);
77
339k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
339k
    const auto c = Or(a, l);
79
339k
    extra_bits = Add(extra_bits, eb_fixed);
80
339k
    const auto t = Or(c, d);
81
339k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
339k
    Store(t_fixed, du, out + i);
83
339k
  }
84
5.16k
  if (last_full < len) {
85
4.35k
    const auto stop = Set(du, len);
86
4.35k
    const auto fence = Iota(du, last_full);
87
4.35k
    const auto take = Lt(fence, stop);
88
4.35k
    const auto val = LoadU(du, values + last_full);
89
4.35k
    const auto is_large = Gt(val, kLargeThreshold);
90
4.35k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
4.35k
    const auto not_literal = Ge(val, kSplit);
92
4.35k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
4.35k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
4.35k
    const auto l = And(val, kMaskL);
95
4.35k
    const auto exp = ShiftRight<23>(b);
96
4.35k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
4.35k
    const auto n = Sub(exp_fixed, kExpOffset);
98
4.35k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
4.35k
    const auto m = ShiftRight<23 - M - L>(b);
100
4.35k
    const auto a = Add(kBase, Mul(n, kMulN));
101
4.35k
    const auto d = And(m, kMaskM);
102
4.35k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
4.35k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
4.35k
    const auto c = Or(a, l);
105
4.35k
    extra_bits = Add(extra_bits, eb_masked);
106
4.35k
    const auto t = Or(c, d);
107
4.35k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
4.35k
    Store(t_fixed, du, out + last_full);
109
4.35k
  }
110
5.16k
  return GetLane(SumOfLanes(du, extra_bits));
111
5.16k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
5.16k
                               uint32_t* JXL_RESTRICT out) {
46
5.16k
  const HWY_FULL(uint32_t) du;
47
5.16k
  const HWY_FULL(float) df;
48
5.16k
  const auto kZero = Zero(du);
49
5.16k
  const auto kSplit = Set(du, 1 << E);
50
5.16k
  const auto kExpOffset = Set(du, 127);
51
5.16k
  const auto kEBOffset = Set(du, 127 + M + L);
52
5.16k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
5.16k
  const auto kMulN = Set(du, 1 << (M + L));
54
5.16k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
5.16k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
5.16k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
5.16k
  constexpr size_t kLargeShiftVal = 10;
58
5.16k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
5.16k
  auto extra_bits = kZero;
61
5.16k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
345k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
339k
    const auto val = LoadU(du, values + i);
64
339k
    const auto is_large = Gt(val, kLargeThreshold);
65
339k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
339k
    const auto not_literal = Ge(val, kSplit);
67
339k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
339k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
339k
    const auto l = And(val, kMaskL);
70
339k
    const auto exp = ShiftRight<23>(b);
71
339k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
339k
    const auto n = Sub(exp_fixed, kExpOffset);
73
339k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
339k
    const auto m = ShiftRight<23 - M - L>(b);
75
339k
    const auto a = Add(kBase, Mul(n, kMulN));
76
339k
    const auto d = And(m, kMaskM);
77
339k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
339k
    const auto c = Or(a, l);
79
339k
    extra_bits = Add(extra_bits, eb_fixed);
80
339k
    const auto t = Or(c, d);
81
339k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
339k
    Store(t_fixed, du, out + i);
83
339k
  }
84
5.16k
  if (last_full < len) {
85
4.35k
    const auto stop = Set(du, len);
86
4.35k
    const auto fence = Iota(du, last_full);
87
4.35k
    const auto take = Lt(fence, stop);
88
4.35k
    const auto val = LoadU(du, values + last_full);
89
4.35k
    const auto is_large = Gt(val, kLargeThreshold);
90
4.35k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
4.35k
    const auto not_literal = Ge(val, kSplit);
92
4.35k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
4.35k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
4.35k
    const auto l = And(val, kMaskL);
95
4.35k
    const auto exp = ShiftRight<23>(b);
96
4.35k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
4.35k
    const auto n = Sub(exp_fixed, kExpOffset);
98
4.35k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
4.35k
    const auto m = ShiftRight<23 - M - L>(b);
100
4.35k
    const auto a = Add(kBase, Mul(n, kMulN));
101
4.35k
    const auto d = And(m, kMaskM);
102
4.35k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
4.35k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
4.35k
    const auto c = Or(a, l);
105
4.35k
    extra_bits = Add(extra_bits, eb_masked);
106
4.35k
    const auto t = Or(c, d);
107
4.35k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
4.35k
    Store(t_fixed, du, out + last_full);
109
4.35k
  }
110
5.16k
  return GetLane(SumOfLanes(du, extra_bits));
111
5.16k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
5.16k
                               uint32_t* JXL_RESTRICT out) {
46
5.16k
  const HWY_FULL(uint32_t) du;
47
5.16k
  const HWY_FULL(float) df;
48
5.16k
  const auto kZero = Zero(du);
49
5.16k
  const auto kSplit = Set(du, 1 << E);
50
5.16k
  const auto kExpOffset = Set(du, 127);
51
5.16k
  const auto kEBOffset = Set(du, 127 + M + L);
52
5.16k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
5.16k
  const auto kMulN = Set(du, 1 << (M + L));
54
5.16k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
5.16k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
5.16k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
5.16k
  constexpr size_t kLargeShiftVal = 10;
58
5.16k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
5.16k
  auto extra_bits = kZero;
61
5.16k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
345k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
339k
    const auto val = LoadU(du, values + i);
64
339k
    const auto is_large = Gt(val, kLargeThreshold);
65
339k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
339k
    const auto not_literal = Ge(val, kSplit);
67
339k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
339k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
339k
    const auto l = And(val, kMaskL);
70
339k
    const auto exp = ShiftRight<23>(b);
71
339k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
339k
    const auto n = Sub(exp_fixed, kExpOffset);
73
339k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
339k
    const auto m = ShiftRight<23 - M - L>(b);
75
339k
    const auto a = Add(kBase, Mul(n, kMulN));
76
339k
    const auto d = And(m, kMaskM);
77
339k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
339k
    const auto c = Or(a, l);
79
339k
    extra_bits = Add(extra_bits, eb_fixed);
80
339k
    const auto t = Or(c, d);
81
339k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
339k
    Store(t_fixed, du, out + i);
83
339k
  }
84
5.16k
  if (last_full < len) {
85
4.35k
    const auto stop = Set(du, len);
86
4.35k
    const auto fence = Iota(du, last_full);
87
4.35k
    const auto take = Lt(fence, stop);
88
4.35k
    const auto val = LoadU(du, values + last_full);
89
4.35k
    const auto is_large = Gt(val, kLargeThreshold);
90
4.35k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
4.35k
    const auto not_literal = Ge(val, kSplit);
92
4.35k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
4.35k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
4.35k
    const auto l = And(val, kMaskL);
95
4.35k
    const auto exp = ShiftRight<23>(b);
96
4.35k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
4.35k
    const auto n = Sub(exp_fixed, kExpOffset);
98
4.35k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
4.35k
    const auto m = ShiftRight<23 - M - L>(b);
100
4.35k
    const auto a = Add(kBase, Mul(n, kMulN));
101
4.35k
    const auto d = And(m, kMaskM);
102
4.35k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
4.35k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
4.35k
    const auto c = Or(a, l);
105
4.35k
    extra_bits = Add(extra_bits, eb_masked);
106
4.35k
    const auto t = Or(c, d);
107
4.35k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
4.35k
    Store(t_fixed, du, out + last_full);
109
4.35k
  }
110
5.16k
  return GetLane(SumOfLanes(du, extra_bits));
111
5.16k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
45
5.16k
                               uint32_t* JXL_RESTRICT out) {
46
5.16k
  const HWY_FULL(uint32_t) du;
47
5.16k
  const HWY_FULL(float) df;
48
5.16k
  const auto kZero = Zero(du);
49
5.16k
  const auto kSplit = Set(du, 1 << E);
50
5.16k
  const auto kExpOffset = Set(du, 127);
51
5.16k
  const auto kEBOffset = Set(du, 127 + M + L);
52
5.16k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
53
5.16k
  const auto kMulN = Set(du, 1 << (M + L));
54
5.16k
  const auto kMaskL = Set(du, (1 << L) - 1);
55
5.16k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
56
5.16k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
57
5.16k
  constexpr size_t kLargeShiftVal = 10;
58
5.16k
  const auto kLargeShift = Set(du, kLargeShiftVal);
59
60
5.16k
  auto extra_bits = kZero;
61
5.16k
  size_t last_full = Lanes(du) * (len / Lanes(du));
62
345k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
63
339k
    const auto val = LoadU(du, values + i);
64
339k
    const auto is_large = Gt(val, kLargeThreshold);
65
339k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
66
339k
    const auto not_literal = Ge(val, kSplit);
67
339k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
68
339k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
69
339k
    const auto l = And(val, kMaskL);
70
339k
    const auto exp = ShiftRight<23>(b);
71
339k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
72
339k
    const auto n = Sub(exp_fixed, kExpOffset);
73
339k
    const auto eb = Sub(exp_fixed, kEBOffset);
74
339k
    const auto m = ShiftRight<23 - M - L>(b);
75
339k
    const auto a = Add(kBase, Mul(n, kMulN));
76
339k
    const auto d = And(m, kMaskM);
77
339k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
78
339k
    const auto c = Or(a, l);
79
339k
    extra_bits = Add(extra_bits, eb_fixed);
80
339k
    const auto t = Or(c, d);
81
339k
    const auto t_fixed = IfThenElse(not_literal, t, val);
82
339k
    Store(t_fixed, du, out + i);
83
339k
  }
84
5.16k
  if (last_full < len) {
85
4.35k
    const auto stop = Set(du, len);
86
4.35k
    const auto fence = Iota(du, last_full);
87
4.35k
    const auto take = Lt(fence, stop);
88
4.35k
    const auto val = LoadU(du, values + last_full);
89
4.35k
    const auto is_large = Gt(val, kLargeThreshold);
90
4.35k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
91
4.35k
    const auto not_literal = Ge(val, kSplit);
92
4.35k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
93
4.35k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
94
4.35k
    const auto l = And(val, kMaskL);
95
4.35k
    const auto exp = ShiftRight<23>(b);
96
4.35k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
97
4.35k
    const auto n = Sub(exp_fixed, kExpOffset);
98
4.35k
    const auto eb = Sub(exp_fixed, kEBOffset);
99
4.35k
    const auto m = ShiftRight<23 - M - L>(b);
100
4.35k
    const auto a = Add(kBase, Mul(n, kMulN));
101
4.35k
    const auto d = And(m, kMaskM);
102
4.35k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
103
4.35k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
104
4.35k
    const auto c = Or(a, l);
105
4.35k
    extra_bits = Add(extra_bits, eb_masked);
106
4.35k
    const auto t = Or(c, d);
107
4.35k
    const auto t_fixed = IfThenElse(not_literal, t, val);
108
4.35k
    Store(t_fixed, du, out + last_full);
109
4.35k
  }
110
5.16k
  return GetLane(SumOfLanes(du, extra_bits));
111
5.16k
}
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
112
113
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
114
476k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
115
476k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
116
#if HWY_TARGET == HWY_SCALAR
117
  uint32_t extra_bits = 0;
118
  for (size_t i = 0; i < len; ++i) {
119
    uint32_t v = values[i];
120
    uint32_t tok, nbits, bits;
121
    cfg.Encode(v, &tok, &nbits, &bits);
122
    extra_bits += nbits;
123
    out[i] = tok;
124
  }
125
  return extra_bits;
126
#else
127
476k
  if (cfg.split_exponent == 0) {
128
80.6k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
129
395k
  } else if (cfg.split_exponent == 2) {
130
80.0k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
131
80.0k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
132
315k
  } else if (cfg.split_exponent == 3) {
133
27.7k
    if (cfg.msb_in_token == 1) {
134
13.8k
      if (cfg.lsb_in_token == 0) {
135
6.93k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
136
6.93k
      } else {
137
6.93k
        JXL_DASSERT(cfg.lsb_in_token == 2);
138
6.93k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
139
6.93k
      }
140
13.8k
    } else {
141
13.8k
      JXL_DASSERT(cfg.msb_in_token == 2);
142
13.8k
      if (cfg.lsb_in_token == 0) {
143
6.93k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
144
6.93k
      } else {
145
6.93k
        JXL_DASSERT(cfg.lsb_in_token == 1);
146
6.93k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
147
6.93k
      }
148
13.8k
    }
149
288k
  } else if (cfg.split_exponent == 4) {
150
187k
    if (cfg.msb_in_token == 1) {
151
93.9k
      if (cfg.lsb_in_token == 0) {
152
6.93k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
153
87.0k
      } else if (cfg.lsb_in_token == 2) {
154
80.0k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
155
80.0k
      } else {
156
6.93k
        JXL_DASSERT(cfg.lsb_in_token == 3);
157
6.93k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
158
6.93k
      }
159
93.9k
    } else {
160
93.9k
      JXL_DASSERT(cfg.msb_in_token == 2);
161
93.9k
      if (cfg.lsb_in_token == 0) {
162
80.0k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
163
80.0k
      } else if (cfg.lsb_in_token == 1) {
164
6.93k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
165
6.93k
      } else {
166
6.93k
        JXL_DASSERT(cfg.lsb_in_token == 2);
167
6.93k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
168
6.93k
      }
169
93.9k
    }
170
187k
  } else if (cfg.split_exponent == 5) {
171
48.5k
    if (cfg.msb_in_token == 1) {
172
20.8k
      if (cfg.lsb_in_token == 0) {
173
6.93k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
174
13.8k
      } else if (cfg.lsb_in_token == 2) {
175
6.93k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
176
6.93k
      } else {
177
6.92k
        JXL_DASSERT(cfg.lsb_in_token == 4);
178
6.92k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
179
6.92k
      }
180
27.7k
    } else {
181
27.7k
      JXL_DASSERT(cfg.msb_in_token == 2);
182
27.7k
      if (cfg.lsb_in_token == 0) {
183
6.93k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
184
20.8k
      } else if (cfg.lsb_in_token == 1) {
185
6.93k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
186
13.8k
      } else if (cfg.lsb_in_token == 2) {
187
6.93k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
188
6.93k
      } else {
189
6.92k
        JXL_DASSERT(cfg.lsb_in_token == 3);
190
6.92k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
191
6.92k
      }
192
27.7k
    }
193
51.6k
  } else if (cfg.split_exponent == 6) {
194
18.9k
    if (cfg.msb_in_token == 0) {
195
6.93k
      JXL_DASSERT(cfg.lsb_in_token == 0);
196
6.93k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
197
11.9k
    } else if (cfg.msb_in_token == 1) {
198
5.98k
      JXL_DASSERT(cfg.lsb_in_token == 5);
199
5.98k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
200
5.98k
    } else {
201
5.98k
      JXL_DASSERT(cfg.msb_in_token == 2);
202
5.98k
      JXL_DASSERT(cfg.lsb_in_token == 4);
203
5.98k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
204
5.98k
    }
205
32.7k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
206
32.7k
    JXL_DASSERT(cfg.msb_in_token == 0);
207
32.7k
    JXL_DASSERT(cfg.lsb_in_token == 0);
208
32.7k
    if (cfg.split_exponent == 7) {
209
6.93k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
210
25.8k
    } else if (cfg.split_exponent == 8) {
211
5.16k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
212
20.6k
    } else if (cfg.split_exponent == 9) {
213
5.16k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
214
15.4k
    } else if (cfg.split_exponent == 10) {
215
5.16k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
216
10.3k
    } else if (cfg.split_exponent == 11) {
217
5.16k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
218
5.16k
    } else {
219
5.16k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
220
5.16k
    }
221
32.7k
  } else {
222
0
    JXL_DASSERT(false);
223
0
  }
224
0
  return ~0;
225
476k
#endif
226
476k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Line
Count
Source
114
476k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
115
476k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
116
#if HWY_TARGET == HWY_SCALAR
117
  uint32_t extra_bits = 0;
118
  for (size_t i = 0; i < len; ++i) {
119
    uint32_t v = values[i];
120
    uint32_t tok, nbits, bits;
121
    cfg.Encode(v, &tok, &nbits, &bits);
122
    extra_bits += nbits;
123
    out[i] = tok;
124
  }
125
  return extra_bits;
126
#else
127
476k
  if (cfg.split_exponent == 0) {
128
80.6k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
129
395k
  } else if (cfg.split_exponent == 2) {
130
80.0k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
131
80.0k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
132
315k
  } else if (cfg.split_exponent == 3) {
133
27.7k
    if (cfg.msb_in_token == 1) {
134
13.8k
      if (cfg.lsb_in_token == 0) {
135
6.93k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
136
6.93k
      } else {
137
6.93k
        JXL_DASSERT(cfg.lsb_in_token == 2);
138
6.93k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
139
6.93k
      }
140
13.8k
    } else {
141
13.8k
      JXL_DASSERT(cfg.msb_in_token == 2);
142
13.8k
      if (cfg.lsb_in_token == 0) {
143
6.93k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
144
6.93k
      } else {
145
6.93k
        JXL_DASSERT(cfg.lsb_in_token == 1);
146
6.93k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
147
6.93k
      }
148
13.8k
    }
149
288k
  } else if (cfg.split_exponent == 4) {
150
187k
    if (cfg.msb_in_token == 1) {
151
93.9k
      if (cfg.lsb_in_token == 0) {
152
6.93k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
153
87.0k
      } else if (cfg.lsb_in_token == 2) {
154
80.0k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
155
80.0k
      } else {
156
6.93k
        JXL_DASSERT(cfg.lsb_in_token == 3);
157
6.93k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
158
6.93k
      }
159
93.9k
    } else {
160
93.9k
      JXL_DASSERT(cfg.msb_in_token == 2);
161
93.9k
      if (cfg.lsb_in_token == 0) {
162
80.0k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
163
80.0k
      } else if (cfg.lsb_in_token == 1) {
164
6.93k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
165
6.93k
      } else {
166
6.93k
        JXL_DASSERT(cfg.lsb_in_token == 2);
167
6.93k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
168
6.93k
      }
169
93.9k
    }
170
187k
  } else if (cfg.split_exponent == 5) {
171
48.5k
    if (cfg.msb_in_token == 1) {
172
20.8k
      if (cfg.lsb_in_token == 0) {
173
6.93k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
174
13.8k
      } else if (cfg.lsb_in_token == 2) {
175
6.93k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
176
6.93k
      } else {
177
6.92k
        JXL_DASSERT(cfg.lsb_in_token == 4);
178
6.92k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
179
6.92k
      }
180
27.7k
    } else {
181
27.7k
      JXL_DASSERT(cfg.msb_in_token == 2);
182
27.7k
      if (cfg.lsb_in_token == 0) {
183
6.93k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
184
20.8k
      } else if (cfg.lsb_in_token == 1) {
185
6.93k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
186
13.8k
      } else if (cfg.lsb_in_token == 2) {
187
6.93k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
188
6.93k
      } else {
189
6.92k
        JXL_DASSERT(cfg.lsb_in_token == 3);
190
6.92k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
191
6.92k
      }
192
27.7k
    }
193
51.6k
  } else if (cfg.split_exponent == 6) {
194
18.9k
    if (cfg.msb_in_token == 0) {
195
6.93k
      JXL_DASSERT(cfg.lsb_in_token == 0);
196
6.93k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
197
11.9k
    } else if (cfg.msb_in_token == 1) {
198
5.98k
      JXL_DASSERT(cfg.lsb_in_token == 5);
199
5.98k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
200
5.98k
    } else {
201
5.98k
      JXL_DASSERT(cfg.msb_in_token == 2);
202
5.98k
      JXL_DASSERT(cfg.lsb_in_token == 4);
203
5.98k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
204
5.98k
    }
205
32.7k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
206
32.7k
    JXL_DASSERT(cfg.msb_in_token == 0);
207
32.7k
    JXL_DASSERT(cfg.lsb_in_token == 0);
208
32.7k
    if (cfg.split_exponent == 7) {
209
6.93k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
210
25.8k
    } else if (cfg.split_exponent == 8) {
211
5.16k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
212
20.6k
    } else if (cfg.split_exponent == 9) {
213
5.16k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
214
15.4k
    } else if (cfg.split_exponent == 10) {
215
5.16k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
216
10.3k
    } else if (cfg.split_exponent == 11) {
217
5.16k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
218
5.16k
    } else {
219
5.16k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
220
5.16k
    }
221
32.7k
  } else {
222
0
    JXL_DASSERT(false);
223
0
  }
224
0
  return ~0;
225
476k
#endif
226
476k
}
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
227
228
// NOLINTNEXTLINE(google-readability-namespace-comments)
229
}  // namespace HWY_NAMESPACE
230
}  // namespace jxl
231
HWY_AFTER_NAMESPACE();
232
233
#if HWY_ONCE
234
namespace jxl {
235
236
HWY_EXPORT(EstimateTokenCost);
237
238
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
239
476k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
240
476k
  JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent);
241
476k
  return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens);
242
476k
}
243
244
}  // namespace jxl
245
#endif