Coverage Report

Created: 2026-03-31 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/enc_ans_simd.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ans_simd.h"
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/status.h"
11
#include "lib/jxl/dec_ans.h"
12
#include "lib/jxl/memory_manager_internal.h"
13
14
#undef HWY_TARGET_INCLUDE
15
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc"
16
#include <hwy/foreach_target.h>
17
#include <hwy/highway.h>
18
19
HWY_BEFORE_NAMESPACE();
20
namespace jxl {
21
namespace HWY_NAMESPACE {
22
23
// These templates are not found via ADL.
24
using hwy::HWY_NAMESPACE::Add;
25
using hwy::HWY_NAMESPACE::And;
26
using hwy::HWY_NAMESPACE::Ge;
27
using hwy::HWY_NAMESPACE::GetLane;
28
using hwy::HWY_NAMESPACE::Gt;
29
using hwy::HWY_NAMESPACE::IfThenElse;
30
using hwy::HWY_NAMESPACE::IfThenElseZero;
31
using hwy::HWY_NAMESPACE::Iota;
32
using hwy::HWY_NAMESPACE::LoadU;
33
using hwy::HWY_NAMESPACE::Lt;
34
using hwy::HWY_NAMESPACE::Mul;
35
using hwy::HWY_NAMESPACE::Or;
36
using hwy::HWY_NAMESPACE::Set;
37
using hwy::HWY_NAMESPACE::ShiftRight;
38
using hwy::HWY_NAMESPACE::Store;
39
using hwy::HWY_NAMESPACE::Sub;
40
using hwy::HWY_NAMESPACE::Zero;
41
42
template <size_t E, size_t M, size_t L>
43
uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len,
44
460k
                               uint32_t* JXL_RESTRICT out) {
45
460k
  const HWY_FULL(uint32_t) du;
46
460k
  const HWY_FULL(float) df;
47
460k
  const auto kZero = Zero(du);
48
460k
  const auto kSplit = Set(du, 1 << E);
49
460k
  const auto kExpOffset = Set(du, 127);
50
460k
  const auto kEBOffset = Set(du, 127 + M + L);
51
460k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
460k
  const auto kMulN = Set(du, 1 << (M + L));
53
460k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
460k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
460k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
460k
  constexpr size_t kLargeShiftVal = 10;
57
460k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
460k
  auto extra_bits = kZero;
60
460k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
61.1M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
60.7M
    const auto val = LoadU(du, values + i);
63
60.7M
    const auto is_large = Gt(val, kLargeThreshold);
64
60.7M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
60.7M
    const auto not_literal = Ge(val, kSplit);
66
60.7M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
60.7M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
60.7M
    const auto l = And(val, kMaskL);
69
60.7M
    const auto exp = ShiftRight<23>(b);
70
60.7M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
60.7M
    const auto n = Sub(exp_fixed, kExpOffset);
72
60.7M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
60.7M
    const auto m = ShiftRight<23 - M - L>(b);
74
60.7M
    const auto a = Add(kBase, Mul(n, kMulN));
75
60.7M
    const auto d = And(m, kMaskM);
76
60.7M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
60.7M
    const auto c = Or(a, l);
78
60.7M
    extra_bits = Add(extra_bits, eb_fixed);
79
60.7M
    const auto t = Or(c, d);
80
60.7M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
60.7M
    Store(t_fixed, du, out + i);
82
60.7M
  }
83
460k
  if (last_full < len) {
84
395k
    const auto stop = Set(du, len);
85
395k
    const auto fence = Iota(du, last_full);
86
395k
    const auto take = Lt(fence, stop);
87
395k
    const auto val = LoadU(du, values + last_full);
88
395k
    const auto is_large = Gt(val, kLargeThreshold);
89
395k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
395k
    const auto not_literal = Ge(val, kSplit);
91
395k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
395k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
395k
    const auto l = And(val, kMaskL);
94
395k
    const auto exp = ShiftRight<23>(b);
95
395k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
395k
    const auto n = Sub(exp_fixed, kExpOffset);
97
395k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
395k
    const auto m = ShiftRight<23 - M - L>(b);
99
395k
    const auto a = Add(kBase, Mul(n, kMulN));
100
395k
    const auto d = And(m, kMaskM);
101
395k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
395k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
395k
    const auto c = Or(a, l);
104
395k
    extra_bits = Add(extra_bits, eb_masked);
105
395k
    const auto t = Or(c, d);
106
395k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
395k
    Store(t_fixed, du, out + last_full);
108
395k
  }
109
460k
  return GetLane(SumOfLanes(du, extra_bits));
110
460k
}
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
78.0k
                               uint32_t* JXL_RESTRICT out) {
45
78.0k
  const HWY_FULL(uint32_t) du;
46
78.0k
  const HWY_FULL(float) df;
47
78.0k
  const auto kZero = Zero(du);
48
78.0k
  const auto kSplit = Set(du, 1 << E);
49
78.0k
  const auto kExpOffset = Set(du, 127);
50
78.0k
  const auto kEBOffset = Set(du, 127 + M + L);
51
78.0k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
78.0k
  const auto kMulN = Set(du, 1 << (M + L));
53
78.0k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
78.0k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
78.0k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
78.0k
  constexpr size_t kLargeShiftVal = 10;
57
78.0k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
78.0k
  auto extra_bits = kZero;
60
78.0k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
11.5M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
11.4M
    const auto val = LoadU(du, values + i);
63
11.4M
    const auto is_large = Gt(val, kLargeThreshold);
64
11.4M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
11.4M
    const auto not_literal = Ge(val, kSplit);
66
11.4M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
11.4M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
11.4M
    const auto l = And(val, kMaskL);
69
11.4M
    const auto exp = ShiftRight<23>(b);
70
11.4M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
11.4M
    const auto n = Sub(exp_fixed, kExpOffset);
72
11.4M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
11.4M
    const auto m = ShiftRight<23 - M - L>(b);
74
11.4M
    const auto a = Add(kBase, Mul(n, kMulN));
75
11.4M
    const auto d = And(m, kMaskM);
76
11.4M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
11.4M
    const auto c = Or(a, l);
78
11.4M
    extra_bits = Add(extra_bits, eb_fixed);
79
11.4M
    const auto t = Or(c, d);
80
11.4M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
11.4M
    Store(t_fixed, du, out + i);
82
11.4M
  }
83
78.0k
  if (last_full < len) {
84
67.5k
    const auto stop = Set(du, len);
85
67.5k
    const auto fence = Iota(du, last_full);
86
67.5k
    const auto take = Lt(fence, stop);
87
67.5k
    const auto val = LoadU(du, values + last_full);
88
67.5k
    const auto is_large = Gt(val, kLargeThreshold);
89
67.5k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
67.5k
    const auto not_literal = Ge(val, kSplit);
91
67.5k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
67.5k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
67.5k
    const auto l = And(val, kMaskL);
94
67.5k
    const auto exp = ShiftRight<23>(b);
95
67.5k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
67.5k
    const auto n = Sub(exp_fixed, kExpOffset);
97
67.5k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
67.5k
    const auto m = ShiftRight<23 - M - L>(b);
99
67.5k
    const auto a = Add(kBase, Mul(n, kMulN));
100
67.5k
    const auto d = And(m, kMaskM);
101
67.5k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
67.5k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
67.5k
    const auto c = Or(a, l);
104
67.5k
    extra_bits = Add(extra_bits, eb_masked);
105
67.5k
    const auto t = Or(c, d);
106
67.5k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
67.5k
    Store(t_fixed, du, out + last_full);
108
67.5k
  }
109
78.0k
  return GetLane(SumOfLanes(du, extra_bits));
110
78.0k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
77.4k
                               uint32_t* JXL_RESTRICT out) {
45
77.4k
  const HWY_FULL(uint32_t) du;
46
77.4k
  const HWY_FULL(float) df;
47
77.4k
  const auto kZero = Zero(du);
48
77.4k
  const auto kSplit = Set(du, 1 << E);
49
77.4k
  const auto kExpOffset = Set(du, 127);
50
77.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
77.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
77.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
77.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
77.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
77.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
77.4k
  constexpr size_t kLargeShiftVal = 10;
57
77.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
77.4k
  auto extra_bits = kZero;
60
77.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
11.5M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
11.4M
    const auto val = LoadU(du, values + i);
63
11.4M
    const auto is_large = Gt(val, kLargeThreshold);
64
11.4M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
11.4M
    const auto not_literal = Ge(val, kSplit);
66
11.4M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
11.4M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
11.4M
    const auto l = And(val, kMaskL);
69
11.4M
    const auto exp = ShiftRight<23>(b);
70
11.4M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
11.4M
    const auto n = Sub(exp_fixed, kExpOffset);
72
11.4M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
11.4M
    const auto m = ShiftRight<23 - M - L>(b);
74
11.4M
    const auto a = Add(kBase, Mul(n, kMulN));
75
11.4M
    const auto d = And(m, kMaskM);
76
11.4M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
11.4M
    const auto c = Or(a, l);
78
11.4M
    extra_bits = Add(extra_bits, eb_fixed);
79
11.4M
    const auto t = Or(c, d);
80
11.4M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
11.4M
    Store(t_fixed, du, out + i);
82
11.4M
  }
83
77.4k
  if (last_full < len) {
84
67.0k
    const auto stop = Set(du, len);
85
67.0k
    const auto fence = Iota(du, last_full);
86
67.0k
    const auto take = Lt(fence, stop);
87
67.0k
    const auto val = LoadU(du, values + last_full);
88
67.0k
    const auto is_large = Gt(val, kLargeThreshold);
89
67.0k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
67.0k
    const auto not_literal = Ge(val, kSplit);
91
67.0k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
67.0k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
67.0k
    const auto l = And(val, kMaskL);
94
67.0k
    const auto exp = ShiftRight<23>(b);
95
67.0k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
67.0k
    const auto n = Sub(exp_fixed, kExpOffset);
97
67.0k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
67.0k
    const auto m = ShiftRight<23 - M - L>(b);
99
67.0k
    const auto a = Add(kBase, Mul(n, kMulN));
100
67.0k
    const auto d = And(m, kMaskM);
101
67.0k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
67.0k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
67.0k
    const auto c = Or(a, l);
104
67.0k
    extra_bits = Add(extra_bits, eb_masked);
105
67.0k
    const auto t = Or(c, d);
106
67.0k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
67.0k
    Store(t_fixed, du, out + last_full);
108
67.0k
  }
109
77.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
77.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
77.4k
                               uint32_t* JXL_RESTRICT out) {
45
77.4k
  const HWY_FULL(uint32_t) du;
46
77.4k
  const HWY_FULL(float) df;
47
77.4k
  const auto kZero = Zero(du);
48
77.4k
  const auto kSplit = Set(du, 1 << E);
49
77.4k
  const auto kExpOffset = Set(du, 127);
50
77.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
77.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
77.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
77.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
77.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
77.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
77.4k
  constexpr size_t kLargeShiftVal = 10;
57
77.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
77.4k
  auto extra_bits = kZero;
60
77.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
11.5M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
11.4M
    const auto val = LoadU(du, values + i);
63
11.4M
    const auto is_large = Gt(val, kLargeThreshold);
64
11.4M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
11.4M
    const auto not_literal = Ge(val, kSplit);
66
11.4M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
11.4M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
11.4M
    const auto l = And(val, kMaskL);
69
11.4M
    const auto exp = ShiftRight<23>(b);
70
11.4M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
11.4M
    const auto n = Sub(exp_fixed, kExpOffset);
72
11.4M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
11.4M
    const auto m = ShiftRight<23 - M - L>(b);
74
11.4M
    const auto a = Add(kBase, Mul(n, kMulN));
75
11.4M
    const auto d = And(m, kMaskM);
76
11.4M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
11.4M
    const auto c = Or(a, l);
78
11.4M
    extra_bits = Add(extra_bits, eb_fixed);
79
11.4M
    const auto t = Or(c, d);
80
11.4M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
11.4M
    Store(t_fixed, du, out + i);
82
11.4M
  }
83
77.4k
  if (last_full < len) {
84
67.0k
    const auto stop = Set(du, len);
85
67.0k
    const auto fence = Iota(du, last_full);
86
67.0k
    const auto take = Lt(fence, stop);
87
67.0k
    const auto val = LoadU(du, values + last_full);
88
67.0k
    const auto is_large = Gt(val, kLargeThreshold);
89
67.0k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
67.0k
    const auto not_literal = Ge(val, kSplit);
91
67.0k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
67.0k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
67.0k
    const auto l = And(val, kMaskL);
94
67.0k
    const auto exp = ShiftRight<23>(b);
95
67.0k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
67.0k
    const auto n = Sub(exp_fixed, kExpOffset);
97
67.0k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
67.0k
    const auto m = ShiftRight<23 - M - L>(b);
99
67.0k
    const auto a = Add(kBase, Mul(n, kMulN));
100
67.0k
    const auto d = And(m, kMaskM);
101
67.0k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
67.0k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
67.0k
    const auto c = Or(a, l);
104
67.0k
    extra_bits = Add(extra_bits, eb_masked);
105
67.0k
    const auto t = Or(c, d);
106
67.0k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
67.0k
    Store(t_fixed, du, out + last_full);
108
67.0k
  }
109
77.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
77.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
77.4k
                               uint32_t* JXL_RESTRICT out) {
45
77.4k
  const HWY_FULL(uint32_t) du;
46
77.4k
  const HWY_FULL(float) df;
47
77.4k
  const auto kZero = Zero(du);
48
77.4k
  const auto kSplit = Set(du, 1 << E);
49
77.4k
  const auto kExpOffset = Set(du, 127);
50
77.4k
  const auto kEBOffset = Set(du, 127 + M + L);
51
77.4k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
77.4k
  const auto kMulN = Set(du, 1 << (M + L));
53
77.4k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
77.4k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
77.4k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
77.4k
  constexpr size_t kLargeShiftVal = 10;
57
77.4k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
77.4k
  auto extra_bits = kZero;
60
77.4k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
11.5M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
11.4M
    const auto val = LoadU(du, values + i);
63
11.4M
    const auto is_large = Gt(val, kLargeThreshold);
64
11.4M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
11.4M
    const auto not_literal = Ge(val, kSplit);
66
11.4M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
11.4M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
11.4M
    const auto l = And(val, kMaskL);
69
11.4M
    const auto exp = ShiftRight<23>(b);
70
11.4M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
11.4M
    const auto n = Sub(exp_fixed, kExpOffset);
72
11.4M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
11.4M
    const auto m = ShiftRight<23 - M - L>(b);
74
11.4M
    const auto a = Add(kBase, Mul(n, kMulN));
75
11.4M
    const auto d = And(m, kMaskM);
76
11.4M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
11.4M
    const auto c = Or(a, l);
78
11.4M
    extra_bits = Add(extra_bits, eb_fixed);
79
11.4M
    const auto t = Or(c, d);
80
11.4M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
11.4M
    Store(t_fixed, du, out + i);
82
11.4M
  }
83
77.4k
  if (last_full < len) {
84
67.0k
    const auto stop = Set(du, len);
85
67.0k
    const auto fence = Iota(du, last_full);
86
67.0k
    const auto take = Lt(fence, stop);
87
67.0k
    const auto val = LoadU(du, values + last_full);
88
67.0k
    const auto is_large = Gt(val, kLargeThreshold);
89
67.0k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
67.0k
    const auto not_literal = Ge(val, kSplit);
91
67.0k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
67.0k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
67.0k
    const auto l = And(val, kMaskL);
94
67.0k
    const auto exp = ShiftRight<23>(b);
95
67.0k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
67.0k
    const auto n = Sub(exp_fixed, kExpOffset);
97
67.0k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
67.0k
    const auto m = ShiftRight<23 - M - L>(b);
99
67.0k
    const auto a = Add(kBase, Mul(n, kMulN));
100
67.0k
    const auto d = And(m, kMaskM);
101
67.0k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
67.0k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
67.0k
    const auto c = Or(a, l);
104
67.0k
    extra_bits = Add(extra_bits, eb_masked);
105
67.0k
    const auto t = Or(c, d);
106
67.0k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
67.0k
    Store(t_fixed, du, out + last_full);
108
67.0k
  }
109
77.4k
  return GetLane(SumOfLanes(du, extra_bits));
110
77.4k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.67k
                               uint32_t* JXL_RESTRICT out) {
45
6.67k
  const HWY_FULL(uint32_t) du;
46
6.67k
  const HWY_FULL(float) df;
47
6.67k
  const auto kZero = Zero(du);
48
6.67k
  const auto kSplit = Set(du, 1 << E);
49
6.67k
  const auto kExpOffset = Set(du, 127);
50
6.67k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.67k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.67k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.67k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.67k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.67k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.67k
  constexpr size_t kLargeShiftVal = 10;
57
6.67k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.67k
  auto extra_bits = kZero;
60
6.67k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.67k
  if (last_full < len) {
84
5.64k
    const auto stop = Set(du, len);
85
5.64k
    const auto fence = Iota(du, last_full);
86
5.64k
    const auto take = Lt(fence, stop);
87
5.64k
    const auto val = LoadU(du, values + last_full);
88
5.64k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.64k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.64k
    const auto not_literal = Ge(val, kSplit);
91
5.64k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.64k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.64k
    const auto l = And(val, kMaskL);
94
5.64k
    const auto exp = ShiftRight<23>(b);
95
5.64k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.64k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.64k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.64k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.64k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.64k
    const auto d = And(m, kMaskM);
101
5.64k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.64k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.64k
    const auto c = Or(a, l);
104
5.64k
    extra_bits = Add(extra_bits, eb_masked);
105
5.64k
    const auto t = Or(c, d);
106
5.64k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.64k
    Store(t_fixed, du, out + last_full);
108
5.64k
  }
109
6.67k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.67k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.67k
                               uint32_t* JXL_RESTRICT out) {
45
6.67k
  const HWY_FULL(uint32_t) du;
46
6.67k
  const HWY_FULL(float) df;
47
6.67k
  const auto kZero = Zero(du);
48
6.67k
  const auto kSplit = Set(du, 1 << E);
49
6.67k
  const auto kExpOffset = Set(du, 127);
50
6.67k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.67k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.67k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.67k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.67k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.67k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.67k
  constexpr size_t kLargeShiftVal = 10;
57
6.67k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.67k
  auto extra_bits = kZero;
60
6.67k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.67k
  if (last_full < len) {
84
5.64k
    const auto stop = Set(du, len);
85
5.64k
    const auto fence = Iota(du, last_full);
86
5.64k
    const auto take = Lt(fence, stop);
87
5.64k
    const auto val = LoadU(du, values + last_full);
88
5.64k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.64k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.64k
    const auto not_literal = Ge(val, kSplit);
91
5.64k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.64k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.64k
    const auto l = And(val, kMaskL);
94
5.64k
    const auto exp = ShiftRight<23>(b);
95
5.64k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.64k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.64k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.64k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.64k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.64k
    const auto d = And(m, kMaskM);
101
5.64k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.64k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.64k
    const auto c = Or(a, l);
104
5.64k
    extra_bits = Add(extra_bits, eb_masked);
105
5.64k
    const auto t = Or(c, d);
106
5.64k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.64k
    Store(t_fixed, du, out + last_full);
108
5.64k
  }
109
6.67k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.67k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
5.77k
                               uint32_t* JXL_RESTRICT out) {
45
5.77k
  const HWY_FULL(uint32_t) du;
46
5.77k
  const HWY_FULL(float) df;
47
5.77k
  const auto kZero = Zero(du);
48
5.77k
  const auto kSplit = Set(du, 1 << E);
49
5.77k
  const auto kExpOffset = Set(du, 127);
50
5.77k
  const auto kEBOffset = Set(du, 127 + M + L);
51
5.77k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
5.77k
  const auto kMulN = Set(du, 1 << (M + L));
53
5.77k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
5.77k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
5.77k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
5.77k
  constexpr size_t kLargeShiftVal = 10;
57
5.77k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
5.77k
  auto extra_bits = kZero;
60
5.77k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
374k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
369k
    const auto val = LoadU(du, values + i);
63
369k
    const auto is_large = Gt(val, kLargeThreshold);
64
369k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
369k
    const auto not_literal = Ge(val, kSplit);
66
369k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
369k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
369k
    const auto l = And(val, kMaskL);
69
369k
    const auto exp = ShiftRight<23>(b);
70
369k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
369k
    const auto n = Sub(exp_fixed, kExpOffset);
72
369k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
369k
    const auto m = ShiftRight<23 - M - L>(b);
74
369k
    const auto a = Add(kBase, Mul(n, kMulN));
75
369k
    const auto d = And(m, kMaskM);
76
369k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
369k
    const auto c = Or(a, l);
78
369k
    extra_bits = Add(extra_bits, eb_fixed);
79
369k
    const auto t = Or(c, d);
80
369k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
369k
    Store(t_fixed, du, out + i);
82
369k
  }
83
5.77k
  if (last_full < len) {
84
4.91k
    const auto stop = Set(du, len);
85
4.91k
    const auto fence = Iota(du, last_full);
86
4.91k
    const auto take = Lt(fence, stop);
87
4.91k
    const auto val = LoadU(du, values + last_full);
88
4.91k
    const auto is_large = Gt(val, kLargeThreshold);
89
4.91k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
4.91k
    const auto not_literal = Ge(val, kSplit);
91
4.91k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
4.91k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
4.91k
    const auto l = And(val, kMaskL);
94
4.91k
    const auto exp = ShiftRight<23>(b);
95
4.91k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
4.91k
    const auto n = Sub(exp_fixed, kExpOffset);
97
4.91k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
4.91k
    const auto m = ShiftRight<23 - M - L>(b);
99
4.91k
    const auto a = Add(kBase, Mul(n, kMulN));
100
4.91k
    const auto d = And(m, kMaskM);
101
4.91k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
4.91k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
4.91k
    const auto c = Or(a, l);
104
4.91k
    extra_bits = Add(extra_bits, eb_masked);
105
4.91k
    const auto t = Or(c, d);
106
4.91k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
4.91k
    Store(t_fixed, du, out + last_full);
108
4.91k
  }
109
5.77k
  return GetLane(SumOfLanes(du, extra_bits));
110
5.77k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
5.77k
                               uint32_t* JXL_RESTRICT out) {
45
5.77k
  const HWY_FULL(uint32_t) du;
46
5.77k
  const HWY_FULL(float) df;
47
5.77k
  const auto kZero = Zero(du);
48
5.77k
  const auto kSplit = Set(du, 1 << E);
49
5.77k
  const auto kExpOffset = Set(du, 127);
50
5.77k
  const auto kEBOffset = Set(du, 127 + M + L);
51
5.77k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
5.77k
  const auto kMulN = Set(du, 1 << (M + L));
53
5.77k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
5.77k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
5.77k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
5.77k
  constexpr size_t kLargeShiftVal = 10;
57
5.77k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
5.77k
  auto extra_bits = kZero;
60
5.77k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
374k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
369k
    const auto val = LoadU(du, values + i);
63
369k
    const auto is_large = Gt(val, kLargeThreshold);
64
369k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
369k
    const auto not_literal = Ge(val, kSplit);
66
369k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
369k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
369k
    const auto l = And(val, kMaskL);
69
369k
    const auto exp = ShiftRight<23>(b);
70
369k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
369k
    const auto n = Sub(exp_fixed, kExpOffset);
72
369k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
369k
    const auto m = ShiftRight<23 - M - L>(b);
74
369k
    const auto a = Add(kBase, Mul(n, kMulN));
75
369k
    const auto d = And(m, kMaskM);
76
369k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
369k
    const auto c = Or(a, l);
78
369k
    extra_bits = Add(extra_bits, eb_fixed);
79
369k
    const auto t = Or(c, d);
80
369k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
369k
    Store(t_fixed, du, out + i);
82
369k
  }
83
5.77k
  if (last_full < len) {
84
4.91k
    const auto stop = Set(du, len);
85
4.91k
    const auto fence = Iota(du, last_full);
86
4.91k
    const auto take = Lt(fence, stop);
87
4.91k
    const auto val = LoadU(du, values + last_full);
88
4.91k
    const auto is_large = Gt(val, kLargeThreshold);
89
4.91k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
4.91k
    const auto not_literal = Ge(val, kSplit);
91
4.91k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
4.91k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
4.91k
    const auto l = And(val, kMaskL);
94
4.91k
    const auto exp = ShiftRight<23>(b);
95
4.91k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
4.91k
    const auto n = Sub(exp_fixed, kExpOffset);
97
4.91k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
4.91k
    const auto m = ShiftRight<23 - M - L>(b);
99
4.91k
    const auto a = Add(kBase, Mul(n, kMulN));
100
4.91k
    const auto d = And(m, kMaskM);
101
4.91k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
4.91k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
4.91k
    const auto c = Or(a, l);
104
4.91k
    extra_bits = Add(extra_bits, eb_masked);
105
4.91k
    const auto t = Or(c, d);
106
4.91k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
4.91k
    Store(t_fixed, du, out + last_full);
108
4.91k
  }
109
5.77k
  return GetLane(SumOfLanes(du, extra_bits));
110
5.77k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
6.68k
                               uint32_t* JXL_RESTRICT out) {
45
6.68k
  const HWY_FULL(uint32_t) du;
46
6.68k
  const HWY_FULL(float) df;
47
6.68k
  const auto kZero = Zero(du);
48
6.68k
  const auto kSplit = Set(du, 1 << E);
49
6.68k
  const auto kExpOffset = Set(du, 127);
50
6.68k
  const auto kEBOffset = Set(du, 127 + M + L);
51
6.68k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
6.68k
  const auto kMulN = Set(du, 1 << (M + L));
53
6.68k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
6.68k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
6.68k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
6.68k
  constexpr size_t kLargeShiftVal = 10;
57
6.68k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
6.68k
  auto extra_bits = kZero;
60
6.68k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
741k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
734k
    const auto val = LoadU(du, values + i);
63
734k
    const auto is_large = Gt(val, kLargeThreshold);
64
734k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
734k
    const auto not_literal = Ge(val, kSplit);
66
734k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
734k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
734k
    const auto l = And(val, kMaskL);
69
734k
    const auto exp = ShiftRight<23>(b);
70
734k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
734k
    const auto n = Sub(exp_fixed, kExpOffset);
72
734k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
734k
    const auto m = ShiftRight<23 - M - L>(b);
74
734k
    const auto a = Add(kBase, Mul(n, kMulN));
75
734k
    const auto d = And(m, kMaskM);
76
734k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
734k
    const auto c = Or(a, l);
78
734k
    extra_bits = Add(extra_bits, eb_fixed);
79
734k
    const auto t = Or(c, d);
80
734k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
734k
    Store(t_fixed, du, out + i);
82
734k
  }
83
6.68k
  if (last_full < len) {
84
5.65k
    const auto stop = Set(du, len);
85
5.65k
    const auto fence = Iota(du, last_full);
86
5.65k
    const auto take = Lt(fence, stop);
87
5.65k
    const auto val = LoadU(du, values + last_full);
88
5.65k
    const auto is_large = Gt(val, kLargeThreshold);
89
5.65k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
5.65k
    const auto not_literal = Ge(val, kSplit);
91
5.65k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
5.65k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
5.65k
    const auto l = And(val, kMaskL);
94
5.65k
    const auto exp = ShiftRight<23>(b);
95
5.65k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
5.65k
    const auto n = Sub(exp_fixed, kExpOffset);
97
5.65k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
5.65k
    const auto m = ShiftRight<23 - M - L>(b);
99
5.65k
    const auto a = Add(kBase, Mul(n, kMulN));
100
5.65k
    const auto d = And(m, kMaskM);
101
5.65k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
5.65k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
5.65k
    const auto c = Or(a, l);
104
5.65k
    extra_bits = Add(extra_bits, eb_masked);
105
5.65k
    const auto t = Or(c, d);
106
5.65k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
5.65k
    Store(t_fixed, du, out + last_full);
108
5.65k
  }
109
6.68k
  return GetLane(SumOfLanes(du, extra_bits));
110
6.68k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.96k
                               uint32_t* JXL_RESTRICT out) {
45
4.96k
  const HWY_FULL(uint32_t) du;
46
4.96k
  const HWY_FULL(float) df;
47
4.96k
  const auto kZero = Zero(du);
48
4.96k
  const auto kSplit = Set(du, 1 << E);
49
4.96k
  const auto kExpOffset = Set(du, 127);
50
4.96k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.96k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.96k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.96k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.96k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.96k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.96k
  constexpr size_t kLargeShiftVal = 10;
57
4.96k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.96k
  auto extra_bits = kZero;
60
4.96k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
342k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
337k
    const auto val = LoadU(du, values + i);
63
337k
    const auto is_large = Gt(val, kLargeThreshold);
64
337k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
337k
    const auto not_literal = Ge(val, kSplit);
66
337k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
337k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
337k
    const auto l = And(val, kMaskL);
69
337k
    const auto exp = ShiftRight<23>(b);
70
337k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
337k
    const auto n = Sub(exp_fixed, kExpOffset);
72
337k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
337k
    const auto m = ShiftRight<23 - M - L>(b);
74
337k
    const auto a = Add(kBase, Mul(n, kMulN));
75
337k
    const auto d = And(m, kMaskM);
76
337k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
337k
    const auto c = Or(a, l);
78
337k
    extra_bits = Add(extra_bits, eb_fixed);
79
337k
    const auto t = Or(c, d);
80
337k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
337k
    Store(t_fixed, du, out + i);
82
337k
  }
83
4.96k
  if (last_full < len) {
84
4.17k
    const auto stop = Set(du, len);
85
4.17k
    const auto fence = Iota(du, last_full);
86
4.17k
    const auto take = Lt(fence, stop);
87
4.17k
    const auto val = LoadU(du, values + last_full);
88
4.17k
    const auto is_large = Gt(val, kLargeThreshold);
89
4.17k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
4.17k
    const auto not_literal = Ge(val, kSplit);
91
4.17k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
4.17k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
4.17k
    const auto l = And(val, kMaskL);
94
4.17k
    const auto exp = ShiftRight<23>(b);
95
4.17k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
4.17k
    const auto n = Sub(exp_fixed, kExpOffset);
97
4.17k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
4.17k
    const auto m = ShiftRight<23 - M - L>(b);
99
4.17k
    const auto a = Add(kBase, Mul(n, kMulN));
100
4.17k
    const auto d = And(m, kMaskM);
101
4.17k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
4.17k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
4.17k
    const auto c = Or(a, l);
104
4.17k
    extra_bits = Add(extra_bits, eb_masked);
105
4.17k
    const auto t = Or(c, d);
106
4.17k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
4.17k
    Store(t_fixed, du, out + last_full);
108
4.17k
  }
109
4.96k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.96k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.96k
                               uint32_t* JXL_RESTRICT out) {
45
4.96k
  const HWY_FULL(uint32_t) du;
46
4.96k
  const HWY_FULL(float) df;
47
4.96k
  const auto kZero = Zero(du);
48
4.96k
  const auto kSplit = Set(du, 1 << E);
49
4.96k
  const auto kExpOffset = Set(du, 127);
50
4.96k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.96k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.96k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.96k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.96k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.96k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.96k
  constexpr size_t kLargeShiftVal = 10;
57
4.96k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.96k
  auto extra_bits = kZero;
60
4.96k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
342k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
337k
    const auto val = LoadU(du, values + i);
63
337k
    const auto is_large = Gt(val, kLargeThreshold);
64
337k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
337k
    const auto not_literal = Ge(val, kSplit);
66
337k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
337k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
337k
    const auto l = And(val, kMaskL);
69
337k
    const auto exp = ShiftRight<23>(b);
70
337k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
337k
    const auto n = Sub(exp_fixed, kExpOffset);
72
337k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
337k
    const auto m = ShiftRight<23 - M - L>(b);
74
337k
    const auto a = Add(kBase, Mul(n, kMulN));
75
337k
    const auto d = And(m, kMaskM);
76
337k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
337k
    const auto c = Or(a, l);
78
337k
    extra_bits = Add(extra_bits, eb_fixed);
79
337k
    const auto t = Or(c, d);
80
337k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
337k
    Store(t_fixed, du, out + i);
82
337k
  }
83
4.96k
  if (last_full < len) {
84
4.17k
    const auto stop = Set(du, len);
85
4.17k
    const auto fence = Iota(du, last_full);
86
4.17k
    const auto take = Lt(fence, stop);
87
4.17k
    const auto val = LoadU(du, values + last_full);
88
4.17k
    const auto is_large = Gt(val, kLargeThreshold);
89
4.17k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
4.17k
    const auto not_literal = Ge(val, kSplit);
91
4.17k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
4.17k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
4.17k
    const auto l = And(val, kMaskL);
94
4.17k
    const auto exp = ShiftRight<23>(b);
95
4.17k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
4.17k
    const auto n = Sub(exp_fixed, kExpOffset);
97
4.17k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
4.17k
    const auto m = ShiftRight<23 - M - L>(b);
99
4.17k
    const auto a = Add(kBase, Mul(n, kMulN));
100
4.17k
    const auto d = And(m, kMaskM);
101
4.17k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
4.17k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
4.17k
    const auto c = Or(a, l);
104
4.17k
    extra_bits = Add(extra_bits, eb_masked);
105
4.17k
    const auto t = Or(c, d);
106
4.17k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
4.17k
    Store(t_fixed, du, out + last_full);
108
4.17k
  }
109
4.96k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.96k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.96k
                               uint32_t* JXL_RESTRICT out) {
45
4.96k
  const HWY_FULL(uint32_t) du;
46
4.96k
  const HWY_FULL(float) df;
47
4.96k
  const auto kZero = Zero(du);
48
4.96k
  const auto kSplit = Set(du, 1 << E);
49
4.96k
  const auto kExpOffset = Set(du, 127);
50
4.96k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.96k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.96k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.96k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.96k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.96k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.96k
  constexpr size_t kLargeShiftVal = 10;
57
4.96k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.96k
  auto extra_bits = kZero;
60
4.96k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
342k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
337k
    const auto val = LoadU(du, values + i);
63
337k
    const auto is_large = Gt(val, kLargeThreshold);
64
337k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
337k
    const auto not_literal = Ge(val, kSplit);
66
337k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
337k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
337k
    const auto l = And(val, kMaskL);
69
337k
    const auto exp = ShiftRight<23>(b);
70
337k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
337k
    const auto n = Sub(exp_fixed, kExpOffset);
72
337k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
337k
    const auto m = ShiftRight<23 - M - L>(b);
74
337k
    const auto a = Add(kBase, Mul(n, kMulN));
75
337k
    const auto d = And(m, kMaskM);
76
337k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
337k
    const auto c = Or(a, l);
78
337k
    extra_bits = Add(extra_bits, eb_fixed);
79
337k
    const auto t = Or(c, d);
80
337k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
337k
    Store(t_fixed, du, out + i);
82
337k
  }
83
4.96k
  if (last_full < len) {
84
4.17k
    const auto stop = Set(du, len);
85
4.17k
    const auto fence = Iota(du, last_full);
86
4.17k
    const auto take = Lt(fence, stop);
87
4.17k
    const auto val = LoadU(du, values + last_full);
88
4.17k
    const auto is_large = Gt(val, kLargeThreshold);
89
4.17k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
4.17k
    const auto not_literal = Ge(val, kSplit);
91
4.17k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
4.17k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
4.17k
    const auto l = And(val, kMaskL);
94
4.17k
    const auto exp = ShiftRight<23>(b);
95
4.17k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
4.17k
    const auto n = Sub(exp_fixed, kExpOffset);
97
4.17k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
4.17k
    const auto m = ShiftRight<23 - M - L>(b);
99
4.17k
    const auto a = Add(kBase, Mul(n, kMulN));
100
4.17k
    const auto d = And(m, kMaskM);
101
4.17k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
4.17k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
4.17k
    const auto c = Or(a, l);
104
4.17k
    extra_bits = Add(extra_bits, eb_masked);
105
4.17k
    const auto t = Or(c, d);
106
4.17k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
4.17k
    Store(t_fixed, du, out + last_full);
108
4.17k
  }
109
4.96k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.96k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.96k
                               uint32_t* JXL_RESTRICT out) {
45
4.96k
  const HWY_FULL(uint32_t) du;
46
4.96k
  const HWY_FULL(float) df;
47
4.96k
  const auto kZero = Zero(du);
48
4.96k
  const auto kSplit = Set(du, 1 << E);
49
4.96k
  const auto kExpOffset = Set(du, 127);
50
4.96k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.96k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.96k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.96k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.96k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.96k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.96k
  constexpr size_t kLargeShiftVal = 10;
57
4.96k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.96k
  auto extra_bits = kZero;
60
4.96k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
342k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
337k
    const auto val = LoadU(du, values + i);
63
337k
    const auto is_large = Gt(val, kLargeThreshold);
64
337k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
337k
    const auto not_literal = Ge(val, kSplit);
66
337k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
337k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
337k
    const auto l = And(val, kMaskL);
69
337k
    const auto exp = ShiftRight<23>(b);
70
337k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
337k
    const auto n = Sub(exp_fixed, kExpOffset);
72
337k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
337k
    const auto m = ShiftRight<23 - M - L>(b);
74
337k
    const auto a = Add(kBase, Mul(n, kMulN));
75
337k
    const auto d = And(m, kMaskM);
76
337k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
337k
    const auto c = Or(a, l);
78
337k
    extra_bits = Add(extra_bits, eb_fixed);
79
337k
    const auto t = Or(c, d);
80
337k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
337k
    Store(t_fixed, du, out + i);
82
337k
  }
83
4.96k
  if (last_full < len) {
84
4.17k
    const auto stop = Set(du, len);
85
4.17k
    const auto fence = Iota(du, last_full);
86
4.17k
    const auto take = Lt(fence, stop);
87
4.17k
    const auto val = LoadU(du, values + last_full);
88
4.17k
    const auto is_large = Gt(val, kLargeThreshold);
89
4.17k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
4.17k
    const auto not_literal = Ge(val, kSplit);
91
4.17k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
4.17k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
4.17k
    const auto l = And(val, kMaskL);
94
4.17k
    const auto exp = ShiftRight<23>(b);
95
4.17k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
4.17k
    const auto n = Sub(exp_fixed, kExpOffset);
97
4.17k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
4.17k
    const auto m = ShiftRight<23 - M - L>(b);
99
4.17k
    const auto a = Add(kBase, Mul(n, kMulN));
100
4.17k
    const auto d = And(m, kMaskM);
101
4.17k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
4.17k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
4.17k
    const auto c = Or(a, l);
104
4.17k
    extra_bits = Add(extra_bits, eb_masked);
105
4.17k
    const auto t = Or(c, d);
106
4.17k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
4.17k
    Store(t_fixed, du, out + last_full);
108
4.17k
  }
109
4.96k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.96k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.96k
                               uint32_t* JXL_RESTRICT out) {
45
4.96k
  const HWY_FULL(uint32_t) du;
46
4.96k
  const HWY_FULL(float) df;
47
4.96k
  const auto kZero = Zero(du);
48
4.96k
  const auto kSplit = Set(du, 1 << E);
49
4.96k
  const auto kExpOffset = Set(du, 127);
50
4.96k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.96k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.96k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.96k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.96k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.96k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.96k
  constexpr size_t kLargeShiftVal = 10;
57
4.96k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.96k
  auto extra_bits = kZero;
60
4.96k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
342k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
337k
    const auto val = LoadU(du, values + i);
63
337k
    const auto is_large = Gt(val, kLargeThreshold);
64
337k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
337k
    const auto not_literal = Ge(val, kSplit);
66
337k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
337k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
337k
    const auto l = And(val, kMaskL);
69
337k
    const auto exp = ShiftRight<23>(b);
70
337k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
337k
    const auto n = Sub(exp_fixed, kExpOffset);
72
337k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
337k
    const auto m = ShiftRight<23 - M - L>(b);
74
337k
    const auto a = Add(kBase, Mul(n, kMulN));
75
337k
    const auto d = And(m, kMaskM);
76
337k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
337k
    const auto c = Or(a, l);
78
337k
    extra_bits = Add(extra_bits, eb_fixed);
79
337k
    const auto t = Or(c, d);
80
337k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
337k
    Store(t_fixed, du, out + i);
82
337k
  }
83
4.96k
  if (last_full < len) {
84
4.17k
    const auto stop = Set(du, len);
85
4.17k
    const auto fence = Iota(du, last_full);
86
4.17k
    const auto take = Lt(fence, stop);
87
4.17k
    const auto val = LoadU(du, values + last_full);
88
4.17k
    const auto is_large = Gt(val, kLargeThreshold);
89
4.17k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
4.17k
    const auto not_literal = Ge(val, kSplit);
91
4.17k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
4.17k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
4.17k
    const auto l = And(val, kMaskL);
94
4.17k
    const auto exp = ShiftRight<23>(b);
95
4.17k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
4.17k
    const auto n = Sub(exp_fixed, kExpOffset);
97
4.17k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
4.17k
    const auto m = ShiftRight<23 - M - L>(b);
99
4.17k
    const auto a = Add(kBase, Mul(n, kMulN));
100
4.17k
    const auto d = And(m, kMaskM);
101
4.17k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
4.17k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
4.17k
    const auto c = Or(a, l);
104
4.17k
    extra_bits = Add(extra_bits, eb_masked);
105
4.17k
    const auto t = Or(c, d);
106
4.17k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
4.17k
    Store(t_fixed, du, out + last_full);
108
4.17k
  }
109
4.96k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.96k
}
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
111
112
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
113
460k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
460k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
460k
  if (cfg.split_exponent == 0) {
127
78.0k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
382k
  } else if (cfg.split_exponent == 2) {
129
77.4k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
77.4k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
304k
  } else if (cfg.split_exponent == 3) {
132
26.7k
    if (cfg.msb_in_token == 1) {
133
13.3k
      if (cfg.lsb_in_token == 0) {
134
6.68k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
6.68k
      } else {
136
6.68k
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
6.68k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
6.68k
      }
139
13.3k
    } else {
140
13.3k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
13.3k
      if (cfg.lsb_in_token == 0) {
142
6.68k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
6.68k
      } else {
144
6.68k
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
6.68k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
6.68k
      }
147
13.3k
    }
148
278k
  } else if (cfg.split_exponent == 4) {
149
181k
    if (cfg.msb_in_token == 1) {
150
90.8k
      if (cfg.lsb_in_token == 0) {
151
6.68k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
84.1k
      } else if (cfg.lsb_in_token == 2) {
153
77.4k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
77.4k
      } else {
155
6.68k
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
6.68k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
6.68k
      }
158
90.8k
    } else {
159
90.8k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
90.8k
      if (cfg.lsb_in_token == 0) {
161
77.4k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
77.4k
      } else if (cfg.lsb_in_token == 1) {
163
6.68k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
6.68k
      } else {
165
6.68k
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
6.68k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
6.68k
      }
168
90.8k
    }
169
181k
  } else if (cfg.split_exponent == 5) {
170
46.7k
    if (cfg.msb_in_token == 1) {
171
20.0k
      if (cfg.lsb_in_token == 0) {
172
6.68k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
13.3k
      } else if (cfg.lsb_in_token == 2) {
174
6.68k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
6.68k
      } else {
176
6.67k
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
6.67k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
6.67k
      }
179
26.7k
    } else {
180
26.7k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
26.7k
      if (cfg.lsb_in_token == 0) {
182
6.68k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
20.0k
      } else if (cfg.lsb_in_token == 1) {
184
6.68k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
13.3k
      } else if (cfg.lsb_in_token == 2) {
186
6.68k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
6.68k
      } else {
188
6.67k
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
6.67k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
6.67k
      }
191
26.7k
    }
192
49.7k
  } else if (cfg.split_exponent == 6) {
193
18.2k
    if (cfg.msb_in_token == 0) {
194
6.68k
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
6.68k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
11.5k
    } else if (cfg.msb_in_token == 1) {
197
5.77k
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
5.77k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
5.77k
    } else {
200
5.77k
      JXL_DASSERT(cfg.msb_in_token == 2);
201
5.77k
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
5.77k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
5.77k
    }
204
31.5k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
31.5k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
31.5k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
31.5k
    if (cfg.split_exponent == 7) {
208
6.68k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
24.8k
    } else if (cfg.split_exponent == 8) {
210
4.96k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
19.8k
    } else if (cfg.split_exponent == 9) {
212
4.96k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
14.9k
    } else if (cfg.split_exponent == 10) {
214
4.96k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
9.93k
    } else if (cfg.split_exponent == 11) {
216
4.96k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
4.96k
    } else {
218
4.96k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
4.96k
    }
220
31.5k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
460k
#endif
225
460k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Line
Count
Source
113
460k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
460k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
460k
  if (cfg.split_exponent == 0) {
127
78.0k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
382k
  } else if (cfg.split_exponent == 2) {
129
77.4k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
77.4k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
304k
  } else if (cfg.split_exponent == 3) {
132
26.7k
    if (cfg.msb_in_token == 1) {
133
13.3k
      if (cfg.lsb_in_token == 0) {
134
6.68k
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
6.68k
      } else {
136
6.68k
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
6.68k
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
6.68k
      }
139
13.3k
    } else {
140
13.3k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
13.3k
      if (cfg.lsb_in_token == 0) {
142
6.68k
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
6.68k
      } else {
144
6.68k
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
6.68k
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
6.68k
      }
147
13.3k
    }
148
278k
  } else if (cfg.split_exponent == 4) {
149
181k
    if (cfg.msb_in_token == 1) {
150
90.8k
      if (cfg.lsb_in_token == 0) {
151
6.68k
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
84.1k
      } else if (cfg.lsb_in_token == 2) {
153
77.4k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
77.4k
      } else {
155
6.68k
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
6.68k
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
6.68k
      }
158
90.8k
    } else {
159
90.8k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
90.8k
      if (cfg.lsb_in_token == 0) {
161
77.4k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
77.4k
      } else if (cfg.lsb_in_token == 1) {
163
6.68k
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
6.68k
      } else {
165
6.68k
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
6.68k
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
6.68k
      }
168
90.8k
    }
169
181k
  } else if (cfg.split_exponent == 5) {
170
46.7k
    if (cfg.msb_in_token == 1) {
171
20.0k
      if (cfg.lsb_in_token == 0) {
172
6.68k
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
13.3k
      } else if (cfg.lsb_in_token == 2) {
174
6.68k
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
6.68k
      } else {
176
6.67k
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
6.67k
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
6.67k
      }
179
26.7k
    } else {
180
26.7k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
26.7k
      if (cfg.lsb_in_token == 0) {
182
6.68k
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
20.0k
      } else if (cfg.lsb_in_token == 1) {
184
6.68k
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
13.3k
      } else if (cfg.lsb_in_token == 2) {
186
6.68k
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
6.68k
      } else {
188
6.67k
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
6.67k
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
6.67k
      }
191
26.7k
    }
192
49.7k
  } else if (cfg.split_exponent == 6) {
193
18.2k
    if (cfg.msb_in_token == 0) {
194
6.68k
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
6.68k
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
11.5k
    } else if (cfg.msb_in_token == 1) {
197
5.77k
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
5.77k
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
5.77k
    } else {
200
5.77k
      JXL_DASSERT(cfg.msb_in_token == 2);
201
5.77k
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
5.77k
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
5.77k
    }
204
31.5k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
31.5k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
31.5k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
31.5k
    if (cfg.split_exponent == 7) {
208
6.68k
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
24.8k
    } else if (cfg.split_exponent == 8) {
210
4.96k
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
19.8k
    } else if (cfg.split_exponent == 9) {
212
4.96k
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
14.9k
    } else if (cfg.split_exponent == 10) {
214
4.96k
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
9.93k
    } else if (cfg.split_exponent == 11) {
216
4.96k
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
4.96k
    } else {
218
4.96k
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
4.96k
    }
220
31.5k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
460k
#endif
225
460k
}
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
226
227
// NOLINTNEXTLINE(google-readability-namespace-comments)
228
}  // namespace HWY_NAMESPACE
229
}  // namespace jxl
230
HWY_AFTER_NAMESPACE();
231
232
#if HWY_ONCE
233
namespace jxl {
234
235
HWY_EXPORT(EstimateTokenCost);
236
237
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
238
460k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
239
460k
  JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent);
240
460k
  return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens);
241
460k
}
242
243
}  // namespace jxl
244
#endif