Coverage Report

Created: 2025-08-12 07:37

/src/libjxl/lib/jxl/enc_ans_simd.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/enc_ans_simd.h"
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/status.h"
11
#include "lib/jxl/dec_ans.h"
12
#include "lib/jxl/memory_manager_internal.h"
13
14
#undef HWY_TARGET_INCLUDE
15
#define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc"
16
#include <hwy/foreach_target.h>
17
#include <hwy/highway.h>
18
19
HWY_BEFORE_NAMESPACE();
20
namespace jxl {
21
namespace HWY_NAMESPACE {
22
23
// These templates are not found via ADL.
24
using hwy::HWY_NAMESPACE::Add;
25
using hwy::HWY_NAMESPACE::And;
26
using hwy::HWY_NAMESPACE::Ge;
27
using hwy::HWY_NAMESPACE::GetLane;
28
using hwy::HWY_NAMESPACE::Gt;
29
using hwy::HWY_NAMESPACE::IfThenElse;
30
using hwy::HWY_NAMESPACE::IfThenElseZero;
31
using hwy::HWY_NAMESPACE::Iota;
32
using hwy::HWY_NAMESPACE::LoadU;
33
using hwy::HWY_NAMESPACE::Lt;
34
using hwy::HWY_NAMESPACE::Mul;
35
using hwy::HWY_NAMESPACE::Or;
36
using hwy::HWY_NAMESPACE::Set;
37
using hwy::HWY_NAMESPACE::ShiftRight;
38
using hwy::HWY_NAMESPACE::Store;
39
using hwy::HWY_NAMESPACE::Sub;
40
using hwy::HWY_NAMESPACE::Zero;
41
42
template <size_t E, size_t M, size_t L>
43
uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len,
44
35.1k
                               uint32_t* JXL_RESTRICT out) {
45
35.1k
  const HWY_FULL(uint32_t) du;
46
35.1k
  const HWY_FULL(float) df;
47
35.1k
  const auto kZero = Zero(du);
48
35.1k
  const auto kSplit = Set(du, 1 << E);
49
35.1k
  const auto kExpOffset = Set(du, 127);
50
35.1k
  const auto kEBOffset = Set(du, 127 + M + L);
51
35.1k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
35.1k
  const auto kMulN = Set(du, 1 << (M + L));
53
35.1k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
35.1k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
35.1k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
35.1k
  constexpr size_t kLargeShiftVal = 10;
57
35.1k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
35.1k
  auto extra_bits = kZero;
60
35.1k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
5.88M
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
5.85M
    const auto val = LoadU(du, values + i);
63
5.85M
    const auto is_large = Gt(val, kLargeThreshold);
64
5.85M
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
5.85M
    const auto not_literal = Ge(val, kSplit);
66
5.85M
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
5.85M
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
5.85M
    const auto l = And(val, kMaskL);
69
5.85M
    const auto exp = ShiftRight<23>(b);
70
5.85M
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
5.85M
    const auto n = Sub(exp_fixed, kExpOffset);
72
5.85M
    const auto eb = Sub(exp_fixed, kEBOffset);
73
5.85M
    const auto m = ShiftRight<23 - M - L>(b);
74
5.85M
    const auto a = Add(kBase, Mul(n, kMulN));
75
5.85M
    const auto d = And(m, kMaskM);
76
5.85M
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
5.85M
    const auto c = Or(a, l);
78
5.85M
    extra_bits = Add(extra_bits, eb_fixed);
79
5.85M
    const auto t = Or(c, d);
80
5.85M
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
5.85M
    Store(t_fixed, du, out + i);
82
5.85M
  }
83
35.1k
  if (last_full < len) {
84
29.9k
    const auto stop = Set(du, len);
85
29.9k
    const auto fence = Iota(du, last_full);
86
29.9k
    const auto take = Lt(fence, stop);
87
29.9k
    const auto val = LoadU(du, values + last_full);
88
29.9k
    const auto is_large = Gt(val, kLargeThreshold);
89
29.9k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
29.9k
    const auto not_literal = Ge(val, kSplit);
91
29.9k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
29.9k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
29.9k
    const auto l = And(val, kMaskL);
94
29.9k
    const auto exp = ShiftRight<23>(b);
95
29.9k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
29.9k
    const auto n = Sub(exp_fixed, kExpOffset);
97
29.9k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
29.9k
    const auto m = ShiftRight<23 - M - L>(b);
99
29.9k
    const auto a = Add(kBase, Mul(n, kMulN));
100
29.9k
    const auto d = And(m, kMaskM);
101
29.9k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
29.9k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
29.9k
    const auto c = Or(a, l);
104
29.9k
    extra_bits = Add(extra_bits, eb_masked);
105
29.9k
    const auto t = Or(c, d);
106
29.9k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
29.9k
    Store(t_fixed, du, out + last_full);
108
29.9k
  }
109
35.1k
  return GetLane(SumOfLanes(du, extra_bits));
110
35.1k
}
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.59k
                               uint32_t* JXL_RESTRICT out) {
45
4.59k
  const HWY_FULL(uint32_t) du;
46
4.59k
  const HWY_FULL(float) df;
47
4.59k
  const auto kZero = Zero(du);
48
4.59k
  const auto kSplit = Set(du, 1 << E);
49
4.59k
  const auto kExpOffset = Set(du, 127);
50
4.59k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.59k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.59k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.59k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.59k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.59k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.59k
  constexpr size_t kLargeShiftVal = 10;
57
4.59k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.59k
  auto extra_bits = kZero;
60
4.59k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
560k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
555k
    const auto val = LoadU(du, values + i);
63
555k
    const auto is_large = Gt(val, kLargeThreshold);
64
555k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
555k
    const auto not_literal = Ge(val, kSplit);
66
555k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
555k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
555k
    const auto l = And(val, kMaskL);
69
555k
    const auto exp = ShiftRight<23>(b);
70
555k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
555k
    const auto n = Sub(exp_fixed, kExpOffset);
72
555k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
555k
    const auto m = ShiftRight<23 - M - L>(b);
74
555k
    const auto a = Add(kBase, Mul(n, kMulN));
75
555k
    const auto d = And(m, kMaskM);
76
555k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
555k
    const auto c = Or(a, l);
78
555k
    extra_bits = Add(extra_bits, eb_fixed);
79
555k
    const auto t = Or(c, d);
80
555k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
555k
    Store(t_fixed, du, out + i);
82
555k
  }
83
4.59k
  if (last_full < len) {
84
3.95k
    const auto stop = Set(du, len);
85
3.95k
    const auto fence = Iota(du, last_full);
86
3.95k
    const auto take = Lt(fence, stop);
87
3.95k
    const auto val = LoadU(du, values + last_full);
88
3.95k
    const auto is_large = Gt(val, kLargeThreshold);
89
3.95k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
3.95k
    const auto not_literal = Ge(val, kSplit);
91
3.95k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
3.95k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
3.95k
    const auto l = And(val, kMaskL);
94
3.95k
    const auto exp = ShiftRight<23>(b);
95
3.95k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
3.95k
    const auto n = Sub(exp_fixed, kExpOffset);
97
3.95k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
3.95k
    const auto m = ShiftRight<23 - M - L>(b);
99
3.95k
    const auto a = Add(kBase, Mul(n, kMulN));
100
3.95k
    const auto d = And(m, kMaskM);
101
3.95k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
3.95k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
3.95k
    const auto c = Or(a, l);
104
3.95k
    extra_bits = Add(extra_bits, eb_masked);
105
3.95k
    const auto t = Or(c, d);
106
3.95k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
3.95k
    Store(t_fixed, du, out + last_full);
108
3.95k
  }
109
4.59k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.59k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.36k
                               uint32_t* JXL_RESTRICT out) {
45
4.36k
  const HWY_FULL(uint32_t) du;
46
4.36k
  const HWY_FULL(float) df;
47
4.36k
  const auto kZero = Zero(du);
48
4.36k
  const auto kSplit = Set(du, 1 << E);
49
4.36k
  const auto kExpOffset = Set(du, 127);
50
4.36k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.36k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.36k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.36k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.36k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.36k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.36k
  constexpr size_t kLargeShiftVal = 10;
57
4.36k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.36k
  auto extra_bits = kZero;
60
4.36k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
559k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
555k
    const auto val = LoadU(du, values + i);
63
555k
    const auto is_large = Gt(val, kLargeThreshold);
64
555k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
555k
    const auto not_literal = Ge(val, kSplit);
66
555k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
555k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
555k
    const auto l = And(val, kMaskL);
69
555k
    const auto exp = ShiftRight<23>(b);
70
555k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
555k
    const auto n = Sub(exp_fixed, kExpOffset);
72
555k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
555k
    const auto m = ShiftRight<23 - M - L>(b);
74
555k
    const auto a = Add(kBase, Mul(n, kMulN));
75
555k
    const auto d = And(m, kMaskM);
76
555k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
555k
    const auto c = Or(a, l);
78
555k
    extra_bits = Add(extra_bits, eb_fixed);
79
555k
    const auto t = Or(c, d);
80
555k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
555k
    Store(t_fixed, du, out + i);
82
555k
  }
83
4.36k
  if (last_full < len) {
84
3.74k
    const auto stop = Set(du, len);
85
3.74k
    const auto fence = Iota(du, last_full);
86
3.74k
    const auto take = Lt(fence, stop);
87
3.74k
    const auto val = LoadU(du, values + last_full);
88
3.74k
    const auto is_large = Gt(val, kLargeThreshold);
89
3.74k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
3.74k
    const auto not_literal = Ge(val, kSplit);
91
3.74k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
3.74k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
3.74k
    const auto l = And(val, kMaskL);
94
3.74k
    const auto exp = ShiftRight<23>(b);
95
3.74k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
3.74k
    const auto n = Sub(exp_fixed, kExpOffset);
97
3.74k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
3.74k
    const auto m = ShiftRight<23 - M - L>(b);
99
3.74k
    const auto a = Add(kBase, Mul(n, kMulN));
100
3.74k
    const auto d = And(m, kMaskM);
101
3.74k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
3.74k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
3.74k
    const auto c = Or(a, l);
104
3.74k
    extra_bits = Add(extra_bits, eb_masked);
105
3.74k
    const auto t = Or(c, d);
106
3.74k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
3.74k
    Store(t_fixed, du, out + last_full);
108
3.74k
  }
109
4.36k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.36k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.36k
                               uint32_t* JXL_RESTRICT out) {
45
4.36k
  const HWY_FULL(uint32_t) du;
46
4.36k
  const HWY_FULL(float) df;
47
4.36k
  const auto kZero = Zero(du);
48
4.36k
  const auto kSplit = Set(du, 1 << E);
49
4.36k
  const auto kExpOffset = Set(du, 127);
50
4.36k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.36k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.36k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.36k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.36k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.36k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.36k
  constexpr size_t kLargeShiftVal = 10;
57
4.36k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.36k
  auto extra_bits = kZero;
60
4.36k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
559k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
555k
    const auto val = LoadU(du, values + i);
63
555k
    const auto is_large = Gt(val, kLargeThreshold);
64
555k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
555k
    const auto not_literal = Ge(val, kSplit);
66
555k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
555k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
555k
    const auto l = And(val, kMaskL);
69
555k
    const auto exp = ShiftRight<23>(b);
70
555k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
555k
    const auto n = Sub(exp_fixed, kExpOffset);
72
555k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
555k
    const auto m = ShiftRight<23 - M - L>(b);
74
555k
    const auto a = Add(kBase, Mul(n, kMulN));
75
555k
    const auto d = And(m, kMaskM);
76
555k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
555k
    const auto c = Or(a, l);
78
555k
    extra_bits = Add(extra_bits, eb_fixed);
79
555k
    const auto t = Or(c, d);
80
555k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
555k
    Store(t_fixed, du, out + i);
82
555k
  }
83
4.36k
  if (last_full < len) {
84
3.74k
    const auto stop = Set(du, len);
85
3.74k
    const auto fence = Iota(du, last_full);
86
3.74k
    const auto take = Lt(fence, stop);
87
3.74k
    const auto val = LoadU(du, values + last_full);
88
3.74k
    const auto is_large = Gt(val, kLargeThreshold);
89
3.74k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
3.74k
    const auto not_literal = Ge(val, kSplit);
91
3.74k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
3.74k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
3.74k
    const auto l = And(val, kMaskL);
94
3.74k
    const auto exp = ShiftRight<23>(b);
95
3.74k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
3.74k
    const auto n = Sub(exp_fixed, kExpOffset);
97
3.74k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
3.74k
    const auto m = ShiftRight<23 - M - L>(b);
99
3.74k
    const auto a = Add(kBase, Mul(n, kMulN));
100
3.74k
    const auto d = And(m, kMaskM);
101
3.74k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
3.74k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
3.74k
    const auto c = Or(a, l);
104
3.74k
    extra_bits = Add(extra_bits, eb_masked);
105
3.74k
    const auto t = Or(c, d);
106
3.74k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
3.74k
    Store(t_fixed, du, out + last_full);
108
3.74k
  }
109
4.36k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.36k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
4.36k
                               uint32_t* JXL_RESTRICT out) {
45
4.36k
  const HWY_FULL(uint32_t) du;
46
4.36k
  const HWY_FULL(float) df;
47
4.36k
  const auto kZero = Zero(du);
48
4.36k
  const auto kSplit = Set(du, 1 << E);
49
4.36k
  const auto kExpOffset = Set(du, 127);
50
4.36k
  const auto kEBOffset = Set(du, 127 + M + L);
51
4.36k
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
4.36k
  const auto kMulN = Set(du, 1 << (M + L));
53
4.36k
  const auto kMaskL = Set(du, (1 << L) - 1);
54
4.36k
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
4.36k
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
4.36k
  constexpr size_t kLargeShiftVal = 10;
57
4.36k
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
4.36k
  auto extra_bits = kZero;
60
4.36k
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
559k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
555k
    const auto val = LoadU(du, values + i);
63
555k
    const auto is_large = Gt(val, kLargeThreshold);
64
555k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
555k
    const auto not_literal = Ge(val, kSplit);
66
555k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
555k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
555k
    const auto l = And(val, kMaskL);
69
555k
    const auto exp = ShiftRight<23>(b);
70
555k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
555k
    const auto n = Sub(exp_fixed, kExpOffset);
72
555k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
555k
    const auto m = ShiftRight<23 - M - L>(b);
74
555k
    const auto a = Add(kBase, Mul(n, kMulN));
75
555k
    const auto d = And(m, kMaskM);
76
555k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
555k
    const auto c = Or(a, l);
78
555k
    extra_bits = Add(extra_bits, eb_fixed);
79
555k
    const auto t = Or(c, d);
80
555k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
555k
    Store(t_fixed, du, out + i);
82
555k
  }
83
4.36k
  if (last_full < len) {
84
3.74k
    const auto stop = Set(du, len);
85
3.74k
    const auto fence = Iota(du, last_full);
86
3.74k
    const auto take = Lt(fence, stop);
87
3.74k
    const auto val = LoadU(du, values + last_full);
88
3.74k
    const auto is_large = Gt(val, kLargeThreshold);
89
3.74k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
3.74k
    const auto not_literal = Ge(val, kSplit);
91
3.74k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
3.74k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
3.74k
    const auto l = And(val, kMaskL);
94
3.74k
    const auto exp = ShiftRight<23>(b);
95
3.74k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
3.74k
    const auto n = Sub(exp_fixed, kExpOffset);
97
3.74k
    const auto eb = Sub(exp_fixed, kEBOffset);
98
3.74k
    const auto m = ShiftRight<23 - M - L>(b);
99
3.74k
    const auto a = Add(kBase, Mul(n, kMulN));
100
3.74k
    const auto d = And(m, kMaskM);
101
3.74k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
3.74k
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
3.74k
    const auto c = Or(a, l);
104
3.74k
    extra_bits = Add(extra_bits, eb_masked);
105
3.74k
    const auto t = Or(c, d);
106
3.74k
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
3.74k
    Store(t_fixed, du, out + last_full);
108
3.74k
  }
109
4.36k
  return GetLane(SumOfLanes(du, extra_bits));
110
4.36k
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
802
                               uint32_t* JXL_RESTRICT out) {
45
802
  const HWY_FULL(uint32_t) du;
46
802
  const HWY_FULL(float) df;
47
802
  const auto kZero = Zero(du);
48
802
  const auto kSplit = Set(du, 1 << E);
49
802
  const auto kExpOffset = Set(du, 127);
50
802
  const auto kEBOffset = Set(du, 127 + M + L);
51
802
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
802
  const auto kMulN = Set(du, 1 << (M + L));
53
802
  const auto kMaskL = Set(du, (1 << L) - 1);
54
802
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
802
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
802
  constexpr size_t kLargeShiftVal = 10;
57
802
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
802
  auto extra_bits = kZero;
60
802
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
802
  if (last_full < len) {
84
680
    const auto stop = Set(du, len);
85
680
    const auto fence = Iota(du, last_full);
86
680
    const auto take = Lt(fence, stop);
87
680
    const auto val = LoadU(du, values + last_full);
88
680
    const auto is_large = Gt(val, kLargeThreshold);
89
680
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
680
    const auto not_literal = Ge(val, kSplit);
91
680
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
680
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
680
    const auto l = And(val, kMaskL);
94
680
    const auto exp = ShiftRight<23>(b);
95
680
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
680
    const auto n = Sub(exp_fixed, kExpOffset);
97
680
    const auto eb = Sub(exp_fixed, kEBOffset);
98
680
    const auto m = ShiftRight<23 - M - L>(b);
99
680
    const auto a = Add(kBase, Mul(n, kMulN));
100
680
    const auto d = And(m, kMaskM);
101
680
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
680
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
680
    const auto c = Or(a, l);
104
680
    extra_bits = Add(extra_bits, eb_masked);
105
680
    const auto t = Or(c, d);
106
680
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
680
    Store(t_fixed, du, out + last_full);
108
680
  }
109
802
  return GetLane(SumOfLanes(du, extra_bits));
110
802
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
802
                               uint32_t* JXL_RESTRICT out) {
45
802
  const HWY_FULL(uint32_t) du;
46
802
  const HWY_FULL(float) df;
47
802
  const auto kZero = Zero(du);
48
802
  const auto kSplit = Set(du, 1 << E);
49
802
  const auto kExpOffset = Set(du, 127);
50
802
  const auto kEBOffset = Set(du, 127 + M + L);
51
802
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
802
  const auto kMulN = Set(du, 1 << (M + L));
53
802
  const auto kMaskL = Set(du, (1 << L) - 1);
54
802
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
802
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
802
  constexpr size_t kLargeShiftVal = 10;
57
802
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
802
  auto extra_bits = kZero;
60
802
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
802
  if (last_full < len) {
84
680
    const auto stop = Set(du, len);
85
680
    const auto fence = Iota(du, last_full);
86
680
    const auto take = Lt(fence, stop);
87
680
    const auto val = LoadU(du, values + last_full);
88
680
    const auto is_large = Gt(val, kLargeThreshold);
89
680
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
680
    const auto not_literal = Ge(val, kSplit);
91
680
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
680
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
680
    const auto l = And(val, kMaskL);
94
680
    const auto exp = ShiftRight<23>(b);
95
680
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
680
    const auto n = Sub(exp_fixed, kExpOffset);
97
680
    const auto eb = Sub(exp_fixed, kEBOffset);
98
680
    const auto m = ShiftRight<23 - M - L>(b);
99
680
    const auto a = Add(kBase, Mul(n, kMulN));
100
680
    const auto d = And(m, kMaskM);
101
680
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
680
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
680
    const auto c = Or(a, l);
104
680
    extra_bits = Add(extra_bits, eb_masked);
105
680
    const auto t = Or(c, d);
106
680
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
680
    Store(t_fixed, du, out + last_full);
108
680
  }
109
802
  return GetLane(SumOfLanes(du, extra_bits));
110
802
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
605
                               uint32_t* JXL_RESTRICT out) {
45
605
  const HWY_FULL(uint32_t) du;
46
605
  const HWY_FULL(float) df;
47
605
  const auto kZero = Zero(du);
48
605
  const auto kSplit = Set(du, 1 << E);
49
605
  const auto kExpOffset = Set(du, 127);
50
605
  const auto kEBOffset = Set(du, 127 + M + L);
51
605
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
605
  const auto kMulN = Set(du, 1 << (M + L));
53
605
  const auto kMaskL = Set(du, (1 << L) - 1);
54
605
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
605
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
605
  constexpr size_t kLargeShiftVal = 10;
57
605
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
605
  auto extra_bits = kZero;
60
605
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
71.7k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
71.1k
    const auto val = LoadU(du, values + i);
63
71.1k
    const auto is_large = Gt(val, kLargeThreshold);
64
71.1k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
71.1k
    const auto not_literal = Ge(val, kSplit);
66
71.1k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
71.1k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
71.1k
    const auto l = And(val, kMaskL);
69
71.1k
    const auto exp = ShiftRight<23>(b);
70
71.1k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
71.1k
    const auto n = Sub(exp_fixed, kExpOffset);
72
71.1k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
71.1k
    const auto m = ShiftRight<23 - M - L>(b);
74
71.1k
    const auto a = Add(kBase, Mul(n, kMulN));
75
71.1k
    const auto d = And(m, kMaskM);
76
71.1k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
71.1k
    const auto c = Or(a, l);
78
71.1k
    extra_bits = Add(extra_bits, eb_fixed);
79
71.1k
    const auto t = Or(c, d);
80
71.1k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
71.1k
    Store(t_fixed, du, out + i);
82
71.1k
  }
83
605
  if (last_full < len) {
84
505
    const auto stop = Set(du, len);
85
505
    const auto fence = Iota(du, last_full);
86
505
    const auto take = Lt(fence, stop);
87
505
    const auto val = LoadU(du, values + last_full);
88
505
    const auto is_large = Gt(val, kLargeThreshold);
89
505
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
505
    const auto not_literal = Ge(val, kSplit);
91
505
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
505
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
505
    const auto l = And(val, kMaskL);
94
505
    const auto exp = ShiftRight<23>(b);
95
505
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
505
    const auto n = Sub(exp_fixed, kExpOffset);
97
505
    const auto eb = Sub(exp_fixed, kEBOffset);
98
505
    const auto m = ShiftRight<23 - M - L>(b);
99
505
    const auto a = Add(kBase, Mul(n, kMulN));
100
505
    const auto d = And(m, kMaskM);
101
505
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
505
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
505
    const auto c = Or(a, l);
104
505
    extra_bits = Add(extra_bits, eb_masked);
105
505
    const auto t = Or(c, d);
106
505
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
505
    Store(t_fixed, du, out + last_full);
108
505
  }
109
605
  return GetLane(SumOfLanes(du, extra_bits));
110
605
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
605
                               uint32_t* JXL_RESTRICT out) {
45
605
  const HWY_FULL(uint32_t) du;
46
605
  const HWY_FULL(float) df;
47
605
  const auto kZero = Zero(du);
48
605
  const auto kSplit = Set(du, 1 << E);
49
605
  const auto kExpOffset = Set(du, 127);
50
605
  const auto kEBOffset = Set(du, 127 + M + L);
51
605
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
605
  const auto kMulN = Set(du, 1 << (M + L));
53
605
  const auto kMaskL = Set(du, (1 << L) - 1);
54
605
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
605
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
605
  constexpr size_t kLargeShiftVal = 10;
57
605
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
605
  auto extra_bits = kZero;
60
605
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
71.7k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
71.1k
    const auto val = LoadU(du, values + i);
63
71.1k
    const auto is_large = Gt(val, kLargeThreshold);
64
71.1k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
71.1k
    const auto not_literal = Ge(val, kSplit);
66
71.1k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
71.1k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
71.1k
    const auto l = And(val, kMaskL);
69
71.1k
    const auto exp = ShiftRight<23>(b);
70
71.1k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
71.1k
    const auto n = Sub(exp_fixed, kExpOffset);
72
71.1k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
71.1k
    const auto m = ShiftRight<23 - M - L>(b);
74
71.1k
    const auto a = Add(kBase, Mul(n, kMulN));
75
71.1k
    const auto d = And(m, kMaskM);
76
71.1k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
71.1k
    const auto c = Or(a, l);
78
71.1k
    extra_bits = Add(extra_bits, eb_fixed);
79
71.1k
    const auto t = Or(c, d);
80
71.1k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
71.1k
    Store(t_fixed, du, out + i);
82
71.1k
  }
83
605
  if (last_full < len) {
84
505
    const auto stop = Set(du, len);
85
505
    const auto fence = Iota(du, last_full);
86
505
    const auto take = Lt(fence, stop);
87
505
    const auto val = LoadU(du, values + last_full);
88
505
    const auto is_large = Gt(val, kLargeThreshold);
89
505
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
505
    const auto not_literal = Ge(val, kSplit);
91
505
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
505
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
505
    const auto l = And(val, kMaskL);
94
505
    const auto exp = ShiftRight<23>(b);
95
505
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
505
    const auto n = Sub(exp_fixed, kExpOffset);
97
505
    const auto eb = Sub(exp_fixed, kEBOffset);
98
505
    const auto m = ShiftRight<23 - M - L>(b);
99
505
    const auto a = Add(kBase, Mul(n, kMulN));
100
505
    const auto d = And(m, kMaskM);
101
505
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
505
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
505
    const auto c = Or(a, l);
104
505
    extra_bits = Add(extra_bits, eb_masked);
105
505
    const auto t = Or(c, d);
106
505
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
505
    Store(t_fixed, du, out + last_full);
108
505
  }
109
605
  return GetLane(SumOfLanes(du, extra_bits));
110
605
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
811
                               uint32_t* JXL_RESTRICT out) {
45
811
  const HWY_FULL(uint32_t) du;
46
811
  const HWY_FULL(float) df;
47
811
  const auto kZero = Zero(du);
48
811
  const auto kSplit = Set(du, 1 << E);
49
811
  const auto kExpOffset = Set(du, 127);
50
811
  const auto kEBOffset = Set(du, 127 + M + L);
51
811
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
811
  const auto kMulN = Set(du, 1 << (M + L));
53
811
  const auto kMaskL = Set(du, (1 << L) - 1);
54
811
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
811
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
811
  constexpr size_t kLargeShiftVal = 10;
57
811
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
811
  auto extra_bits = kZero;
60
811
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
186k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
185k
    const auto val = LoadU(du, values + i);
63
185k
    const auto is_large = Gt(val, kLargeThreshold);
64
185k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
185k
    const auto not_literal = Ge(val, kSplit);
66
185k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
185k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
185k
    const auto l = And(val, kMaskL);
69
185k
    const auto exp = ShiftRight<23>(b);
70
185k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
185k
    const auto n = Sub(exp_fixed, kExpOffset);
72
185k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
185k
    const auto m = ShiftRight<23 - M - L>(b);
74
185k
    const auto a = Add(kBase, Mul(n, kMulN));
75
185k
    const auto d = And(m, kMaskM);
76
185k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
185k
    const auto c = Or(a, l);
78
185k
    extra_bits = Add(extra_bits, eb_fixed);
79
185k
    const auto t = Or(c, d);
80
185k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
185k
    Store(t_fixed, du, out + i);
82
185k
  }
83
811
  if (last_full < len) {
84
687
    const auto stop = Set(du, len);
85
687
    const auto fence = Iota(du, last_full);
86
687
    const auto take = Lt(fence, stop);
87
687
    const auto val = LoadU(du, values + last_full);
88
687
    const auto is_large = Gt(val, kLargeThreshold);
89
687
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
687
    const auto not_literal = Ge(val, kSplit);
91
687
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
687
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
687
    const auto l = And(val, kMaskL);
94
687
    const auto exp = ShiftRight<23>(b);
95
687
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
687
    const auto n = Sub(exp_fixed, kExpOffset);
97
687
    const auto eb = Sub(exp_fixed, kEBOffset);
98
687
    const auto m = ShiftRight<23 - M - L>(b);
99
687
    const auto a = Add(kBase, Mul(n, kMulN));
100
687
    const auto d = And(m, kMaskM);
101
687
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
687
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
687
    const auto c = Or(a, l);
104
687
    extra_bits = Add(extra_bits, eb_masked);
105
687
    const auto t = Or(c, d);
106
687
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
687
    Store(t_fixed, du, out + last_full);
108
687
  }
109
811
  return GetLane(SumOfLanes(du, extra_bits));
110
811
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
494
                               uint32_t* JXL_RESTRICT out) {
45
494
  const HWY_FULL(uint32_t) du;
46
494
  const HWY_FULL(float) df;
47
494
  const auto kZero = Zero(du);
48
494
  const auto kSplit = Set(du, 1 << E);
49
494
  const auto kExpOffset = Set(du, 127);
50
494
  const auto kEBOffset = Set(du, 127 + M + L);
51
494
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
494
  const auto kMulN = Set(du, 1 << (M + L));
53
494
  const auto kMaskL = Set(du, (1 << L) - 1);
54
494
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
494
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
494
  constexpr size_t kLargeShiftVal = 10;
57
494
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
494
  auto extra_bits = kZero;
60
494
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
67.7k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
67.2k
    const auto val = LoadU(du, values + i);
63
67.2k
    const auto is_large = Gt(val, kLargeThreshold);
64
67.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
67.2k
    const auto not_literal = Ge(val, kSplit);
66
67.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
67.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
67.2k
    const auto l = And(val, kMaskL);
69
67.2k
    const auto exp = ShiftRight<23>(b);
70
67.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
67.2k
    const auto n = Sub(exp_fixed, kExpOffset);
72
67.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
67.2k
    const auto m = ShiftRight<23 - M - L>(b);
74
67.2k
    const auto a = Add(kBase, Mul(n, kMulN));
75
67.2k
    const auto d = And(m, kMaskM);
76
67.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
67.2k
    const auto c = Or(a, l);
78
67.2k
    extra_bits = Add(extra_bits, eb_fixed);
79
67.2k
    const auto t = Or(c, d);
80
67.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
67.2k
    Store(t_fixed, du, out + i);
82
67.2k
  }
83
494
  if (last_full < len) {
84
408
    const auto stop = Set(du, len);
85
408
    const auto fence = Iota(du, last_full);
86
408
    const auto take = Lt(fence, stop);
87
408
    const auto val = LoadU(du, values + last_full);
88
408
    const auto is_large = Gt(val, kLargeThreshold);
89
408
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
408
    const auto not_literal = Ge(val, kSplit);
91
408
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
408
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
408
    const auto l = And(val, kMaskL);
94
408
    const auto exp = ShiftRight<23>(b);
95
408
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
408
    const auto n = Sub(exp_fixed, kExpOffset);
97
408
    const auto eb = Sub(exp_fixed, kEBOffset);
98
408
    const auto m = ShiftRight<23 - M - L>(b);
99
408
    const auto a = Add(kBase, Mul(n, kMulN));
100
408
    const auto d = And(m, kMaskM);
101
408
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
408
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
408
    const auto c = Or(a, l);
104
408
    extra_bits = Add(extra_bits, eb_masked);
105
408
    const auto t = Or(c, d);
106
408
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
408
    Store(t_fixed, du, out + last_full);
108
408
  }
109
494
  return GetLane(SumOfLanes(du, extra_bits));
110
494
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
494
                               uint32_t* JXL_RESTRICT out) {
45
494
  const HWY_FULL(uint32_t) du;
46
494
  const HWY_FULL(float) df;
47
494
  const auto kZero = Zero(du);
48
494
  const auto kSplit = Set(du, 1 << E);
49
494
  const auto kExpOffset = Set(du, 127);
50
494
  const auto kEBOffset = Set(du, 127 + M + L);
51
494
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
494
  const auto kMulN = Set(du, 1 << (M + L));
53
494
  const auto kMaskL = Set(du, (1 << L) - 1);
54
494
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
494
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
494
  constexpr size_t kLargeShiftVal = 10;
57
494
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
494
  auto extra_bits = kZero;
60
494
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
67.7k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
67.2k
    const auto val = LoadU(du, values + i);
63
67.2k
    const auto is_large = Gt(val, kLargeThreshold);
64
67.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
67.2k
    const auto not_literal = Ge(val, kSplit);
66
67.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
67.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
67.2k
    const auto l = And(val, kMaskL);
69
67.2k
    const auto exp = ShiftRight<23>(b);
70
67.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
67.2k
    const auto n = Sub(exp_fixed, kExpOffset);
72
67.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
67.2k
    const auto m = ShiftRight<23 - M - L>(b);
74
67.2k
    const auto a = Add(kBase, Mul(n, kMulN));
75
67.2k
    const auto d = And(m, kMaskM);
76
67.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
67.2k
    const auto c = Or(a, l);
78
67.2k
    extra_bits = Add(extra_bits, eb_fixed);
79
67.2k
    const auto t = Or(c, d);
80
67.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
67.2k
    Store(t_fixed, du, out + i);
82
67.2k
  }
83
494
  if (last_full < len) {
84
408
    const auto stop = Set(du, len);
85
408
    const auto fence = Iota(du, last_full);
86
408
    const auto take = Lt(fence, stop);
87
408
    const auto val = LoadU(du, values + last_full);
88
408
    const auto is_large = Gt(val, kLargeThreshold);
89
408
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
408
    const auto not_literal = Ge(val, kSplit);
91
408
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
408
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
408
    const auto l = And(val, kMaskL);
94
408
    const auto exp = ShiftRight<23>(b);
95
408
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
408
    const auto n = Sub(exp_fixed, kExpOffset);
97
408
    const auto eb = Sub(exp_fixed, kEBOffset);
98
408
    const auto m = ShiftRight<23 - M - L>(b);
99
408
    const auto a = Add(kBase, Mul(n, kMulN));
100
408
    const auto d = And(m, kMaskM);
101
408
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
408
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
408
    const auto c = Or(a, l);
104
408
    extra_bits = Add(extra_bits, eb_masked);
105
408
    const auto t = Or(c, d);
106
408
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
408
    Store(t_fixed, du, out + last_full);
108
408
  }
109
494
  return GetLane(SumOfLanes(du, extra_bits));
110
494
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
494
                               uint32_t* JXL_RESTRICT out) {
45
494
  const HWY_FULL(uint32_t) du;
46
494
  const HWY_FULL(float) df;
47
494
  const auto kZero = Zero(du);
48
494
  const auto kSplit = Set(du, 1 << E);
49
494
  const auto kExpOffset = Set(du, 127);
50
494
  const auto kEBOffset = Set(du, 127 + M + L);
51
494
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
494
  const auto kMulN = Set(du, 1 << (M + L));
53
494
  const auto kMaskL = Set(du, (1 << L) - 1);
54
494
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
494
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
494
  constexpr size_t kLargeShiftVal = 10;
57
494
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
494
  auto extra_bits = kZero;
60
494
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
67.7k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
67.2k
    const auto val = LoadU(du, values + i);
63
67.2k
    const auto is_large = Gt(val, kLargeThreshold);
64
67.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
67.2k
    const auto not_literal = Ge(val, kSplit);
66
67.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
67.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
67.2k
    const auto l = And(val, kMaskL);
69
67.2k
    const auto exp = ShiftRight<23>(b);
70
67.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
67.2k
    const auto n = Sub(exp_fixed, kExpOffset);
72
67.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
67.2k
    const auto m = ShiftRight<23 - M - L>(b);
74
67.2k
    const auto a = Add(kBase, Mul(n, kMulN));
75
67.2k
    const auto d = And(m, kMaskM);
76
67.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
67.2k
    const auto c = Or(a, l);
78
67.2k
    extra_bits = Add(extra_bits, eb_fixed);
79
67.2k
    const auto t = Or(c, d);
80
67.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
67.2k
    Store(t_fixed, du, out + i);
82
67.2k
  }
83
494
  if (last_full < len) {
84
408
    const auto stop = Set(du, len);
85
408
    const auto fence = Iota(du, last_full);
86
408
    const auto take = Lt(fence, stop);
87
408
    const auto val = LoadU(du, values + last_full);
88
408
    const auto is_large = Gt(val, kLargeThreshold);
89
408
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
408
    const auto not_literal = Ge(val, kSplit);
91
408
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
408
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
408
    const auto l = And(val, kMaskL);
94
408
    const auto exp = ShiftRight<23>(b);
95
408
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
408
    const auto n = Sub(exp_fixed, kExpOffset);
97
408
    const auto eb = Sub(exp_fixed, kEBOffset);
98
408
    const auto m = ShiftRight<23 - M - L>(b);
99
408
    const auto a = Add(kBase, Mul(n, kMulN));
100
408
    const auto d = And(m, kMaskM);
101
408
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
408
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
408
    const auto c = Or(a, l);
104
408
    extra_bits = Add(extra_bits, eb_masked);
105
408
    const auto t = Or(c, d);
106
408
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
408
    Store(t_fixed, du, out + last_full);
108
408
  }
109
494
  return GetLane(SumOfLanes(du, extra_bits));
110
494
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
494
                               uint32_t* JXL_RESTRICT out) {
45
494
  const HWY_FULL(uint32_t) du;
46
494
  const HWY_FULL(float) df;
47
494
  const auto kZero = Zero(du);
48
494
  const auto kSplit = Set(du, 1 << E);
49
494
  const auto kExpOffset = Set(du, 127);
50
494
  const auto kEBOffset = Set(du, 127 + M + L);
51
494
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
494
  const auto kMulN = Set(du, 1 << (M + L));
53
494
  const auto kMaskL = Set(du, (1 << L) - 1);
54
494
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
494
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
494
  constexpr size_t kLargeShiftVal = 10;
57
494
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
494
  auto extra_bits = kZero;
60
494
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
67.7k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
67.2k
    const auto val = LoadU(du, values + i);
63
67.2k
    const auto is_large = Gt(val, kLargeThreshold);
64
67.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
67.2k
    const auto not_literal = Ge(val, kSplit);
66
67.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
67.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
67.2k
    const auto l = And(val, kMaskL);
69
67.2k
    const auto exp = ShiftRight<23>(b);
70
67.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
67.2k
    const auto n = Sub(exp_fixed, kExpOffset);
72
67.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
67.2k
    const auto m = ShiftRight<23 - M - L>(b);
74
67.2k
    const auto a = Add(kBase, Mul(n, kMulN));
75
67.2k
    const auto d = And(m, kMaskM);
76
67.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
67.2k
    const auto c = Or(a, l);
78
67.2k
    extra_bits = Add(extra_bits, eb_fixed);
79
67.2k
    const auto t = Or(c, d);
80
67.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
67.2k
    Store(t_fixed, du, out + i);
82
67.2k
  }
83
494
  if (last_full < len) {
84
408
    const auto stop = Set(du, len);
85
408
    const auto fence = Iota(du, last_full);
86
408
    const auto take = Lt(fence, stop);
87
408
    const auto val = LoadU(du, values + last_full);
88
408
    const auto is_large = Gt(val, kLargeThreshold);
89
408
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
408
    const auto not_literal = Ge(val, kSplit);
91
408
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
408
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
408
    const auto l = And(val, kMaskL);
94
408
    const auto exp = ShiftRight<23>(b);
95
408
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
408
    const auto n = Sub(exp_fixed, kExpOffset);
97
408
    const auto eb = Sub(exp_fixed, kEBOffset);
98
408
    const auto m = ShiftRight<23 - M - L>(b);
99
408
    const auto a = Add(kBase, Mul(n, kMulN));
100
408
    const auto d = And(m, kMaskM);
101
408
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
408
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
408
    const auto c = Or(a, l);
104
408
    extra_bits = Add(extra_bits, eb_masked);
105
408
    const auto t = Or(c, d);
106
408
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
408
    Store(t_fixed, du, out + last_full);
108
408
  }
109
494
  return GetLane(SumOfLanes(du, extra_bits));
110
494
}
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Line
Count
Source
44
494
                               uint32_t* JXL_RESTRICT out) {
45
494
  const HWY_FULL(uint32_t) du;
46
494
  const HWY_FULL(float) df;
47
494
  const auto kZero = Zero(du);
48
494
  const auto kSplit = Set(du, 1 << E);
49
494
  const auto kExpOffset = Set(du, 127);
50
494
  const auto kEBOffset = Set(du, 127 + M + L);
51
494
  const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L))));
52
494
  const auto kMulN = Set(du, 1 << (M + L));
53
494
  const auto kMaskL = Set(du, (1 << L) - 1);
54
494
  const auto kMaskM = Set(du, ((1 << M) - 1) << L);
55
494
  const auto kLargeThreshold = Set(du, (1 << 22) - 1);
56
494
  constexpr size_t kLargeShiftVal = 10;
57
494
  const auto kLargeShift = Set(du, kLargeShiftVal);
58
59
494
  auto extra_bits = kZero;
60
494
  size_t last_full = Lanes(du) * (len / Lanes(du));
61
67.7k
  for (size_t i = 0; i < last_full; i += Lanes(du)) {
62
67.2k
    const auto val = LoadU(du, values + i);
63
67.2k
    const auto is_large = Gt(val, kLargeThreshold);
64
67.2k
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
65
67.2k
    const auto not_literal = Ge(val, kSplit);
66
67.2k
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
67
67.2k
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
68
67.2k
    const auto l = And(val, kMaskL);
69
67.2k
    const auto exp = ShiftRight<23>(b);
70
67.2k
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
71
67.2k
    const auto n = Sub(exp_fixed, kExpOffset);
72
67.2k
    const auto eb = Sub(exp_fixed, kEBOffset);
73
67.2k
    const auto m = ShiftRight<23 - M - L>(b);
74
67.2k
    const auto a = Add(kBase, Mul(n, kMulN));
75
67.2k
    const auto d = And(m, kMaskM);
76
67.2k
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
77
67.2k
    const auto c = Or(a, l);
78
67.2k
    extra_bits = Add(extra_bits, eb_fixed);
79
67.2k
    const auto t = Or(c, d);
80
67.2k
    const auto t_fixed = IfThenElse(not_literal, t, val);
81
67.2k
    Store(t_fixed, du, out + i);
82
67.2k
  }
83
494
  if (last_full < len) {
84
408
    const auto stop = Set(du, len);
85
408
    const auto fence = Iota(du, last_full);
86
408
    const auto take = Lt(fence, stop);
87
408
    const auto val = LoadU(du, values + last_full);
88
408
    const auto is_large = Gt(val, kLargeThreshold);
89
408
    const auto val_shifted = ShiftRight<kLargeShiftVal>(val);
90
408
    const auto not_literal = Ge(val, kSplit);
91
408
    const auto val_fixed = IfThenElse(is_large, val_shifted, val);
92
408
    const auto b = BitCast(du, ConvertTo(df, val_fixed));
93
408
    const auto l = And(val, kMaskL);
94
408
    const auto exp = ShiftRight<23>(b);
95
408
    const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp);
96
408
    const auto n = Sub(exp_fixed, kExpOffset);
97
408
    const auto eb = Sub(exp_fixed, kEBOffset);
98
408
    const auto m = ShiftRight<23 - M - L>(b);
99
408
    const auto a = Add(kBase, Mul(n, kMulN));
100
408
    const auto d = And(m, kMaskM);
101
408
    const auto eb_fixed = IfThenElseZero(not_literal, eb);
102
408
    const auto eb_masked = IfThenElseZero(take, eb_fixed);
103
408
    const auto c = Or(a, l);
104
408
    extra_bits = Add(extra_bits, eb_masked);
105
408
    const auto t = Or(c, d);
106
408
    const auto t_fixed = IfThenElse(not_literal, t, val);
107
408
    Store(t_fixed, du, out + last_full);
108
408
  }
109
494
  return GetLane(SumOfLanes(du, extra_bits));
110
494
}
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*)
111
112
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
113
35.1k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
35.1k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
35.1k
  if (cfg.split_exponent == 0) {
127
4.59k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
30.5k
  } else if (cfg.split_exponent == 2) {
129
4.36k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
4.36k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
26.1k
  } else if (cfg.split_exponent == 3) {
132
3.24k
    if (cfg.msb_in_token == 1) {
133
1.62k
      if (cfg.lsb_in_token == 0) {
134
811
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
811
      } else {
136
811
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
811
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
811
      }
139
1.62k
    } else {
140
1.62k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
1.62k
      if (cfg.lsb_in_token == 0) {
142
811
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
811
      } else {
144
811
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
811
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
811
      }
147
1.62k
    }
148
22.9k
  } else if (cfg.split_exponent == 4) {
149
11.9k
    if (cfg.msb_in_token == 1) {
150
5.98k
      if (cfg.lsb_in_token == 0) {
151
811
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
5.17k
      } else if (cfg.lsb_in_token == 2) {
153
4.36k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
4.36k
      } else {
155
811
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
811
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
811
      }
158
5.98k
    } else {
159
5.98k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
5.98k
      if (cfg.lsb_in_token == 0) {
161
4.36k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
4.36k
      } else if (cfg.lsb_in_token == 1) {
163
811
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
811
      } else {
165
811
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
811
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
811
      }
168
5.98k
    }
169
11.9k
  } else if (cfg.split_exponent == 5) {
170
5.65k
    if (cfg.msb_in_token == 1) {
171
2.42k
      if (cfg.lsb_in_token == 0) {
172
811
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
1.61k
      } else if (cfg.lsb_in_token == 2) {
174
811
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
811
      } else {
176
802
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
802
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
802
      }
179
3.23k
    } else {
180
3.23k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
3.23k
      if (cfg.lsb_in_token == 0) {
182
811
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
2.42k
      } else if (cfg.lsb_in_token == 1) {
184
811
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
1.61k
      } else if (cfg.lsb_in_token == 2) {
186
811
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
811
      } else {
188
802
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
802
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
802
      }
191
3.23k
    }
192
5.65k
  } else if (cfg.split_exponent == 6) {
193
2.02k
    if (cfg.msb_in_token == 0) {
194
811
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
811
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
1.21k
    } else if (cfg.msb_in_token == 1) {
197
605
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
605
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
605
    } else {
200
605
      JXL_DASSERT(cfg.msb_in_token == 2);
201
605
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
605
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
605
    }
204
3.28k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
3.28k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
3.28k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
3.28k
    if (cfg.split_exponent == 7) {
208
811
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
2.47k
    } else if (cfg.split_exponent == 8) {
210
494
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
1.97k
    } else if (cfg.split_exponent == 9) {
212
494
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
1.48k
    } else if (cfg.split_exponent == 10) {
214
494
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
988
    } else if (cfg.split_exponent == 11) {
216
494
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
494
    } else {
218
494
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
494
    }
220
3.28k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
35.1k
#endif
225
35.1k
}
Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
Line
Count
Source
113
35.1k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
114
35.1k
  uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>();
115
#if HWY_TARGET == HWY_SCALAR
116
  uint32_t extra_bits = 0;
117
  for (size_t i = 0; i < len; ++i) {
118
    uint32_t v = values[i];
119
    uint32_t tok, nbits, bits;
120
    cfg.Encode(v, &tok, &nbits, &bits);
121
    extra_bits += nbits;
122
    out[i] = tok;
123
  }
124
  return extra_bits;
125
#else
126
35.1k
  if (cfg.split_exponent == 0) {
127
4.59k
    return EstimateTokenCostImpl<0, 0, 0>(values, len, out);
128
30.5k
  } else if (cfg.split_exponent == 2) {
129
4.36k
    JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1));
130
4.36k
    return EstimateTokenCostImpl<2, 0, 1>(values, len, out);
131
26.1k
  } else if (cfg.split_exponent == 3) {
132
3.24k
    if (cfg.msb_in_token == 1) {
133
1.62k
      if (cfg.lsb_in_token == 0) {
134
811
        return EstimateTokenCostImpl<3, 1, 0>(values, len, out);
135
811
      } else {
136
811
        JXL_DASSERT(cfg.lsb_in_token == 2);
137
811
        return EstimateTokenCostImpl<3, 1, 2>(values, len, out);
138
811
      }
139
1.62k
    } else {
140
1.62k
      JXL_DASSERT(cfg.msb_in_token == 2);
141
1.62k
      if (cfg.lsb_in_token == 0) {
142
811
        return EstimateTokenCostImpl<3, 2, 0>(values, len, out);
143
811
      } else {
144
811
        JXL_DASSERT(cfg.lsb_in_token == 1);
145
811
        return EstimateTokenCostImpl<3, 2, 1>(values, len, out);
146
811
      }
147
1.62k
    }
148
22.9k
  } else if (cfg.split_exponent == 4) {
149
11.9k
    if (cfg.msb_in_token == 1) {
150
5.98k
      if (cfg.lsb_in_token == 0) {
151
811
        return EstimateTokenCostImpl<4, 1, 0>(values, len, out);
152
5.17k
      } else if (cfg.lsb_in_token == 2) {
153
4.36k
        return EstimateTokenCostImpl<4, 1, 2>(values, len, out);
154
4.36k
      } else {
155
811
        JXL_DASSERT(cfg.lsb_in_token == 3);
156
811
        return EstimateTokenCostImpl<4, 1, 3>(values, len, out);
157
811
      }
158
5.98k
    } else {
159
5.98k
      JXL_DASSERT(cfg.msb_in_token == 2);
160
5.98k
      if (cfg.lsb_in_token == 0) {
161
4.36k
        return EstimateTokenCostImpl<4, 2, 0>(values, len, out);
162
4.36k
      } else if (cfg.lsb_in_token == 1) {
163
811
        return EstimateTokenCostImpl<4, 2, 1>(values, len, out);
164
811
      } else {
165
811
        JXL_DASSERT(cfg.lsb_in_token == 2);
166
811
        return EstimateTokenCostImpl<4, 2, 2>(values, len, out);
167
811
      }
168
5.98k
    }
169
11.9k
  } else if (cfg.split_exponent == 5) {
170
5.65k
    if (cfg.msb_in_token == 1) {
171
2.42k
      if (cfg.lsb_in_token == 0) {
172
811
        return EstimateTokenCostImpl<5, 1, 0>(values, len, out);
173
1.61k
      } else if (cfg.lsb_in_token == 2) {
174
811
        return EstimateTokenCostImpl<5, 1, 2>(values, len, out);
175
811
      } else {
176
802
        JXL_DASSERT(cfg.lsb_in_token == 4);
177
802
        return EstimateTokenCostImpl<5, 1, 4>(values, len, out);
178
802
      }
179
3.23k
    } else {
180
3.23k
      JXL_DASSERT(cfg.msb_in_token == 2);
181
3.23k
      if (cfg.lsb_in_token == 0) {
182
811
        return EstimateTokenCostImpl<5, 2, 0>(values, len, out);
183
2.42k
      } else if (cfg.lsb_in_token == 1) {
184
811
        return EstimateTokenCostImpl<5, 2, 1>(values, len, out);
185
1.61k
      } else if (cfg.lsb_in_token == 2) {
186
811
        return EstimateTokenCostImpl<5, 2, 2>(values, len, out);
187
811
      } else {
188
802
        JXL_DASSERT(cfg.lsb_in_token == 3);
189
802
        return EstimateTokenCostImpl<5, 2, 3>(values, len, out);
190
802
      }
191
3.23k
    }
192
5.65k
  } else if (cfg.split_exponent == 6) {
193
2.02k
    if (cfg.msb_in_token == 0) {
194
811
      JXL_DASSERT(cfg.lsb_in_token == 0);
195
811
      return EstimateTokenCostImpl<6, 0, 0>(values, len, out);
196
1.21k
    } else if (cfg.msb_in_token == 1) {
197
605
      JXL_DASSERT(cfg.lsb_in_token == 5);
198
605
      return EstimateTokenCostImpl<6, 1, 5>(values, len, out);
199
605
    } else {
200
605
      JXL_DASSERT(cfg.msb_in_token == 2);
201
605
      JXL_DASSERT(cfg.lsb_in_token == 4);
202
605
      return EstimateTokenCostImpl<6, 2, 4>(values, len, out);
203
605
    }
204
3.28k
  } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) {
205
3.28k
    JXL_DASSERT(cfg.msb_in_token == 0);
206
3.28k
    JXL_DASSERT(cfg.lsb_in_token == 0);
207
3.28k
    if (cfg.split_exponent == 7) {
208
811
      return EstimateTokenCostImpl<7, 0, 0>(values, len, out);
209
2.47k
    } else if (cfg.split_exponent == 8) {
210
494
      return EstimateTokenCostImpl<8, 0, 0>(values, len, out);
211
1.97k
    } else if (cfg.split_exponent == 9) {
212
494
      return EstimateTokenCostImpl<9, 0, 0>(values, len, out);
213
1.48k
    } else if (cfg.split_exponent == 10) {
214
494
      return EstimateTokenCostImpl<10, 0, 0>(values, len, out);
215
988
    } else if (cfg.split_exponent == 11) {
216
494
      return EstimateTokenCostImpl<11, 0, 0>(values, len, out);
217
494
    } else {
218
494
      return EstimateTokenCostImpl<12, 0, 0>(values, len, out);
219
494
    }
220
3.28k
  } else {
221
0
    JXL_DASSERT(false);
222
0
  }
223
0
  return ~0;
224
35.1k
#endif
225
35.1k
}
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&)
226
227
// NOLINTNEXTLINE(google-readability-namespace-comments)
228
}  // namespace HWY_NAMESPACE
229
}  // namespace jxl
230
HWY_AFTER_NAMESPACE();
231
232
#if HWY_ONCE
233
namespace jxl {
234
235
HWY_EXPORT(EstimateTokenCost);
236
237
uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len,
238
35.1k
                           HybridUintConfig cfg, AlignedMemory& tokens) {
239
35.1k
  JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent);
240
35.1k
  return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens);
241
35.1k
}
242
243
}  // namespace jxl
244
#endif