/src/libjxl/lib/jxl/enc_ans_simd.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_ans_simd.h" |
7 | | |
8 | | #include <cstdint> |
9 | | |
10 | | #include "lib/jxl/base/compiler_specific.h" |
11 | | #include "lib/jxl/base/status.h" |
12 | | #include "lib/jxl/dec_ans.h" |
13 | | #include "lib/jxl/memory_manager_internal.h" |
14 | | |
15 | | #undef HWY_TARGET_INCLUDE |
16 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_ans_simd.cc" |
17 | | #include <hwy/foreach_target.h> |
18 | | #include <hwy/highway.h> |
19 | | |
20 | | HWY_BEFORE_NAMESPACE(); |
21 | | namespace jxl { |
22 | | namespace HWY_NAMESPACE { |
23 | | |
24 | | // These templates are not found via ADL. |
25 | | using hwy::HWY_NAMESPACE::Add; |
26 | | using hwy::HWY_NAMESPACE::And; |
27 | | using hwy::HWY_NAMESPACE::Ge; |
28 | | using hwy::HWY_NAMESPACE::GetLane; |
29 | | using hwy::HWY_NAMESPACE::Gt; |
30 | | using hwy::HWY_NAMESPACE::IfThenElse; |
31 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
32 | | using hwy::HWY_NAMESPACE::Iota; |
33 | | using hwy::HWY_NAMESPACE::LoadU; |
34 | | using hwy::HWY_NAMESPACE::Lt; |
35 | | using hwy::HWY_NAMESPACE::Mul; |
36 | | using hwy::HWY_NAMESPACE::Or; |
37 | | using hwy::HWY_NAMESPACE::Set; |
38 | | using hwy::HWY_NAMESPACE::ShiftRight; |
39 | | using hwy::HWY_NAMESPACE::Store; |
40 | | using hwy::HWY_NAMESPACE::Sub; |
41 | | using hwy::HWY_NAMESPACE::Zero; |
42 | | |
43 | | template <size_t E, size_t M, size_t L> |
44 | | uint32_t EstimateTokenCostImpl(uint32_t* JXL_RESTRICT values, size_t len, |
45 | 2.80k | uint32_t* JXL_RESTRICT out) { |
46 | 2.80k | const HWY_FULL(uint32_t) du; |
47 | 2.80k | const HWY_FULL(float) df; |
48 | 2.80k | const auto kZero = Zero(du); |
49 | 2.80k | const auto kSplit = Set(du, 1 << E); |
50 | 2.80k | const auto kExpOffset = Set(du, 127); |
51 | 2.80k | const auto kEBOffset = Set(du, 127 + M + L); |
52 | 2.80k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); |
53 | 2.80k | const auto kMulN = Set(du, 1 << (M + L)); |
54 | 2.80k | const auto kMaskL = Set(du, (1 << L) - 1); |
55 | 2.80k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); |
56 | 2.80k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); |
57 | 2.80k | constexpr size_t kLargeShiftVal = 10; |
58 | 2.80k | const auto kLargeShift = Set(du, kLargeShiftVal); |
59 | | |
60 | 2.80k | auto extra_bits = kZero; |
61 | 2.80k | size_t last_full = Lanes(du) * (len / Lanes(du)); |
62 | 5.60k | for (size_t i = 0; i < last_full; i += Lanes(du)) { |
63 | 2.80k | const auto val = LoadU(du, values + i); |
64 | 2.80k | const auto is_large = Gt(val, kLargeThreshold); |
65 | 2.80k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); |
66 | 2.80k | const auto not_literal = Ge(val, kSplit); |
67 | 2.80k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); |
68 | 2.80k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); |
69 | 2.80k | const auto l = And(val, kMaskL); |
70 | 2.80k | const auto exp = ShiftRight<23>(b); |
71 | 2.80k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); |
72 | 2.80k | const auto n = Sub(exp_fixed, kExpOffset); |
73 | 2.80k | const auto eb = Sub(exp_fixed, kEBOffset); |
74 | 2.80k | const auto m = ShiftRight<23 - M - L>(b); |
75 | 2.80k | const auto a = Add(kBase, Mul(n, kMulN)); |
76 | 2.80k | const auto d = And(m, kMaskM); |
77 | 2.80k | const auto eb_fixed = IfThenElseZero(not_literal, eb); |
78 | 2.80k | const auto c = Or(a, l); |
79 | 2.80k | extra_bits = Add(extra_bits, eb_fixed); |
80 | 2.80k | const auto t = Or(c, d); |
81 | 2.80k | const auto t_fixed = IfThenElse(not_literal, t, val); |
82 | 2.80k | Store(t_fixed, du, out + i); |
83 | 2.80k | } |
84 | 2.80k | if (last_full < len) { |
85 | 1.40k | const auto stop = Set(du, len); |
86 | 1.40k | const auto fence = Iota(du, last_full); |
87 | 1.40k | const auto take = Lt(fence, stop); |
88 | 1.40k | const auto val = LoadU(du, values + last_full); |
89 | 1.40k | const auto is_large = Gt(val, kLargeThreshold); |
90 | 1.40k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); |
91 | 1.40k | const auto not_literal = Ge(val, kSplit); |
92 | 1.40k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); |
93 | 1.40k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); |
94 | 1.40k | const auto l = And(val, kMaskL); |
95 | 1.40k | const auto exp = ShiftRight<23>(b); |
96 | 1.40k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); |
97 | 1.40k | const auto n = Sub(exp_fixed, kExpOffset); |
98 | 1.40k | const auto eb = Sub(exp_fixed, kEBOffset); |
99 | 1.40k | const auto m = ShiftRight<23 - M - L>(b); |
100 | 1.40k | const auto a = Add(kBase, Mul(n, kMulN)); |
101 | 1.40k | const auto d = And(m, kMaskM); |
102 | 1.40k | const auto eb_fixed = IfThenElseZero(not_literal, eb); |
103 | 1.40k | const auto eb_masked = IfThenElseZero(take, eb_fixed); |
104 | 1.40k | const auto c = Or(a, l); |
105 | 1.40k | extra_bits = Add(extra_bits, eb_masked); |
106 | 1.40k | const auto t = Or(c, d); |
107 | 1.40k | const auto t_fixed = IfThenElse(not_literal, t, val); |
108 | 1.40k | Store(t_fixed, du, out + last_full); |
109 | 1.40k | } |
110 | 2.80k | return GetLane(SumOfLanes(du, extra_bits)); |
111 | 2.80k | } Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 45 | 700 | uint32_t* JXL_RESTRICT out) { | 46 | 700 | const HWY_FULL(uint32_t) du; | 47 | 700 | const HWY_FULL(float) df; | 48 | 700 | const auto kZero = Zero(du); | 49 | 700 | const auto kSplit = Set(du, 1 << E); | 50 | 700 | const auto kExpOffset = Set(du, 127); | 51 | 700 | const auto kEBOffset = Set(du, 127 + M + L); | 52 | 700 | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 53 | 700 | const auto kMulN = Set(du, 1 << (M + L)); | 54 | 700 | const auto kMaskL = Set(du, (1 << L) - 1); | 55 | 700 | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 56 | 700 | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 57 | 700 | constexpr size_t kLargeShiftVal = 10; | 58 | 700 | const auto kLargeShift = Set(du, kLargeShiftVal); | 59 | | | 60 | 700 | auto extra_bits = kZero; | 61 | 700 | size_t last_full = Lanes(du) * (len / Lanes(du)); | 62 | 1.40k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 63 | 700 | const auto val = LoadU(du, values + i); | 64 | 700 | const auto is_large = Gt(val, kLargeThreshold); | 65 | 700 | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 66 | 700 | const auto not_literal = Ge(val, kSplit); | 67 | 700 | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 68 | 700 | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 69 | 700 | const auto l = And(val, kMaskL); | 70 | 700 | const auto exp = ShiftRight<23>(b); | 71 | 700 | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 72 | 700 | const auto n = Sub(exp_fixed, kExpOffset); | 73 | 700 | const auto eb = Sub(exp_fixed, kEBOffset); | 74 | 700 | const auto m = ShiftRight<23 - M - L>(b); | 75 | 700 | const auto a = Add(kBase, Mul(n, kMulN)); | 76 | 700 | const auto d = And(m, kMaskM); | 77 | 700 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 78 | 700 | const auto c = Or(a, l); | 79 | 700 | extra_bits = Add(extra_bits, eb_fixed); | 80 | 700 | const auto t = Or(c, d); | 81 | 700 | const auto t_fixed = IfThenElse(not_literal, t, val); | 82 | 700 | Store(t_fixed, du, out + i); | 83 | 700 | } | 84 | 700 | if (last_full < len) { | 85 | 350 | const auto stop = Set(du, len); | 86 | 350 | const auto fence = Iota(du, last_full); | 87 | 350 | const auto take = Lt(fence, stop); | 88 | 350 | const auto val = LoadU(du, values + last_full); | 89 | 350 | const auto is_large = Gt(val, kLargeThreshold); | 90 | 350 | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 91 | 350 | const auto not_literal = Ge(val, kSplit); | 92 | 350 | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 93 | 350 | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 94 | 350 | const auto l = And(val, kMaskL); | 95 | 350 | const auto exp = ShiftRight<23>(b); | 96 | 350 | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 97 | 350 | const auto n = Sub(exp_fixed, kExpOffset); | 98 | 350 | const auto eb = Sub(exp_fixed, kEBOffset); | 99 | 350 | const auto m = ShiftRight<23 - M - L>(b); | 100 | 350 | const auto a = Add(kBase, Mul(n, kMulN)); | 101 | 350 | const auto d = And(m, kMaskM); | 102 | 350 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 103 | 350 | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 104 | 350 | const auto c = Or(a, l); | 105 | 350 | extra_bits = Add(extra_bits, eb_masked); | 106 | 350 | const auto t = Or(c, d); | 107 | 350 | const auto t_fixed = IfThenElse(not_literal, t, val); | 108 | 350 | Store(t_fixed, du, out + last_full); | 109 | 350 | } | 110 | 700 | return GetLane(SumOfLanes(du, extra_bits)); | 111 | 700 | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 45 | 700 | uint32_t* JXL_RESTRICT out) { | 46 | 700 | const HWY_FULL(uint32_t) du; | 47 | 700 | const HWY_FULL(float) df; | 48 | 700 | const auto kZero = Zero(du); | 49 | 700 | const auto kSplit = Set(du, 1 << E); | 50 | 700 | const auto kExpOffset = Set(du, 127); | 51 | 700 | const auto kEBOffset = Set(du, 127 + M + L); | 52 | 700 | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 53 | 700 | const auto kMulN = Set(du, 1 << (M + L)); | 54 | 700 | const auto kMaskL = Set(du, (1 << L) - 1); | 55 | 700 | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 56 | 700 | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 57 | 700 | constexpr size_t kLargeShiftVal = 10; | 58 | 700 | const auto kLargeShift = Set(du, kLargeShiftVal); | 59 | | | 60 | 700 | auto extra_bits = kZero; | 61 | 700 | size_t last_full = Lanes(du) * (len / Lanes(du)); | 62 | 1.40k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 63 | 700 | const auto val = LoadU(du, values + i); | 64 | 700 | const auto is_large = Gt(val, kLargeThreshold); | 65 | 700 | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 66 | 700 | const auto not_literal = Ge(val, kSplit); | 67 | 700 | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 68 | 700 | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 69 | 700 | const auto l = And(val, kMaskL); | 70 | 700 | const auto exp = ShiftRight<23>(b); | 71 | 700 | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 72 | 700 | const auto n = Sub(exp_fixed, kExpOffset); | 73 | 700 | const auto eb = Sub(exp_fixed, kEBOffset); | 74 | 700 | const auto m = ShiftRight<23 - M - L>(b); | 75 | 700 | const auto a = Add(kBase, Mul(n, kMulN)); | 76 | 700 | const auto d = And(m, kMaskM); | 77 | 700 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 78 | 700 | const auto c = Or(a, l); | 79 | 700 | extra_bits = Add(extra_bits, eb_fixed); | 80 | 700 | const auto t = Or(c, d); | 81 | 700 | const auto t_fixed = IfThenElse(not_literal, t, val); | 82 | 700 | Store(t_fixed, du, out + i); | 83 | 700 | } | 84 | 700 | if (last_full < len) { | 85 | 350 | const auto stop = Set(du, len); | 86 | 350 | const auto fence = Iota(du, last_full); | 87 | 350 | const auto take = Lt(fence, stop); | 88 | 350 | const auto val = LoadU(du, values + last_full); | 89 | 350 | const auto is_large = Gt(val, kLargeThreshold); | 90 | 350 | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 91 | 350 | const auto not_literal = Ge(val, kSplit); | 92 | 350 | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 93 | 350 | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 94 | 350 | const auto l = And(val, kMaskL); | 95 | 350 | const auto exp = ShiftRight<23>(b); | 96 | 350 | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 97 | 350 | const auto n = Sub(exp_fixed, kExpOffset); | 98 | 350 | const auto eb = Sub(exp_fixed, kEBOffset); | 99 | 350 | const auto m = ShiftRight<23 - M - L>(b); | 100 | 350 | const auto a = Add(kBase, Mul(n, kMulN)); | 101 | 350 | const auto d = And(m, kMaskM); | 102 | 350 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 103 | 350 | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 104 | 350 | const auto c = Or(a, l); | 105 | 350 | extra_bits = Add(extra_bits, eb_masked); | 106 | 350 | const auto t = Or(c, d); | 107 | 350 | const auto t_fixed = IfThenElse(not_literal, t, val); | 108 | 350 | Store(t_fixed, du, out + last_full); | 109 | 350 | } | 110 | 700 | return GetLane(SumOfLanes(du, extra_bits)); | 111 | 700 | } |
Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 45 | 700 | uint32_t* JXL_RESTRICT out) { | 46 | 700 | const HWY_FULL(uint32_t) du; | 47 | 700 | const HWY_FULL(float) df; | 48 | 700 | const auto kZero = Zero(du); | 49 | 700 | const auto kSplit = Set(du, 1 << E); | 50 | 700 | const auto kExpOffset = Set(du, 127); | 51 | 700 | const auto kEBOffset = Set(du, 127 + M + L); | 52 | 700 | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 53 | 700 | const auto kMulN = Set(du, 1 << (M + L)); | 54 | 700 | const auto kMaskL = Set(du, (1 << L) - 1); | 55 | 700 | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 56 | 700 | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 57 | 700 | constexpr size_t kLargeShiftVal = 10; | 58 | 700 | const auto kLargeShift = Set(du, kLargeShiftVal); | 59 | | | 60 | 700 | auto extra_bits = kZero; | 61 | 700 | size_t last_full = Lanes(du) * (len / Lanes(du)); | 62 | 1.40k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 63 | 700 | const auto val = LoadU(du, values + i); | 64 | 700 | const auto is_large = Gt(val, kLargeThreshold); | 65 | 700 | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 66 | 700 | const auto not_literal = Ge(val, kSplit); | 67 | 700 | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 68 | 700 | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 69 | 700 | const auto l = And(val, kMaskL); | 70 | 700 | const auto exp = ShiftRight<23>(b); | 71 | 700 | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 72 | 700 | const auto n = Sub(exp_fixed, kExpOffset); | 73 | 700 | const auto eb = Sub(exp_fixed, kEBOffset); | 74 | 700 | const auto m = ShiftRight<23 - M - L>(b); | 75 | 700 | const auto a = Add(kBase, Mul(n, kMulN)); | 76 | 700 | const auto d = And(m, kMaskM); | 77 | 700 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 78 | 700 | const auto c = Or(a, l); | 79 | 700 | extra_bits = Add(extra_bits, eb_fixed); | 80 | 700 | const auto t = Or(c, d); | 81 | 700 | const auto t_fixed = IfThenElse(not_literal, t, val); | 82 | 700 | Store(t_fixed, du, out + i); | 83 | 700 | } | 84 | 700 | if (last_full < len) { | 85 | 350 | const auto stop = Set(du, len); | 86 | 350 | const auto fence = Iota(du, last_full); | 87 | 350 | const auto take = Lt(fence, stop); | 88 | 350 | const auto val = LoadU(du, values + last_full); | 89 | 350 | const auto is_large = Gt(val, kLargeThreshold); | 90 | 350 | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 91 | 350 | const auto not_literal = Ge(val, kSplit); | 92 | 350 | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 93 | 350 | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 94 | 350 | const auto l = And(val, kMaskL); | 95 | 350 | const auto exp = ShiftRight<23>(b); | 96 | 350 | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 97 | 350 | const auto n = Sub(exp_fixed, kExpOffset); | 98 | 350 | const auto eb = Sub(exp_fixed, kEBOffset); | 99 | 350 | const auto m = ShiftRight<23 - M - L>(b); | 100 | 350 | const auto a = Add(kBase, Mul(n, kMulN)); | 101 | 350 | const auto d = And(m, kMaskM); | 102 | 350 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 103 | 350 | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 104 | 350 | const auto c = Or(a, l); | 105 | 350 | extra_bits = Add(extra_bits, eb_masked); | 106 | 350 | const auto t = Or(c, d); | 107 | 350 | const auto t_fixed = IfThenElse(not_literal, t, val); | 108 | 350 | Store(t_fixed, du, out + last_full); | 109 | 350 | } | 110 | 700 | return GetLane(SumOfLanes(du, extra_bits)); | 111 | 700 | } |
Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 45 | 700 | uint32_t* JXL_RESTRICT out) { | 46 | 700 | const HWY_FULL(uint32_t) du; | 47 | 700 | const HWY_FULL(float) df; | 48 | 700 | const auto kZero = Zero(du); | 49 | 700 | const auto kSplit = Set(du, 1 << E); | 50 | 700 | const auto kExpOffset = Set(du, 127); | 51 | 700 | const auto kEBOffset = Set(du, 127 + M + L); | 52 | 700 | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 53 | 700 | const auto kMulN = Set(du, 1 << (M + L)); | 54 | 700 | const auto kMaskL = Set(du, (1 << L) - 1); | 55 | 700 | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 56 | 700 | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 57 | 700 | constexpr size_t kLargeShiftVal = 10; | 58 | 700 | const auto kLargeShift = Set(du, kLargeShiftVal); | 59 | | | 60 | 700 | auto extra_bits = kZero; | 61 | 700 | size_t last_full = Lanes(du) * (len / Lanes(du)); | 62 | 1.40k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 63 | 700 | const auto val = LoadU(du, values + i); | 64 | 700 | const auto is_large = Gt(val, kLargeThreshold); | 65 | 700 | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 66 | 700 | const auto not_literal = Ge(val, kSplit); | 67 | 700 | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 68 | 700 | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 69 | 700 | const auto l = And(val, kMaskL); | 70 | 700 | const auto exp = ShiftRight<23>(b); | 71 | 700 | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 72 | 700 | const auto n = Sub(exp_fixed, kExpOffset); | 73 | 700 | const auto eb = Sub(exp_fixed, kEBOffset); | 74 | 700 | const auto m = ShiftRight<23 - M - L>(b); | 75 | 700 | const auto a = Add(kBase, Mul(n, kMulN)); | 76 | 700 | const auto d = And(m, kMaskM); | 77 | 700 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 78 | 700 | const auto c = Or(a, l); | 79 | 700 | extra_bits = Add(extra_bits, eb_fixed); | 80 | 700 | const auto t = Or(c, d); | 81 | 700 | const auto t_fixed = IfThenElse(not_literal, t, val); | 82 | 700 | Store(t_fixed, du, out + i); | 83 | 700 | } | 84 | 700 | if (last_full < len) { | 85 | 350 | const auto stop = Set(du, len); | 86 | 350 | const auto fence = Iota(du, last_full); | 87 | 350 | const auto take = Lt(fence, stop); | 88 | 350 | const auto val = LoadU(du, values + last_full); | 89 | 350 | const auto is_large = Gt(val, kLargeThreshold); | 90 | 350 | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 91 | 350 | const auto not_literal = Ge(val, kSplit); | 92 | 350 | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 93 | 350 | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 94 | 350 | const auto l = And(val, kMaskL); | 95 | 350 | const auto exp = ShiftRight<23>(b); | 96 | 350 | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 97 | 350 | const auto n = Sub(exp_fixed, kExpOffset); | 98 | 350 | const auto eb = Sub(exp_fixed, kEBOffset); | 99 | 350 | const auto m = ShiftRight<23 - M - L>(b); | 100 | 350 | const auto a = Add(kBase, Mul(n, kMulN)); | 101 | 350 | const auto d = And(m, kMaskM); | 102 | 350 | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 103 | 350 | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 104 | 350 | const auto c = Or(a, l); | 105 | 350 | extra_bits = Add(extra_bits, eb_masked); | 106 | 350 | const auto t = Or(c, d); | 107 | 350 | const auto t_fixed = IfThenElse(not_literal, t, val); | 108 | 350 | Store(t_fixed, du, out + last_full); | 109 | 350 | } | 110 | 700 | return GetLane(SumOfLanes(du, extra_bits)); | 111 | 700 | } |
Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) |
112 | | |
113 | | uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len, |
114 | 2.80k | HybridUintConfig cfg, AlignedMemory& tokens) { |
115 | 2.80k | uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>(); |
116 | | #if HWY_TARGET == HWY_SCALAR |
117 | | uint32_t extra_bits = 0; |
118 | | for (size_t i = 0; i < len; ++i) { |
119 | | uint32_t v = values[i]; |
120 | | uint32_t tok, nbits, bits; |
121 | | cfg.Encode(v, &tok, &nbits, &bits); |
122 | | extra_bits += nbits; |
123 | | out[i] = tok; |
124 | | } |
125 | | return extra_bits; |
126 | | #else |
127 | 2.80k | if (cfg.split_exponent == 0) { |
128 | 700 | return EstimateTokenCostImpl<0, 0, 0>(values, len, out); |
129 | 2.10k | } else if (cfg.split_exponent == 2) { |
130 | 700 | JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1)); |
131 | 700 | return EstimateTokenCostImpl<2, 0, 1>(values, len, out); |
132 | 1.40k | } else if (cfg.split_exponent == 3) { |
133 | 0 | if (cfg.msb_in_token == 1) { |
134 | 0 | if (cfg.lsb_in_token == 0) { |
135 | 0 | return EstimateTokenCostImpl<3, 1, 0>(values, len, out); |
136 | 0 | } else { |
137 | 0 | JXL_DASSERT(cfg.lsb_in_token == 2); |
138 | 0 | return EstimateTokenCostImpl<3, 1, 2>(values, len, out); |
139 | 0 | } |
140 | 0 | } else { |
141 | 0 | JXL_DASSERT(cfg.msb_in_token == 2); |
142 | 0 | if (cfg.lsb_in_token == 0) { |
143 | 0 | return EstimateTokenCostImpl<3, 2, 0>(values, len, out); |
144 | 0 | } else { |
145 | 0 | JXL_DASSERT(cfg.lsb_in_token == 1); |
146 | 0 | return EstimateTokenCostImpl<3, 2, 1>(values, len, out); |
147 | 0 | } |
148 | 0 | } |
149 | 1.40k | } else if (cfg.split_exponent == 4) { |
150 | 1.40k | if (cfg.msb_in_token == 1) { |
151 | 700 | if (cfg.lsb_in_token == 0) { |
152 | 0 | return EstimateTokenCostImpl<4, 1, 0>(values, len, out); |
153 | 700 | } else if (cfg.lsb_in_token == 2) { |
154 | 700 | return EstimateTokenCostImpl<4, 1, 2>(values, len, out); |
155 | 700 | } else { |
156 | 0 | JXL_DASSERT(cfg.lsb_in_token == 3); |
157 | 0 | return EstimateTokenCostImpl<4, 1, 3>(values, len, out); |
158 | 0 | } |
159 | 700 | } else { |
160 | 700 | JXL_DASSERT(cfg.msb_in_token == 2); |
161 | 700 | if (cfg.lsb_in_token == 0) { |
162 | 700 | return EstimateTokenCostImpl<4, 2, 0>(values, len, out); |
163 | 700 | } else if (cfg.lsb_in_token == 1) { |
164 | 0 | return EstimateTokenCostImpl<4, 2, 1>(values, len, out); |
165 | 0 | } else { |
166 | 0 | JXL_DASSERT(cfg.lsb_in_token == 2); |
167 | 0 | return EstimateTokenCostImpl<4, 2, 2>(values, len, out); |
168 | 0 | } |
169 | 700 | } |
170 | 1.40k | } else if (cfg.split_exponent == 5) { |
171 | 0 | if (cfg.msb_in_token == 1) { |
172 | 0 | if (cfg.lsb_in_token == 0) { |
173 | 0 | return EstimateTokenCostImpl<5, 1, 0>(values, len, out); |
174 | 0 | } else if (cfg.lsb_in_token == 2) { |
175 | 0 | return EstimateTokenCostImpl<5, 1, 2>(values, len, out); |
176 | 0 | } else { |
177 | 0 | JXL_DASSERT(cfg.lsb_in_token == 4); |
178 | 0 | return EstimateTokenCostImpl<5, 1, 4>(values, len, out); |
179 | 0 | } |
180 | 0 | } else { |
181 | 0 | JXL_DASSERT(cfg.msb_in_token == 2); |
182 | 0 | if (cfg.lsb_in_token == 0) { |
183 | 0 | return EstimateTokenCostImpl<5, 2, 0>(values, len, out); |
184 | 0 | } else if (cfg.lsb_in_token == 1) { |
185 | 0 | return EstimateTokenCostImpl<5, 2, 1>(values, len, out); |
186 | 0 | } else if (cfg.lsb_in_token == 2) { |
187 | 0 | return EstimateTokenCostImpl<5, 2, 2>(values, len, out); |
188 | 0 | } else { |
189 | 0 | JXL_DASSERT(cfg.lsb_in_token == 3); |
190 | 0 | return EstimateTokenCostImpl<5, 2, 3>(values, len, out); |
191 | 0 | } |
192 | 0 | } |
193 | 0 | } else if (cfg.split_exponent == 6) { |
194 | 0 | if (cfg.msb_in_token == 0) { |
195 | 0 | JXL_DASSERT(cfg.lsb_in_token == 0); |
196 | 0 | return EstimateTokenCostImpl<6, 0, 0>(values, len, out); |
197 | 0 | } else if (cfg.msb_in_token == 1) { |
198 | 0 | JXL_DASSERT(cfg.lsb_in_token == 5); |
199 | 0 | return EstimateTokenCostImpl<6, 1, 5>(values, len, out); |
200 | 0 | } else { |
201 | 0 | JXL_DASSERT(cfg.msb_in_token == 2); |
202 | 0 | JXL_DASSERT(cfg.lsb_in_token == 4); |
203 | 0 | return EstimateTokenCostImpl<6, 2, 4>(values, len, out); |
204 | 0 | } |
205 | 0 | } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) { |
206 | 0 | JXL_DASSERT(cfg.msb_in_token == 0); |
207 | 0 | JXL_DASSERT(cfg.lsb_in_token == 0); |
208 | 0 | if (cfg.split_exponent == 7) { |
209 | 0 | return EstimateTokenCostImpl<7, 0, 0>(values, len, out); |
210 | 0 | } else if (cfg.split_exponent == 8) { |
211 | 0 | return EstimateTokenCostImpl<8, 0, 0>(values, len, out); |
212 | 0 | } else if (cfg.split_exponent == 9) { |
213 | 0 | return EstimateTokenCostImpl<9, 0, 0>(values, len, out); |
214 | 0 | } else if (cfg.split_exponent == 10) { |
215 | 0 | return EstimateTokenCostImpl<10, 0, 0>(values, len, out); |
216 | 0 | } else if (cfg.split_exponent == 11) { |
217 | 0 | return EstimateTokenCostImpl<11, 0, 0>(values, len, out); |
218 | 0 | } else { |
219 | 0 | return EstimateTokenCostImpl<12, 0, 0>(values, len, out); |
220 | 0 | } |
221 | 0 | } else { |
222 | 0 | JXL_DASSERT(false); |
223 | 0 | } |
224 | 0 | return ~0; |
225 | 2.80k | #endif |
226 | 2.80k | } Unexecuted instantiation: jxl::N_SSE4::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&) jxl::N_AVX2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&) Line | Count | Source | 114 | 2.80k | HybridUintConfig cfg, AlignedMemory& tokens) { | 115 | 2.80k | uint32_t* JXL_RESTRICT out = tokens.address<uint32_t>(); | 116 | | #if HWY_TARGET == HWY_SCALAR | 117 | | uint32_t extra_bits = 0; | 118 | | for (size_t i = 0; i < len; ++i) { | 119 | | uint32_t v = values[i]; | 120 | | uint32_t tok, nbits, bits; | 121 | | cfg.Encode(v, &tok, &nbits, &bits); | 122 | | extra_bits += nbits; | 123 | | out[i] = tok; | 124 | | } | 125 | | return extra_bits; | 126 | | #else | 127 | 2.80k | if (cfg.split_exponent == 0) { | 128 | 700 | return EstimateTokenCostImpl<0, 0, 0>(values, len, out); | 129 | 2.10k | } else if (cfg.split_exponent == 2) { | 130 | 700 | JXL_DASSERT((cfg.msb_in_token == 0) && (cfg.lsb_in_token == 1)); | 131 | 700 | return EstimateTokenCostImpl<2, 0, 1>(values, len, out); | 132 | 1.40k | } else if (cfg.split_exponent == 3) { | 133 | 0 | if (cfg.msb_in_token == 1) { | 134 | 0 | if (cfg.lsb_in_token == 0) { | 135 | 0 | return EstimateTokenCostImpl<3, 1, 0>(values, len, out); | 136 | 0 | } else { | 137 | 0 | JXL_DASSERT(cfg.lsb_in_token == 2); | 138 | 0 | return EstimateTokenCostImpl<3, 1, 2>(values, len, out); | 139 | 0 | } | 140 | 0 | } else { | 141 | 0 | JXL_DASSERT(cfg.msb_in_token == 2); | 142 | 0 | if (cfg.lsb_in_token == 0) { | 143 | 0 | return EstimateTokenCostImpl<3, 2, 0>(values, len, out); | 144 | 0 | } else { | 145 | 0 | JXL_DASSERT(cfg.lsb_in_token == 1); | 146 | 0 | return EstimateTokenCostImpl<3, 2, 1>(values, len, out); | 147 | 0 | } | 148 | 0 | } | 149 | 1.40k | } else if (cfg.split_exponent == 4) { | 150 | 1.40k | if (cfg.msb_in_token == 1) { | 151 | 700 | if (cfg.lsb_in_token == 0) { | 152 | 0 | return EstimateTokenCostImpl<4, 1, 0>(values, len, out); | 153 | 700 | } else if (cfg.lsb_in_token == 2) { | 154 | 700 | return EstimateTokenCostImpl<4, 1, 2>(values, len, out); | 155 | 700 | } else { | 156 | 0 | JXL_DASSERT(cfg.lsb_in_token == 3); | 157 | 0 | return EstimateTokenCostImpl<4, 1, 3>(values, len, out); | 158 | 0 | } | 159 | 700 | } else { | 160 | 700 | JXL_DASSERT(cfg.msb_in_token == 2); | 161 | 700 | if (cfg.lsb_in_token == 0) { | 162 | 700 | return EstimateTokenCostImpl<4, 2, 0>(values, len, out); | 163 | 700 | } else if (cfg.lsb_in_token == 1) { | 164 | 0 | return EstimateTokenCostImpl<4, 2, 1>(values, len, out); | 165 | 0 | } else { | 166 | 0 | JXL_DASSERT(cfg.lsb_in_token == 2); | 167 | 0 | return EstimateTokenCostImpl<4, 2, 2>(values, len, out); | 168 | 0 | } | 169 | 700 | } | 170 | 1.40k | } else if (cfg.split_exponent == 5) { | 171 | 0 | if (cfg.msb_in_token == 1) { | 172 | 0 | if (cfg.lsb_in_token == 0) { | 173 | 0 | return EstimateTokenCostImpl<5, 1, 0>(values, len, out); | 174 | 0 | } else if (cfg.lsb_in_token == 2) { | 175 | 0 | return EstimateTokenCostImpl<5, 1, 2>(values, len, out); | 176 | 0 | } else { | 177 | 0 | JXL_DASSERT(cfg.lsb_in_token == 4); | 178 | 0 | return EstimateTokenCostImpl<5, 1, 4>(values, len, out); | 179 | 0 | } | 180 | 0 | } else { | 181 | 0 | JXL_DASSERT(cfg.msb_in_token == 2); | 182 | 0 | if (cfg.lsb_in_token == 0) { | 183 | 0 | return EstimateTokenCostImpl<5, 2, 0>(values, len, out); | 184 | 0 | } else if (cfg.lsb_in_token == 1) { | 185 | 0 | return EstimateTokenCostImpl<5, 2, 1>(values, len, out); | 186 | 0 | } else if (cfg.lsb_in_token == 2) { | 187 | 0 | return EstimateTokenCostImpl<5, 2, 2>(values, len, out); | 188 | 0 | } else { | 189 | 0 | JXL_DASSERT(cfg.lsb_in_token == 3); | 190 | 0 | return EstimateTokenCostImpl<5, 2, 3>(values, len, out); | 191 | 0 | } | 192 | 0 | } | 193 | 0 | } else if (cfg.split_exponent == 6) { | 194 | 0 | if (cfg.msb_in_token == 0) { | 195 | 0 | JXL_DASSERT(cfg.lsb_in_token == 0); | 196 | 0 | return EstimateTokenCostImpl<6, 0, 0>(values, len, out); | 197 | 0 | } else if (cfg.msb_in_token == 1) { | 198 | 0 | JXL_DASSERT(cfg.lsb_in_token == 5); | 199 | 0 | return EstimateTokenCostImpl<6, 1, 5>(values, len, out); | 200 | 0 | } else { | 201 | 0 | JXL_DASSERT(cfg.msb_in_token == 2); | 202 | 0 | JXL_DASSERT(cfg.lsb_in_token == 4); | 203 | 0 | return EstimateTokenCostImpl<6, 2, 4>(values, len, out); | 204 | 0 | } | 205 | 0 | } else if (cfg.split_exponent >= 7 && cfg.split_exponent <= 12) { | 206 | 0 | JXL_DASSERT(cfg.msb_in_token == 0); | 207 | 0 | JXL_DASSERT(cfg.lsb_in_token == 0); | 208 | 0 | if (cfg.split_exponent == 7) { | 209 | 0 | return EstimateTokenCostImpl<7, 0, 0>(values, len, out); | 210 | 0 | } else if (cfg.split_exponent == 8) { | 211 | 0 | return EstimateTokenCostImpl<8, 0, 0>(values, len, out); | 212 | 0 | } else if (cfg.split_exponent == 9) { | 213 | 0 | return EstimateTokenCostImpl<9, 0, 0>(values, len, out); | 214 | 0 | } else if (cfg.split_exponent == 10) { | 215 | 0 | return EstimateTokenCostImpl<10, 0, 0>(values, len, out); | 216 | 0 | } else if (cfg.split_exponent == 11) { | 217 | 0 | return EstimateTokenCostImpl<11, 0, 0>(values, len, out); | 218 | 0 | } else { | 219 | 0 | return EstimateTokenCostImpl<12, 0, 0>(values, len, out); | 220 | 0 | } | 221 | 0 | } else { | 222 | 0 | JXL_DASSERT(false); | 223 | 0 | } | 224 | 0 | return ~0; | 225 | 2.80k | #endif | 226 | 2.80k | } |
Unexecuted instantiation: jxl::N_SSE2::EstimateTokenCost(unsigned int*, unsigned long, jxl::HybridUintConfig, jxl::AlignedMemory&) |
227 | | |
228 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
229 | | } // namespace HWY_NAMESPACE |
230 | | } // namespace jxl |
231 | | HWY_AFTER_NAMESPACE(); |
232 | | |
233 | | #if HWY_ONCE |
234 | | namespace jxl { |
235 | | |
236 | | HWY_EXPORT(EstimateTokenCost); |
237 | | |
238 | | uint32_t EstimateTokenCost(uint32_t* JXL_RESTRICT values, size_t len, |
239 | 2.80k | HybridUintConfig cfg, AlignedMemory& tokens) { |
240 | 2.80k | JXL_DASSERT(cfg.lsb_in_token + cfg.msb_in_token <= cfg.split_exponent); |
241 | 2.80k | return HWY_DYNAMIC_DISPATCH(EstimateTokenCost)(values, len, cfg, tokens); |
242 | 2.80k | } |
243 | | |
244 | | } // namespace jxl |
245 | | #endif |