110 | 834k | } Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 100k | uint32_t* JXL_RESTRICT out) { | 45 | 100k | const HWY_FULL(uint32_t) du; | 46 | 100k | const HWY_FULL(float) df; | 47 | 100k | const auto kZero = Zero(du); | 48 | 100k | const auto kSplit = Set(du, 1 << E); | 49 | 100k | const auto kExpOffset = Set(du, 127); | 50 | 100k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 100k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 100k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 100k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 100k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 100k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 100k | constexpr size_t kLargeShiftVal = 10; | 57 | 100k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 100k | auto extra_bits = kZero; | 60 | 100k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 8.83M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 8.73M | const auto val = LoadU(du, values + i); | 63 | 8.73M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 8.73M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 8.73M | const auto not_literal = Ge(val, kSplit); | 66 | 8.73M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 8.73M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 8.73M | const auto l = And(val, kMaskL); | 69 | 8.73M | const auto exp = ShiftRight<23>(b); | 70 | 8.73M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 8.73M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 8.73M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 8.73M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 8.73M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 8.73M | const auto d = And(m, kMaskM); | 76 | 8.73M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 8.73M | const auto c = Or(a, l); | 78 | 8.73M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 8.73M | const auto t = Or(c, d); | 80 | 8.73M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 8.73M | Store(t_fixed, du, out + i); | 82 | 8.73M | } | 83 | 100k | if (last_full < len) { | 84 | 87.0k | const auto stop = Set(du, len); | 85 | 87.0k | const auto fence = Iota(du, last_full); | 86 | 87.0k | const auto take = Lt(fence, stop); | 87 | 87.0k | const auto val = LoadU(du, values + last_full); | 88 | 87.0k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 87.0k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 87.0k | const auto not_literal = Ge(val, kSplit); | 91 | 87.0k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 87.0k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 87.0k | const auto l = And(val, kMaskL); | 94 | 87.0k | const auto exp = ShiftRight<23>(b); | 95 | 87.0k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 87.0k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 87.0k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 87.0k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 87.0k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 87.0k | const auto d = And(m, kMaskM); | 101 | 87.0k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 87.0k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 87.0k | const auto c = Or(a, l); | 104 | 87.0k | extra_bits = Add(extra_bits, eb_masked); | 105 | 87.0k | const auto t = Or(c, d); | 106 | 87.0k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 87.0k | Store(t_fixed, du, out + last_full); | 108 | 87.0k | } | 109 | 100k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 100k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 97.9k | uint32_t* JXL_RESTRICT out) { | 45 | 97.9k | const HWY_FULL(uint32_t) du; | 46 | 97.9k | const HWY_FULL(float) df; | 47 | 97.9k | const auto kZero = Zero(du); | 48 | 97.9k | const auto kSplit = Set(du, 1 << E); | 49 | 97.9k | const auto kExpOffset = Set(du, 127); | 50 | 97.9k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 97.9k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 97.9k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 97.9k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 97.9k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 97.9k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 97.9k | constexpr size_t kLargeShiftVal = 10; | 57 | 97.9k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 97.9k | auto extra_bits = kZero; | 60 | 97.9k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 8.83M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 8.73M | const auto val = LoadU(du, values + i); | 63 | 8.73M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 8.73M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 8.73M | const auto not_literal = Ge(val, kSplit); | 66 | 8.73M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 8.73M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 8.73M | const auto l = And(val, kMaskL); | 69 | 8.73M | const auto exp = ShiftRight<23>(b); | 70 | 8.73M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 8.73M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 8.73M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 8.73M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 8.73M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 8.73M | const auto d = And(m, kMaskM); | 76 | 8.73M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 8.73M | const auto c = Or(a, l); | 78 | 8.73M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 8.73M | const auto t = Or(c, d); | 80 | 8.73M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 8.73M | Store(t_fixed, du, out + i); | 82 | 8.73M | } | 83 | 97.9k | if (last_full < len) { | 84 | 84.5k | const auto stop = Set(du, len); | 85 | 84.5k | const auto fence = Iota(du, last_full); | 86 | 84.5k | const auto take = Lt(fence, stop); | 87 | 84.5k | const auto val = LoadU(du, values + last_full); | 88 | 84.5k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 84.5k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 84.5k | const auto not_literal = Ge(val, kSplit); | 91 | 84.5k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 84.5k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 84.5k | const auto l = And(val, kMaskL); | 94 | 84.5k | const auto exp = ShiftRight<23>(b); | 95 | 84.5k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 84.5k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 84.5k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 84.5k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 84.5k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 84.5k | const auto d = And(m, kMaskM); | 101 | 84.5k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 84.5k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 84.5k | const auto c = Or(a, l); | 104 | 84.5k | extra_bits = Add(extra_bits, eb_masked); | 105 | 84.5k | const auto t = Or(c, d); | 106 | 84.5k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 84.5k | Store(t_fixed, du, out + last_full); | 108 | 84.5k | } | 109 | 97.9k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 97.9k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 97.9k | uint32_t* JXL_RESTRICT out) { | 45 | 97.9k | const HWY_FULL(uint32_t) du; | 46 | 97.9k | const HWY_FULL(float) df; | 47 | 97.9k | const auto kZero = Zero(du); | 48 | 97.9k | const auto kSplit = Set(du, 1 << E); | 49 | 97.9k | const auto kExpOffset = Set(du, 127); | 50 | 97.9k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 97.9k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 97.9k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 97.9k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 97.9k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 97.9k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 97.9k | constexpr size_t kLargeShiftVal = 10; | 57 | 97.9k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 97.9k | auto extra_bits = kZero; | 60 | 97.9k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 8.83M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 8.73M | const auto val = LoadU(du, values + i); | 63 | 8.73M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 8.73M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 8.73M | const auto not_literal = Ge(val, kSplit); | 66 | 8.73M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 8.73M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 8.73M | const auto l = And(val, kMaskL); | 69 | 8.73M | const auto exp = ShiftRight<23>(b); | 70 | 8.73M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 8.73M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 8.73M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 8.73M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 8.73M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 8.73M | const auto d = And(m, kMaskM); | 76 | 8.73M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 8.73M | const auto c = Or(a, l); | 78 | 8.73M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 8.73M | const auto t = Or(c, d); | 80 | 8.73M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 8.73M | Store(t_fixed, du, out + i); | 82 | 8.73M | } | 83 | 97.9k | if (last_full < len) { | 84 | 84.5k | const auto stop = Set(du, len); | 85 | 84.5k | const auto fence = Iota(du, last_full); | 86 | 84.5k | const auto take = Lt(fence, stop); | 87 | 84.5k | const auto val = LoadU(du, values + last_full); | 88 | 84.5k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 84.5k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 84.5k | const auto not_literal = Ge(val, kSplit); | 91 | 84.5k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 84.5k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 84.5k | const auto l = And(val, kMaskL); | 94 | 84.5k | const auto exp = ShiftRight<23>(b); | 95 | 84.5k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 84.5k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 84.5k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 84.5k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 84.5k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 84.5k | const auto d = And(m, kMaskM); | 101 | 84.5k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 84.5k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 84.5k | const auto c = Or(a, l); | 104 | 84.5k | extra_bits = Add(extra_bits, eb_masked); | 105 | 84.5k | const auto t = Or(c, d); | 106 | 84.5k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 84.5k | Store(t_fixed, du, out + last_full); | 108 | 84.5k | } | 109 | 97.9k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 97.9k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 97.9k | uint32_t* JXL_RESTRICT out) { | 45 | 97.9k | const HWY_FULL(uint32_t) du; | 46 | 97.9k | const HWY_FULL(float) df; | 47 | 97.9k | const auto kZero = Zero(du); | 48 | 97.9k | const auto kSplit = Set(du, 1 << E); | 49 | 97.9k | const auto kExpOffset = Set(du, 127); | 50 | 97.9k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 97.9k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 97.9k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 97.9k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 97.9k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 97.9k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 97.9k | constexpr size_t kLargeShiftVal = 10; | 57 | 97.9k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 97.9k | auto extra_bits = kZero; | 60 | 97.9k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 8.83M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 8.73M | const auto val = LoadU(du, values + i); | 63 | 8.73M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 8.73M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 8.73M | const auto not_literal = Ge(val, kSplit); | 66 | 8.73M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 8.73M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 8.73M | const auto l = And(val, kMaskL); | 69 | 8.73M | const auto exp = ShiftRight<23>(b); | 70 | 8.73M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 8.73M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 8.73M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 8.73M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 8.73M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 8.73M | const auto d = And(m, kMaskM); | 76 | 8.73M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 8.73M | const auto c = Or(a, l); | 78 | 8.73M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 8.73M | const auto t = Or(c, d); | 80 | 8.73M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 8.73M | Store(t_fixed, du, out + i); | 82 | 8.73M | } | 83 | 97.9k | if (last_full < len) { | 84 | 84.5k | const auto stop = Set(du, len); | 85 | 84.5k | const auto fence = Iota(du, last_full); | 86 | 84.5k | const auto take = Lt(fence, stop); | 87 | 84.5k | const auto val = LoadU(du, values + last_full); | 88 | 84.5k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 84.5k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 84.5k | const auto not_literal = Ge(val, kSplit); | 91 | 84.5k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 84.5k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 84.5k | const auto l = And(val, kMaskL); | 94 | 84.5k | const auto exp = ShiftRight<23>(b); | 95 | 84.5k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 84.5k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 84.5k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 84.5k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 84.5k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 84.5k | const auto d = And(m, kMaskM); | 101 | 84.5k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 84.5k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 84.5k | const auto c = Or(a, l); | 104 | 84.5k | extra_bits = Add(extra_bits, eb_masked); | 105 | 84.5k | const auto t = Or(c, d); | 106 | 84.5k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 84.5k | Store(t_fixed, du, out + last_full); | 108 | 84.5k | } | 109 | 97.9k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 97.9k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.4k | uint32_t* JXL_RESTRICT out) { | 45 | 19.4k | const HWY_FULL(uint32_t) du; | 46 | 19.4k | const HWY_FULL(float) df; | 47 | 19.4k | const auto kZero = Zero(du); | 48 | 19.4k | const auto kSplit = Set(du, 1 << E); | 49 | 19.4k | const auto kExpOffset = Set(du, 127); | 50 | 19.4k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.4k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.4k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.4k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.4k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.4k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.4k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.4k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.4k | auto extra_bits = kZero; | 60 | 19.4k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.4k | if (last_full < len) { | 84 | 16.6k | const auto stop = Set(du, len); | 85 | 16.6k | const auto fence = Iota(du, last_full); | 86 | 16.6k | const auto take = Lt(fence, stop); | 87 | 16.6k | const auto val = LoadU(du, values + last_full); | 88 | 16.6k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.6k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.6k | const auto not_literal = Ge(val, kSplit); | 91 | 16.6k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.6k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.6k | const auto l = And(val, kMaskL); | 94 | 16.6k | const auto exp = ShiftRight<23>(b); | 95 | 16.6k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.6k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.6k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.6k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.6k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.6k | const auto d = And(m, kMaskM); | 101 | 16.6k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.6k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.6k | const auto c = Or(a, l); | 104 | 16.6k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.6k | const auto t = Or(c, d); | 106 | 16.6k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.6k | Store(t_fixed, du, out + last_full); | 108 | 16.6k | } | 109 | 19.4k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.4k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.4k | uint32_t* JXL_RESTRICT out) { | 45 | 19.4k | const HWY_FULL(uint32_t) du; | 46 | 19.4k | const HWY_FULL(float) df; | 47 | 19.4k | const auto kZero = Zero(du); | 48 | 19.4k | const auto kSplit = Set(du, 1 << E); | 49 | 19.4k | const auto kExpOffset = Set(du, 127); | 50 | 19.4k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.4k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.4k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.4k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.4k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.4k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.4k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.4k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.4k | auto extra_bits = kZero; | 60 | 19.4k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.4k | if (last_full < len) { | 84 | 16.6k | const auto stop = Set(du, len); | 85 | 16.6k | const auto fence = Iota(du, last_full); | 86 | 16.6k | const auto take = Lt(fence, stop); | 87 | 16.6k | const auto val = LoadU(du, values + last_full); | 88 | 16.6k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.6k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.6k | const auto not_literal = Ge(val, kSplit); | 91 | 16.6k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.6k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.6k | const auto l = And(val, kMaskL); | 94 | 16.6k | const auto exp = ShiftRight<23>(b); | 95 | 16.6k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.6k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.6k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.6k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.6k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.6k | const auto d = And(m, kMaskM); | 101 | 16.6k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.6k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.6k | const auto c = Or(a, l); | 104 | 16.6k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.6k | const auto t = Or(c, d); | 106 | 16.6k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.6k | Store(t_fixed, du, out + last_full); | 108 | 16.6k | } | 109 | 19.4k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.4k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 17.1k | uint32_t* JXL_RESTRICT out) { | 45 | 17.1k | const HWY_FULL(uint32_t) du; | 46 | 17.1k | const HWY_FULL(float) df; | 47 | 17.1k | const auto kZero = Zero(du); | 48 | 17.1k | const auto kSplit = Set(du, 1 << E); | 49 | 17.1k | const auto kExpOffset = Set(du, 127); | 50 | 17.1k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 17.1k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 17.1k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 17.1k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 17.1k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 17.1k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 17.1k | constexpr size_t kLargeShiftVal = 10; | 57 | 17.1k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 17.1k | auto extra_bits = kZero; | 60 | 17.1k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 579k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 562k | const auto val = LoadU(du, values + i); | 63 | 562k | const auto is_large = Gt(val, kLargeThreshold); | 64 | 562k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 562k | const auto not_literal = Ge(val, kSplit); | 66 | 562k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 562k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 562k | const auto l = And(val, kMaskL); | 69 | 562k | const auto exp = ShiftRight<23>(b); | 70 | 562k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 562k | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 562k | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 562k | const auto m = ShiftRight<23 - M - L>(b); | 74 | 562k | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 562k | const auto d = And(m, kMaskM); | 76 | 562k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 562k | const auto c = Or(a, l); | 78 | 562k | extra_bits = Add(extra_bits, eb_fixed); | 79 | 562k | const auto t = Or(c, d); | 80 | 562k | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 562k | Store(t_fixed, du, out + i); | 82 | 562k | } | 83 | 17.1k | if (last_full < len) { | 84 | 14.6k | const auto stop = Set(du, len); | 85 | 14.6k | const auto fence = Iota(du, last_full); | 86 | 14.6k | const auto take = Lt(fence, stop); | 87 | 14.6k | const auto val = LoadU(du, values + last_full); | 88 | 14.6k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 14.6k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 14.6k | const auto not_literal = Ge(val, kSplit); | 91 | 14.6k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 14.6k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 14.6k | const auto l = And(val, kMaskL); | 94 | 14.6k | const auto exp = ShiftRight<23>(b); | 95 | 14.6k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 14.6k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 14.6k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 14.6k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 14.6k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 14.6k | const auto d = And(m, kMaskM); | 101 | 14.6k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 14.6k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 14.6k | const auto c = Or(a, l); | 104 | 14.6k | extra_bits = Add(extra_bits, eb_masked); | 105 | 14.6k | const auto t = Or(c, d); | 106 | 14.6k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 14.6k | Store(t_fixed, du, out + last_full); | 108 | 14.6k | } | 109 | 17.1k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 17.1k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 17.1k | uint32_t* JXL_RESTRICT out) { | 45 | 17.1k | const HWY_FULL(uint32_t) du; | 46 | 17.1k | const HWY_FULL(float) df; | 47 | 17.1k | const auto kZero = Zero(du); | 48 | 17.1k | const auto kSplit = Set(du, 1 << E); | 49 | 17.1k | const auto kExpOffset = Set(du, 127); | 50 | 17.1k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 17.1k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 17.1k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 17.1k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 17.1k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 17.1k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 17.1k | constexpr size_t kLargeShiftVal = 10; | 57 | 17.1k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 17.1k | auto extra_bits = kZero; | 60 | 17.1k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 579k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 562k | const auto val = LoadU(du, values + i); | 63 | 562k | const auto is_large = Gt(val, kLargeThreshold); | 64 | 562k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 562k | const auto not_literal = Ge(val, kSplit); | 66 | 562k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 562k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 562k | const auto l = And(val, kMaskL); | 69 | 562k | const auto exp = ShiftRight<23>(b); | 70 | 562k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 562k | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 562k | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 562k | const auto m = ShiftRight<23 - M - L>(b); | 74 | 562k | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 562k | const auto d = And(m, kMaskM); | 76 | 562k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 562k | const auto c = Or(a, l); | 78 | 562k | extra_bits = Add(extra_bits, eb_fixed); | 79 | 562k | const auto t = Or(c, d); | 80 | 562k | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 562k | Store(t_fixed, du, out + i); | 82 | 562k | } | 83 | 17.1k | if (last_full < len) { | 84 | 14.6k | const auto stop = Set(du, len); | 85 | 14.6k | const auto fence = Iota(du, last_full); | 86 | 14.6k | const auto take = Lt(fence, stop); | 87 | 14.6k | const auto val = LoadU(du, values + last_full); | 88 | 14.6k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 14.6k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 14.6k | const auto not_literal = Ge(val, kSplit); | 91 | 14.6k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 14.6k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 14.6k | const auto l = And(val, kMaskL); | 94 | 14.6k | const auto exp = ShiftRight<23>(b); | 95 | 14.6k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 14.6k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 14.6k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 14.6k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 14.6k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 14.6k | const auto d = And(m, kMaskM); | 101 | 14.6k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 14.6k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 14.6k | const auto c = Or(a, l); | 104 | 14.6k | extra_bits = Add(extra_bits, eb_masked); | 105 | 14.6k | const auto t = Or(c, d); | 106 | 14.6k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 14.6k | Store(t_fixed, du, out + last_full); | 108 | 14.6k | } | 109 | 17.1k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 17.1k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 19.5k | uint32_t* JXL_RESTRICT out) { | 45 | 19.5k | const HWY_FULL(uint32_t) du; | 46 | 19.5k | const HWY_FULL(float) df; | 47 | 19.5k | const auto kZero = Zero(du); | 48 | 19.5k | const auto kSplit = Set(du, 1 << E); | 49 | 19.5k | const auto kExpOffset = Set(du, 127); | 50 | 19.5k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 19.5k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 19.5k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 19.5k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 19.5k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 19.5k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 19.5k | constexpr size_t kLargeShiftVal = 10; | 57 | 19.5k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 19.5k | auto extra_bits = kZero; | 60 | 19.5k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 1.02M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 1.00M | const auto val = LoadU(du, values + i); | 63 | 1.00M | const auto is_large = Gt(val, kLargeThreshold); | 64 | 1.00M | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 1.00M | const auto not_literal = Ge(val, kSplit); | 66 | 1.00M | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 1.00M | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 1.00M | const auto l = And(val, kMaskL); | 69 | 1.00M | const auto exp = ShiftRight<23>(b); | 70 | 1.00M | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 1.00M | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 1.00M | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 1.00M | const auto m = ShiftRight<23 - M - L>(b); | 74 | 1.00M | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 1.00M | const auto d = And(m, kMaskM); | 76 | 1.00M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 1.00M | const auto c = Or(a, l); | 78 | 1.00M | extra_bits = Add(extra_bits, eb_fixed); | 79 | 1.00M | const auto t = Or(c, d); | 80 | 1.00M | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 1.00M | Store(t_fixed, du, out + i); | 82 | 1.00M | } | 83 | 19.5k | if (last_full < len) { | 84 | 16.7k | const auto stop = Set(du, len); | 85 | 16.7k | const auto fence = Iota(du, last_full); | 86 | 16.7k | const auto take = Lt(fence, stop); | 87 | 16.7k | const auto val = LoadU(du, values + last_full); | 88 | 16.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 16.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 16.7k | const auto not_literal = Ge(val, kSplit); | 91 | 16.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 16.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 16.7k | const auto l = And(val, kMaskL); | 94 | 16.7k | const auto exp = ShiftRight<23>(b); | 95 | 16.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 16.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 16.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 16.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 16.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 16.7k | const auto d = And(m, kMaskM); | 101 | 16.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 16.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 16.7k | const auto c = Or(a, l); | 104 | 16.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 16.7k | const auto t = Or(c, d); | 106 | 16.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 16.7k | Store(t_fixed, du, out + last_full); | 108 | 16.7k | } | 109 | 19.5k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 19.5k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 14.8k | uint32_t* JXL_RESTRICT out) { | 45 | 14.8k | const HWY_FULL(uint32_t) du; | 46 | 14.8k | const HWY_FULL(float) df; | 47 | 14.8k | const auto kZero = Zero(du); | 48 | 14.8k | const auto kSplit = Set(du, 1 << E); | 49 | 14.8k | const auto kExpOffset = Set(du, 127); | 50 | 14.8k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 14.8k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 14.8k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 14.8k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 14.8k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 14.8k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 14.8k | constexpr size_t kLargeShiftVal = 10; | 57 | 14.8k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 14.8k | auto extra_bits = kZero; | 60 | 14.8k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 493k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 478k | const auto val = LoadU(du, values + i); | 63 | 478k | const auto is_large = Gt(val, kLargeThreshold); | 64 | 478k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 478k | const auto not_literal = Ge(val, kSplit); | 66 | 478k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 478k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 478k | const auto l = And(val, kMaskL); | 69 | 478k | const auto exp = ShiftRight<23>(b); | 70 | 478k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 478k | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 478k | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 478k | const auto m = ShiftRight<23 - M - L>(b); | 74 | 478k | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 478k | const auto d = And(m, kMaskM); | 76 | 478k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 478k | const auto c = Or(a, l); | 78 | 478k | extra_bits = Add(extra_bits, eb_fixed); | 79 | 478k | const auto t = Or(c, d); | 80 | 478k | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 478k | Store(t_fixed, du, out + i); | 82 | 478k | } | 83 | 14.8k | if (last_full < len) { | 84 | 12.7k | const auto stop = Set(du, len); | 85 | 12.7k | const auto fence = Iota(du, last_full); | 86 | 12.7k | const auto take = Lt(fence, stop); | 87 | 12.7k | const auto val = LoadU(du, values + last_full); | 88 | 12.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 12.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 12.7k | const auto not_literal = Ge(val, kSplit); | 91 | 12.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 12.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 12.7k | const auto l = And(val, kMaskL); | 94 | 12.7k | const auto exp = ShiftRight<23>(b); | 95 | 12.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 12.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 12.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 12.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 12.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 12.7k | const auto d = And(m, kMaskM); | 101 | 12.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 12.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 12.7k | const auto c = Or(a, l); | 104 | 12.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 12.7k | const auto t = Or(c, d); | 106 | 12.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 12.7k | Store(t_fixed, du, out + last_full); | 108 | 12.7k | } | 109 | 14.8k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 14.8k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 14.8k | uint32_t* JXL_RESTRICT out) { | 45 | 14.8k | const HWY_FULL(uint32_t) du; | 46 | 14.8k | const HWY_FULL(float) df; | 47 | 14.8k | const auto kZero = Zero(du); | 48 | 14.8k | const auto kSplit = Set(du, 1 << E); | 49 | 14.8k | const auto kExpOffset = Set(du, 127); | 50 | 14.8k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 14.8k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 14.8k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 14.8k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 14.8k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 14.8k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 14.8k | constexpr size_t kLargeShiftVal = 10; | 57 | 14.8k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 14.8k | auto extra_bits = kZero; | 60 | 14.8k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 493k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 478k | const auto val = LoadU(du, values + i); | 63 | 478k | const auto is_large = Gt(val, kLargeThreshold); | 64 | 478k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 478k | const auto not_literal = Ge(val, kSplit); | 66 | 478k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 478k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 478k | const auto l = And(val, kMaskL); | 69 | 478k | const auto exp = ShiftRight<23>(b); | 70 | 478k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 478k | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 478k | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 478k | const auto m = ShiftRight<23 - M - L>(b); | 74 | 478k | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 478k | const auto d = And(m, kMaskM); | 76 | 478k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 478k | const auto c = Or(a, l); | 78 | 478k | extra_bits = Add(extra_bits, eb_fixed); | 79 | 478k | const auto t = Or(c, d); | 80 | 478k | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 478k | Store(t_fixed, du, out + i); | 82 | 478k | } | 83 | 14.8k | if (last_full < len) { | 84 | 12.7k | const auto stop = Set(du, len); | 85 | 12.7k | const auto fence = Iota(du, last_full); | 86 | 12.7k | const auto take = Lt(fence, stop); | 87 | 12.7k | const auto val = LoadU(du, values + last_full); | 88 | 12.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 12.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 12.7k | const auto not_literal = Ge(val, kSplit); | 91 | 12.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 12.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 12.7k | const auto l = And(val, kMaskL); | 94 | 12.7k | const auto exp = ShiftRight<23>(b); | 95 | 12.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 12.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 12.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 12.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 12.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 12.7k | const auto d = And(m, kMaskM); | 101 | 12.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 12.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 12.7k | const auto c = Or(a, l); | 104 | 12.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 12.7k | const auto t = Or(c, d); | 106 | 12.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 12.7k | Store(t_fixed, du, out + last_full); | 108 | 12.7k | } | 109 | 14.8k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 14.8k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 14.8k | uint32_t* JXL_RESTRICT out) { | 45 | 14.8k | const HWY_FULL(uint32_t) du; | 46 | 14.8k | const HWY_FULL(float) df; | 47 | 14.8k | const auto kZero = Zero(du); | 48 | 14.8k | const auto kSplit = Set(du, 1 << E); | 49 | 14.8k | const auto kExpOffset = Set(du, 127); | 50 | 14.8k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 14.8k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 14.8k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 14.8k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 14.8k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 14.8k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 14.8k | constexpr size_t kLargeShiftVal = 10; | 57 | 14.8k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 14.8k | auto extra_bits = kZero; | 60 | 14.8k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 493k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 478k | const auto val = LoadU(du, values + i); | 63 | 478k | const auto is_large = Gt(val, kLargeThreshold); | 64 | 478k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 478k | const auto not_literal = Ge(val, kSplit); | 66 | 478k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 478k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 478k | const auto l = And(val, kMaskL); | 69 | 478k | const auto exp = ShiftRight<23>(b); | 70 | 478k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 478k | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 478k | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 478k | const auto m = ShiftRight<23 - M - L>(b); | 74 | 478k | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 478k | const auto d = And(m, kMaskM); | 76 | 478k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 478k | const auto c = Or(a, l); | 78 | 478k | extra_bits = Add(extra_bits, eb_fixed); | 79 | 478k | const auto t = Or(c, d); | 80 | 478k | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 478k | Store(t_fixed, du, out + i); | 82 | 478k | } | 83 | 14.8k | if (last_full < len) { | 84 | 12.7k | const auto stop = Set(du, len); | 85 | 12.7k | const auto fence = Iota(du, last_full); | 86 | 12.7k | const auto take = Lt(fence, stop); | 87 | 12.7k | const auto val = LoadU(du, values + last_full); | 88 | 12.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 12.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 12.7k | const auto not_literal = Ge(val, kSplit); | 91 | 12.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 12.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 12.7k | const auto l = And(val, kMaskL); | 94 | 12.7k | const auto exp = ShiftRight<23>(b); | 95 | 12.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 12.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 12.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 12.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 12.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 12.7k | const auto d = And(m, kMaskM); | 101 | 12.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 12.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 12.7k | const auto c = Or(a, l); | 104 | 12.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 12.7k | const auto t = Or(c, d); | 106 | 12.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 12.7k | Store(t_fixed, du, out + last_full); | 108 | 12.7k | } | 109 | 14.8k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 14.8k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 14.8k | uint32_t* JXL_RESTRICT out) { | 45 | 14.8k | const HWY_FULL(uint32_t) du; | 46 | 14.8k | const HWY_FULL(float) df; | 47 | 14.8k | const auto kZero = Zero(du); | 48 | 14.8k | const auto kSplit = Set(du, 1 << E); | 49 | 14.8k | const auto kExpOffset = Set(du, 127); | 50 | 14.8k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 14.8k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 14.8k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 14.8k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 14.8k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 14.8k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 14.8k | constexpr size_t kLargeShiftVal = 10; | 57 | 14.8k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 14.8k | auto extra_bits = kZero; | 60 | 14.8k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 493k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 478k | const auto val = LoadU(du, values + i); | 63 | 478k | const auto is_large = Gt(val, kLargeThreshold); | 64 | 478k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 478k | const auto not_literal = Ge(val, kSplit); | 66 | 478k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 478k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 478k | const auto l = And(val, kMaskL); | 69 | 478k | const auto exp = ShiftRight<23>(b); | 70 | 478k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 478k | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 478k | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 478k | const auto m = ShiftRight<23 - M - L>(b); | 74 | 478k | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 478k | const auto d = And(m, kMaskM); | 76 | 478k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 478k | const auto c = Or(a, l); | 78 | 478k | extra_bits = Add(extra_bits, eb_fixed); | 79 | 478k | const auto t = Or(c, d); | 80 | 478k | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 478k | Store(t_fixed, du, out + i); | 82 | 478k | } | 83 | 14.8k | if (last_full < len) { | 84 | 12.7k | const auto stop = Set(du, len); | 85 | 12.7k | const auto fence = Iota(du, last_full); | 86 | 12.7k | const auto take = Lt(fence, stop); | 87 | 12.7k | const auto val = LoadU(du, values + last_full); | 88 | 12.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 12.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 12.7k | const auto not_literal = Ge(val, kSplit); | 91 | 12.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 12.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 12.7k | const auto l = And(val, kMaskL); | 94 | 12.7k | const auto exp = ShiftRight<23>(b); | 95 | 12.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 12.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 12.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 12.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 12.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 12.7k | const auto d = And(m, kMaskM); | 101 | 12.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 12.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 12.7k | const auto c = Or(a, l); | 104 | 12.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 12.7k | const auto t = Or(c, d); | 106 | 12.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 12.7k | Store(t_fixed, du, out + last_full); | 108 | 12.7k | } | 109 | 14.8k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 14.8k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 44 | 14.8k | uint32_t* JXL_RESTRICT out) { | 45 | 14.8k | const HWY_FULL(uint32_t) du; | 46 | 14.8k | const HWY_FULL(float) df; | 47 | 14.8k | const auto kZero = Zero(du); | 48 | 14.8k | const auto kSplit = Set(du, 1 << E); | 49 | 14.8k | const auto kExpOffset = Set(du, 127); | 50 | 14.8k | const auto kEBOffset = Set(du, 127 + M + L); | 51 | 14.8k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 52 | 14.8k | const auto kMulN = Set(du, 1 << (M + L)); | 53 | 14.8k | const auto kMaskL = Set(du, (1 << L) - 1); | 54 | 14.8k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 55 | 14.8k | const auto kLargeThreshold = Set(du, (1 << 22) - 1); | 56 | 14.8k | constexpr size_t kLargeShiftVal = 10; | 57 | 14.8k | const auto kLargeShift = Set(du, kLargeShiftVal); | 58 | | | 59 | 14.8k | auto extra_bits = kZero; | 60 | 14.8k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 61 | 493k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 62 | 478k | const auto val = LoadU(du, values + i); | 63 | 478k | const auto is_large = Gt(val, kLargeThreshold); | 64 | 478k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 65 | 478k | const auto not_literal = Ge(val, kSplit); | 66 | 478k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 67 | 478k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 68 | 478k | const auto l = And(val, kMaskL); | 69 | 478k | const auto exp = ShiftRight<23>(b); | 70 | 478k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 71 | 478k | const auto n = Sub(exp_fixed, kExpOffset); | 72 | 478k | const auto eb = Sub(exp_fixed, kEBOffset); | 73 | 478k | const auto m = ShiftRight<23 - M - L>(b); | 74 | 478k | const auto a = Add(kBase, Mul(n, kMulN)); | 75 | 478k | const auto d = And(m, kMaskM); | 76 | 478k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 77 | 478k | const auto c = Or(a, l); | 78 | 478k | extra_bits = Add(extra_bits, eb_fixed); | 79 | 478k | const auto t = Or(c, d); | 80 | 478k | const auto t_fixed = IfThenElse(not_literal, t, val); | 81 | 478k | Store(t_fixed, du, out + i); | 82 | 478k | } | 83 | 14.8k | if (last_full < len) { | 84 | 12.7k | const auto stop = Set(du, len); | 85 | 12.7k | const auto fence = Iota(du, last_full); | 86 | 12.7k | const auto take = Lt(fence, stop); | 87 | 12.7k | const auto val = LoadU(du, values + last_full); | 88 | 12.7k | const auto is_large = Gt(val, kLargeThreshold); | 89 | 12.7k | const auto val_shifted = ShiftRight<kLargeShiftVal>(val); | 90 | 12.7k | const auto not_literal = Ge(val, kSplit); | 91 | 12.7k | const auto val_fixed = IfThenElse(is_large, val_shifted, val); | 92 | 12.7k | const auto b = BitCast(du, ConvertTo(df, val_fixed)); | 93 | 12.7k | const auto l = And(val, kMaskL); | 94 | 12.7k | const auto exp = ShiftRight<23>(b); | 95 | 12.7k | const auto exp_fixed = IfThenElse(is_large, Add(exp, kLargeShift), exp); | 96 | 12.7k | const auto n = Sub(exp_fixed, kExpOffset); | 97 | 12.7k | const auto eb = Sub(exp_fixed, kEBOffset); | 98 | 12.7k | const auto m = ShiftRight<23 - M - L>(b); | 99 | 12.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 100 | 12.7k | const auto d = And(m, kMaskM); | 101 | 12.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 102 | 12.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 103 | 12.7k | const auto c = Or(a, l); | 104 | 12.7k | extra_bits = Add(extra_bits, eb_masked); | 105 | 12.7k | const auto t = Or(c, d); | 106 | 12.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 107 | 12.7k | Store(t_fixed, du, out + last_full); | 108 | 12.7k | } | 109 | 14.8k | return GetLane(SumOfLanes(du, extra_bits)); | 110 | 14.8k | } |
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) |