98 | 404k | } Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) unsigned int jxl::N_AVX2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 57.0k | uint32_t* JXL_RESTRICT out) { | 44 | 57.0k | const HWY_FULL(uint32_t) du; | 45 | 57.0k | const HWY_FULL(float) df; | 46 | 57.0k | const auto kZero = Zero(du); | 47 | 57.0k | const auto kSplit = Set(du, 1 << E); | 48 | 57.0k | const auto kExpOffset = Set(du, 127); | 49 | 57.0k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 57.0k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 57.0k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 57.0k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 57.0k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 57.0k | auto extra_bits = kZero; | 56 | 57.0k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 5.49M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 5.43M | const auto val = LoadU(du, values + i); | 59 | 5.43M | const auto not_literal = Ge(val, kSplit); | 60 | 5.43M | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 5.43M | const auto l = And(val, kMaskL); | 62 | 5.43M | const auto exp = ShiftRight<23>(b); | 63 | 5.43M | const auto n = Sub(exp, kExpOffset); | 64 | 5.43M | const auto eb = Sub(exp, kEBOffset); | 65 | 5.43M | const auto m = ShiftRight<23 - M - L>(b); | 66 | 5.43M | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 5.43M | const auto d = And(m, kMaskM); | 68 | 5.43M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 5.43M | const auto c = Or(a, l); | 70 | 5.43M | extra_bits = Add(extra_bits, eb_fixed); | 71 | 5.43M | const auto t = Or(c, d); | 72 | 5.43M | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 5.43M | Store(t_fixed, du, out + i); | 74 | 5.43M | } | 75 | 57.0k | if (last_full < len) { | 76 | 49.3k | const auto stop = Set(du, len); | 77 | 49.3k | const auto fence = Iota(du, last_full); | 78 | 49.3k | const auto take = Lt(fence, stop); | 79 | 49.3k | const auto val = LoadU(du, values + last_full); | 80 | 49.3k | const auto not_literal = Ge(val, kSplit); | 81 | 49.3k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 49.3k | const auto l = And(val, kMaskL); | 83 | 49.3k | const auto exp = ShiftRight<23>(b); | 84 | 49.3k | const auto n = Sub(exp, kExpOffset); | 85 | 49.3k | const auto eb = Sub(exp, kEBOffset); | 86 | 49.3k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 49.3k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 49.3k | const auto d = And(m, kMaskM); | 89 | 49.3k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 49.3k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 49.3k | const auto c = Or(a, l); | 92 | 49.3k | extra_bits = Add(extra_bits, eb_masked); | 93 | 49.3k | const auto t = Or(c, d); | 94 | 49.3k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 49.3k | Store(t_fixed, du, out + last_full); | 96 | 49.3k | } | 97 | 57.0k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 57.0k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 56.4k | uint32_t* JXL_RESTRICT out) { | 44 | 56.4k | const HWY_FULL(uint32_t) du; | 45 | 56.4k | const HWY_FULL(float) df; | 46 | 56.4k | const auto kZero = Zero(du); | 47 | 56.4k | const auto kSplit = Set(du, 1 << E); | 48 | 56.4k | const auto kExpOffset = Set(du, 127); | 49 | 56.4k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 56.4k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 56.4k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 56.4k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 56.4k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 56.4k | auto extra_bits = kZero; | 56 | 56.4k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 5.48M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 5.43M | const auto val = LoadU(du, values + i); | 59 | 5.43M | const auto not_literal = Ge(val, kSplit); | 60 | 5.43M | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 5.43M | const auto l = And(val, kMaskL); | 62 | 5.43M | const auto exp = ShiftRight<23>(b); | 63 | 5.43M | const auto n = Sub(exp, kExpOffset); | 64 | 5.43M | const auto eb = Sub(exp, kEBOffset); | 65 | 5.43M | const auto m = ShiftRight<23 - M - L>(b); | 66 | 5.43M | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 5.43M | const auto d = And(m, kMaskM); | 68 | 5.43M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 5.43M | const auto c = Or(a, l); | 70 | 5.43M | extra_bits = Add(extra_bits, eb_fixed); | 71 | 5.43M | const auto t = Or(c, d); | 72 | 5.43M | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 5.43M | Store(t_fixed, du, out + i); | 74 | 5.43M | } | 75 | 56.4k | if (last_full < len) { | 76 | 48.7k | const auto stop = Set(du, len); | 77 | 48.7k | const auto fence = Iota(du, last_full); | 78 | 48.7k | const auto take = Lt(fence, stop); | 79 | 48.7k | const auto val = LoadU(du, values + last_full); | 80 | 48.7k | const auto not_literal = Ge(val, kSplit); | 81 | 48.7k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 48.7k | const auto l = And(val, kMaskL); | 83 | 48.7k | const auto exp = ShiftRight<23>(b); | 84 | 48.7k | const auto n = Sub(exp, kExpOffset); | 85 | 48.7k | const auto eb = Sub(exp, kEBOffset); | 86 | 48.7k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 48.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 48.7k | const auto d = And(m, kMaskM); | 89 | 48.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 48.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 48.7k | const auto c = Or(a, l); | 92 | 48.7k | extra_bits = Add(extra_bits, eb_masked); | 93 | 48.7k | const auto t = Or(c, d); | 94 | 48.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 48.7k | Store(t_fixed, du, out + last_full); | 96 | 48.7k | } | 97 | 56.4k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 56.4k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 56.4k | uint32_t* JXL_RESTRICT out) { | 44 | 56.4k | const HWY_FULL(uint32_t) du; | 45 | 56.4k | const HWY_FULL(float) df; | 46 | 56.4k | const auto kZero = Zero(du); | 47 | 56.4k | const auto kSplit = Set(du, 1 << E); | 48 | 56.4k | const auto kExpOffset = Set(du, 127); | 49 | 56.4k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 56.4k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 56.4k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 56.4k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 56.4k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 56.4k | auto extra_bits = kZero; | 56 | 56.4k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 5.48M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 5.43M | const auto val = LoadU(du, values + i); | 59 | 5.43M | const auto not_literal = Ge(val, kSplit); | 60 | 5.43M | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 5.43M | const auto l = And(val, kMaskL); | 62 | 5.43M | const auto exp = ShiftRight<23>(b); | 63 | 5.43M | const auto n = Sub(exp, kExpOffset); | 64 | 5.43M | const auto eb = Sub(exp, kEBOffset); | 65 | 5.43M | const auto m = ShiftRight<23 - M - L>(b); | 66 | 5.43M | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 5.43M | const auto d = And(m, kMaskM); | 68 | 5.43M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 5.43M | const auto c = Or(a, l); | 70 | 5.43M | extra_bits = Add(extra_bits, eb_fixed); | 71 | 5.43M | const auto t = Or(c, d); | 72 | 5.43M | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 5.43M | Store(t_fixed, du, out + i); | 74 | 5.43M | } | 75 | 56.4k | if (last_full < len) { | 76 | 48.7k | const auto stop = Set(du, len); | 77 | 48.7k | const auto fence = Iota(du, last_full); | 78 | 48.7k | const auto take = Lt(fence, stop); | 79 | 48.7k | const auto val = LoadU(du, values + last_full); | 80 | 48.7k | const auto not_literal = Ge(val, kSplit); | 81 | 48.7k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 48.7k | const auto l = And(val, kMaskL); | 83 | 48.7k | const auto exp = ShiftRight<23>(b); | 84 | 48.7k | const auto n = Sub(exp, kExpOffset); | 85 | 48.7k | const auto eb = Sub(exp, kEBOffset); | 86 | 48.7k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 48.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 48.7k | const auto d = And(m, kMaskM); | 89 | 48.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 48.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 48.7k | const auto c = Or(a, l); | 92 | 48.7k | extra_bits = Add(extra_bits, eb_masked); | 93 | 48.7k | const auto t = Or(c, d); | 94 | 48.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 48.7k | Store(t_fixed, du, out + last_full); | 96 | 48.7k | } | 97 | 56.4k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 56.4k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 56.4k | uint32_t* JXL_RESTRICT out) { | 44 | 56.4k | const HWY_FULL(uint32_t) du; | 45 | 56.4k | const HWY_FULL(float) df; | 46 | 56.4k | const auto kZero = Zero(du); | 47 | 56.4k | const auto kSplit = Set(du, 1 << E); | 48 | 56.4k | const auto kExpOffset = Set(du, 127); | 49 | 56.4k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 56.4k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 56.4k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 56.4k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 56.4k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 56.4k | auto extra_bits = kZero; | 56 | 56.4k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 5.48M | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 5.43M | const auto val = LoadU(du, values + i); | 59 | 5.43M | const auto not_literal = Ge(val, kSplit); | 60 | 5.43M | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 5.43M | const auto l = And(val, kMaskL); | 62 | 5.43M | const auto exp = ShiftRight<23>(b); | 63 | 5.43M | const auto n = Sub(exp, kExpOffset); | 64 | 5.43M | const auto eb = Sub(exp, kEBOffset); | 65 | 5.43M | const auto m = ShiftRight<23 - M - L>(b); | 66 | 5.43M | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 5.43M | const auto d = And(m, kMaskM); | 68 | 5.43M | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 5.43M | const auto c = Or(a, l); | 70 | 5.43M | extra_bits = Add(extra_bits, eb_fixed); | 71 | 5.43M | const auto t = Or(c, d); | 72 | 5.43M | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 5.43M | Store(t_fixed, du, out + i); | 74 | 5.43M | } | 75 | 56.4k | if (last_full < len) { | 76 | 48.7k | const auto stop = Set(du, len); | 77 | 48.7k | const auto fence = Iota(du, last_full); | 78 | 48.7k | const auto take = Lt(fence, stop); | 79 | 48.7k | const auto val = LoadU(du, values + last_full); | 80 | 48.7k | const auto not_literal = Ge(val, kSplit); | 81 | 48.7k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 48.7k | const auto l = And(val, kMaskL); | 83 | 48.7k | const auto exp = ShiftRight<23>(b); | 84 | 48.7k | const auto n = Sub(exp, kExpOffset); | 85 | 48.7k | const auto eb = Sub(exp, kEBOffset); | 86 | 48.7k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 48.7k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 48.7k | const auto d = And(m, kMaskM); | 89 | 48.7k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 48.7k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 48.7k | const auto c = Or(a, l); | 92 | 48.7k | extra_bits = Add(extra_bits, eb_masked); | 93 | 48.7k | const auto t = Or(c, d); | 94 | 48.7k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 48.7k | Store(t_fixed, du, out + last_full); | 96 | 48.7k | } | 97 | 56.4k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 56.4k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.85k | uint32_t* JXL_RESTRICT out) { | 44 | 7.85k | const HWY_FULL(uint32_t) du; | 45 | 7.85k | const HWY_FULL(float) df; | 46 | 7.85k | const auto kZero = Zero(du); | 47 | 7.85k | const auto kSplit = Set(du, 1 << E); | 48 | 7.85k | const auto kExpOffset = Set(du, 127); | 49 | 7.85k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.85k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.85k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.85k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.85k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.85k | auto extra_bits = kZero; | 56 | 7.85k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.85k | if (last_full < len) { | 76 | 6.62k | const auto stop = Set(du, len); | 77 | 6.62k | const auto fence = Iota(du, last_full); | 78 | 6.62k | const auto take = Lt(fence, stop); | 79 | 6.62k | const auto val = LoadU(du, values + last_full); | 80 | 6.62k | const auto not_literal = Ge(val, kSplit); | 81 | 6.62k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.62k | const auto l = And(val, kMaskL); | 83 | 6.62k | const auto exp = ShiftRight<23>(b); | 84 | 6.62k | const auto n = Sub(exp, kExpOffset); | 85 | 6.62k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.62k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.62k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.62k | const auto d = And(m, kMaskM); | 89 | 6.62k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.62k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.62k | const auto c = Or(a, l); | 92 | 6.62k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.62k | const auto t = Or(c, d); | 94 | 6.62k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.62k | Store(t_fixed, du, out + last_full); | 96 | 6.62k | } | 97 | 7.85k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.85k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.85k | uint32_t* JXL_RESTRICT out) { | 44 | 7.85k | const HWY_FULL(uint32_t) du; | 45 | 7.85k | const HWY_FULL(float) df; | 46 | 7.85k | const auto kZero = Zero(du); | 47 | 7.85k | const auto kSplit = Set(du, 1 << E); | 48 | 7.85k | const auto kExpOffset = Set(du, 127); | 49 | 7.85k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.85k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.85k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.85k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.85k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.85k | auto extra_bits = kZero; | 56 | 7.85k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.85k | if (last_full < len) { | 76 | 6.62k | const auto stop = Set(du, len); | 77 | 6.62k | const auto fence = Iota(du, last_full); | 78 | 6.62k | const auto take = Lt(fence, stop); | 79 | 6.62k | const auto val = LoadU(du, values + last_full); | 80 | 6.62k | const auto not_literal = Ge(val, kSplit); | 81 | 6.62k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.62k | const auto l = And(val, kMaskL); | 83 | 6.62k | const auto exp = ShiftRight<23>(b); | 84 | 6.62k | const auto n = Sub(exp, kExpOffset); | 85 | 6.62k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.62k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.62k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.62k | const auto d = And(m, kMaskM); | 89 | 6.62k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.62k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.62k | const auto c = Or(a, l); | 92 | 6.62k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.62k | const auto t = Or(c, d); | 94 | 6.62k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.62k | Store(t_fixed, du, out + last_full); | 96 | 6.62k | } | 97 | 7.85k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.85k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.11k | uint32_t* JXL_RESTRICT out) { | 44 | 7.11k | const HWY_FULL(uint32_t) du; | 45 | 7.11k | const HWY_FULL(float) df; | 46 | 7.11k | const auto kZero = Zero(du); | 47 | 7.11k | const auto kSplit = Set(du, 1 << E); | 48 | 7.11k | const auto kExpOffset = Set(du, 127); | 49 | 7.11k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.11k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.11k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.11k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.11k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.11k | auto extra_bits = kZero; | 56 | 7.11k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 302k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 295k | const auto val = LoadU(du, values + i); | 59 | 295k | const auto not_literal = Ge(val, kSplit); | 60 | 295k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 295k | const auto l = And(val, kMaskL); | 62 | 295k | const auto exp = ShiftRight<23>(b); | 63 | 295k | const auto n = Sub(exp, kExpOffset); | 64 | 295k | const auto eb = Sub(exp, kEBOffset); | 65 | 295k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 295k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 295k | const auto d = And(m, kMaskM); | 68 | 295k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 295k | const auto c = Or(a, l); | 70 | 295k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 295k | const auto t = Or(c, d); | 72 | 295k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 295k | Store(t_fixed, du, out + i); | 74 | 295k | } | 75 | 7.11k | if (last_full < len) { | 76 | 6.02k | const auto stop = Set(du, len); | 77 | 6.02k | const auto fence = Iota(du, last_full); | 78 | 6.02k | const auto take = Lt(fence, stop); | 79 | 6.02k | const auto val = LoadU(du, values + last_full); | 80 | 6.02k | const auto not_literal = Ge(val, kSplit); | 81 | 6.02k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.02k | const auto l = And(val, kMaskL); | 83 | 6.02k | const auto exp = ShiftRight<23>(b); | 84 | 6.02k | const auto n = Sub(exp, kExpOffset); | 85 | 6.02k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.02k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.02k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.02k | const auto d = And(m, kMaskM); | 89 | 6.02k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.02k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.02k | const auto c = Or(a, l); | 92 | 6.02k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.02k | const auto t = Or(c, d); | 94 | 6.02k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.02k | Store(t_fixed, du, out + last_full); | 96 | 6.02k | } | 97 | 7.11k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.11k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.11k | uint32_t* JXL_RESTRICT out) { | 44 | 7.11k | const HWY_FULL(uint32_t) du; | 45 | 7.11k | const HWY_FULL(float) df; | 46 | 7.11k | const auto kZero = Zero(du); | 47 | 7.11k | const auto kSplit = Set(du, 1 << E); | 48 | 7.11k | const auto kExpOffset = Set(du, 127); | 49 | 7.11k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.11k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.11k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.11k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.11k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.11k | auto extra_bits = kZero; | 56 | 7.11k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 302k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 295k | const auto val = LoadU(du, values + i); | 59 | 295k | const auto not_literal = Ge(val, kSplit); | 60 | 295k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 295k | const auto l = And(val, kMaskL); | 62 | 295k | const auto exp = ShiftRight<23>(b); | 63 | 295k | const auto n = Sub(exp, kExpOffset); | 64 | 295k | const auto eb = Sub(exp, kEBOffset); | 65 | 295k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 295k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 295k | const auto d = And(m, kMaskM); | 68 | 295k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 295k | const auto c = Or(a, l); | 70 | 295k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 295k | const auto t = Or(c, d); | 72 | 295k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 295k | Store(t_fixed, du, out + i); | 74 | 295k | } | 75 | 7.11k | if (last_full < len) { | 76 | 6.02k | const auto stop = Set(du, len); | 77 | 6.02k | const auto fence = Iota(du, last_full); | 78 | 6.02k | const auto take = Lt(fence, stop); | 79 | 6.02k | const auto val = LoadU(du, values + last_full); | 80 | 6.02k | const auto not_literal = Ge(val, kSplit); | 81 | 6.02k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.02k | const auto l = And(val, kMaskL); | 83 | 6.02k | const auto exp = ShiftRight<23>(b); | 84 | 6.02k | const auto n = Sub(exp, kExpOffset); | 85 | 6.02k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.02k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.02k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.02k | const auto d = And(m, kMaskM); | 89 | 6.02k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.02k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.02k | const auto c = Or(a, l); | 92 | 6.02k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.02k | const auto t = Or(c, d); | 94 | 6.02k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.02k | Store(t_fixed, du, out + last_full); | 96 | 6.02k | } | 97 | 7.11k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.11k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 7.86k | uint32_t* JXL_RESTRICT out) { | 44 | 7.86k | const HWY_FULL(uint32_t) du; | 45 | 7.86k | const HWY_FULL(float) df; | 46 | 7.86k | const auto kZero = Zero(du); | 47 | 7.86k | const auto kSplit = Set(du, 1 << E); | 48 | 7.86k | const auto kExpOffset = Set(du, 127); | 49 | 7.86k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 7.86k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 7.86k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 7.86k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 7.86k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 7.86k | auto extra_bits = kZero; | 56 | 7.86k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 575k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 567k | const auto val = LoadU(du, values + i); | 59 | 567k | const auto not_literal = Ge(val, kSplit); | 60 | 567k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 567k | const auto l = And(val, kMaskL); | 62 | 567k | const auto exp = ShiftRight<23>(b); | 63 | 567k | const auto n = Sub(exp, kExpOffset); | 64 | 567k | const auto eb = Sub(exp, kEBOffset); | 65 | 567k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 567k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 567k | const auto d = And(m, kMaskM); | 68 | 567k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 567k | const auto c = Or(a, l); | 70 | 567k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 567k | const auto t = Or(c, d); | 72 | 567k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 567k | Store(t_fixed, du, out + i); | 74 | 567k | } | 75 | 7.86k | if (last_full < len) { | 76 | 6.63k | const auto stop = Set(du, len); | 77 | 6.63k | const auto fence = Iota(du, last_full); | 78 | 6.63k | const auto take = Lt(fence, stop); | 79 | 6.63k | const auto val = LoadU(du, values + last_full); | 80 | 6.63k | const auto not_literal = Ge(val, kSplit); | 81 | 6.63k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 6.63k | const auto l = And(val, kMaskL); | 83 | 6.63k | const auto exp = ShiftRight<23>(b); | 84 | 6.63k | const auto n = Sub(exp, kExpOffset); | 85 | 6.63k | const auto eb = Sub(exp, kEBOffset); | 86 | 6.63k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 6.63k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 6.63k | const auto d = And(m, kMaskM); | 89 | 6.63k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 6.63k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 6.63k | const auto c = Or(a, l); | 92 | 6.63k | extra_bits = Add(extra_bits, eb_masked); | 93 | 6.63k | const auto t = Or(c, d); | 94 | 6.63k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 6.63k | Store(t_fixed, du, out + last_full); | 96 | 6.63k | } | 97 | 7.86k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 7.86k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 6.03k | uint32_t* JXL_RESTRICT out) { | 44 | 6.03k | const HWY_FULL(uint32_t) du; | 45 | 6.03k | const HWY_FULL(float) df; | 46 | 6.03k | const auto kZero = Zero(du); | 47 | 6.03k | const auto kSplit = Set(du, 1 << E); | 48 | 6.03k | const auto kExpOffset = Set(du, 127); | 49 | 6.03k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 6.03k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 6.03k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 6.03k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 6.03k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 6.03k | auto extra_bits = kZero; | 56 | 6.03k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 263k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 257k | const auto val = LoadU(du, values + i); | 59 | 257k | const auto not_literal = Ge(val, kSplit); | 60 | 257k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 257k | const auto l = And(val, kMaskL); | 62 | 257k | const auto exp = ShiftRight<23>(b); | 63 | 257k | const auto n = Sub(exp, kExpOffset); | 64 | 257k | const auto eb = Sub(exp, kEBOffset); | 65 | 257k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 257k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 257k | const auto d = And(m, kMaskM); | 68 | 257k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 257k | const auto c = Or(a, l); | 70 | 257k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 257k | const auto t = Or(c, d); | 72 | 257k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 257k | Store(t_fixed, du, out + i); | 74 | 257k | } | 75 | 6.03k | if (last_full < len) { | 76 | 5.12k | const auto stop = Set(du, len); | 77 | 5.12k | const auto fence = Iota(du, last_full); | 78 | 5.12k | const auto take = Lt(fence, stop); | 79 | 5.12k | const auto val = LoadU(du, values + last_full); | 80 | 5.12k | const auto not_literal = Ge(val, kSplit); | 81 | 5.12k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 5.12k | const auto l = And(val, kMaskL); | 83 | 5.12k | const auto exp = ShiftRight<23>(b); | 84 | 5.12k | const auto n = Sub(exp, kExpOffset); | 85 | 5.12k | const auto eb = Sub(exp, kEBOffset); | 86 | 5.12k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 5.12k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 5.12k | const auto d = And(m, kMaskM); | 89 | 5.12k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 5.12k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 5.12k | const auto c = Or(a, l); | 92 | 5.12k | extra_bits = Add(extra_bits, eb_masked); | 93 | 5.12k | const auto t = Or(c, d); | 94 | 5.12k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 5.12k | Store(t_fixed, du, out + last_full); | 96 | 5.12k | } | 97 | 6.03k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 6.03k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 6.03k | uint32_t* JXL_RESTRICT out) { | 44 | 6.03k | const HWY_FULL(uint32_t) du; | 45 | 6.03k | const HWY_FULL(float) df; | 46 | 6.03k | const auto kZero = Zero(du); | 47 | 6.03k | const auto kSplit = Set(du, 1 << E); | 48 | 6.03k | const auto kExpOffset = Set(du, 127); | 49 | 6.03k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 6.03k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 6.03k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 6.03k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 6.03k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 6.03k | auto extra_bits = kZero; | 56 | 6.03k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 263k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 257k | const auto val = LoadU(du, values + i); | 59 | 257k | const auto not_literal = Ge(val, kSplit); | 60 | 257k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 257k | const auto l = And(val, kMaskL); | 62 | 257k | const auto exp = ShiftRight<23>(b); | 63 | 257k | const auto n = Sub(exp, kExpOffset); | 64 | 257k | const auto eb = Sub(exp, kEBOffset); | 65 | 257k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 257k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 257k | const auto d = And(m, kMaskM); | 68 | 257k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 257k | const auto c = Or(a, l); | 70 | 257k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 257k | const auto t = Or(c, d); | 72 | 257k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 257k | Store(t_fixed, du, out + i); | 74 | 257k | } | 75 | 6.03k | if (last_full < len) { | 76 | 5.12k | const auto stop = Set(du, len); | 77 | 5.12k | const auto fence = Iota(du, last_full); | 78 | 5.12k | const auto take = Lt(fence, stop); | 79 | 5.12k | const auto val = LoadU(du, values + last_full); | 80 | 5.12k | const auto not_literal = Ge(val, kSplit); | 81 | 5.12k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 5.12k | const auto l = And(val, kMaskL); | 83 | 5.12k | const auto exp = ShiftRight<23>(b); | 84 | 5.12k | const auto n = Sub(exp, kExpOffset); | 85 | 5.12k | const auto eb = Sub(exp, kEBOffset); | 86 | 5.12k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 5.12k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 5.12k | const auto d = And(m, kMaskM); | 89 | 5.12k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 5.12k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 5.12k | const auto c = Or(a, l); | 92 | 5.12k | extra_bits = Add(extra_bits, eb_masked); | 93 | 5.12k | const auto t = Or(c, d); | 94 | 5.12k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 5.12k | Store(t_fixed, du, out + last_full); | 96 | 5.12k | } | 97 | 6.03k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 6.03k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 6.03k | uint32_t* JXL_RESTRICT out) { | 44 | 6.03k | const HWY_FULL(uint32_t) du; | 45 | 6.03k | const HWY_FULL(float) df; | 46 | 6.03k | const auto kZero = Zero(du); | 47 | 6.03k | const auto kSplit = Set(du, 1 << E); | 48 | 6.03k | const auto kExpOffset = Set(du, 127); | 49 | 6.03k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 6.03k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 6.03k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 6.03k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 6.03k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 6.03k | auto extra_bits = kZero; | 56 | 6.03k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 263k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 257k | const auto val = LoadU(du, values + i); | 59 | 257k | const auto not_literal = Ge(val, kSplit); | 60 | 257k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 257k | const auto l = And(val, kMaskL); | 62 | 257k | const auto exp = ShiftRight<23>(b); | 63 | 257k | const auto n = Sub(exp, kExpOffset); | 64 | 257k | const auto eb = Sub(exp, kEBOffset); | 65 | 257k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 257k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 257k | const auto d = And(m, kMaskM); | 68 | 257k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 257k | const auto c = Or(a, l); | 70 | 257k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 257k | const auto t = Or(c, d); | 72 | 257k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 257k | Store(t_fixed, du, out + i); | 74 | 257k | } | 75 | 6.03k | if (last_full < len) { | 76 | 5.12k | const auto stop = Set(du, len); | 77 | 5.12k | const auto fence = Iota(du, last_full); | 78 | 5.12k | const auto take = Lt(fence, stop); | 79 | 5.12k | const auto val = LoadU(du, values + last_full); | 80 | 5.12k | const auto not_literal = Ge(val, kSplit); | 81 | 5.12k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 5.12k | const auto l = And(val, kMaskL); | 83 | 5.12k | const auto exp = ShiftRight<23>(b); | 84 | 5.12k | const auto n = Sub(exp, kExpOffset); | 85 | 5.12k | const auto eb = Sub(exp, kEBOffset); | 86 | 5.12k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 5.12k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 5.12k | const auto d = And(m, kMaskM); | 89 | 5.12k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 5.12k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 5.12k | const auto c = Or(a, l); | 92 | 5.12k | extra_bits = Add(extra_bits, eb_masked); | 93 | 5.12k | const auto t = Or(c, d); | 94 | 5.12k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 5.12k | Store(t_fixed, du, out + last_full); | 96 | 5.12k | } | 97 | 6.03k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 6.03k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 6.03k | uint32_t* JXL_RESTRICT out) { | 44 | 6.03k | const HWY_FULL(uint32_t) du; | 45 | 6.03k | const HWY_FULL(float) df; | 46 | 6.03k | const auto kZero = Zero(du); | 47 | 6.03k | const auto kSplit = Set(du, 1 << E); | 48 | 6.03k | const auto kExpOffset = Set(du, 127); | 49 | 6.03k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 6.03k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 6.03k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 6.03k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 6.03k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 6.03k | auto extra_bits = kZero; | 56 | 6.03k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 263k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 257k | const auto val = LoadU(du, values + i); | 59 | 257k | const auto not_literal = Ge(val, kSplit); | 60 | 257k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 257k | const auto l = And(val, kMaskL); | 62 | 257k | const auto exp = ShiftRight<23>(b); | 63 | 257k | const auto n = Sub(exp, kExpOffset); | 64 | 257k | const auto eb = Sub(exp, kEBOffset); | 65 | 257k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 257k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 257k | const auto d = And(m, kMaskM); | 68 | 257k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 257k | const auto c = Or(a, l); | 70 | 257k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 257k | const auto t = Or(c, d); | 72 | 257k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 257k | Store(t_fixed, du, out + i); | 74 | 257k | } | 75 | 6.03k | if (last_full < len) { | 76 | 5.12k | const auto stop = Set(du, len); | 77 | 5.12k | const auto fence = Iota(du, last_full); | 78 | 5.12k | const auto take = Lt(fence, stop); | 79 | 5.12k | const auto val = LoadU(du, values + last_full); | 80 | 5.12k | const auto not_literal = Ge(val, kSplit); | 81 | 5.12k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 5.12k | const auto l = And(val, kMaskL); | 83 | 5.12k | const auto exp = ShiftRight<23>(b); | 84 | 5.12k | const auto n = Sub(exp, kExpOffset); | 85 | 5.12k | const auto eb = Sub(exp, kEBOffset); | 86 | 5.12k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 5.12k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 5.12k | const auto d = And(m, kMaskM); | 89 | 5.12k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 5.12k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 5.12k | const auto c = Or(a, l); | 92 | 5.12k | extra_bits = Add(extra_bits, eb_masked); | 93 | 5.12k | const auto t = Or(c, d); | 94 | 5.12k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 5.12k | Store(t_fixed, du, out + last_full); | 96 | 5.12k | } | 97 | 6.03k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 6.03k | } |
unsigned int jxl::N_AVX2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Line | Count | Source | 43 | 6.03k | uint32_t* JXL_RESTRICT out) { | 44 | 6.03k | const HWY_FULL(uint32_t) du; | 45 | 6.03k | const HWY_FULL(float) df; | 46 | 6.03k | const auto kZero = Zero(du); | 47 | 6.03k | const auto kSplit = Set(du, 1 << E); | 48 | 6.03k | const auto kExpOffset = Set(du, 127); | 49 | 6.03k | const auto kEBOffset = Set(du, 127 + M + L); | 50 | 6.03k | const auto kBase = Set(du, static_cast<uint32_t>((1 << E) - (E << (M + L)))); | 51 | 6.03k | const auto kMulN = Set(du, 1 << (M + L)); | 52 | 6.03k | const auto kMaskL = Set(du, (1 << L) - 1); | 53 | 6.03k | const auto kMaskM = Set(du, ((1 << M) - 1) << L); | 54 | | | 55 | 6.03k | auto extra_bits = kZero; | 56 | 6.03k | size_t last_full = Lanes(du) * (len / Lanes(du)); | 57 | 263k | for (size_t i = 0; i < last_full; i += Lanes(du)) { | 58 | 257k | const auto val = LoadU(du, values + i); | 59 | 257k | const auto not_literal = Ge(val, kSplit); | 60 | 257k | const auto b = BitCast(du, ConvertTo(df, val)); | 61 | 257k | const auto l = And(val, kMaskL); | 62 | 257k | const auto exp = ShiftRight<23>(b); | 63 | 257k | const auto n = Sub(exp, kExpOffset); | 64 | 257k | const auto eb = Sub(exp, kEBOffset); | 65 | 257k | const auto m = ShiftRight<23 - M - L>(b); | 66 | 257k | const auto a = Add(kBase, Mul(n, kMulN)); | 67 | 257k | const auto d = And(m, kMaskM); | 68 | 257k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 69 | 257k | const auto c = Or(a, l); | 70 | 257k | extra_bits = Add(extra_bits, eb_fixed); | 71 | 257k | const auto t = Or(c, d); | 72 | 257k | const auto t_fixed = IfThenElse(not_literal, t, val); | 73 | 257k | Store(t_fixed, du, out + i); | 74 | 257k | } | 75 | 6.03k | if (last_full < len) { | 76 | 5.12k | const auto stop = Set(du, len); | 77 | 5.12k | const auto fence = Iota(du, last_full); | 78 | 5.12k | const auto take = Lt(fence, stop); | 79 | 5.12k | const auto val = LoadU(du, values + last_full); | 80 | 5.12k | const auto not_literal = Ge(val, kSplit); | 81 | 5.12k | const auto b = BitCast(du, ConvertTo(df, val)); | 82 | 5.12k | const auto l = And(val, kMaskL); | 83 | 5.12k | const auto exp = ShiftRight<23>(b); | 84 | 5.12k | const auto n = Sub(exp, kExpOffset); | 85 | 5.12k | const auto eb = Sub(exp, kEBOffset); | 86 | 5.12k | const auto m = ShiftRight<23 - M - L>(b); | 87 | 5.12k | const auto a = Add(kBase, Mul(n, kMulN)); | 88 | 5.12k | const auto d = And(m, kMaskM); | 89 | 5.12k | const auto eb_fixed = IfThenElseZero(not_literal, eb); | 90 | 5.12k | const auto eb_masked = IfThenElseZero(take, eb_fixed); | 91 | 5.12k | const auto c = Or(a, l); | 92 | 5.12k | extra_bits = Add(extra_bits, eb_masked); | 93 | 5.12k | const auto t = Or(c, d); | 94 | 5.12k | const auto t_fixed = IfThenElse(not_literal, t, val); | 95 | 5.12k | Store(t_fixed, du, out + last_full); | 96 | 5.12k | } | 97 | 6.03k | return GetLane(SumOfLanes(du, extra_bits)); | 98 | 6.03k | } |
Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_ZEN4::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_AVX3_SPR::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<0ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<2ul, 0ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<3ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 1ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<4ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 1ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 1ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 2ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<5ul, 2ul, 3ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 1ul, 5ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<6ul, 2ul, 4ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<7ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<8ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<9ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<10ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<11ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) Unexecuted instantiation: unsigned int jxl::N_SSE2::EstimateTokenCostImpl<12ul, 0ul, 0ul>(unsigned int*, unsigned long, unsigned int*) |