/src/libjxl/lib/jxl/base/fast_math-inl.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | // Fast SIMD math ops (log2, encoder only, cos, erf for splines) |
7 | | |
8 | | #include <cstdint> |
9 | | |
10 | | #if defined(LIB_JXL_BASE_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE) |
11 | | #ifdef LIB_JXL_BASE_FAST_MATH_INL_H_ |
12 | | #undef LIB_JXL_BASE_FAST_MATH_INL_H_ |
13 | | #else |
14 | | #define LIB_JXL_BASE_FAST_MATH_INL_H_ |
15 | | #endif |
16 | | |
17 | | #include <hwy/highway.h> |
18 | | |
19 | | #include "lib/jxl/base/common.h" |
20 | | #include "lib/jxl/base/rational_polynomial-inl.h" |
21 | | HWY_BEFORE_NAMESPACE(); |
22 | | namespace jxl { |
23 | | namespace HWY_NAMESPACE { |
24 | | |
25 | | // These templates are not found via ADL. |
26 | | using hwy::HWY_NAMESPACE::Abs; |
27 | | using hwy::HWY_NAMESPACE::Add; |
28 | | using hwy::HWY_NAMESPACE::Eq; |
29 | | using hwy::HWY_NAMESPACE::Floor; |
30 | | using hwy::HWY_NAMESPACE::Ge; |
31 | | using hwy::HWY_NAMESPACE::GetLane; |
32 | | using hwy::HWY_NAMESPACE::IfThenElse; |
33 | | using hwy::HWY_NAMESPACE::IfThenZeroElse; |
34 | | using hwy::HWY_NAMESPACE::Le; |
35 | | using hwy::HWY_NAMESPACE::Min; |
36 | | using hwy::HWY_NAMESPACE::Mul; |
37 | | using hwy::HWY_NAMESPACE::MulAdd; |
38 | | using hwy::HWY_NAMESPACE::NegMulAdd; |
39 | | using hwy::HWY_NAMESPACE::Rebind; |
40 | | using hwy::HWY_NAMESPACE::ShiftLeft; |
41 | | using hwy::HWY_NAMESPACE::ShiftRight; |
42 | | using hwy::HWY_NAMESPACE::Sub; |
43 | | using hwy::HWY_NAMESPACE::Xor; |
44 | | |
45 | | // Computes base-2 logarithm like std::log2. Undefined if negative / NaN. |
46 | | // L1 error ~3.9E-6 |
47 | | template <class DF, class V> |
48 | 110M | V FastLog2f(const DF df, V x) { |
49 | | // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2). |
50 | 110M | HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f), |
51 | 110M | HWY_REP4(1.4287160470083755E+00f), |
52 | 110M | HWY_REP4(7.4245873327820566E-01f)}; |
53 | 110M | HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f), |
54 | 110M | HWY_REP4(1.0096718572241148E+00f), |
55 | 110M | HWY_REP4(1.7409343003366853E-01f)}; |
56 | | |
57 | 110M | const Rebind<int32_t, DF> di; |
58 | 110M | const auto x_bits = BitCast(di, x); |
59 | | |
60 | | // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops |
61 | 110M | const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); // = 2/3 |
62 | | // Shifted exponent = log2; also used to clear mantissa. |
63 | 110M | const auto exp_shifted = ShiftRight<23>(exp_bits); |
64 | 110M | const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted))); |
65 | 110M | const auto exp_val = ConvertTo(df, exp_shifted); |
66 | 110M | return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q), |
67 | 110M | exp_val); |
68 | 110M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastLog2f<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 48 | 108M | V FastLog2f(const DF df, V x) { | 49 | | // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2). | 50 | 108M | HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f), | 51 | 108M | HWY_REP4(1.4287160470083755E+00f), | 52 | 108M | HWY_REP4(7.4245873327820566E-01f)}; | 53 | 108M | HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f), | 54 | 108M | HWY_REP4(1.0096718572241148E+00f), | 55 | 108M | HWY_REP4(1.7409343003366853E-01f)}; | 56 | | | 57 | 108M | const Rebind<int32_t, DF> di; | 58 | 108M | const auto x_bits = BitCast(di, x); | 59 | | | 60 | | // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops | 61 | 108M | const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); // = 2/3 | 62 | | // Shifted exponent = log2; also used to clear mantissa. | 63 | 108M | const auto exp_shifted = ShiftRight<23>(exp_bits); | 64 | 108M | const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted))); | 65 | 108M | const auto exp_val = ConvertTo(df, exp_shifted); | 66 | 108M | return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q), | 67 | 108M | exp_val); | 68 | 108M | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastLog2f<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastLog2f<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>) hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 48 | 398k | V FastLog2f(const DF df, V x) { | 49 | | // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2). | 50 | 398k | HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f), | 51 | 398k | HWY_REP4(1.4287160470083755E+00f), | 52 | 398k | HWY_REP4(7.4245873327820566E-01f)}; | 53 | 398k | HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f), | 54 | 398k | HWY_REP4(1.0096718572241148E+00f), | 55 | 398k | HWY_REP4(1.7409343003366853E-01f)}; | 56 | | | 57 | 398k | const Rebind<int32_t, DF> di; | 58 | 398k | const auto x_bits = BitCast(di, x); | 59 | | | 60 | | // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops | 61 | 398k | const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); // = 2/3 | 62 | | // Shifted exponent = log2; also used to clear mantissa. | 63 | 398k | const auto exp_shifted = ShiftRight<23>(exp_bits); | 64 | 398k | const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted))); | 65 | 398k | const auto exp_val = ConvertTo(df, exp_shifted); | 66 | 398k | return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q), | 67 | 398k | exp_val); | 68 | 398k | } |
hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastLog2f<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>) Line | Count | Source | 48 | 90.7k | V FastLog2f(const DF df, V x) { | 49 | | // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2). | 50 | 90.7k | HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f), | 51 | 90.7k | HWY_REP4(1.4287160470083755E+00f), | 52 | 90.7k | HWY_REP4(7.4245873327820566E-01f)}; | 53 | 90.7k | HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f), | 54 | 90.7k | HWY_REP4(1.0096718572241148E+00f), | 55 | 90.7k | HWY_REP4(1.7409343003366853E-01f)}; | 56 | | | 57 | 90.7k | const Rebind<int32_t, DF> di; | 58 | 90.7k | const auto x_bits = BitCast(di, x); | 59 | | | 60 | | // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops | 61 | 90.7k | const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); // = 2/3 | 62 | | // Shifted exponent = log2; also used to clear mantissa. | 63 | 90.7k | const auto exp_shifted = ShiftRight<23>(exp_bits); | 64 | 90.7k | const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted))); | 65 | 90.7k | const auto exp_val = ConvertTo(df, exp_shifted); | 66 | 90.7k | return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q), | 67 | 90.7k | exp_val); | 68 | 90.7k | } |
hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>) Line | Count | Source | 48 | 1.28M | V FastLog2f(const DF df, V x) { | 49 | | // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2). | 50 | 1.28M | HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f), | 51 | 1.28M | HWY_REP4(1.4287160470083755E+00f), | 52 | 1.28M | HWY_REP4(7.4245873327820566E-01f)}; | 53 | 1.28M | HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f), | 54 | 1.28M | HWY_REP4(1.0096718572241148E+00f), | 55 | 1.28M | HWY_REP4(1.7409343003366853E-01f)}; | 56 | | | 57 | 1.28M | const Rebind<int32_t, DF> di; | 58 | 1.28M | const auto x_bits = BitCast(di, x); | 59 | | | 60 | | // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops | 61 | 1.28M | const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); // = 2/3 | 62 | | // Shifted exponent = log2; also used to clear mantissa. | 63 | 1.28M | const auto exp_shifted = ShiftRight<23>(exp_bits); | 64 | 1.28M | const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted))); | 65 | 1.28M | const auto exp_val = ConvertTo(df, exp_shifted); | 66 | 1.28M | return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q), | 67 | 1.28M | exp_val); | 68 | 1.28M | } |
|
69 | | |
70 | | // max relative error ~3e-7 |
71 | | template <class DF, class V> |
72 | 2.10M | V FastPow2f(const DF df, V x) { |
73 | 2.10M | const Rebind<int32_t, DF> di; |
74 | 2.10M | auto floorx = Floor(x); |
75 | 2.10M | auto exp = |
76 | 2.10M | BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127)))); |
77 | 2.10M | auto frac = Sub(x, floorx); |
78 | 2.10M | auto num = Add(frac, Set(df, 1.01749063e+01)); |
79 | 2.10M | num = MulAdd(num, frac, Set(df, 4.88687798e+01)); |
80 | 2.10M | num = MulAdd(num, frac, Set(df, 9.85506591e+01)); |
81 | 2.10M | num = Mul(num, exp); |
82 | 2.10M | auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02)); |
83 | 2.10M | den = MulAdd(den, frac, Set(df, -1.94414990e+01)); |
84 | 2.10M | den = MulAdd(den, frac, Set(df, 9.85506633e+01)); |
85 | 2.10M | return Div(num, den); |
86 | 2.10M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastPow2f<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>) hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 72 | 796k | V FastPow2f(const DF df, V x) { | 73 | 796k | const Rebind<int32_t, DF> di; | 74 | 796k | auto floorx = Floor(x); | 75 | 796k | auto exp = | 76 | 796k | BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127)))); | 77 | 796k | auto frac = Sub(x, floorx); | 78 | 796k | auto num = Add(frac, Set(df, 1.01749063e+01)); | 79 | 796k | num = MulAdd(num, frac, Set(df, 4.88687798e+01)); | 80 | 796k | num = MulAdd(num, frac, Set(df, 9.85506591e+01)); | 81 | 796k | num = Mul(num, exp); | 82 | 796k | auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02)); | 83 | 796k | den = MulAdd(den, frac, Set(df, -1.94414990e+01)); | 84 | 796k | den = MulAdd(den, frac, Set(df, 9.85506633e+01)); | 85 | 796k | return Div(num, den); | 86 | 796k | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastPow2f<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>) Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastPow2f<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>) Line | Count | Source | 72 | 1.28M | V FastPow2f(const DF df, V x) { | 73 | 1.28M | const Rebind<int32_t, DF> di; | 74 | 1.28M | auto floorx = Floor(x); | 75 | 1.28M | auto exp = | 76 | 1.28M | BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127)))); | 77 | 1.28M | auto frac = Sub(x, floorx); | 78 | 1.28M | auto num = Add(frac, Set(df, 1.01749063e+01)); | 79 | 1.28M | num = MulAdd(num, frac, Set(df, 4.88687798e+01)); | 80 | 1.28M | num = MulAdd(num, frac, Set(df, 9.85506591e+01)); | 81 | 1.28M | num = Mul(num, exp); | 82 | 1.28M | auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02)); | 83 | 1.28M | den = MulAdd(den, frac, Set(df, -1.94414990e+01)); | 84 | 1.28M | den = MulAdd(den, frac, Set(df, 9.85506633e+01)); | 85 | 1.28M | return Div(num, den); | 86 | 1.28M | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastPow2f<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 72 | 16.0k | V FastPow2f(const DF df, V x) { | 73 | 16.0k | const Rebind<int32_t, DF> di; | 74 | 16.0k | auto floorx = Floor(x); | 75 | 16.0k | auto exp = | 76 | 16.0k | BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127)))); | 77 | 16.0k | auto frac = Sub(x, floorx); | 78 | 16.0k | auto num = Add(frac, Set(df, 1.01749063e+01)); | 79 | 16.0k | num = MulAdd(num, frac, Set(df, 4.88687798e+01)); | 80 | 16.0k | num = MulAdd(num, frac, Set(df, 9.85506591e+01)); | 81 | 16.0k | num = Mul(num, exp); | 82 | 16.0k | auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02)); | 83 | 16.0k | den = MulAdd(den, frac, Set(df, -1.94414990e+01)); | 84 | 16.0k | den = MulAdd(den, frac, Set(df, 9.85506633e+01)); | 85 | 16.0k | return Div(num, den); | 86 | 16.0k | } |
|
87 | | |
88 | | // max relative error ~3e-5 |
89 | | template <class DF, class V> |
90 | 1.70M | V FastPowf(const DF df, V base, V exponent) { |
91 | 1.70M | return FastPow2f(df, Mul(FastLog2f(df, base), exponent)); |
92 | 1.70M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastPowf<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>) hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 90 | 398k | V FastPowf(const DF df, V base, V exponent) { | 91 | 398k | return FastPow2f(df, Mul(FastLog2f(df, base), exponent)); | 92 | 398k | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastPowf<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>, hwy::N_SSE2::Vec128<float, 1ul>) Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastPowf<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>, hwy::N_AVX2::Vec128<float, 4ul>) Line | Count | Source | 90 | 1.28M | V FastPowf(const DF df, V base, V exponent) { | 91 | 1.28M | return FastPow2f(df, Mul(FastLog2f(df, base), exponent)); | 92 | 1.28M | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastPowf<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 90 | 16.0k | V FastPowf(const DF df, V base, V exponent) { | 91 | 16.0k | return FastPow2f(df, Mul(FastLog2f(df, base), exponent)); | 92 | 16.0k | } |
|
93 | | |
94 | | // Computes cosine like std::cos. |
95 | | // L1 error 7e-5. |
96 | | template <class DF, class V> |
97 | 192M | V FastCosf(const DF df, V x) { |
98 | | // Step 1: range reduction to [0, 2pi) |
99 | 192M | const auto pi2 = Set(df, kPi * 2.0f); |
100 | 192M | const auto pi2_inv = Set(df, 0.5f / kPi); |
101 | 192M | const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2); |
102 | 192M | const auto xmodpi2 = Sub(x, npi2); |
103 | | // Step 2: range reduction to [0, pi] |
104 | 192M | const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2)); |
105 | | // Step 3: range reduction to [0, pi/2] |
106 | 192M | const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f)); |
107 | 192M | const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi); |
108 | | // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle |
109 | | // duplication steps faster, on x/4. |
110 | 192M | const auto xs = Mul(x_pihalf, Set(df, 0.25f)); |
111 | 192M | const auto x2 = Mul(xs, xs); |
112 | 192M | const auto x4 = Mul(x2, x2); |
113 | 192M | const auto cosx_prescaling = |
114 | 192M | MulAdd(x4, Set(df, 0.06960438), |
115 | 192M | MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268))); |
116 | | // Step 5: angle duplication. |
117 | 192M | const auto cosx_scale1 = |
118 | 192M | MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562)); |
119 | 192M | const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1)); |
120 | | // Step 6: change sign if needed. |
121 | 192M | const Rebind<uint32_t, DF> du; |
122 | 192M | auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf))); |
123 | 192M | return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2))); |
124 | 192M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastCosf<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>) Unexecuted instantiation: hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastCosf<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>) Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastCosf<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>) hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastCosf<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 97 | 192M | V FastCosf(const DF df, V x) { | 98 | | // Step 1: range reduction to [0, 2pi) | 99 | 192M | const auto pi2 = Set(df, kPi * 2.0f); | 100 | 192M | const auto pi2_inv = Set(df, 0.5f / kPi); | 101 | 192M | const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2); | 102 | 192M | const auto xmodpi2 = Sub(x, npi2); | 103 | | // Step 2: range reduction to [0, pi] | 104 | 192M | const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2)); | 105 | | // Step 3: range reduction to [0, pi/2] | 106 | 192M | const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f)); | 107 | 192M | const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi); | 108 | | // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle | 109 | | // duplication steps faster, on x/4. | 110 | 192M | const auto xs = Mul(x_pihalf, Set(df, 0.25f)); | 111 | 192M | const auto x2 = Mul(xs, xs); | 112 | 192M | const auto x4 = Mul(x2, x2); | 113 | 192M | const auto cosx_prescaling = | 114 | 192M | MulAdd(x4, Set(df, 0.06960438), | 115 | 192M | MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268))); | 116 | | // Step 5: angle duplication. | 117 | 192M | const auto cosx_scale1 = | 118 | 192M | MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562)); | 119 | 192M | const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1)); | 120 | | // Step 6: change sign if needed. | 121 | 192M | const Rebind<uint32_t, DF> du; | 122 | 192M | auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf))); | 123 | 192M | return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2))); | 124 | 192M | } |
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastCosf<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastCosf<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) |
125 | | |
126 | | // Computes the error function like std::erf. |
127 | | // L1 error 7e-4. |
128 | | template <class DF, class V> |
129 | 1.41M | V FastErff(const DF df, V x) { |
130 | | // Formula from |
131 | | // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations |
132 | | // but constants have been recomputed. |
133 | 1.41M | const auto xle0 = Le(x, Zero(df)); |
134 | 1.41M | const auto absx = Abs(x); |
135 | | // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4 |
136 | 1.41M | const auto denom1 = |
137 | 1.41M | MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04)); |
138 | 1.41M | const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01)); |
139 | 1.41M | const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01)); |
140 | 1.41M | const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f)); |
141 | 1.41M | const auto denom5 = Mul(denom4, denom4); |
142 | 1.41M | const auto inv_denom5 = Div(Set(df, 1.0f), denom5); |
143 | 1.41M | const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f)); |
144 | | // Change sign if needed. |
145 | 1.41M | const Rebind<uint32_t, DF> du; |
146 | 1.41M | auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0))); |
147 | 1.41M | return BitCast(df, Xor(signbit, BitCast(du, result))); |
148 | 1.41M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastErff<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>) hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastErff<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>) Line | Count | Source | 129 | 766k | V FastErff(const DF df, V x) { | 130 | | // Formula from | 131 | | // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations | 132 | | // but constants have been recomputed. | 133 | 766k | const auto xle0 = Le(x, Zero(df)); | 134 | 766k | const auto absx = Abs(x); | 135 | | // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4 | 136 | 766k | const auto denom1 = | 137 | 766k | MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04)); | 138 | 766k | const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01)); | 139 | 766k | const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01)); | 140 | 766k | const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f)); | 141 | 766k | const auto denom5 = Mul(denom4, denom4); | 142 | 766k | const auto inv_denom5 = Div(Set(df, 1.0f), denom5); | 143 | 766k | const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f)); | 144 | | // Change sign if needed. | 145 | 766k | const Rebind<uint32_t, DF> du; | 146 | 766k | auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0))); | 147 | 766k | return BitCast(df, Xor(signbit, BitCast(du, result))); | 148 | 766k | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastErff<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>) hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastErff<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 129 | 650k | V FastErff(const DF df, V x) { | 130 | | // Formula from | 131 | | // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations | 132 | | // but constants have been recomputed. | 133 | 650k | const auto xle0 = Le(x, Zero(df)); | 134 | 650k | const auto absx = Abs(x); | 135 | | // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4 | 136 | 650k | const auto denom1 = | 137 | 650k | MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04)); | 138 | 650k | const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01)); | 139 | 650k | const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01)); | 140 | 650k | const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f)); | 141 | 650k | const auto denom5 = Mul(denom4, denom4); | 142 | 650k | const auto inv_denom5 = Div(Set(df, 1.0f), denom5); | 143 | 650k | const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f)); | 144 | | // Change sign if needed. | 145 | 650k | const Rebind<uint32_t, DF> du; | 146 | 650k | auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0))); | 147 | 650k | return BitCast(df, Xor(signbit, BitCast(du, result))); | 148 | 650k | } |
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastErff<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastErff<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) |
149 | | |
150 | 90.7k | inline float FastLog2f(float f) { |
151 | 90.7k | HWY_CAPPED(float, 1) D; |
152 | 90.7k | return GetLane(FastLog2f(D, Set(D, f))); |
153 | 90.7k | } Unexecuted instantiation: jxl::N_SSE4::FastLog2f(float) Unexecuted instantiation: jxl::N_AVX2::FastLog2f(float) jxl::N_SSE2::FastLog2f(float) Line | Count | Source | 150 | 90.7k | inline float FastLog2f(float f) { | 151 | 90.7k | HWY_CAPPED(float, 1) D; | 152 | 90.7k | return GetLane(FastLog2f(D, Set(D, f))); | 153 | 90.7k | } |
|
154 | | |
155 | 398k | inline float FastPow2f(float f) { |
156 | 398k | HWY_CAPPED(float, 1) D; |
157 | 398k | return GetLane(FastPow2f(D, Set(D, f))); |
158 | 398k | } Unexecuted instantiation: jxl::N_SSE4::FastPow2f(float) jxl::N_AVX2::FastPow2f(float) Line | Count | Source | 155 | 398k | inline float FastPow2f(float f) { | 156 | 398k | HWY_CAPPED(float, 1) D; | 157 | 398k | return GetLane(FastPow2f(D, Set(D, f))); | 158 | 398k | } |
Unexecuted instantiation: jxl::N_SSE2::FastPow2f(float) |
159 | | |
160 | 398k | inline float FastPowf(float b, float e) { |
161 | 398k | HWY_CAPPED(float, 1) D; |
162 | 398k | return GetLane(FastPowf(D, Set(D, b), Set(D, e))); |
163 | 398k | } Unexecuted instantiation: jxl::N_SSE4::FastPowf(float, float) jxl::N_AVX2::FastPowf(float, float) Line | Count | Source | 160 | 398k | inline float FastPowf(float b, float e) { | 161 | 398k | HWY_CAPPED(float, 1) D; | 162 | 398k | return GetLane(FastPowf(D, Set(D, b), Set(D, e))); | 163 | 398k | } |
Unexecuted instantiation: jxl::N_SSE2::FastPowf(float, float) |
164 | | |
165 | 0 | inline float FastCosf(float f) { |
166 | 0 | HWY_CAPPED(float, 1) D; |
167 | 0 | return GetLane(FastCosf(D, Set(D, f))); |
168 | 0 | } Unexecuted instantiation: jxl::N_SSE4::FastCosf(float) Unexecuted instantiation: jxl::N_AVX2::FastCosf(float) Unexecuted instantiation: jxl::N_SSE2::FastCosf(float) |
169 | | |
170 | 0 | inline float FastErff(float f) { |
171 | 0 | HWY_CAPPED(float, 1) D; |
172 | 0 | return GetLane(FastErff(D, Set(D, f))); |
173 | 0 | } Unexecuted instantiation: jxl::N_SSE4::FastErff(float) Unexecuted instantiation: jxl::N_AVX2::FastErff(float) Unexecuted instantiation: jxl::N_SSE2::FastErff(float) |
174 | | |
175 | | // Returns cbrt(x) + add with 6 ulp max error. |
176 | | // Modified from vectormath_exp.h, Apache 2 license. |
177 | | // https://www.agner.org/optimize/vectorclass.zip |
178 | | template <class V> |
179 | 9.50M | V CubeRootAndAdd(const V x, const V add) { |
180 | 9.50M | const HWY_FULL(float) df; |
181 | 9.50M | const HWY_FULL(int32_t) di; |
182 | | |
183 | 9.50M | const auto kExpBias = Set(di, 0x54800000); // cast(1.) + cast(1.) / 3 |
184 | 9.50M | const auto kExpMul = Set(di, 0x002AAAAA); // shifted 1/3 |
185 | 9.50M | const auto k1_3 = Set(df, 1.0f / 3); |
186 | 9.50M | const auto k4_3 = Set(df, 4.0f / 3); |
187 | | |
188 | 9.50M | const auto xa = x; // assume inputs never negative |
189 | 9.50M | const auto xa_3 = Mul(k1_3, xa); |
190 | | |
191 | | // Multiply exponent by -1/3 |
192 | 9.50M | const auto m1 = BitCast(di, xa); |
193 | | // Special case for 0. 0 is represented with an exponent of 0, so the |
194 | | // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse() |
195 | | // sets those values as 0, which prevents having NaNs in the computations |
196 | | // below. |
197 | | // TODO(eustas): use fused op |
198 | 9.50M | const auto m2 = IfThenZeroElse( |
199 | 9.50M | Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul))); |
200 | 9.50M | auto r = BitCast(df, m2); |
201 | | |
202 | | // Newton-Raphson iterations |
203 | 38.0M | for (int i = 0; i < 3; i++) { |
204 | 28.5M | const auto r2 = Mul(r, r); |
205 | 28.5M | r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r)); |
206 | 28.5M | } |
207 | | // Final iteration |
208 | 9.50M | auto r2 = Mul(r, r); |
209 | 9.50M | r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r); |
210 | 9.50M | r2 = Mul(r, r); |
211 | 9.50M | r = MulAdd(r2, x, add); |
212 | | |
213 | 9.50M | return r; |
214 | 9.50M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::CubeRootAndAdd<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>) hwy::N_AVX2::Vec256<float> jxl::N_AVX2::CubeRootAndAdd<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>) Line | Count | Source | 179 | 9.50M | V CubeRootAndAdd(const V x, const V add) { | 180 | 9.50M | const HWY_FULL(float) df; | 181 | 9.50M | const HWY_FULL(int32_t) di; | 182 | | | 183 | 9.50M | const auto kExpBias = Set(di, 0x54800000); // cast(1.) + cast(1.) / 3 | 184 | 9.50M | const auto kExpMul = Set(di, 0x002AAAAA); // shifted 1/3 | 185 | 9.50M | const auto k1_3 = Set(df, 1.0f / 3); | 186 | 9.50M | const auto k4_3 = Set(df, 4.0f / 3); | 187 | | | 188 | 9.50M | const auto xa = x; // assume inputs never negative | 189 | 9.50M | const auto xa_3 = Mul(k1_3, xa); | 190 | | | 191 | | // Multiply exponent by -1/3 | 192 | 9.50M | const auto m1 = BitCast(di, xa); | 193 | | // Special case for 0. 0 is represented with an exponent of 0, so the | 194 | | // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse() | 195 | | // sets those values as 0, which prevents having NaNs in the computations | 196 | | // below. | 197 | | // TODO(eustas): use fused op | 198 | 9.50M | const auto m2 = IfThenZeroElse( | 199 | 9.50M | Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul))); | 200 | 9.50M | auto r = BitCast(df, m2); | 201 | | | 202 | | // Newton-Raphson iterations | 203 | 38.0M | for (int i = 0; i < 3; i++) { | 204 | 28.5M | const auto r2 = Mul(r, r); | 205 | 28.5M | r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r)); | 206 | 28.5M | } | 207 | | // Final iteration | 208 | 9.50M | auto r2 = Mul(r, r); | 209 | 9.50M | r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r); | 210 | 9.50M | r2 = Mul(r, r); | 211 | 9.50M | r = MulAdd(r2, x, add); | 212 | | | 213 | 9.50M | return r; | 214 | 9.50M | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::CubeRootAndAdd<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>) |
215 | | |
216 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
217 | | } // namespace HWY_NAMESPACE |
218 | | } // namespace jxl |
219 | | HWY_AFTER_NAMESPACE(); |
220 | | |
221 | | #endif // LIB_JXL_BASE_FAST_MATH_INL_H_ |
222 | | |
223 | | #if HWY_ONCE |
224 | | #ifndef LIB_JXL_BASE_FAST_MATH_ONCE |
225 | | #define LIB_JXL_BASE_FAST_MATH_ONCE |
226 | | |
227 | | namespace jxl { |
228 | 90.7k | inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); } |
229 | 0 | inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); } |
230 | 0 | inline float FastPowf(float b, float e) { |
231 | 0 | return HWY_STATIC_DISPATCH(FastPowf)(b, e); |
232 | 0 | } |
233 | 0 | inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); } |
234 | 0 | inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); } |
235 | | } // namespace jxl |
236 | | |
237 | | #endif // LIB_JXL_BASE_FAST_MATH_ONCE |
238 | | #endif // HWY_ONCE |