Coverage Report

Created: 2025-07-23 08:18

/src/libjxl/lib/jxl/base/fast_math-inl.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
// Fast SIMD math ops (log2, encoder only, cos, erf for splines)
7
8
#include <cstdint>
9
10
#if defined(LIB_JXL_BASE_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE)
11
#ifdef LIB_JXL_BASE_FAST_MATH_INL_H_
12
#undef LIB_JXL_BASE_FAST_MATH_INL_H_
13
#else
14
#define LIB_JXL_BASE_FAST_MATH_INL_H_
15
#endif
16
17
#include <hwy/highway.h>
18
19
#include "lib/jxl/base/common.h"
20
#include "lib/jxl/base/rational_polynomial-inl.h"
21
HWY_BEFORE_NAMESPACE();
22
namespace jxl {
23
namespace HWY_NAMESPACE {
24
25
// These templates are not found via ADL.
26
using hwy::HWY_NAMESPACE::Abs;
27
using hwy::HWY_NAMESPACE::Add;
28
using hwy::HWY_NAMESPACE::Eq;
29
using hwy::HWY_NAMESPACE::Floor;
30
using hwy::HWY_NAMESPACE::Ge;
31
using hwy::HWY_NAMESPACE::GetLane;
32
using hwy::HWY_NAMESPACE::IfThenElse;
33
using hwy::HWY_NAMESPACE::IfThenZeroElse;
34
using hwy::HWY_NAMESPACE::Le;
35
using hwy::HWY_NAMESPACE::Min;
36
using hwy::HWY_NAMESPACE::Mul;
37
using hwy::HWY_NAMESPACE::MulAdd;
38
using hwy::HWY_NAMESPACE::NegMulAdd;
39
using hwy::HWY_NAMESPACE::Rebind;
40
using hwy::HWY_NAMESPACE::ShiftLeft;
41
using hwy::HWY_NAMESPACE::ShiftRight;
42
using hwy::HWY_NAMESPACE::Sub;
43
using hwy::HWY_NAMESPACE::Xor;
44
45
// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
46
// L1 error ~3.9E-6
47
template <class DF, class V>
48
443M
V FastLog2f(const DF df, V x) {
49
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
50
443M
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
51
443M
                                          HWY_REP4(1.4287160470083755E+00f),
52
443M
                                          HWY_REP4(7.4245873327820566E-01f)};
53
443M
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
54
443M
                                          HWY_REP4(1.0096718572241148E+00f),
55
443M
                                          HWY_REP4(1.7409343003366853E-01f)};
56
57
443M
  const Rebind<int32_t, DF> di;
58
443M
  const auto x_bits = BitCast(di, x);
59
60
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
61
443M
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
62
  // Shifted exponent = log2; also used to clear mantissa.
63
443M
  const auto exp_shifted = ShiftRight<23>(exp_bits);
64
443M
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
65
443M
  const auto exp_val = ConvertTo(df, exp_shifted);
66
443M
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
67
443M
             exp_val);
68
443M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastLog2f<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
48
2.82M
V FastLog2f(const DF df, V x) {
49
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
50
2.82M
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
51
2.82M
                                          HWY_REP4(1.4287160470083755E+00f),
52
2.82M
                                          HWY_REP4(7.4245873327820566E-01f)};
53
2.82M
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
54
2.82M
                                          HWY_REP4(1.0096718572241148E+00f),
55
2.82M
                                          HWY_REP4(1.7409343003366853E-01f)};
56
57
2.82M
  const Rebind<int32_t, DF> di;
58
2.82M
  const auto x_bits = BitCast(di, x);
59
60
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
61
2.82M
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
62
  // Shifted exponent = log2; also used to clear mantissa.
63
2.82M
  const auto exp_shifted = ShiftRight<23>(exp_bits);
64
2.82M
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
65
2.82M
  const auto exp_val = ConvertTo(df, exp_shifted);
66
2.82M
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
67
2.82M
             exp_val);
68
2.82M
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastLog2f<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 1ul> jxl::N_AVX3_ZEN4::FastLog2f<hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul> >(hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 1ul> jxl::N_AVX3_SPR::FastLog2f<hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul> >(hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul>)
hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastLog2f<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>)
Line
Count
Source
48
1.19M
V FastLog2f(const DF df, V x) {
49
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
50
1.19M
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
51
1.19M
                                          HWY_REP4(1.4287160470083755E+00f),
52
1.19M
                                          HWY_REP4(7.4245873327820566E-01f)};
53
1.19M
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
54
1.19M
                                          HWY_REP4(1.0096718572241148E+00f),
55
1.19M
                                          HWY_REP4(1.7409343003366853E-01f)};
56
57
1.19M
  const Rebind<int32_t, DF> di;
58
1.19M
  const auto x_bits = BitCast(di, x);
59
60
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
61
1.19M
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
62
  // Shifted exponent = log2; also used to clear mantissa.
63
1.19M
  const auto exp_shifted = ShiftRight<23>(exp_bits);
64
1.19M
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
65
1.19M
  const auto exp_val = ConvertTo(df, exp_shifted);
66
1.19M
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
67
1.19M
             exp_val);
68
1.19M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastLog2f<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>)
Line
Count
Source
48
13.6M
V FastLog2f(const DF df, V x) {
49
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
50
13.6M
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
51
13.6M
                                          HWY_REP4(1.4287160470083755E+00f),
52
13.6M
                                          HWY_REP4(7.4245873327820566E-01f)};
53
13.6M
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
54
13.6M
                                          HWY_REP4(1.0096718572241148E+00f),
55
13.6M
                                          HWY_REP4(1.7409343003366853E-01f)};
56
57
13.6M
  const Rebind<int32_t, DF> di;
58
13.6M
  const auto x_bits = BitCast(di, x);
59
60
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
61
13.6M
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
62
  // Shifted exponent = log2; also used to clear mantissa.
63
13.6M
  const auto exp_shifted = ShiftRight<23>(exp_bits);
64
13.6M
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
65
13.6M
  const auto exp_val = ConvertTo(df, exp_shifted);
66
13.6M
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
67
13.6M
             exp_val);
68
13.6M
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 4ul> jxl::N_AVX3::FastLog2f<hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul> >(hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 4ul> jxl::N_AVX3_ZEN4::FastLog2f<hwy::N_AVX3_ZEN4::Simd<float, 4ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul> >(hwy::N_AVX3_ZEN4::Simd<float, 4ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 4ul> jxl::N_AVX3_SPR::FastLog2f<hwy::N_AVX3_SPR::Simd<float, 4ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul> >(hwy::N_AVX3_SPR::Simd<float, 4ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastLog2f<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
48
425M
V FastLog2f(const DF df, V x) {
49
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
50
425M
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
51
425M
                                          HWY_REP4(1.4287160470083755E+00f),
52
425M
                                          HWY_REP4(7.4245873327820566E-01f)};
53
425M
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
54
425M
                                          HWY_REP4(1.0096718572241148E+00f),
55
425M
                                          HWY_REP4(1.7409343003366853E-01f)};
56
57
425M
  const Rebind<int32_t, DF> di;
58
425M
  const auto x_bits = BitCast(di, x);
59
60
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
61
425M
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
62
  // Shifted exponent = log2; also used to clear mantissa.
63
425M
  const auto exp_shifted = ShiftRight<23>(exp_bits);
64
425M
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
65
425M
  const auto exp_val = ConvertTo(df, exp_shifted);
66
425M
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
67
425M
             exp_val);
68
425M
}
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastLog2f<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::FastLog2f<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::FastLog2f<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec256<float> jxl::N_AVX3_SPR::FastLog2f<hwy::N_AVX3_SPR::Simd<float, 8ul, 0>, hwy::N_AVX3_SPR::Vec256<float> >(hwy::N_AVX3_SPR::Simd<float, 8ul, 0>, hwy::N_AVX3_SPR::Vec256<float>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec256<float> jxl::N_AVX3_ZEN4::FastLog2f<hwy::N_AVX3_ZEN4::Simd<float, 8ul, 0>, hwy::N_AVX3_ZEN4::Vec256<float> >(hwy::N_AVX3_ZEN4::Simd<float, 8ul, 0>, hwy::N_AVX3_ZEN4::Vec256<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec256<float> jxl::N_AVX3::FastLog2f<hwy::N_AVX3::Simd<float, 8ul, 0>, hwy::N_AVX3::Vec256<float> >(hwy::N_AVX3::Simd<float, 8ul, 0>, hwy::N_AVX3::Vec256<float>)
69
70
// max relative error ~3e-7
71
template <class DF, class V>
72
19.6M
V FastPow2f(const DF df, V x) {
73
19.6M
  const Rebind<int32_t, DF> di;
74
19.6M
  auto floorx = Floor(x);
75
19.6M
  auto exp =
76
19.6M
      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
77
19.6M
  auto frac = Sub(x, floorx);
78
19.6M
  auto num = Add(frac, Set(df, 1.01749063e+01));
79
19.6M
  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
80
19.6M
  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
81
19.6M
  num = Mul(num, exp);
82
19.6M
  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
83
19.6M
  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
84
19.6M
  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
85
19.6M
  return Div(num, den);
86
19.6M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastPow2f<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
72
5.72M
V FastPow2f(const DF df, V x) {
73
5.72M
  const Rebind<int32_t, DF> di;
74
5.72M
  auto floorx = Floor(x);
75
5.72M
  auto exp =
76
5.72M
      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
77
5.72M
  auto frac = Sub(x, floorx);
78
5.72M
  auto num = Add(frac, Set(df, 1.01749063e+01));
79
5.72M
  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
80
5.72M
  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
81
5.72M
  num = Mul(num, exp);
82
5.72M
  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
83
5.72M
  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
84
5.72M
  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
85
5.72M
  return Div(num, den);
86
5.72M
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastPow2f<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 1ul> jxl::N_AVX3_ZEN4::FastPow2f<hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul> >(hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 1ul> jxl::N_AVX3_SPR::FastPow2f<hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul> >(hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastPow2f<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastPow2f<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>)
Line
Count
Source
72
13.6M
V FastPow2f(const DF df, V x) {
73
13.6M
  const Rebind<int32_t, DF> di;
74
13.6M
  auto floorx = Floor(x);
75
13.6M
  auto exp =
76
13.6M
      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
77
13.6M
  auto frac = Sub(x, floorx);
78
13.6M
  auto num = Add(frac, Set(df, 1.01749063e+01));
79
13.6M
  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
80
13.6M
  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
81
13.6M
  num = Mul(num, exp);
82
13.6M
  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
83
13.6M
  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
84
13.6M
  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
85
13.6M
  return Div(num, den);
86
13.6M
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 4ul> jxl::N_AVX3::FastPow2f<hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul> >(hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 4ul> jxl::N_AVX3_ZEN4::FastPow2f<hwy::N_AVX3_ZEN4::Simd<float, 4ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul> >(hwy::N_AVX3_ZEN4::Simd<float, 4ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 4ul> jxl::N_AVX3_SPR::FastPow2f<hwy::N_AVX3_SPR::Simd<float, 4ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul> >(hwy::N_AVX3_SPR::Simd<float, 4ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastPow2f<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::FastPow2f<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::FastPow2f<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastPow2f<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
72
330k
V FastPow2f(const DF df, V x) {
73
330k
  const Rebind<int32_t, DF> di;
74
330k
  auto floorx = Floor(x);
75
330k
  auto exp =
76
330k
      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
77
330k
  auto frac = Sub(x, floorx);
78
330k
  auto num = Add(frac, Set(df, 1.01749063e+01));
79
330k
  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
80
330k
  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
81
330k
  num = Mul(num, exp);
82
330k
  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
83
330k
  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
84
330k
  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
85
330k
  return Div(num, den);
86
330k
}
87
88
// max relative error ~3e-5
89
template <class DF, class V>
90
16.7M
V FastPowf(const DF df, V base, V exponent) {
91
16.7M
  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
92
16.7M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastPowf<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>)
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
90
2.82M
V FastPowf(const DF df, V base, V exponent) {
91
2.82M
  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
92
2.82M
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastPowf<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 1ul> jxl::N_AVX3_ZEN4::FastPowf<hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul> >(hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 1ul> jxl::N_AVX3_SPR::FastPowf<hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul> >(hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul>, hwy::N_AVX3_SPR::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastPowf<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastPowf<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>, hwy::N_AVX2::Vec128<float, 4ul>)
Line
Count
Source
90
13.6M
V FastPowf(const DF df, V base, V exponent) {
91
13.6M
  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
92
13.6M
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 4ul> jxl::N_AVX3::FastPowf<hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul> >(hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul>, hwy::N_AVX3::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 4ul> jxl::N_AVX3_ZEN4::FastPowf<hwy::N_AVX3_ZEN4::Simd<float, 4ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul> >(hwy::N_AVX3_ZEN4::Simd<float, 4ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 4ul> jxl::N_AVX3_SPR::FastPowf<hwy::N_AVX3_SPR::Simd<float, 4ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul> >(hwy::N_AVX3_SPR::Simd<float, 4ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul>, hwy::N_AVX3_SPR::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastPowf<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::FastPowf<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::FastPowf<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastPowf<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
90
330k
V FastPowf(const DF df, V base, V exponent) {
91
330k
  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
92
330k
}
93
94
// Computes cosine like std::cos.
95
// L1 error 7e-5.
96
template <class DF, class V>
97
125M
V FastCosf(const DF df, V x) {
98
  // Step 1: range reduction to [0, 2pi)
99
125M
  const auto pi2 = Set(df, kPi * 2.0f);
100
125M
  const auto pi2_inv = Set(df, 0.5f / kPi);
101
125M
  const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2);
102
125M
  const auto xmodpi2 = Sub(x, npi2);
103
  // Step 2: range reduction to [0, pi]
104
125M
  const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2));
105
  // Step 3: range reduction to [0, pi/2]
106
125M
  const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f));
107
125M
  const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi);
108
  // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle
109
  // duplication steps faster, on x/4.
110
125M
  const auto xs = Mul(x_pihalf, Set(df, 0.25f));
111
125M
  const auto x2 = Mul(xs, xs);
112
125M
  const auto x4 = Mul(x2, x2);
113
125M
  const auto cosx_prescaling =
114
125M
      MulAdd(x4, Set(df, 0.06960438),
115
125M
             MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268)));
116
  // Step 5: angle duplication.
117
125M
  const auto cosx_scale1 =
118
125M
      MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562));
119
125M
  const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1));
120
  // Step 6: change sign if needed.
121
125M
  const Rebind<uint32_t, DF> du;
122
125M
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf)));
123
125M
  return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2)));
124
125M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastCosf<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastCosf<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastCosf<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 1ul> jxl::N_AVX3_ZEN4::FastCosf<hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul> >(hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 1ul> jxl::N_AVX3_SPR::FastCosf<hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul> >(hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastCosf<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::FastCosf<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::FastCosf<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastCosf<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastCosf<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
97
125M
V FastCosf(const DF df, V x) {
98
  // Step 1: range reduction to [0, 2pi)
99
125M
  const auto pi2 = Set(df, kPi * 2.0f);
100
125M
  const auto pi2_inv = Set(df, 0.5f / kPi);
101
125M
  const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2);
102
125M
  const auto xmodpi2 = Sub(x, npi2);
103
  // Step 2: range reduction to [0, pi]
104
125M
  const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2));
105
  // Step 3: range reduction to [0, pi/2]
106
125M
  const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f));
107
125M
  const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi);
108
  // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle
109
  // duplication steps faster, on x/4.
110
125M
  const auto xs = Mul(x_pihalf, Set(df, 0.25f));
111
125M
  const auto x2 = Mul(xs, xs);
112
125M
  const auto x4 = Mul(x2, x2);
113
125M
  const auto cosx_prescaling =
114
125M
      MulAdd(x4, Set(df, 0.06960438),
115
125M
             MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268)));
116
  // Step 5: angle duplication.
117
125M
  const auto cosx_scale1 =
118
125M
      MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562));
119
125M
  const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1));
120
  // Step 6: change sign if needed.
121
125M
  const Rebind<uint32_t, DF> du;
122
125M
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf)));
123
125M
  return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2)));
124
125M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastCosf<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastCosf<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
125
126
// Computes the error function like std::erf.
127
// L1 error 7e-4.
128
template <class DF, class V>
129
4.41M
V FastErff(const DF df, V x) {
130
  // Formula from
131
  // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
132
  // but constants have been recomputed.
133
4.41M
  const auto xle0 = Le(x, Zero(df));
134
4.41M
  const auto absx = Abs(x);
135
  // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4
136
4.41M
  const auto denom1 =
137
4.41M
      MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04));
138
4.41M
  const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01));
139
4.41M
  const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01));
140
4.41M
  const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f));
141
4.41M
  const auto denom5 = Mul(denom4, denom4);
142
4.41M
  const auto inv_denom5 = Div(Set(df, 1.0f), denom5);
143
4.41M
  const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f));
144
  // Change sign if needed.
145
4.41M
  const Rebind<uint32_t, DF> du;
146
4.41M
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0)));
147
4.41M
  return BitCast(df, Xor(signbit, BitCast(du, result)));
148
4.41M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastErff<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastErff<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
129
3.88M
V FastErff(const DF df, V x) {
130
  // Formula from
131
  // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
132
  // but constants have been recomputed.
133
3.88M
  const auto xle0 = Le(x, Zero(df));
134
3.88M
  const auto absx = Abs(x);
135
  // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4
136
3.88M
  const auto denom1 =
137
3.88M
      MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04));
138
3.88M
  const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01));
139
3.88M
  const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01));
140
3.88M
  const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f));
141
3.88M
  const auto denom5 = Mul(denom4, denom4);
142
3.88M
  const auto inv_denom5 = Div(Set(df, 1.0f), denom5);
143
3.88M
  const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f));
144
  // Change sign if needed.
145
3.88M
  const Rebind<uint32_t, DF> du;
146
3.88M
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0)));
147
3.88M
  return BitCast(df, Xor(signbit, BitCast(du, result)));
148
3.88M
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastErff<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec128<float, 1ul> jxl::N_AVX3_ZEN4::FastErff<hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul> >(hwy::N_AVX3_ZEN4::Simd<float, 1ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec128<float, 1ul> jxl::N_AVX3_SPR::FastErff<hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul> >(hwy::N_AVX3_SPR::Simd<float, 1ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 1ul> jxl::N_SSE2::FastErff<hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul> >(hwy::N_SSE2::Simd<float, 1ul, 0>, hwy::N_SSE2::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::FastErff<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::FastErff<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastErff<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastErff<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
129
527k
V FastErff(const DF df, V x) {
130
  // Formula from
131
  // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
132
  // but constants have been recomputed.
133
527k
  const auto xle0 = Le(x, Zero(df));
134
527k
  const auto absx = Abs(x);
135
  // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4
136
527k
  const auto denom1 =
137
527k
      MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04));
138
527k
  const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01));
139
527k
  const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01));
140
527k
  const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f));
141
527k
  const auto denom5 = Mul(denom4, denom4);
142
527k
  const auto inv_denom5 = Div(Set(df, 1.0f), denom5);
143
527k
  const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f));
144
  // Change sign if needed.
145
527k
  const Rebind<uint32_t, DF> du;
146
527k
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0)));
147
527k
  return BitCast(df, Xor(signbit, BitCast(du, result)));
148
527k
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastErff<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::FastErff<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>)
149
150
1.19M
inline float FastLog2f(float f) {
151
1.19M
  HWY_CAPPED(float, 1) D;
152
1.19M
  return GetLane(FastLog2f(D, Set(D, f)));
153
1.19M
}
Unexecuted instantiation: jxl::N_SSE4::FastLog2f(float)
Unexecuted instantiation: jxl::N_AVX2::FastLog2f(float)
Unexecuted instantiation: jxl::N_AVX3::FastLog2f(float)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::FastLog2f(float)
Unexecuted instantiation: jxl::N_AVX3_SPR::FastLog2f(float)
jxl::N_SSE2::FastLog2f(float)
Line
Count
Source
150
1.19M
inline float FastLog2f(float f) {
151
1.19M
  HWY_CAPPED(float, 1) D;
152
1.19M
  return GetLane(FastLog2f(D, Set(D, f)));
153
1.19M
}
154
155
2.89M
inline float FastPow2f(float f) {
156
2.89M
  HWY_CAPPED(float, 1) D;
157
2.89M
  return GetLane(FastPow2f(D, Set(D, f)));
158
2.89M
}
Unexecuted instantiation: jxl::N_SSE4::FastPow2f(float)
jxl::N_AVX2::FastPow2f(float)
Line
Count
Source
155
2.89M
inline float FastPow2f(float f) {
156
2.89M
  HWY_CAPPED(float, 1) D;
157
2.89M
  return GetLane(FastPow2f(D, Set(D, f)));
158
2.89M
}
Unexecuted instantiation: jxl::N_AVX3::FastPow2f(float)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::FastPow2f(float)
Unexecuted instantiation: jxl::N_AVX3_SPR::FastPow2f(float)
Unexecuted instantiation: jxl::N_SSE2::FastPow2f(float)
159
160
2.82M
inline float FastPowf(float b, float e) {
161
2.82M
  HWY_CAPPED(float, 1) D;
162
2.82M
  return GetLane(FastPowf(D, Set(D, b), Set(D, e)));
163
2.82M
}
Unexecuted instantiation: jxl::N_SSE4::FastPowf(float, float)
jxl::N_AVX2::FastPowf(float, float)
Line
Count
Source
160
2.82M
inline float FastPowf(float b, float e) {
161
2.82M
  HWY_CAPPED(float, 1) D;
162
2.82M
  return GetLane(FastPowf(D, Set(D, b), Set(D, e)));
163
2.82M
}
Unexecuted instantiation: jxl::N_AVX3::FastPowf(float, float)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::FastPowf(float, float)
Unexecuted instantiation: jxl::N_AVX3_SPR::FastPowf(float, float)
Unexecuted instantiation: jxl::N_SSE2::FastPowf(float, float)
164
165
0
inline float FastCosf(float f) {
166
0
  HWY_CAPPED(float, 1) D;
167
0
  return GetLane(FastCosf(D, Set(D, f)));
168
0
}
Unexecuted instantiation: jxl::N_SSE4::FastCosf(float)
Unexecuted instantiation: jxl::N_AVX2::FastCosf(float)
Unexecuted instantiation: jxl::N_AVX3::FastCosf(float)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::FastCosf(float)
Unexecuted instantiation: jxl::N_AVX3_SPR::FastCosf(float)
Unexecuted instantiation: jxl::N_SSE2::FastCosf(float)
169
170
0
inline float FastErff(float f) {
171
0
  HWY_CAPPED(float, 1) D;
172
0
  return GetLane(FastErff(D, Set(D, f)));
173
0
}
Unexecuted instantiation: jxl::N_SSE4::FastErff(float)
Unexecuted instantiation: jxl::N_AVX2::FastErff(float)
Unexecuted instantiation: jxl::N_AVX3::FastErff(float)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::FastErff(float)
Unexecuted instantiation: jxl::N_AVX3_SPR::FastErff(float)
Unexecuted instantiation: jxl::N_SSE2::FastErff(float)
174
175
// Returns cbrt(x) + add with 6 ulp max error.
176
// Modified from vectormath_exp.h, Apache 2 license.
177
// https://www.agner.org/optimize/vectorclass.zip
178
template <class V>
179
69.0M
V CubeRootAndAdd(const V x, const V add) {
180
69.0M
  const HWY_FULL(float) df;
181
69.0M
  const HWY_FULL(int32_t) di;
182
183
69.0M
  const auto kExpBias = Set(di, 0x54800000);  // cast(1.) + cast(1.) / 3
184
69.0M
  const auto kExpMul = Set(di, 0x002AAAAA);   // shifted 1/3
185
69.0M
  const auto k1_3 = Set(df, 1.0f / 3);
186
69.0M
  const auto k4_3 = Set(df, 4.0f / 3);
187
188
69.0M
  const auto xa = x;  // assume inputs never negative
189
69.0M
  const auto xa_3 = Mul(k1_3, xa);
190
191
  // Multiply exponent by -1/3
192
69.0M
  const auto m1 = BitCast(di, xa);
193
  // Special case for 0. 0 is represented with an exponent of 0, so the
194
  // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
195
  // sets those values as 0, which prevents having NaNs in the computations
196
  // below.
197
  // TODO(eustas): use fused op
198
69.0M
  const auto m2 = IfThenZeroElse(
199
69.0M
      Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul)));
200
69.0M
  auto r = BitCast(df, m2);
201
202
  // Newton-Raphson iterations
203
276M
  for (int i = 0; i < 3; i++) {
204
207M
    const auto r2 = Mul(r, r);
205
207M
    r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r));
206
207M
  }
207
  // Final iteration
208
69.0M
  auto r2 = Mul(r, r);
209
69.0M
  r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r);
210
69.0M
  r2 = Mul(r, r);
211
69.0M
  r = MulAdd(r2, x, add);
212
213
69.0M
  return r;
214
69.0M
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::CubeRootAndAdd<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::CubeRootAndAdd<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
179
69.0M
V CubeRootAndAdd(const V x, const V add) {
180
69.0M
  const HWY_FULL(float) df;
181
69.0M
  const HWY_FULL(int32_t) di;
182
183
69.0M
  const auto kExpBias = Set(di, 0x54800000);  // cast(1.) + cast(1.) / 3
184
69.0M
  const auto kExpMul = Set(di, 0x002AAAAA);   // shifted 1/3
185
69.0M
  const auto k1_3 = Set(df, 1.0f / 3);
186
69.0M
  const auto k4_3 = Set(df, 4.0f / 3);
187
188
69.0M
  const auto xa = x;  // assume inputs never negative
189
69.0M
  const auto xa_3 = Mul(k1_3, xa);
190
191
  // Multiply exponent by -1/3
192
69.0M
  const auto m1 = BitCast(di, xa);
193
  // Special case for 0. 0 is represented with an exponent of 0, so the
194
  // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
195
  // sets those values as 0, which prevents having NaNs in the computations
196
  // below.
197
  // TODO(eustas): use fused op
198
69.0M
  const auto m2 = IfThenZeroElse(
199
69.0M
      Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul)));
200
69.0M
  auto r = BitCast(df, m2);
201
202
  // Newton-Raphson iterations
203
276M
  for (int i = 0; i < 3; i++) {
204
207M
    const auto r2 = Mul(r, r);
205
207M
    r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r));
206
207M
  }
207
  // Final iteration
208
69.0M
  auto r2 = Mul(r, r);
209
69.0M
  r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r);
210
69.0M
  r2 = Mul(r, r);
211
69.0M
  r = MulAdd(r2, x, add);
212
213
69.0M
  return r;
214
69.0M
}
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::CubeRootAndAdd<hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::CubeRootAndAdd<hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::CubeRootAndAdd<hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::CubeRootAndAdd<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>)
215
216
// NOLINTNEXTLINE(google-readability-namespace-comments)
217
}  // namespace HWY_NAMESPACE
218
}  // namespace jxl
219
HWY_AFTER_NAMESPACE();
220
221
#endif  // LIB_JXL_BASE_FAST_MATH_INL_H_
222
223
#if HWY_ONCE
224
#ifndef LIB_JXL_BASE_FAST_MATH_ONCE
225
#define LIB_JXL_BASE_FAST_MATH_ONCE
226
227
namespace jxl {
228
1.19M
inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); }
229
0
inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); }
230
0
inline float FastPowf(float b, float e) {
231
0
  return HWY_STATIC_DISPATCH(FastPowf)(b, e);
232
0
}
233
0
inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); }
234
0
inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); }
235
}  // namespace jxl
236
237
#endif  // LIB_JXL_BASE_FAST_MATH_ONCE
238
#endif  // HWY_ONCE