Coverage Report

Created: 2022-08-24 06:04

/src/libjxl/lib/jxl/fast_math-inl.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
// Fast SIMD math ops (log2, encoder only, cos, erf for splines)
7
8
#if defined(LIB_JXL_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE)
9
#ifdef LIB_JXL_FAST_MATH_INL_H_
10
#undef LIB_JXL_FAST_MATH_INL_H_
11
#else
12
#define LIB_JXL_FAST_MATH_INL_H_
13
#endif
14
15
#include <hwy/highway.h>
16
17
#include "lib/jxl/common.h"
18
#include "lib/jxl/rational_polynomial-inl.h"
19
HWY_BEFORE_NAMESPACE();
20
namespace jxl {
21
namespace HWY_NAMESPACE {
22
23
// These templates are not found via ADL.
24
using hwy::HWY_NAMESPACE::Abs;
25
using hwy::HWY_NAMESPACE::Add;
26
using hwy::HWY_NAMESPACE::Eq;
27
using hwy::HWY_NAMESPACE::Floor;
28
using hwy::HWY_NAMESPACE::Ge;
29
using hwy::HWY_NAMESPACE::GetLane;
30
using hwy::HWY_NAMESPACE::IfThenElse;
31
using hwy::HWY_NAMESPACE::IfThenZeroElse;
32
using hwy::HWY_NAMESPACE::Le;
33
using hwy::HWY_NAMESPACE::Min;
34
using hwy::HWY_NAMESPACE::Mul;
35
using hwy::HWY_NAMESPACE::MulAdd;
36
using hwy::HWY_NAMESPACE::NegMulAdd;
37
using hwy::HWY_NAMESPACE::Rebind;
38
using hwy::HWY_NAMESPACE::ShiftLeft;
39
using hwy::HWY_NAMESPACE::ShiftRight;
40
using hwy::HWY_NAMESPACE::Sub;
41
using hwy::HWY_NAMESPACE::Xor;
42
43
// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
44
// L1 error ~3.9E-6
45
template <class DF, class V>
46
596k
V FastLog2f(const DF df, V x) {
47
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
48
596k
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
49
596k
                                          HWY_REP4(1.4287160470083755E+00f),
50
596k
                                          HWY_REP4(7.4245873327820566E-01f)};
51
596k
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
52
596k
                                          HWY_REP4(1.0096718572241148E+00f),
53
596k
                                          HWY_REP4(1.7409343003366853E-01f)};
54
55
596k
  const Rebind<int32_t, DF> di;
56
596k
  const auto x_bits = BitCast(di, x);
57
58
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
59
596k
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
60
  // Shifted exponent = log2; also used to clear mantissa.
61
596k
  const auto exp_shifted = ShiftRight<23>(exp_bits);
62
596k
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
63
596k
  const auto exp_val = ConvertTo(df, exp_shifted);
64
596k
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
65
596k
             exp_val);
66
596k
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastLog2f<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
46
3.31k
V FastLog2f(const DF df, V x) {
47
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
48
3.31k
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
49
3.31k
                                          HWY_REP4(1.4287160470083755E+00f),
50
3.31k
                                          HWY_REP4(7.4245873327820566E-01f)};
51
3.31k
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
52
3.31k
                                          HWY_REP4(1.0096718572241148E+00f),
53
3.31k
                                          HWY_REP4(1.7409343003366853E-01f)};
54
55
3.31k
  const Rebind<int32_t, DF> di;
56
3.31k
  const auto x_bits = BitCast(di, x);
57
58
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
59
3.31k
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
60
  // Shifted exponent = log2; also used to clear mantissa.
61
3.31k
  const auto exp_shifted = ShiftRight<23>(exp_bits);
62
3.31k
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
63
3.31k
  const auto exp_val = ConvertTo(df, exp_shifted);
64
3.31k
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
65
3.31k
             exp_val);
66
3.31k
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastLog2f<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>)
hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::FastLog2f<hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul>)
Line
Count
Source
46
4.14k
V FastLog2f(const DF df, V x) {
47
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
48
4.14k
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
49
4.14k
                                          HWY_REP4(1.4287160470083755E+00f),
50
4.14k
                                          HWY_REP4(7.4245873327820566E-01f)};
51
4.14k
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
52
4.14k
                                          HWY_REP4(1.0096718572241148E+00f),
53
4.14k
                                          HWY_REP4(1.7409343003366853E-01f)};
54
55
4.14k
  const Rebind<int32_t, DF> di;
56
4.14k
  const auto x_bits = BitCast(di, x);
57
58
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
59
4.14k
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
60
  // Shifted exponent = log2; also used to clear mantissa.
61
4.14k
  const auto exp_shifted = ShiftRight<23>(exp_bits);
62
4.14k
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
63
4.14k
  const auto exp_val = ConvertTo(df, exp_shifted);
64
4.14k
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
65
4.14k
             exp_val);
66
4.14k
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastLog2f<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>)
Line
Count
Source
46
585k
V FastLog2f(const DF df, V x) {
47
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
48
585k
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
49
585k
                                          HWY_REP4(1.4287160470083755E+00f),
50
585k
                                          HWY_REP4(7.4245873327820566E-01f)};
51
585k
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
52
585k
                                          HWY_REP4(1.0096718572241148E+00f),
53
585k
                                          HWY_REP4(1.7409343003366853E-01f)};
54
55
585k
  const Rebind<int32_t, DF> di;
56
585k
  const auto x_bits = BitCast(di, x);
57
58
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
59
585k
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
60
  // Shifted exponent = log2; also used to clear mantissa.
61
585k
  const auto exp_shifted = ShiftRight<23>(exp_bits);
62
585k
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
63
585k
  const auto exp_val = ConvertTo(df, exp_shifted);
64
585k
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
65
585k
             exp_val);
66
585k
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 4ul> jxl::N_AVX3::FastLog2f<hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul> >(hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::FastLog2f<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul>)
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastLog2f<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
46
3.68k
V FastLog2f(const DF df, V x) {
47
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
48
3.68k
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
49
3.68k
                                          HWY_REP4(1.4287160470083755E+00f),
50
3.68k
                                          HWY_REP4(7.4245873327820566E-01f)};
51
3.68k
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
52
3.68k
                                          HWY_REP4(1.0096718572241148E+00f),
53
3.68k
                                          HWY_REP4(1.7409343003366853E-01f)};
54
55
3.68k
  const Rebind<int32_t, DF> di;
56
3.68k
  const auto x_bits = BitCast(di, x);
57
58
  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
59
3.68k
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
60
  // Shifted exponent = log2; also used to clear mantissa.
61
3.68k
  const auto exp_shifted = ShiftRight<23>(exp_bits);
62
3.68k
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
63
3.68k
  const auto exp_val = ConvertTo(df, exp_shifted);
64
3.68k
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
65
3.68k
             exp_val);
66
3.68k
}
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastLog2f<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec256<float> jxl::N_AVX3::FastLog2f<hwy::N_AVX3::Simd<float, 8ul, 0>, hwy::N_AVX3::Vec256<float> >(hwy::N_AVX3::Simd<float, 8ul, 0>, hwy::N_AVX3::Vec256<float>)
67
68
// max relative error ~3e-7
69
template <class DF, class V>
70
588k
V FastPow2f(const DF df, V x) {
71
588k
  const Rebind<int32_t, DF> di;
72
588k
  auto floorx = Floor(x);
73
588k
  auto exp =
74
588k
      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
75
588k
  auto frac = Sub(x, floorx);
76
588k
  auto num = Add(frac, Set(df, 1.01749063e+01));
77
588k
  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
78
588k
  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
79
588k
  num = Mul(num, exp);
80
588k
  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
81
588k
  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
82
588k
  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
83
588k
  return Div(num, den);
84
588k
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastPow2f<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
70
3.40k
V FastPow2f(const DF df, V x) {
71
3.40k
  const Rebind<int32_t, DF> di;
72
3.40k
  auto floorx = Floor(x);
73
3.40k
  auto exp =
74
3.40k
      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
75
3.40k
  auto frac = Sub(x, floorx);
76
3.40k
  auto num = Add(frac, Set(df, 1.01749063e+01));
77
3.40k
  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
78
3.40k
  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
79
3.40k
  num = Mul(num, exp);
80
3.40k
  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
81
3.40k
  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
82
3.40k
  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
83
3.40k
  return Div(num, den);
84
3.40k
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastPow2f<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::FastPow2f<hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastPow2f<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>)
Line
Count
Source
70
585k
V FastPow2f(const DF df, V x) {
71
585k
  const Rebind<int32_t, DF> di;
72
585k
  auto floorx = Floor(x);
73
585k
  auto exp =
74
585k
      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
75
585k
  auto frac = Sub(x, floorx);
76
585k
  auto num = Add(frac, Set(df, 1.01749063e+01));
77
585k
  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
78
585k
  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
79
585k
  num = Mul(num, exp);
80
585k
  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
81
585k
  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
82
585k
  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
83
585k
  return Div(num, den);
84
585k
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 4ul> jxl::N_AVX3::FastPow2f<hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul> >(hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::FastPow2f<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastPow2f<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastPow2f<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
85
86
// max relative error ~3e-5
87
template <class DF, class V>
88
588k
V FastPowf(const DF df, V base, V exponent) {
89
588k
  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
90
588k
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastPowf<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>)
hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>)
Line
Count
Source
88
3.31k
V FastPowf(const DF df, V base, V exponent) {
89
3.31k
  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
90
3.31k
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastPowf<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::FastPowf<hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastPowf<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec128<float, 4ul> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul> >(hwy::N_AVX2::Simd<float, 4ul, 0>, hwy::N_AVX2::Vec128<float, 4ul>, hwy::N_AVX2::Vec128<float, 4ul>)
Line
Count
Source
88
585k
V FastPowf(const DF df, V base, V exponent) {
89
585k
  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
90
585k
}
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 4ul> jxl::N_AVX3::FastPowf<hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul> >(hwy::N_AVX3::Simd<float, 4ul, 0>, hwy::N_AVX3::Vec128<float, 4ul>, hwy::N_AVX3::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::FastPowf<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastPowf<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastPowf<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
91
92
// Computes cosine like std::cos.
93
// L1 error 7e-5.
94
template <class DF, class V>
95
0
V FastCosf(const DF df, V x) {
96
  // Step 1: range reduction to [0, 2pi)
97
0
  const auto pi2 = Set(df, kPi * 2.0f);
98
0
  const auto pi2_inv = Set(df, 0.5f / kPi);
99
0
  const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2);
100
0
  const auto xmodpi2 = Sub(x, npi2);
101
  // Step 2: range reduction to [0, pi]
102
0
  const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2));
103
  // Step 3: range reduction to [0, pi/2]
104
0
  const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f));
105
0
  const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi);
106
  // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle
107
  // duplication steps faster, on x/4.
108
0
  const auto xs = Mul(x_pihalf, Set(df, 0.25f));
109
0
  const auto x2 = Mul(xs, xs);
110
0
  const auto x4 = Mul(x2, x2);
111
0
  const auto cosx_prescaling =
112
0
      MulAdd(x4, Set(df, 0.06960438),
113
0
             MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268)));
114
  // Step 5: angle duplication.
115
0
  const auto cosx_scale1 =
116
0
      MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562));
117
0
  const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1));
118
  // Step 6: change sign if needed.
119
0
  const Rebind<uint32_t, DF> du;
120
0
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf)));
121
0
  return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2)));
122
0
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastCosf<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastCosf<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastCosf<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::FastCosf<hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastCosf<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastCosf<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastCosf<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::FastCosf<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul>)
123
124
// Computes the error function like std::erf.
125
// L1 error 7e-4.
126
template <class DF, class V>
127
0
V FastErff(const DF df, V x) {
128
  // Formula from
129
  // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
130
  // but constants have been recomputed.
131
0
  const auto xle0 = Le(x, Zero(df));
132
0
  const auto absx = Abs(x);
133
  // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4
134
0
  const auto denom1 =
135
0
      MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04));
136
0
  const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01));
137
0
  const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01));
138
0
  const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f));
139
0
  const auto denom5 = Mul(denom4, denom4);
140
0
  const auto inv_denom5 = Div(Set(df, 1.0f), denom5);
141
0
  const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f));
142
  // Change sign if needed.
143
0
  const Rebind<uint32_t, DF> du;
144
0
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0)));
145
0
  return BitCast(df, Xor(signbit, BitCast(du, result)));
146
0
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::FastErff<hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Simd<float, 1ul, 0>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::FastErff<hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Simd<float, 1ul, 0>, hwy::N_AVX2::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::FastErff<hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Simd<float, 1ul, 0>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::FastErff<hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Simd<float, 1ul, 0>, hwy::N_EMU128::Vec128<float, 1ul>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::FastErff<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::FastErff<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::FastErff<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::FastErff<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul>)
147
148
4.14k
inline float FastLog2f(float f) {
149
4.14k
  HWY_CAPPED(float, 1) D;
150
4.14k
  return GetLane(FastLog2f(D, Set(D, f)));
151
4.14k
}
Unexecuted instantiation: jxl::N_SSE4::FastLog2f(float)
Unexecuted instantiation: jxl::N_AVX2::FastLog2f(float)
Unexecuted instantiation: jxl::N_AVX3::FastLog2f(float)
jxl::N_EMU128::FastLog2f(float)
Line
Count
Source
148
4.14k
inline float FastLog2f(float f) {
149
4.14k
  HWY_CAPPED(float, 1) D;
150
4.14k
  return GetLane(FastLog2f(D, Set(D, f)));
151
4.14k
}
152
153
92
inline float FastPow2f(float f) {
154
92
  HWY_CAPPED(float, 1) D;
155
92
  return GetLane(FastPow2f(D, Set(D, f)));
156
92
}
Unexecuted instantiation: jxl::N_SSE4::FastPow2f(float)
jxl::N_AVX2::FastPow2f(float)
Line
Count
Source
153
92
inline float FastPow2f(float f) {
154
92
  HWY_CAPPED(float, 1) D;
155
92
  return GetLane(FastPow2f(D, Set(D, f)));
156
92
}
Unexecuted instantiation: jxl::N_AVX3::FastPow2f(float)
Unexecuted instantiation: jxl::N_EMU128::FastPow2f(float)
157
158
3.31k
inline float FastPowf(float b, float e) {
159
3.31k
  HWY_CAPPED(float, 1) D;
160
3.31k
  return GetLane(FastPowf(D, Set(D, b), Set(D, e)));
161
3.31k
}
Unexecuted instantiation: jxl::N_SSE4::FastPowf(float, float)
jxl::N_AVX2::FastPowf(float, float)
Line
Count
Source
158
3.31k
inline float FastPowf(float b, float e) {
159
3.31k
  HWY_CAPPED(float, 1) D;
160
3.31k
  return GetLane(FastPowf(D, Set(D, b), Set(D, e)));
161
3.31k
}
Unexecuted instantiation: jxl::N_AVX3::FastPowf(float, float)
Unexecuted instantiation: jxl::N_EMU128::FastPowf(float, float)
162
163
0
inline float FastCosf(float f) {
164
0
  HWY_CAPPED(float, 1) D;
165
0
  return GetLane(FastCosf(D, Set(D, f)));
166
0
}
Unexecuted instantiation: jxl::N_SSE4::FastCosf(float)
Unexecuted instantiation: jxl::N_AVX2::FastCosf(float)
Unexecuted instantiation: jxl::N_AVX3::FastCosf(float)
Unexecuted instantiation: jxl::N_EMU128::FastCosf(float)
167
168
0
inline float FastErff(float f) {
169
0
  HWY_CAPPED(float, 1) D;
170
0
  return GetLane(FastErff(D, Set(D, f)));
171
0
}
Unexecuted instantiation: jxl::N_SSE4::FastErff(float)
Unexecuted instantiation: jxl::N_AVX2::FastErff(float)
Unexecuted instantiation: jxl::N_AVX3::FastErff(float)
Unexecuted instantiation: jxl::N_EMU128::FastErff(float)
172
173
// Returns cbrt(x) + add with 6 ulp max error.
174
// Modified from vectormath_exp.h, Apache 2 license.
175
// https://www.agner.org/optimize/vectorclass.zip
176
template <class V>
177
276
V CubeRootAndAdd(const V x, const V add) {
178
276
  const HWY_FULL(float) df;
179
276
  const HWY_FULL(int32_t) di;
180
181
276
  const auto kExpBias = Set(di, 0x54800000);  // cast(1.) + cast(1.) / 3
182
276
  const auto kExpMul = Set(di, 0x002AAAAA);   // shifted 1/3
183
276
  const auto k1_3 = Set(df, 1.0f / 3);
184
276
  const auto k4_3 = Set(df, 4.0f / 3);
185
186
276
  const auto xa = x;  // assume inputs never negative
187
276
  const auto xa_3 = Mul(k1_3, xa);
188
189
  // Multiply exponent by -1/3
190
276
  const auto m1 = BitCast(di, xa);
191
  // Special case for 0. 0 is represented with an exponent of 0, so the
192
  // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
193
  // sets those values as 0, which prevents having NaNs in the computations
194
  // below.
195
  // TODO(eustas): use fused op
196
276
  const auto m2 = IfThenZeroElse(
197
276
      Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul)));
198
276
  auto r = BitCast(df, m2);
199
200
  // Newton-Raphson iterations
201
1.10k
  for (int i = 0; i < 3; i++) {
202
828
    const auto r2 = Mul(r, r);
203
828
    r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r));
204
828
  }
205
  // Final iteration
206
276
  auto r2 = Mul(r, r);
207
276
  r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r);
208
276
  r2 = Mul(r, r);
209
276
  r = MulAdd(r2, x, add);
210
211
276
  return r;
212
276
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::CubeRootAndAdd<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::CubeRootAndAdd<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Line
Count
Source
177
276
V CubeRootAndAdd(const V x, const V add) {
178
276
  const HWY_FULL(float) df;
179
276
  const HWY_FULL(int32_t) di;
180
181
276
  const auto kExpBias = Set(di, 0x54800000);  // cast(1.) + cast(1.) / 3
182
276
  const auto kExpMul = Set(di, 0x002AAAAA);   // shifted 1/3
183
276
  const auto k1_3 = Set(df, 1.0f / 3);
184
276
  const auto k4_3 = Set(df, 4.0f / 3);
185
186
276
  const auto xa = x;  // assume inputs never negative
187
276
  const auto xa_3 = Mul(k1_3, xa);
188
189
  // Multiply exponent by -1/3
190
276
  const auto m1 = BitCast(di, xa);
191
  // Special case for 0. 0 is represented with an exponent of 0, so the
192
  // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
193
  // sets those values as 0, which prevents having NaNs in the computations
194
  // below.
195
  // TODO(eustas): use fused op
196
276
  const auto m2 = IfThenZeroElse(
197
276
      Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul)));
198
276
  auto r = BitCast(df, m2);
199
200
  // Newton-Raphson iterations
201
1.10k
  for (int i = 0; i < 3; i++) {
202
828
    const auto r2 = Mul(r, r);
203
828
    r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r));
204
828
  }
205
  // Final iteration
206
276
  auto r2 = Mul(r, r);
207
276
  r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r);
208
276
  r2 = Mul(r, r);
209
276
  r = MulAdd(r2, x, add);
210
211
276
  return r;
212
276
}
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::CubeRootAndAdd<hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::CubeRootAndAdd<hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>)
213
214
// NOLINTNEXTLINE(google-readability-namespace-comments)
215
}  // namespace HWY_NAMESPACE
216
}  // namespace jxl
217
HWY_AFTER_NAMESPACE();
218
219
#endif  // LIB_JXL_FAST_MATH_INL_H_
220
221
#if HWY_ONCE
222
#ifndef FAST_MATH_ONCE
223
#define FAST_MATH_ONCE
224
225
namespace jxl {
226
4.14k
inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); }
227
0
inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); }
228
0
inline float FastPowf(float b, float e) {
229
0
  return HWY_STATIC_DISPATCH(FastPowf)(b, e);
230
0
}
231
0
inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); }
232
0
inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); }
233
}  // namespace jxl
234
235
#endif  // FAST_MATH_ONCE
236
#endif  // HWY_ONCE