/src/libjxl/lib/jxl/base/fast_math-inl.h

Source
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Fast SIMD math ops (log2, encoder only, cos, erf for splines)

#include <cstdint>

#if defined(LIB_JXL_BASE_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef LIB_JXL_BASE_FAST_MATH_INL_H_
#undef LIB_JXL_BASE_FAST_MATH_INL_H_
#else
#define LIB_JXL_BASE_FAST_MATH_INL_H_
#endif

#include <hwy/highway.h>

#include "lib/jxl/base/common.h"
#include "lib/jxl/base/rational_polynomial-inl.h"
HWY_BEFORE_NAMESPACE();
namespace jxl {
namespace HWY_NAMESPACE {

// These templates are not found via ADL.
using hwy::HWY_NAMESPACE::Abs;
using hwy::HWY_NAMESPACE::Add;
using hwy::HWY_NAMESPACE::Eq;
using hwy::HWY_NAMESPACE::Floor;
using hwy::HWY_NAMESPACE::Ge;
using hwy::HWY_NAMESPACE::GetLane;
using hwy::HWY_NAMESPACE::IfThenElse;
using hwy::HWY_NAMESPACE::IfThenZeroElse;
using hwy::HWY_NAMESPACE::Le;
using hwy::HWY_NAMESPACE::Min;
using hwy::HWY_NAMESPACE::Mul;
using hwy::HWY_NAMESPACE::MulAdd;
using hwy::HWY_NAMESPACE::NegMulAdd;
using hwy::HWY_NAMESPACE::Rebind;
using hwy::HWY_NAMESPACE::ShiftLeft;
using hwy::HWY_NAMESPACE::ShiftRight;
using hwy::HWY_NAMESPACE::Sub;
using hwy::HWY_NAMESPACE::Xor;

// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
// L1 error ~3.9E-6
template <class DF, class V>
V FastLog2f(const DF df, V x) {
  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
                                          HWY_REP4(1.4287160470083755E+00f),
                                          HWY_REP4(7.4245873327820566E-01f)};
  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
                                          HWY_REP4(1.0096718572241148E+00f),
                                          HWY_REP4(1.7409343003366853E-01f)};

  const Rebind<int32_t, DF> di;
  const auto x_bits = BitCast(di, x);

  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
  // Shifted exponent = log2; also used to clear mantissa.
  const auto exp_shifted = ShiftRight<23>(exp_bits);
  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
  const auto exp_val = ConvertTo(df, exp_shifted);
  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
             exp_val);
}

// max relative error ~3e-7
template <class DF, class V>
V FastPow2f(const DF df, V x) {
  const Rebind<int32_t, DF> di;
  auto floorx = Floor(x);
  auto exp =
      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
  auto frac = Sub(x, floorx);
  auto num = Add(frac, Set(df, 1.01749063e+01));
  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
  num = Mul(num, exp);
  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
  return Div(num, den);
}

// max relative error ~3e-5
template <class DF, class V>
V FastPowf(const DF df, V base, V exponent) {
  return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
}

// Computes cosine like std::cos.
// L1 error 7e-5.
template <class DF, class V>
V FastCosf(const DF df, V x) {
  // Step 1: range reduction to [0, 2pi)
  const auto pi2 = Set(df, kPi * 2.0f);
  const auto pi2_inv = Set(df, 0.5f / kPi);
  const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2);
  const auto xmodpi2 = Sub(x, npi2);
  // Step 2: range reduction to [0, pi]
  const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2));
  // Step 3: range reduction to [0, pi/2]
  const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f));
  const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi);
  // Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle
  // duplication steps faster, on x/4.
  const auto xs = Mul(x_pihalf, Set(df, 0.25f));
  const auto x2 = Mul(xs, xs);
  const auto x4 = Mul(x2, x2);
  const auto cosx_prescaling =
      MulAdd(x4, Set(df, 0.06960438),
             MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268)));
  // Step 5: angle duplication.
  const auto cosx_scale1 =
      MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562));
  const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1));
  // Step 6: change sign if needed.
  const Rebind<uint32_t, DF> du;
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf)));
  return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2)));
}

// Computes the error function like std::erf.
// L1 error 7e-4.
template <class DF, class V>
V FastErff(const DF df, V x) {
  // Formula from
  // https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
  // but constants have been recomputed.
  const auto xle0 = Le(x, Zero(df));
  const auto absx = Abs(x);
  // Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4
  const auto denom1 =
      MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04));
  const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01));
  const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01));
  const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f));
  const auto denom5 = Mul(denom4, denom4);
  const auto inv_denom5 = Div(Set(df, 1.0f), denom5);
  const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f));
  // Change sign if needed.
  const Rebind<uint32_t, DF> du;
  auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0)));
  return BitCast(df, Xor(signbit, BitCast(du, result)));
}

inline float FastLog2f(float f) {
  HWY_CAPPED(float, 1) D;
  return GetLane(FastLog2f(D, Set(D, f)));
}

inline float FastPow2f(float f) {
  HWY_CAPPED(float, 1) D;
  return GetLane(FastPow2f(D, Set(D, f)));
}

inline float FastPowf(float b, float e) {
  HWY_CAPPED(float, 1) D;
  return GetLane(FastPowf(D, Set(D, b), Set(D, e)));
}

inline float FastCosf(float f) {
  HWY_CAPPED(float, 1) D;
  return GetLane(FastCosf(D, Set(D, f)));
}

inline float FastErff(float f) {
  HWY_CAPPED(float, 1) D;
  return GetLane(FastErff(D, Set(D, f)));
}

// Returns cbrt(x) + add with 6 ulp max error.
// Modified from vectormath_exp.h, Apache 2 license.
// https://www.agner.org/optimize/vectorclass.zip
template <class V>
V CubeRootAndAdd(const V x, const V add) {
  const HWY_FULL(float) df;
  const HWY_FULL(int32_t) di;

  const auto kExpBias = Set(di, 0x54800000);  // cast(1.) + cast(1.) / 3
  const auto kExpMul = Set(di, 0x002AAAAA);   // shifted 1/3
  const auto k1_3 = Set(df, 1.0f / 3);
  const auto k4_3 = Set(df, 4.0f / 3);

  const auto xa = x;  // assume inputs never negative
  const auto xa_3 = Mul(k1_3, xa);

  // Multiply exponent by -1/3
  const auto m1 = BitCast(di, xa);
  // Special case for 0. 0 is represented with an exponent of 0, so the
  // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
  // sets those values as 0, which prevents having NaNs in the computations
  // below.
  // TODO(eustas): use fused op
  const auto m2 = IfThenZeroElse(
      Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul)));
  auto r = BitCast(df, m2);

  // Newton-Raphson iterations
  for (int i = 0; i < 3; i++) {
    const auto r2 = Mul(r, r);
    r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r));
  }
  // Final iteration
  auto r2 = Mul(r, r);
  r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r);
  r2 = Mul(r, r);
  r = MulAdd(r2, x, add);

  return r;
}

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace jxl
HWY_AFTER_NAMESPACE();

#endif  // LIB_JXL_BASE_FAST_MATH_INL_H_

#if HWY_ONCE
#ifndef LIB_JXL_BASE_FAST_MATH_ONCE
#define LIB_JXL_BASE_FAST_MATH_ONCE

namespace jxl {
inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); }
inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); }
inline float FastPowf(float b, float e) {
  return HWY_STATIC_DISPATCH(FastPowf)(b, e);
}
inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); }
inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); }
}  // namespace jxl

#endif  // LIB_JXL_BASE_FAST_MATH_ONCE
#endif  // HWY_ONCE

Coverage Report

Created: 2026-02-14 07:09

Line	Count	Source
1		// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2		//
3		// Use of this source code is governed by a BSD-style
4		// license that can be found in the LICENSE file.
5
6		// Fast SIMD math ops (log2, encoder only, cos, erf for splines)
7
8		#include <cstdint>
9
10		#if defined(LIB_JXL_BASE_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE)
11		#ifdef LIB_JXL_BASE_FAST_MATH_INL_H_
12		#undef LIB_JXL_BASE_FAST_MATH_INL_H_
13		#else
14		#define LIB_JXL_BASE_FAST_MATH_INL_H_
15		#endif
16
17		#include <hwy/highway.h>
18
19		#include "lib/jxl/base/common.h"
20		#include "lib/jxl/base/rational_polynomial-inl.h"
21		HWY_BEFORE_NAMESPACE();
22		namespace jxl {
23		namespace HWY_NAMESPACE {
24
25		// These templates are not found via ADL.
26		using hwy::HWY_NAMESPACE::Abs;
27		using hwy::HWY_NAMESPACE::Add;
28		using hwy::HWY_NAMESPACE::Eq;
29		using hwy::HWY_NAMESPACE::Floor;
30		using hwy::HWY_NAMESPACE::Ge;
31		using hwy::HWY_NAMESPACE::GetLane;
32		using hwy::HWY_NAMESPACE::IfThenElse;
33		using hwy::HWY_NAMESPACE::IfThenZeroElse;
34		using hwy::HWY_NAMESPACE::Le;
35		using hwy::HWY_NAMESPACE::Min;
36		using hwy::HWY_NAMESPACE::Mul;
37		using hwy::HWY_NAMESPACE::MulAdd;
38		using hwy::HWY_NAMESPACE::NegMulAdd;
39		using hwy::HWY_NAMESPACE::Rebind;
40		using hwy::HWY_NAMESPACE::ShiftLeft;
41		using hwy::HWY_NAMESPACE::ShiftRight;
42		using hwy::HWY_NAMESPACE::Sub;
43		using hwy::HWY_NAMESPACE::Xor;
44
45		// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
46		// L1 error ~3.9E-6
47		template <class DF, class V>
48	2.88M	V FastLog2f(const DF df, V x) {
49		// 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
50	2.88M	HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
51	2.88M	HWY_REP4(1.4287160470083755E+00f),
52	2.88M	HWY_REP4(7.4245873327820566E-01f)};
53	2.88M	HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
54	2.88M	HWY_REP4(1.0096718572241148E+00f),
55	2.88M	HWY_REP4(1.7409343003366853E-01f)};
56
57	2.88M	const Rebind<int32_t, DF> di;
58	2.88M	const auto x_bits = BitCast(di, x);
59
60		// Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
61	2.88M	const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); // = 2/3
62		// Shifted exponent = log2; also used to clear mantissa.
63	2.88M	const auto exp_shifted = ShiftRight<23>(exp_bits);
64	2.88M	const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
65	2.88M	const auto exp_val = ConvertTo(df, exp_shifted);
66	2.88M	return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
67	2.88M	exp_val);
68	2.88M	}
69
70		// max relative error ~3e-7
71		template <class DF, class V>
72	2.88M	V FastPow2f(const DF df, V x) {
73	2.88M	const Rebind<int32_t, DF> di;
74	2.88M	auto floorx = Floor(x);
75	2.88M	auto exp =
76	2.88M	BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
77	2.88M	auto frac = Sub(x, floorx);
78	2.88M	auto num = Add(frac, Set(df, 1.01749063e+01));
79	2.88M	num = MulAdd(num, frac, Set(df, 4.88687798e+01));
80	2.88M	num = MulAdd(num, frac, Set(df, 9.85506591e+01));
81	2.88M	num = Mul(num, exp);
82	2.88M	auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
83	2.88M	den = MulAdd(den, frac, Set(df, -1.94414990e+01));
84	2.88M	den = MulAdd(den, frac, Set(df, 9.85506633e+01));
85	2.88M	return Div(num, den);
86	2.88M	}
87
88		// max relative error ~3e-5
89		template <class DF, class V>
90	2.88M	V FastPowf(const DF df, V base, V exponent) {
91	2.88M	return FastPow2f(df, Mul(FastLog2f(df, base), exponent));
92	2.88M	}
93
94		// Computes cosine like std::cos.
95		// L1 error 7e-5.
96		template <class DF, class V>
97	69.8k	V FastCosf(const DF df, V x) {
98		// Step 1: range reduction to [0, 2pi)
99	69.8k	const auto pi2 = Set(df, kPi * 2.0f);
100	69.8k	const auto pi2_inv = Set(df, 0.5f / kPi);
101	69.8k	const auto npi2 = Mul(Floor(Mul(x, pi2_inv)), pi2);
102	69.8k	const auto xmodpi2 = Sub(x, npi2);
103		// Step 2: range reduction to [0, pi]
104	69.8k	const auto x_pi = Min(xmodpi2, Sub(pi2, xmodpi2));
105		// Step 3: range reduction to [0, pi/2]
106	69.8k	const auto above_pihalf = Ge(x_pi, Set(df, kPi / 2.0f));
107	69.8k	const auto x_pihalf = IfThenElse(above_pihalf, Sub(Set(df, kPi), x_pi), x_pi);
108		// Step 4: Taylor-like approximation, scaled by 2**0.75 to make angle
109		// duplication steps faster, on x/4.
110	69.8k	const auto xs = Mul(x_pihalf, Set(df, 0.25f));
111	69.8k	const auto x2 = Mul(xs, xs);
112	69.8k	const auto x4 = Mul(x2, x2);
113	69.8k	const auto cosx_prescaling =
114	69.8k	MulAdd(x4, Set(df, 0.06960438),
115	69.8k	MulAdd(x2, Set(df, -0.84087373), Set(df, 1.68179268)));
116		// Step 5: angle duplication.
117	69.8k	const auto cosx_scale1 =
118	69.8k	MulAdd(cosx_prescaling, cosx_prescaling, Set(df, -1.414213562));
119	69.8k	const auto cosx_scale2 = MulAdd(cosx_scale1, cosx_scale1, Set(df, -1));
120		// Step 6: change sign if needed.
121	69.8k	const Rebind<uint32_t, DF> du;
122	69.8k	auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, above_pihalf)));
123	69.8k	return BitCast(df, Xor(signbit, BitCast(du, cosx_scale2)));
124	69.8k	}
125
126		// Computes the error function like std::erf.
127		// L1 error 7e-4.
128		template <class DF, class V>
129	49.0k	V FastErff(const DF df, V x) {
130		// Formula from
131		// https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
132		// but constants have been recomputed.
133	49.0k	const auto xle0 = Le(x, Zero(df));
134	49.0k	const auto absx = Abs(x);
135		// Compute 1 - 1 / ((((x * a + b) * x + c) * x + d) * x + 1)**4
136	49.0k	const auto denom1 =
137	49.0k	MulAdd(absx, Set(df, 7.77394369e-02), Set(df, 2.05260015e-04));
138	49.0k	const auto denom2 = MulAdd(denom1, absx, Set(df, 2.32120216e-01));
139	49.0k	const auto denom3 = MulAdd(denom2, absx, Set(df, 2.77820801e-01));
140	49.0k	const auto denom4 = MulAdd(denom3, absx, Set(df, 1.0f));
141	49.0k	const auto denom5 = Mul(denom4, denom4);
142	49.0k	const auto inv_denom5 = Div(Set(df, 1.0f), denom5);
143	49.0k	const auto result = NegMulAdd(inv_denom5, inv_denom5, Set(df, 1.0f));
144		// Change sign if needed.
145	49.0k	const Rebind<uint32_t, DF> du;
146	49.0k	auto signbit = ShiftLeft<31>(BitCast(du, VecFromMask(df, xle0)));
147	49.0k	return BitCast(df, Xor(signbit, BitCast(du, result)));
148	49.0k	}
149
150	0	inline float FastLog2f(float f) {
151	0	HWY_CAPPED(float, 1) D;
152	0	return GetLane(FastLog2f(D, Set(D, f)));
153	0	}
154
155	0	inline float FastPow2f(float f) {
156	0	HWY_CAPPED(float, 1) D;
157	0	return GetLane(FastPow2f(D, Set(D, f)));
158	0	}
159
160	85.6k	inline float FastPowf(float b, float e) {
161	85.6k	HWY_CAPPED(float, 1) D;
162	85.6k	return GetLane(FastPowf(D, Set(D, b), Set(D, e)));
163	85.6k	}
164
165	0	inline float FastCosf(float f) {
166	0	HWY_CAPPED(float, 1) D;
167	0	return GetLane(FastCosf(D, Set(D, f)));
168	0	}
169
170	0	inline float FastErff(float f) {
171	0	HWY_CAPPED(float, 1) D;
172	0	return GetLane(FastErff(D, Set(D, f)));
173	0	}
174
175		// Returns cbrt(x) + add with 6 ulp max error.
176		// Modified from vectormath_exp.h, Apache 2 license.
177		// https://www.agner.org/optimize/vectorclass.zip
178		template <class V>
179	0	V CubeRootAndAdd(const V x, const V add) {
180	0	const HWY_FULL(float) df;
181	0	const HWY_FULL(int32_t) di;
182
183	0	const auto kExpBias = Set(di, 0x54800000); // cast(1.) + cast(1.) / 3
184	0	const auto kExpMul = Set(di, 0x002AAAAA); // shifted 1/3
185	0	const auto k1_3 = Set(df, 1.0f / 3);
186	0	const auto k4_3 = Set(df, 4.0f / 3);
187
188	0	const auto xa = x; // assume inputs never negative
189	0	const auto xa_3 = Mul(k1_3, xa);
190
191		// Multiply exponent by -1/3
192	0	const auto m1 = BitCast(di, xa);
193		// Special case for 0. 0 is represented with an exponent of 0, so the
194		// "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
195		// sets those values as 0, which prevents having NaNs in the computations
196		// below.
197		// TODO(eustas): use fused op
198	0	const auto m2 = IfThenZeroElse(
199	0	Eq(m1, Zero(di)), Sub(kExpBias, Mul((ShiftRight<23>(m1)), kExpMul)));
200	0	auto r = BitCast(df, m2);
201
202		// Newton-Raphson iterations
203	0	for (int i = 0; i < 3; i++) {
204	0	const auto r2 = Mul(r, r);
205	0	r = NegMulAdd(xa_3, Mul(r2, r2), Mul(k4_3, r));
206	0	}
207		// Final iteration
208	0	auto r2 = Mul(r, r);
209	0	r = MulAdd(k1_3, NegMulAdd(xa, Mul(r2, r2), r), r);
210	0	r2 = Mul(r, r);
211	0	r = MulAdd(r2, x, add);
212
213	0	return r;
214	0	}
215
216		// NOLINTNEXTLINE(google-readability-namespace-comments)
217		} // namespace HWY_NAMESPACE
218		} // namespace jxl
219		HWY_AFTER_NAMESPACE();
220
221		#endif // LIB_JXL_BASE_FAST_MATH_INL_H_
222
223		#if HWY_ONCE
224		#ifndef LIB_JXL_BASE_FAST_MATH_ONCE
225		#define LIB_JXL_BASE_FAST_MATH_ONCE
226
227		namespace jxl {
228	0	inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); }
229	0	inline float FastPow2f(float f) { return HWY_STATIC_DISPATCH(FastPow2f)(f); }
230	0	inline float FastPowf(float b, float e) {
231	0	return HWY_STATIC_DISPATCH(FastPowf)(b, e);
232	0	}
233	0	inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); }
234	0	inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); }
235		} // namespace jxl
236
237		#endif // LIB_JXL_BASE_FAST_MATH_ONCE
238		#endif // HWY_ONCE