Coverage Report

Created: 2025-06-22 08:04

/src/libjxl/lib/jxl/base/rational_polynomial-inl.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
// Fast SIMD evaluation of rational polynomials for approximating functions.
7
8
#if defined(LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_) == \
9
    defined(HWY_TARGET_TOGGLE)
10
#ifdef LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_
11
#undef LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_
12
#else
13
#define LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_
14
#endif
15
16
#include <jxl/types.h>
17
#include <stddef.h>
18
19
#include <hwy/highway.h>
20
HWY_BEFORE_NAMESPACE();
21
namespace jxl {
22
namespace HWY_NAMESPACE {
23
namespace {
24
25
// These templates are not found via ADL.
26
using hwy::HWY_NAMESPACE::Div;
27
using hwy::HWY_NAMESPACE::MulAdd;
28
29
// Primary template: default to actual division.
30
template <typename T, class V>
31
struct FastDivision {
32
  HWY_INLINE V operator()(const V n, const V d) const { return n / d; }
33
};
34
// Partial specialization for float vectors.
35
template <class V>
36
struct FastDivision<float, V> {
37
  // One Newton-Raphson iteration.
38
  static HWY_INLINE V ReciprocalNR(const V x) {
39
    const auto rcp = ApproximateReciprocal(x);
40
    const auto sum = Add(rcp, rcp);
41
    const auto x_rcp = Mul(x, rcp);
42
    return NegMulAdd(x_rcp, rcp, sum);
43
  }
44
45
927M
  V operator()(const V n, const V d) const {
46
#if JXL_TRUE  // Faster on SKX
47
927M
    return Div(n, d);
48
#else
49
    return n * ReciprocalNR(d);
50
#endif
51
927M
  }
quant_weights.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Line
Count
Source
45
5.43M
  V operator()(const V n, const V d) const {
46
#if JXL_TRUE  // Faster on SKX
47
5.43M
    return Div(n, d);
48
#else
49
    return n * ReciprocalNR(d);
50
#endif
51
5.43M
  }
Unexecuted instantiation: enc_xyb.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Unexecuted instantiation: enc_ma.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
stage_from_linear.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Line
Count
Source
45
793M
  V operator()(const V n, const V d) const {
46
#if JXL_TRUE  // Faster on SKX
47
793M
    return Div(n, d);
48
#else
49
    return n * ReciprocalNR(d);
50
#endif
51
793M
  }
Unexecuted instantiation: stage_to_linear.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Unexecuted instantiation: stage_tone_mapping.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Unexecuted instantiation: splines.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Unexecuted instantiation: butteraugli.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Unexecuted instantiation: enc_ac_strategy.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Unexecuted instantiation: enc_adaptive_quantization.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Unexecuted instantiation: enc_ans.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Unexecuted instantiation: enc_cluster.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
jxl_cms.cc:jxl::N_SCALAR::(anonymous namespace)::FastDivision<float, hwy::N_SCALAR::Vec1<float> >::operator()(hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>) const
Line
Count
Source
45
127M
  V operator()(const V n, const V d) const {
46
#if JXL_TRUE  // Faster on SKX
47
127M
    return Div(n, d);
48
#else
49
    return n * ReciprocalNR(d);
50
#endif
51
127M
  }
52
};
53
54
// Approximates smooth functions via rational polynomials (i.e. dividing two
55
// polynomials). Evaluates polynomials via Horner's scheme, which is faster than
56
// Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to
57
// specify constants (replicated 4x) independently of the lane count.
58
template <size_t NP, size_t NQ, class D, class V, typename T>
59
HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
60
                                                     const T (&p)[NP],
61
927M
                                                     const T (&q)[NQ]) {
62
927M
  constexpr size_t kDegP = NP / 4 - 1;
63
927M
  constexpr size_t kDegQ = NQ / 4 - 1;
64
927M
  auto yp = LoadDup128(d, &p[kDegP * 4]);
65
927M
  auto yq = LoadDup128(d, &q[kDegQ * 4]);
66
  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
67
  // compiler warning that the index is out of bounds since we are already
68
  // checking that it is not out of bounds with (kDegP >= n) and the access
69
  // will be optimized away. Similarly with q and kDegQ.
70
927M
  HWY_FENCE;
71
927M
  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
72
927M
  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
73
927M
  HWY_FENCE;
74
927M
  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
75
927M
  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
76
927M
  HWY_FENCE;
77
927M
  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
78
927M
  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
79
927M
  HWY_FENCE;
80
927M
  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
81
927M
  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
82
927M
  HWY_FENCE;
83
927M
  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
84
927M
  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
85
927M
  HWY_FENCE;
86
927M
  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
87
927M
  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
88
927M
  HWY_FENCE;
89
927M
  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
90
927M
  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
91
92
927M
  static_assert(kDegP < 8, "Polynomial degree is too high");
93
927M
  static_assert(kDegQ < 8, "Polynomial degree is too high");
94
95
927M
  return FastDivision<T, V>()(yp, yq);
96
927M
}
quant_weights.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Line
Count
Source
61
5.43M
                                                     const T (&q)[NQ]) {
62
5.43M
  constexpr size_t kDegP = NP / 4 - 1;
63
5.43M
  constexpr size_t kDegQ = NQ / 4 - 1;
64
5.43M
  auto yp = LoadDup128(d, &p[kDegP * 4]);
65
5.43M
  auto yq = LoadDup128(d, &q[kDegQ * 4]);
66
  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
67
  // compiler warning that the index is out of bounds since we are already
68
  // checking that it is not out of bounds with (kDegP >= n) and the access
69
  // will be optimized away. Similarly with q and kDegQ.
70
5.43M
  HWY_FENCE;
71
5.43M
  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
72
5.43M
  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
73
5.43M
  HWY_FENCE;
74
5.43M
  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
75
5.43M
  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
76
5.43M
  HWY_FENCE;
77
5.43M
  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
78
5.43M
  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
79
5.43M
  HWY_FENCE;
80
5.43M
  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
81
5.43M
  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
82
5.43M
  HWY_FENCE;
83
5.43M
  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
84
5.43M
  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
85
5.43M
  HWY_FENCE;
86
5.43M
  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
87
5.43M
  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
88
5.43M
  HWY_FENCE;
89
5.43M
  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
90
5.43M
  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
91
92
5.43M
  static_assert(kDegP < 8, "Polynomial degree is too high");
93
5.43M
  static_assert(kDegQ < 8, "Polynomial degree is too high");
94
95
5.43M
  return FastDivision<T, V>()(yp, yq);
96
5.43M
}
Unexecuted instantiation: enc_xyb.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<20ul, 20ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [20ul], float const (&) [20ul])
Unexecuted instantiation: enc_xyb.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: enc_ma.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
stage_from_linear.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<20ul, 20ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [20ul], float const (&) [20ul])
Line
Count
Source
61
794M
                                                     const T (&q)[NQ]) {
62
794M
  constexpr size_t kDegP = NP / 4 - 1;
63
794M
  constexpr size_t kDegQ = NQ / 4 - 1;
64
794M
  auto yp = LoadDup128(d, &p[kDegP * 4]);
65
794M
  auto yq = LoadDup128(d, &q[kDegQ * 4]);
66
  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
67
  // compiler warning that the index is out of bounds since we are already
68
  // checking that it is not out of bounds with (kDegP >= n) and the access
69
  // will be optimized away. Similarly with q and kDegQ.
70
794M
  HWY_FENCE;
71
794M
  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
72
794M
  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
73
794M
  HWY_FENCE;
74
794M
  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
75
794M
  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
76
794M
  HWY_FENCE;
77
794M
  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
78
794M
  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
79
794M
  HWY_FENCE;
80
794M
  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
81
794M
  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
82
794M
  HWY_FENCE;
83
794M
  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
84
794M
  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
85
794M
  HWY_FENCE;
86
794M
  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
87
794M
  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
88
794M
  HWY_FENCE;
89
794M
  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
90
794M
  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
91
92
794M
  static_assert(kDegP < 8, "Polynomial degree is too high");
93
794M
  static_assert(kDegQ < 8, "Polynomial degree is too high");
94
95
794M
  return FastDivision<T, V>()(yp, yq);
96
794M
}
Unexecuted instantiation: stage_from_linear.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: stage_to_linear.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<20ul, 20ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [20ul], float const (&) [20ul])
Unexecuted instantiation: stage_to_linear.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: stage_tone_mapping.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<20ul, 20ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [20ul], float const (&) [20ul])
Unexecuted instantiation: stage_tone_mapping.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: splines.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: butteraugli.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: enc_ac_strategy.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: enc_adaptive_quantization.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: enc_ans.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
Unexecuted instantiation: enc_cluster.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
jxl_cms.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<20ul, 20ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [20ul], float const (&) [20ul])
Line
Count
Source
61
127M
                                                     const T (&q)[NQ]) {
62
127M
  constexpr size_t kDegP = NP / 4 - 1;
63
127M
  constexpr size_t kDegQ = NQ / 4 - 1;
64
127M
  auto yp = LoadDup128(d, &p[kDegP * 4]);
65
127M
  auto yq = LoadDup128(d, &q[kDegQ * 4]);
66
  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
67
  // compiler warning that the index is out of bounds since we are already
68
  // checking that it is not out of bounds with (kDegP >= n) and the access
69
  // will be optimized away. Similarly with q and kDegQ.
70
127M
  HWY_FENCE;
71
127M
  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
72
127M
  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
73
127M
  HWY_FENCE;
74
127M
  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
75
127M
  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
76
127M
  HWY_FENCE;
77
127M
  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
78
127M
  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
79
127M
  HWY_FENCE;
80
127M
  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
81
127M
  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
82
127M
  HWY_FENCE;
83
127M
  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
84
127M
  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
85
127M
  HWY_FENCE;
86
127M
  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
87
127M
  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
88
127M
  HWY_FENCE;
89
127M
  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
90
127M
  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
91
92
127M
  static_assert(kDegP < 8, "Polynomial degree is too high");
93
127M
  static_assert(kDegQ < 8, "Polynomial degree is too high");
94
95
127M
  return FastDivision<T, V>()(yp, yq);
96
127M
}
Unexecuted instantiation: jxl_cms.cc:hwy::N_SCALAR::Vec1<float> jxl::N_SCALAR::(anonymous namespace)::EvalRationalPolynomial<12ul, 12ul, hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float>(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, float const (&) [12ul], float const (&) [12ul])
97
98
}  // namespace
99
// NOLINTNEXTLINE(google-readability-namespace-comments)
100
}  // namespace HWY_NAMESPACE
101
}  // namespace jxl
102
HWY_AFTER_NAMESPACE();
103
#endif  // LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_