Coverage Report

Created: 2025-07-23 08:18

/src/libjxl/lib/jxl/cms/transfer_functions-inl.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
// Transfer functions for color encodings.
7
8
#include <cstdint>
9
10
#include "lib/jxl/base/common.h"
11
#if defined(LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE)
12
#ifdef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
13
#undef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
14
#else
15
#define LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
16
#endif
17
18
#include <cmath>
19
#include <hwy/highway.h>
20
21
#include "lib/jxl/base/compiler_specific.h"
22
#include "lib/jxl/base/fast_math-inl.h"
23
#include "lib/jxl/base/rational_polynomial-inl.h"
24
#include "lib/jxl/cms/transfer_functions.h"
25
26
HWY_BEFORE_NAMESPACE();
27
namespace jxl {
28
namespace HWY_NAMESPACE {
29
30
// These templates are not found via ADL.
31
using hwy::HWY_NAMESPACE::And;
32
using hwy::HWY_NAMESPACE::AndNot;
33
using hwy::HWY_NAMESPACE::Gt;
34
using hwy::HWY_NAMESPACE::IfThenElse;
35
using hwy::HWY_NAMESPACE::Lt;
36
using hwy::HWY_NAMESPACE::Or;
37
using hwy::HWY_NAMESPACE::Sqrt;
38
using hwy::HWY_NAMESPACE::TableLookupBytes;
39
40
// Definitions for BT.2100-2 transfer functions (used inside/outside SIMD):
41
// "display" is linear light (nits) normalized to [0, 1].
42
// "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1].
43
// "scene" is a linear function of photon counts, normalized to [0, 1].
44
45
// Despite the stated ranges, we need unbounded transfer functions: see
46
// http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or
47
// above 1 due to chromatic adaptation. To avoid severe round-trip errors caused
48
// by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see
49
// https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb)
50
// and extend the function domains above 1.
51
52
// Hybrid Log-Gamma.
53
class TF_HLG : TF_HLG_Base {
54
 public:
55
  // Maximum error 5e-7.
56
  template <class D, class V>
57
105k
  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
58
105k
    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
59
105k
    const V kSign = BitCast(d, Set(du, 0x80000000u));
60
105k
    const V original_sign = And(x, kSign);
61
105k
    x = AndNot(kSign, x);  // abs
62
105k
    const auto belowInv12 = Le(x, Set(d, kInv12));
63
105k
    const V lo = Sqrt(Mul(Set(d, k3), x));
64
105k
    const V hi =
65
105k
        MulAdd(Set(d, kA * kInvLog2e),
66
105k
               FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC));
67
105k
    const V magnitude = IfThenElse(belowInv12, lo, hi);
68
105k
    return Or(AndNot(kSign, magnitude), original_sign);
69
105k
  }
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::TF_HLG::EncodedFromDisplay<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::TF_HLG::EncodedFromDisplay<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::TF_HLG::EncodedFromDisplay<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) const
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_HLG::EncodedFromDisplay<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const
Line
Count
Source
57
105k
  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
58
105k
    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
59
105k
    const V kSign = BitCast(d, Set(du, 0x80000000u));
60
105k
    const V original_sign = And(x, kSign);
61
105k
    x = AndNot(kSign, x);  // abs
62
105k
    const auto belowInv12 = Le(x, Set(d, kInv12));
63
105k
    const V lo = Sqrt(Mul(Set(d, k3), x));
64
105k
    const V hi =
65
105k
        MulAdd(Set(d, kA * kInvLog2e),
66
105k
               FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC));
67
105k
    const V magnitude = IfThenElse(belowInv12, lo, hi);
68
105k
    return Or(AndNot(kSign, magnitude), original_sign);
69
105k
  }
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_HLG::EncodedFromDisplay<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_HLG::EncodedFromDisplay<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const
70
71
  template <class D, class V>
72
0
  JXL_INLINE V DisplayFromEncoded(D d, V x) const {
73
0
    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
74
0
    const V kSign = BitCast(d, Set(du, 0x80000000u));
75
0
    const V original_sign = And(x, kSign);
76
0
    x = AndNot(kSign, x);  // abs
77
0
    const auto below05 = Le(x, Set(d, k05));
78
0
    const V lo = Mul(x, Mul(x, Set(d, kInv3)));
79
0
    const V hi = MulAdd(FastPow2f(d, Mul(x, Set(d, kHiPow))), Set(d, kHiMul),
80
0
                        Set(d, kHiAdd));
81
0
    const V magnitude = IfThenElse(below05, lo, hi);
82
0
    return Or(AndNot(kSign, magnitude), original_sign);
83
0
  }
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::TF_HLG::DisplayFromEncoded<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::TF_HLG::DisplayFromEncoded<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::TF_HLG::DisplayFromEncoded<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_HLG::DisplayFromEncoded<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_HLG::DisplayFromEncoded<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_HLG::DisplayFromEncoded<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const
84
85
 private:
86
  static constexpr double k05 = 0.5;
87
  static constexpr double k3 = 3.0;
88
  static constexpr double kInv3 = 1.0 / 3.0;
89
  static constexpr double kHiAdd = kB * kInv12;
90
  // std::exp(-kC * kRA) * kInv12;
91
  static constexpr double kHiMul = 0.003639807079052639;
92
  // kRA * std::log2e_v
93
  static constexpr double kHiPow = 8.067285659607931;
94
};
95
96
class TF_709 {
97
 public:
98
0
  static JXL_INLINE double EncodedFromDisplay(const double d) {
99
0
    if (d < kThresh) return kMulLow * d;
100
0
    return kMulHi * std::pow(d, kPowHi) + kSub;
101
0
  }
Unexecuted instantiation: jxl::N_SSE4::TF_709::EncodedFromDisplay(double)
Unexecuted instantiation: jxl::N_AVX2::TF_709::EncodedFromDisplay(double)
Unexecuted instantiation: jxl::N_AVX3::TF_709::EncodedFromDisplay(double)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::TF_709::EncodedFromDisplay(double)
Unexecuted instantiation: jxl::N_AVX3_SPR::TF_709::EncodedFromDisplay(double)
Unexecuted instantiation: jxl::N_SSE2::TF_709::EncodedFromDisplay(double)
102
103
  // Maximum error 1e-6.
104
  template <class D, class V>
105
111k
  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
106
111k
    auto low = Mul(Set(d, kMulLow), x);
107
111k
    auto hi =
108
111k
        MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub));
109
111k
    return IfThenElse(Le(x, Set(d, kThresh)), low, hi);
110
111k
  }
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::TF_709::EncodedFromDisplay<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::TF_709::EncodedFromDisplay<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::TF_709::EncodedFromDisplay<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) const
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_709::EncodedFromDisplay<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const
Line
Count
Source
105
111k
  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
106
111k
    auto low = Mul(Set(d, kMulLow), x);
107
111k
    auto hi =
108
111k
        MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub));
109
111k
    return IfThenElse(Le(x, Set(d, kThresh)), low, hi);
110
111k
  }
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_709::EncodedFromDisplay<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_709::EncodedFromDisplay<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const
111
112
  template <class D, class V>
113
0
  JXL_INLINE V DisplayFromEncoded(D d, V x) const {
114
0
    auto low = Mul(Set(d, kInvMulLow), x);
115
0
    auto hi = FastPowf(d, MulAdd(x, Set(d, kInvMulHi), Set(d, kInvAdd)),
116
0
                       Set(d, kInvPowHi));
117
0
    return IfThenElse(Lt(x, Set(d, kInvThresh)), low, hi);
118
0
  }
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::TF_709::DisplayFromEncoded<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::TF_709::DisplayFromEncoded<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::TF_709::DisplayFromEncoded<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_709::DisplayFromEncoded<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_709::DisplayFromEncoded<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_709::DisplayFromEncoded<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const
119
120
 private:
121
  static constexpr double kThresh = 0.018;
122
  static constexpr double kMulLow = 4.5;
123
  static constexpr double kMulHi = 1.099;
124
  static constexpr double kPowHi = 0.45;
125
  static constexpr double kSub = -0.099;
126
127
  static constexpr double kInvThresh = 0.081;
128
  static constexpr double kInvMulLow = 1 / 4.5;
129
  static constexpr double kInvMulHi = 1 / 1.099;
130
  static constexpr double kInvPowHi = 1 / 0.45;
131
  static constexpr double kInvAdd = 0.099 * kInvMulHi;
132
};
133
134
// Perceptual Quantization
135
class TF_PQ : TF_PQ_Base {
136
 public:
137
  explicit TF_PQ(float display_intensity_target = kDefaultIntensityTarget)
138
79
      : display_scaling_factor_to_10000_nits_(display_intensity_target *
139
79
                                              (1.0f / 10000.0f)),
140
79
        display_scaling_factor_from_10000_nits_(10000.0f /
141
79
                                                display_intensity_target) {}
Unexecuted instantiation: jxl::N_SSE4::TF_PQ::TF_PQ(float)
jxl::N_AVX2::TF_PQ::TF_PQ(float)
Line
Count
Source
138
79
      : display_scaling_factor_to_10000_nits_(display_intensity_target *
139
79
                                              (1.0f / 10000.0f)),
140
79
        display_scaling_factor_from_10000_nits_(10000.0f /
141
79
                                                display_intensity_target) {}
Unexecuted instantiation: jxl::N_AVX3::TF_PQ::TF_PQ(float)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::TF_PQ::TF_PQ(float)
Unexecuted instantiation: jxl::N_AVX3_SPR::TF_PQ::TF_PQ(float)
Unexecuted instantiation: jxl::N_SSE2::TF_PQ::TF_PQ(float)
142
143
  // Maximum error 3e-6
144
  template <class D, class V>
145
0
  JXL_INLINE V DisplayFromEncoded(D d, V x) const {
146
0
    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
147
0
    const V kSign = BitCast(d, Set(du, 0x80000000u));
148
0
    const V original_sign = And(x, kSign);
149
0
    x = AndNot(kSign, x);  // abs
150
    // 4-over-4-degree rational polynomial approximation on x+x*x. This improves
151
    // the maximum error by about 5x over a rational polynomial for x.
152
0
    auto xpxx = MulAdd(x, x, x);
153
0
    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
154
0
        HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f),
155
0
        HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f),
156
0
        HWY_REP4(5.50034862e-01f),
157
0
    };
158
0
    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
159
0
        HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f),
160
0
        HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f),
161
0
        HWY_REP4(2.67718770e+00f),
162
0
    };
163
0
    auto magnitude = EvalRationalPolynomial(d, xpxx, p, q);
164
0
    return Or(
165
0
        AndNot(kSign,
166
0
               Mul(magnitude, Set(d, display_scaling_factor_from_10000_nits_))),
167
0
        original_sign);
168
0
  }
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::TF_PQ::DisplayFromEncoded<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::TF_PQ::DisplayFromEncoded<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::TF_PQ::DisplayFromEncoded<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_PQ::DisplayFromEncoded<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_PQ::DisplayFromEncoded<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_PQ::DisplayFromEncoded<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const
169
170
  // Maximum error 7e-7.
171
  template <class D, class V>
172
895k
  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
173
895k
    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
174
895k
    const V kSign = BitCast(d, Set(du, 0x80000000u));
175
895k
    const V original_sign = And(x, kSign);
176
895k
    x = AndNot(kSign, x);  // abs
177
    // 4-over-4-degree rational polynomial approximation on x**0.25, with two
178
    // different polynomials above and below 1e-4.
179
895k
    auto xto025 =
180
895k
        Sqrt(Sqrt(Mul(x, Set(d, display_scaling_factor_to_10000_nits_))));
181
895k
    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
182
895k
        HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f),
183
895k
        HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f),
184
895k
        HWY_REP4(4.838434e+01f),
185
895k
    };
186
895k
    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
187
895k
        HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f),
188
895k
        HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f),
189
895k
        HWY_REP4(2.590418e+01f),
190
895k
    };
191
192
895k
    HWY_ALIGN constexpr float plo[(4 + 1) * 4] = {
193
895k
        HWY_REP4(9.863406e-06f),  HWY_REP4(3.881234e-01f),
194
895k
        HWY_REP4(1.352821e+02f),  HWY_REP4(6.889862e+04f),
195
895k
        HWY_REP4(-2.864824e+05f),
196
895k
    };
197
895k
    HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = {
198
895k
        HWY_REP4(3.371868e+01f),  HWY_REP4(1.477719e+03f),
199
895k
        HWY_REP4(1.608477e+04f),  HWY_REP4(-4.389884e+04f),
200
895k
        HWY_REP4(-2.072546e+05f),
201
895k
    };
202
203
895k
    auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)),
204
895k
                                EvalRationalPolynomial(d, xto025, plo, qlo),
205
895k
                                EvalRationalPolynomial(d, xto025, p, q));
206
895k
    return Or(AndNot(kSign, magnitude), original_sign);
207
895k
  }
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::TF_PQ::EncodedFromDisplay<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::TF_PQ::EncodedFromDisplay<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::TF_PQ::EncodedFromDisplay<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) const
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_PQ::EncodedFromDisplay<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const
Line
Count
Source
172
895k
  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
173
895k
    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
174
895k
    const V kSign = BitCast(d, Set(du, 0x80000000u));
175
895k
    const V original_sign = And(x, kSign);
176
895k
    x = AndNot(kSign, x);  // abs
177
    // 4-over-4-degree rational polynomial approximation on x**0.25, with two
178
    // different polynomials above and below 1e-4.
179
895k
    auto xto025 =
180
895k
        Sqrt(Sqrt(Mul(x, Set(d, display_scaling_factor_to_10000_nits_))));
181
895k
    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
182
895k
        HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f),
183
895k
        HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f),
184
895k
        HWY_REP4(4.838434e+01f),
185
895k
    };
186
895k
    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
187
895k
        HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f),
188
895k
        HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f),
189
895k
        HWY_REP4(2.590418e+01f),
190
895k
    };
191
192
895k
    HWY_ALIGN constexpr float plo[(4 + 1) * 4] = {
193
895k
        HWY_REP4(9.863406e-06f),  HWY_REP4(3.881234e-01f),
194
895k
        HWY_REP4(1.352821e+02f),  HWY_REP4(6.889862e+04f),
195
895k
        HWY_REP4(-2.864824e+05f),
196
895k
    };
197
895k
    HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = {
198
895k
        HWY_REP4(3.371868e+01f),  HWY_REP4(1.477719e+03f),
199
895k
        HWY_REP4(1.608477e+04f),  HWY_REP4(-4.389884e+04f),
200
895k
        HWY_REP4(-2.072546e+05f),
201
895k
    };
202
203
895k
    auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)),
204
895k
                                EvalRationalPolynomial(d, xto025, plo, qlo),
205
895k
                                EvalRationalPolynomial(d, xto025, p, q));
206
895k
    return Or(AndNot(kSign, magnitude), original_sign);
207
895k
  }
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_PQ::EncodedFromDisplay<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_PQ::EncodedFromDisplay<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const
208
209
 private:
210
  const float display_scaling_factor_to_10000_nits_;
211
  const float display_scaling_factor_from_10000_nits_;
212
};
213
214
// sRGB
215
class TF_SRGB {
216
 public:
217
  template <typename V>
218
69.0M
  JXL_INLINE V DisplayFromEncoded(V x) const {
219
69.0M
    const HWY_FULL(float) d;
220
69.0M
    const HWY_FULL(uint32_t) du;
221
69.0M
    const V kSign = BitCast(d, Set(du, 0x80000000u));
222
69.0M
    const V original_sign = And(x, kSign);
223
69.0M
    x = AndNot(kSign, x);  // abs
224
225
    // TODO(janwas): range reduction
226
    // Computed via af_cheb_rational (k=100); replicated 4x.
227
69.0M
    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
228
69.0M
        HWY_REP4(2.200248328e-04f), HWY_REP4(1.043637593e-02f),
229
69.0M
        HWY_REP4(1.624820318e-01f), HWY_REP4(7.961564959e-01f),
230
69.0M
        HWY_REP4(8.210152774e-01f),
231
69.0M
    };
232
69.0M
    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
233
69.0M
        HWY_REP4(2.631846970e-01f), HWY_REP4(1.076976492e+00f),
234
69.0M
        HWY_REP4(4.987528350e-01f), HWY_REP4(-5.512498495e-02f),
235
69.0M
        HWY_REP4(6.521209011e-03f),
236
69.0M
    };
237
69.0M
    const V linear = Mul(x, Set(d, kLowDivInv));
238
69.0M
    const V poly = EvalRationalPolynomial(d, x, p, q);
239
69.0M
    const V magnitude =
240
69.0M
        IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear);
241
69.0M
    return Or(AndNot(kSign, magnitude), original_sign);
242
69.0M
  }
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_SRGB::DisplayFromEncoded<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>) const
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_SRGB::DisplayFromEncoded<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>) const
Line
Count
Source
218
69.0M
  JXL_INLINE V DisplayFromEncoded(V x) const {
219
69.0M
    const HWY_FULL(float) d;
220
69.0M
    const HWY_FULL(uint32_t) du;
221
69.0M
    const V kSign = BitCast(d, Set(du, 0x80000000u));
222
69.0M
    const V original_sign = And(x, kSign);
223
69.0M
    x = AndNot(kSign, x);  // abs
224
225
    // TODO(janwas): range reduction
226
    // Computed via af_cheb_rational (k=100); replicated 4x.
227
69.0M
    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
228
69.0M
        HWY_REP4(2.200248328e-04f), HWY_REP4(1.043637593e-02f),
229
69.0M
        HWY_REP4(1.624820318e-01f), HWY_REP4(7.961564959e-01f),
230
69.0M
        HWY_REP4(8.210152774e-01f),
231
69.0M
    };
232
69.0M
    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
233
69.0M
        HWY_REP4(2.631846970e-01f), HWY_REP4(1.076976492e+00f),
234
69.0M
        HWY_REP4(4.987528350e-01f), HWY_REP4(-5.512498495e-02f),
235
69.0M
        HWY_REP4(6.521209011e-03f),
236
69.0M
    };
237
69.0M
    const V linear = Mul(x, Set(d, kLowDivInv));
238
69.0M
    const V poly = EvalRationalPolynomial(d, x, p, q);
239
69.0M
    const V magnitude =
240
69.0M
        IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear);
241
69.0M
    return Or(AndNot(kSign, magnitude), original_sign);
242
69.0M
  }
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::TF_SRGB::DisplayFromEncoded<hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::TF_SRGB::DisplayFromEncoded<hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::TF_SRGB::DisplayFromEncoded<hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Vec512<float>) const
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_SRGB::DisplayFromEncoded<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>) const
243
244
  // Error ~5e-07
245
  template <class D, class V>
246
87.9M
  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
247
87.9M
    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
248
87.9M
    const V kSign = BitCast(d, Set(du, 0x80000000u));
249
87.9M
    const V original_sign = And(x, kSign);
250
87.9M
    x = AndNot(kSign, x);  // abs
251
252
    // Computed via af_cheb_rational (k=100); replicated 4x.
253
87.9M
    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
254
87.9M
        HWY_REP4(-5.135152395e-04f), HWY_REP4(5.287254571e-03f),
255
87.9M
        HWY_REP4(3.903842876e-01f),  HWY_REP4(1.474205315e+00f),
256
87.9M
        HWY_REP4(7.352629620e-01f),
257
87.9M
    };
258
87.9M
    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
259
87.9M
        HWY_REP4(1.004519624e-02f), HWY_REP4(3.036675394e-01f),
260
87.9M
        HWY_REP4(1.340816930e+00f), HWY_REP4(9.258482155e-01f),
261
87.9M
        HWY_REP4(2.424867759e-02f),
262
87.9M
    };
263
87.9M
    const V linear = Mul(x, Set(d, kLowDiv));
264
87.9M
    const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q);
265
87.9M
    const V magnitude =
266
87.9M
        IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear);
267
87.9M
    return Or(AndNot(kSign, magnitude), original_sign);
268
87.9M
  }
Unexecuted instantiation: hwy::N_AVX3_SPR::Vec512<float> jxl::N_AVX3_SPR::TF_SRGB::EncodedFromDisplay<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3_ZEN4::Vec512<float> jxl::N_AVX3_ZEN4::TF_SRGB::EncodedFromDisplay<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>) const
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::TF_SRGB::EncodedFromDisplay<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>) const
hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_SRGB::EncodedFromDisplay<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const
Line
Count
Source
246
87.9M
  JXL_INLINE V EncodedFromDisplay(D d, V x) const {
247
87.9M
    const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
248
87.9M
    const V kSign = BitCast(d, Set(du, 0x80000000u));
249
87.9M
    const V original_sign = And(x, kSign);
250
87.9M
    x = AndNot(kSign, x);  // abs
251
252
    // Computed via af_cheb_rational (k=100); replicated 4x.
253
87.9M
    HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
254
87.9M
        HWY_REP4(-5.135152395e-04f), HWY_REP4(5.287254571e-03f),
255
87.9M
        HWY_REP4(3.903842876e-01f),  HWY_REP4(1.474205315e+00f),
256
87.9M
        HWY_REP4(7.352629620e-01f),
257
87.9M
    };
258
87.9M
    HWY_ALIGN constexpr float q[(4 + 1) * 4] = {
259
87.9M
        HWY_REP4(1.004519624e-02f), HWY_REP4(3.036675394e-01f),
260
87.9M
        HWY_REP4(1.340816930e+00f), HWY_REP4(9.258482155e-01f),
261
87.9M
        HWY_REP4(2.424867759e-02f),
262
87.9M
    };
263
87.9M
    const V linear = Mul(x, Set(d, kLowDiv));
264
87.9M
    const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q);
265
87.9M
    const V magnitude =
266
87.9M
        IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear);
267
87.9M
    return Or(AndNot(kSign, magnitude), original_sign);
268
87.9M
  }
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_SRGB::EncodedFromDisplay<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_SRGB::EncodedFromDisplay<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const
269
270
 private:
271
  static constexpr float kThreshSRGBToLinear = 0.04045f;
272
  static constexpr float kThreshLinearToSRGB = 0.0031308f;
273
  static constexpr float kLowDiv = 12.92f;
274
  static constexpr float kLowDivInv = 1.0f / kLowDiv;
275
};
276
277
// Linear to sRGB conversion with error of at most 1.2e-4.
278
template <typename D, typename V>
279
V FastLinearToSRGB(D d, V v) {
280
  const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du;
281
  const hwy::HWY_NAMESPACE::Rebind<int32_t, D> di;
282
  // Convert to 0.25 - 0.5 range.
283
  auto v025_05 = BitCast(
284
      d, And(Or(BitCast(du, v), Set(du, 0x3e800000)), Set(du, 0x3effffff)));
285
  // third degree polynomial approximation between 0.25 and 0.5
286
  // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves
287
  // accuracy by about 3x.
288
  auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f));
289
  auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f));
290
  auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f));
291
  // Compute extra multiplier depending on exponent. Valid exponent range for
292
  // [0.0031308f, 1.0) is 0...8 after subtracting 118.
293
  // The next three constants contain a representation of the powers of
294
  // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are
295
  // always the same and in k2to512powers_basebits, and the two arrays contain
296
  // the next groups of 8 bits. This ends up being a 22-bit representation (with
297
  // a mantissa of 13 bits). The choice of polynomial to approximate is such
298
  // that the multiplication factor has the highest 5 bits constant, and that
299
  // the factor for the lowest possible exponent is a power of two (thus making
300
  // the additional bits 0, which is used to correctly merge back together the
301
  // floats).
302
  constexpr uint32_t k2to512powers_basebits = 0x40000000;
303
  HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = {
304
      0x0,  0xa,  0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c,
305
      0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6,
306
  };
307
  HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = {
308
      0x0,  0xb7, 0x4,  0xd,  0xcb, 0xe7, 0x41, 0x68,
309
      0x51, 0xd1, 0xeb, 0xf2, 0x0,  0xb7, 0x4,  0xd,
310
  };
311
  // Note that vld1q_s8_x2 on ARM seems to actually be slower.
312
#if HWY_TARGET != HWY_SCALAR
313
  using hwy::HWY_NAMESPACE::ShiftLeft;
314
  using hwy::HWY_NAMESPACE::ShiftRight;
315
  // Every lane of exp is now (if cast to byte) {0, 0, 0, <index for lookup>}.
316
  auto exp = Sub(ShiftRight<23>(BitCast(di, v)), Set(di, 118));
317
  auto pow25to18bits = TableLookupBytes(
318
      LoadDup128(di,
319
                 reinterpret_cast<const int32_t*>(k2to512powers_25to18bits)),
320
      exp);
321
  auto pow17to10bits = TableLookupBytes(
322
      LoadDup128(di,
323
                 reinterpret_cast<const int32_t*>(k2to512powers_17to10bits)),
324
      exp);
325
  // Now, pow* contain {0, 0, 0, <part of float repr of multiplier>}. Here
326
  // we take advantage of the fact that each table has its position 0 equal to
327
  // 0.
328
  // We can now just reassemble the float.
329
  auto mul = BitCast(
330
      d, Or(Or(ShiftLeft<18>(pow25to18bits), ShiftLeft<10>(pow17to10bits)),
331
            Set(di, k2to512powers_basebits)));
332
#else
333
  // Fallback for scalar.
334
  uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf;
335
  auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) |
336
                                    (k2to512powers_17to10bits[exp] << 10) |
337
                                    k2to512powers_basebits));
338
#endif
339
  return IfThenElse(Lt(v, Set(d, 0.0031308f)), Mul(v, Set(d, 12.92f)),
340
                    MulAdd(pow, mul, Set(d, -0.055)));
341
}
342
343
// NOLINTNEXTLINE(google-readability-namespace-comments)
344
}  // namespace HWY_NAMESPACE
345
}  // namespace jxl
346
HWY_AFTER_NAMESPACE();
347
348
#endif  // LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_