/src/libjxl/lib/jxl/cms/transfer_functions-inl.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | // Transfer functions for color encodings. |
7 | | |
8 | | #include <cstdint> |
9 | | |
10 | | #include "lib/jxl/base/common.h" |
11 | | #if defined(LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE) |
12 | | #ifdef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ |
13 | | #undef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ |
14 | | #else |
15 | | #define LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ |
16 | | #endif |
17 | | |
18 | | #include <cmath> |
19 | | #include <hwy/highway.h> |
20 | | |
21 | | #include "lib/jxl/base/compiler_specific.h" |
22 | | #include "lib/jxl/base/fast_math-inl.h" |
23 | | #include "lib/jxl/base/rational_polynomial-inl.h" |
24 | | #include "lib/jxl/cms/transfer_functions.h" |
25 | | |
26 | | HWY_BEFORE_NAMESPACE(); |
27 | | namespace jxl { |
28 | | namespace HWY_NAMESPACE { |
29 | | |
30 | | // These templates are not found via ADL. |
31 | | using hwy::HWY_NAMESPACE::And; |
32 | | using hwy::HWY_NAMESPACE::AndNot; |
33 | | using hwy::HWY_NAMESPACE::Gt; |
34 | | using hwy::HWY_NAMESPACE::IfThenElse; |
35 | | using hwy::HWY_NAMESPACE::Lt; |
36 | | using hwy::HWY_NAMESPACE::Or; |
37 | | using hwy::HWY_NAMESPACE::Sqrt; |
38 | | using hwy::HWY_NAMESPACE::TableLookupBytes; |
39 | | |
40 | | // Definitions for BT.2100-2 transfer functions (used inside/outside SIMD): |
41 | | // "display" is linear light (nits) normalized to [0, 1]. |
42 | | // "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1]. |
43 | | // "scene" is a linear function of photon counts, normalized to [0, 1]. |
44 | | |
45 | | // Despite the stated ranges, we need unbounded transfer functions: see |
46 | | // http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or |
47 | | // above 1 due to chromatic adaptation. To avoid severe round-trip errors caused |
48 | | // by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see |
49 | | // https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb) |
50 | | // and extend the function domains above 1. |
51 | | |
52 | | // Hybrid Log-Gamma. |
53 | | class TF_HLG : TF_HLG_Base { |
54 | | public: |
55 | | // Maximum error 5e-7. |
56 | | template <class D, class V> |
57 | 267 | JXL_INLINE V EncodedFromDisplay(D d, V x) const { |
58 | 267 | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
59 | 267 | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
60 | 267 | const V original_sign = And(x, kSign); |
61 | 267 | x = AndNot(kSign, x); // abs |
62 | 267 | const auto belowInv12 = Le(x, Set(d, kInv12)); |
63 | 267 | const V lo = Sqrt(Mul(Set(d, k3), x)); |
64 | 267 | const V hi = |
65 | 267 | MulAdd(Set(d, kA * kInvLog2e), |
66 | 267 | FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC)); |
67 | 267 | const V magnitude = IfThenElse(belowInv12, lo, hi); |
68 | 267 | return Or(AndNot(kSign, magnitude), original_sign); |
69 | 267 | } hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_HLG::EncodedFromDisplay<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const Line | Count | Source | 57 | 267 | JXL_INLINE V EncodedFromDisplay(D d, V x) const { | 58 | 267 | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; | 59 | 267 | const V kSign = BitCast(d, Set(du, 0x80000000u)); | 60 | 267 | const V original_sign = And(x, kSign); | 61 | 267 | x = AndNot(kSign, x); // abs | 62 | 267 | const auto belowInv12 = Le(x, Set(d, kInv12)); | 63 | 267 | const V lo = Sqrt(Mul(Set(d, k3), x)); | 64 | 267 | const V hi = | 65 | 267 | MulAdd(Set(d, kA * kInvLog2e), | 66 | 267 | FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC)); | 67 | 267 | const V magnitude = IfThenElse(belowInv12, lo, hi); | 68 | 267 | return Or(AndNot(kSign, magnitude), original_sign); | 69 | 267 | } |
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_HLG::EncodedFromDisplay<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_HLG::EncodedFromDisplay<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const |
70 | | |
71 | | template <class D, class V> |
72 | 0 | JXL_INLINE V DisplayFromEncoded(D d, V x) const { |
73 | 0 | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
74 | 0 | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
75 | 0 | const V original_sign = And(x, kSign); |
76 | 0 | x = AndNot(kSign, x); // abs |
77 | 0 | const auto below05 = Le(x, Set(d, k05)); |
78 | 0 | const V lo = Mul(x, Mul(x, Set(d, kInv3))); |
79 | 0 | const V hi = MulAdd(FastPow2f(d, Mul(x, Set(d, kHiPow))), Set(d, kHiMul), |
80 | 0 | Set(d, kHiAdd)); |
81 | 0 | const V magnitude = IfThenElse(below05, lo, hi); |
82 | 0 | return Or(AndNot(kSign, magnitude), original_sign); |
83 | 0 | } Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_HLG::DisplayFromEncoded<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_HLG::DisplayFromEncoded<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_HLG::DisplayFromEncoded<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const |
84 | | |
85 | | private: |
86 | | static constexpr double k05 = 0.5; |
87 | | static constexpr double k3 = 3.0; |
88 | | static constexpr double kInv3 = 1.0 / 3.0; |
89 | | static constexpr double kHiAdd = kB * kInv12; |
90 | | // std::exp(-kC * kRA) * kInv12; |
91 | | static constexpr double kHiMul = 0.003639807079052639; |
92 | | // kRA * std::log2e_v |
93 | | static constexpr double kHiPow = 8.067285659607931; |
94 | | }; |
95 | | |
96 | | class TF_709 { |
97 | | public: |
98 | 0 | static JXL_INLINE double EncodedFromDisplay(const double d) { |
99 | 0 | if (d < kThresh) return kMulLow * d; |
100 | 0 | return kMulHi * std::pow(d, kPowHi) + kSub; |
101 | 0 | } Unexecuted instantiation: jxl::N_SSE4::TF_709::EncodedFromDisplay(double) Unexecuted instantiation: jxl::N_AVX2::TF_709::EncodedFromDisplay(double) Unexecuted instantiation: jxl::N_SSE2::TF_709::EncodedFromDisplay(double) |
102 | | |
103 | | // Maximum error 1e-6. |
104 | | template <class D, class V> |
105 | 15.5k | JXL_INLINE V EncodedFromDisplay(D d, V x) const { |
106 | 15.5k | auto low = Mul(Set(d, kMulLow), x); |
107 | 15.5k | auto hi = |
108 | 15.5k | MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub)); |
109 | 15.5k | return IfThenElse(Le(x, Set(d, kThresh)), low, hi); |
110 | 15.5k | } hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_709::EncodedFromDisplay<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const Line | Count | Source | 105 | 15.5k | JXL_INLINE V EncodedFromDisplay(D d, V x) const { | 106 | 15.5k | auto low = Mul(Set(d, kMulLow), x); | 107 | 15.5k | auto hi = | 108 | 15.5k | MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub)); | 109 | 15.5k | return IfThenElse(Le(x, Set(d, kThresh)), low, hi); | 110 | 15.5k | } |
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_709::EncodedFromDisplay<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_709::EncodedFromDisplay<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const |
111 | | |
112 | | template <class D, class V> |
113 | 0 | JXL_INLINE V DisplayFromEncoded(D d, V x) const { |
114 | 0 | auto low = Mul(Set(d, kInvMulLow), x); |
115 | 0 | auto hi = FastPowf(d, MulAdd(x, Set(d, kInvMulHi), Set(d, kInvAdd)), |
116 | 0 | Set(d, kInvPowHi)); |
117 | 0 | return IfThenElse(Lt(x, Set(d, kInvThresh)), low, hi); |
118 | 0 | } Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_709::DisplayFromEncoded<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_709::DisplayFromEncoded<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_709::DisplayFromEncoded<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const |
119 | | |
120 | | private: |
121 | | static constexpr double kThresh = 0.018; |
122 | | static constexpr double kMulLow = 4.5; |
123 | | static constexpr double kMulHi = 1.099; |
124 | | static constexpr double kPowHi = 0.45; |
125 | | static constexpr double kSub = -0.099; |
126 | | |
127 | | static constexpr double kInvThresh = 0.081; |
128 | | static constexpr double kInvMulLow = 1 / 4.5; |
129 | | static constexpr double kInvMulHi = 1 / 1.099; |
130 | | static constexpr double kInvPowHi = 1 / 0.45; |
131 | | static constexpr double kInvAdd = 0.099 * kInvMulHi; |
132 | | }; |
133 | | |
134 | | // Perceptual Quantization |
135 | | class TF_PQ : TF_PQ_Base { |
136 | | public: |
137 | | explicit TF_PQ(float display_intensity_target = kDefaultIntensityTarget) |
138 | 28 | : display_scaling_factor_to_10000_nits_(display_intensity_target * |
139 | 28 | (1.0f / 10000.0f)), |
140 | 28 | display_scaling_factor_from_10000_nits_(10000.0f / |
141 | 28 | display_intensity_target) {} Unexecuted instantiation: jxl::N_SSE4::TF_PQ::TF_PQ(float) jxl::N_AVX2::TF_PQ::TF_PQ(float) Line | Count | Source | 138 | 28 | : display_scaling_factor_to_10000_nits_(display_intensity_target * | 139 | 28 | (1.0f / 10000.0f)), | 140 | 28 | display_scaling_factor_from_10000_nits_(10000.0f / | 141 | 28 | display_intensity_target) {} |
Unexecuted instantiation: jxl::N_SSE2::TF_PQ::TF_PQ(float) |
142 | | |
143 | | // Maximum error 3e-6 |
144 | | template <class D, class V> |
145 | 0 | JXL_INLINE V DisplayFromEncoded(D d, V x) const { |
146 | 0 | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
147 | 0 | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
148 | 0 | const V original_sign = And(x, kSign); |
149 | 0 | x = AndNot(kSign, x); // abs |
150 | | // 4-over-4-degree rational polynomial approximation on x+x*x. This improves |
151 | | // the maximum error by about 5x over a rational polynomial for x. |
152 | 0 | auto xpxx = MulAdd(x, x, x); |
153 | 0 | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { |
154 | 0 | HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f), |
155 | 0 | HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f), |
156 | 0 | HWY_REP4(5.50034862e-01f), |
157 | 0 | }; |
158 | 0 | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { |
159 | 0 | HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f), |
160 | 0 | HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f), |
161 | 0 | HWY_REP4(2.67718770e+00f), |
162 | 0 | }; |
163 | 0 | auto magnitude = EvalRationalPolynomial(d, xpxx, p, q); |
164 | 0 | return Or( |
165 | 0 | AndNot(kSign, |
166 | 0 | Mul(magnitude, Set(d, display_scaling_factor_from_10000_nits_))), |
167 | 0 | original_sign); |
168 | 0 | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_PQ::DisplayFromEncoded<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_PQ::DisplayFromEncoded<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_PQ::DisplayFromEncoded<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const |
169 | | |
170 | | // Maximum error 7e-7. |
171 | | template <class D, class V> |
172 | 14.1k | JXL_INLINE V EncodedFromDisplay(D d, V x) const { |
173 | 14.1k | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
174 | 14.1k | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
175 | 14.1k | const V original_sign = And(x, kSign); |
176 | 14.1k | x = AndNot(kSign, x); // abs |
177 | | // 4-over-4-degree rational polynomial approximation on x**0.25, with two |
178 | | // different polynomials above and below 1e-4. |
179 | 14.1k | auto xto025 = |
180 | 14.1k | Sqrt(Sqrt(Mul(x, Set(d, display_scaling_factor_to_10000_nits_)))); |
181 | 14.1k | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { |
182 | 14.1k | HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f), |
183 | 14.1k | HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f), |
184 | 14.1k | HWY_REP4(4.838434e+01f), |
185 | 14.1k | }; |
186 | 14.1k | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { |
187 | 14.1k | HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f), |
188 | 14.1k | HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f), |
189 | 14.1k | HWY_REP4(2.590418e+01f), |
190 | 14.1k | }; |
191 | | |
192 | 14.1k | HWY_ALIGN constexpr float plo[(4 + 1) * 4] = { |
193 | 14.1k | HWY_REP4(9.863406e-06f), HWY_REP4(3.881234e-01f), |
194 | 14.1k | HWY_REP4(1.352821e+02f), HWY_REP4(6.889862e+04f), |
195 | 14.1k | HWY_REP4(-2.864824e+05f), |
196 | 14.1k | }; |
197 | 14.1k | HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = { |
198 | 14.1k | HWY_REP4(3.371868e+01f), HWY_REP4(1.477719e+03f), |
199 | 14.1k | HWY_REP4(1.608477e+04f), HWY_REP4(-4.389884e+04f), |
200 | 14.1k | HWY_REP4(-2.072546e+05f), |
201 | 14.1k | }; |
202 | | |
203 | 14.1k | auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)), |
204 | 14.1k | EvalRationalPolynomial(d, xto025, plo, qlo), |
205 | 14.1k | EvalRationalPolynomial(d, xto025, p, q)); |
206 | 14.1k | return Or(AndNot(kSign, magnitude), original_sign); |
207 | 14.1k | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_PQ::EncodedFromDisplay<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_PQ::EncodedFromDisplay<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const Line | Count | Source | 172 | 14.1k | JXL_INLINE V EncodedFromDisplay(D d, V x) const { | 173 | 14.1k | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; | 174 | 14.1k | const V kSign = BitCast(d, Set(du, 0x80000000u)); | 175 | 14.1k | const V original_sign = And(x, kSign); | 176 | 14.1k | x = AndNot(kSign, x); // abs | 177 | | // 4-over-4-degree rational polynomial approximation on x**0.25, with two | 178 | | // different polynomials above and below 1e-4. | 179 | 14.1k | auto xto025 = | 180 | 14.1k | Sqrt(Sqrt(Mul(x, Set(d, display_scaling_factor_to_10000_nits_)))); | 181 | 14.1k | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { | 182 | 14.1k | HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f), | 183 | 14.1k | HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f), | 184 | 14.1k | HWY_REP4(4.838434e+01f), | 185 | 14.1k | }; | 186 | 14.1k | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { | 187 | 14.1k | HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f), | 188 | 14.1k | HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f), | 189 | 14.1k | HWY_REP4(2.590418e+01f), | 190 | 14.1k | }; | 191 | | | 192 | 14.1k | HWY_ALIGN constexpr float plo[(4 + 1) * 4] = { | 193 | 14.1k | HWY_REP4(9.863406e-06f), HWY_REP4(3.881234e-01f), | 194 | 14.1k | HWY_REP4(1.352821e+02f), HWY_REP4(6.889862e+04f), | 195 | 14.1k | HWY_REP4(-2.864824e+05f), | 196 | 14.1k | }; | 197 | 14.1k | HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = { | 198 | 14.1k | HWY_REP4(3.371868e+01f), HWY_REP4(1.477719e+03f), | 199 | 14.1k | HWY_REP4(1.608477e+04f), HWY_REP4(-4.389884e+04f), | 200 | 14.1k | HWY_REP4(-2.072546e+05f), | 201 | 14.1k | }; | 202 | | | 203 | 14.1k | auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)), | 204 | 14.1k | EvalRationalPolynomial(d, xto025, plo, qlo), | 205 | 14.1k | EvalRationalPolynomial(d, xto025, p, q)); | 206 | 14.1k | return Or(AndNot(kSign, magnitude), original_sign); | 207 | 14.1k | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_PQ::EncodedFromDisplay<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const |
208 | | |
209 | | private: |
210 | | const float display_scaling_factor_to_10000_nits_; |
211 | | const float display_scaling_factor_from_10000_nits_; |
212 | | }; |
213 | | |
214 | | // sRGB |
215 | | class TF_SRGB { |
216 | | public: |
217 | | template <typename V> |
218 | 9.50M | JXL_INLINE V DisplayFromEncoded(V x) const { |
219 | 9.50M | const HWY_FULL(float) d; |
220 | 9.50M | const HWY_FULL(uint32_t) du; |
221 | 9.50M | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
222 | 9.50M | const V original_sign = And(x, kSign); |
223 | 9.50M | x = AndNot(kSign, x); // abs |
224 | | |
225 | | // TODO(janwas): range reduction |
226 | | // Computed via af_cheb_rational (k=100); replicated 4x. |
227 | 9.50M | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { |
228 | 9.50M | HWY_REP4(2.200248328e-04f), HWY_REP4(1.043637593e-02f), |
229 | 9.50M | HWY_REP4(1.624820318e-01f), HWY_REP4(7.961564959e-01f), |
230 | 9.50M | HWY_REP4(8.210152774e-01f), |
231 | 9.50M | }; |
232 | 9.50M | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { |
233 | 9.50M | HWY_REP4(2.631846970e-01f), HWY_REP4(1.076976492e+00f), |
234 | 9.50M | HWY_REP4(4.987528350e-01f), HWY_REP4(-5.512498495e-02f), |
235 | 9.50M | HWY_REP4(6.521209011e-03f), |
236 | 9.50M | }; |
237 | 9.50M | const V linear = Mul(x, Set(d, kLowDivInv)); |
238 | 9.50M | const V poly = EvalRationalPolynomial(d, x, p, q); |
239 | 9.50M | const V magnitude = |
240 | 9.50M | IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear); |
241 | 9.50M | return Or(AndNot(kSign, magnitude), original_sign); |
242 | 9.50M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_SRGB::DisplayFromEncoded<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>) const hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_SRGB::DisplayFromEncoded<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>) const Line | Count | Source | 218 | 9.50M | JXL_INLINE V DisplayFromEncoded(V x) const { | 219 | 9.50M | const HWY_FULL(float) d; | 220 | 9.50M | const HWY_FULL(uint32_t) du; | 221 | 9.50M | const V kSign = BitCast(d, Set(du, 0x80000000u)); | 222 | 9.50M | const V original_sign = And(x, kSign); | 223 | 9.50M | x = AndNot(kSign, x); // abs | 224 | | | 225 | | // TODO(janwas): range reduction | 226 | | // Computed via af_cheb_rational (k=100); replicated 4x. | 227 | 9.50M | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { | 228 | 9.50M | HWY_REP4(2.200248328e-04f), HWY_REP4(1.043637593e-02f), | 229 | 9.50M | HWY_REP4(1.624820318e-01f), HWY_REP4(7.961564959e-01f), | 230 | 9.50M | HWY_REP4(8.210152774e-01f), | 231 | 9.50M | }; | 232 | 9.50M | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { | 233 | 9.50M | HWY_REP4(2.631846970e-01f), HWY_REP4(1.076976492e+00f), | 234 | 9.50M | HWY_REP4(4.987528350e-01f), HWY_REP4(-5.512498495e-02f), | 235 | 9.50M | HWY_REP4(6.521209011e-03f), | 236 | 9.50M | }; | 237 | 9.50M | const V linear = Mul(x, Set(d, kLowDivInv)); | 238 | 9.50M | const V poly = EvalRationalPolynomial(d, x, p, q); | 239 | 9.50M | const V magnitude = | 240 | 9.50M | IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear); | 241 | 9.50M | return Or(AndNot(kSign, magnitude), original_sign); | 242 | 9.50M | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_SRGB::DisplayFromEncoded<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>) const |
243 | | |
244 | | // Error ~5e-07 |
245 | | template <class D, class V> |
246 | 127M | JXL_INLINE V EncodedFromDisplay(D d, V x) const { |
247 | 127M | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
248 | 127M | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
249 | 127M | const V original_sign = And(x, kSign); |
250 | 127M | x = AndNot(kSign, x); // abs |
251 | | |
252 | | // Computed via af_cheb_rational (k=100); replicated 4x. |
253 | 127M | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { |
254 | 127M | HWY_REP4(-5.135152395e-04f), HWY_REP4(5.287254571e-03f), |
255 | 127M | HWY_REP4(3.903842876e-01f), HWY_REP4(1.474205315e+00f), |
256 | 127M | HWY_REP4(7.352629620e-01f), |
257 | 127M | }; |
258 | 127M | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { |
259 | 127M | HWY_REP4(1.004519624e-02f), HWY_REP4(3.036675394e-01f), |
260 | 127M | HWY_REP4(1.340816930e+00f), HWY_REP4(9.258482155e-01f), |
261 | 127M | HWY_REP4(2.424867759e-02f), |
262 | 127M | }; |
263 | 127M | const V linear = Mul(x, Set(d, kLowDiv)); |
264 | 127M | const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q); |
265 | 127M | const V magnitude = |
266 | 127M | IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear); |
267 | 127M | return Or(AndNot(kSign, magnitude), original_sign); |
268 | 127M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::TF_SRGB::EncodedFromDisplay<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>) const hwy::N_AVX2::Vec256<float> jxl::N_AVX2::TF_SRGB::EncodedFromDisplay<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>) const Line | Count | Source | 246 | 127M | JXL_INLINE V EncodedFromDisplay(D d, V x) const { | 247 | 127M | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; | 248 | 127M | const V kSign = BitCast(d, Set(du, 0x80000000u)); | 249 | 127M | const V original_sign = And(x, kSign); | 250 | 127M | x = AndNot(kSign, x); // abs | 251 | | | 252 | | // Computed via af_cheb_rational (k=100); replicated 4x. | 253 | 127M | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { | 254 | 127M | HWY_REP4(-5.135152395e-04f), HWY_REP4(5.287254571e-03f), | 255 | 127M | HWY_REP4(3.903842876e-01f), HWY_REP4(1.474205315e+00f), | 256 | 127M | HWY_REP4(7.352629620e-01f), | 257 | 127M | }; | 258 | 127M | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { | 259 | 127M | HWY_REP4(1.004519624e-02f), HWY_REP4(3.036675394e-01f), | 260 | 127M | HWY_REP4(1.340816930e+00f), HWY_REP4(9.258482155e-01f), | 261 | 127M | HWY_REP4(2.424867759e-02f), | 262 | 127M | }; | 263 | 127M | const V linear = Mul(x, Set(d, kLowDiv)); | 264 | 127M | const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q); | 265 | 127M | const V magnitude = | 266 | 127M | IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear); | 267 | 127M | return Or(AndNot(kSign, magnitude), original_sign); | 268 | 127M | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::TF_SRGB::EncodedFromDisplay<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>) const |
269 | | |
270 | | private: |
271 | | static constexpr float kThreshSRGBToLinear = 0.04045f; |
272 | | static constexpr float kThreshLinearToSRGB = 0.0031308f; |
273 | | static constexpr float kLowDiv = 12.92f; |
274 | | static constexpr float kLowDivInv = 1.0f / kLowDiv; |
275 | | }; |
276 | | |
277 | | // Linear to sRGB conversion with error of at most 1.2e-4. |
278 | | template <typename D, typename V> |
279 | | V FastLinearToSRGB(D d, V v) { |
280 | | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
281 | | const hwy::HWY_NAMESPACE::Rebind<int32_t, D> di; |
282 | | // Convert to 0.25 - 0.5 range. |
283 | | auto v025_05 = BitCast( |
284 | | d, And(Or(BitCast(du, v), Set(du, 0x3e800000)), Set(du, 0x3effffff))); |
285 | | // third degree polynomial approximation between 0.25 and 0.5 |
286 | | // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves |
287 | | // accuracy by about 3x. |
288 | | auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f)); |
289 | | auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f)); |
290 | | auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f)); |
291 | | // Compute extra multiplier depending on exponent. Valid exponent range for |
292 | | // [0.0031308f, 1.0) is 0...8 after subtracting 118. |
293 | | // The next three constants contain a representation of the powers of |
294 | | // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are |
295 | | // always the same and in k2to512powers_basebits, and the two arrays contain |
296 | | // the next groups of 8 bits. This ends up being a 22-bit representation (with |
297 | | // a mantissa of 13 bits). The choice of polynomial to approximate is such |
298 | | // that the multiplication factor has the highest 5 bits constant, and that |
299 | | // the factor for the lowest possible exponent is a power of two (thus making |
300 | | // the additional bits 0, which is used to correctly merge back together the |
301 | | // floats). |
302 | | constexpr uint32_t k2to512powers_basebits = 0x40000000; |
303 | | HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = { |
304 | | 0x0, 0xa, 0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c, |
305 | | 0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6, |
306 | | }; |
307 | | HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = { |
308 | | 0x0, 0xb7, 0x4, 0xd, 0xcb, 0xe7, 0x41, 0x68, |
309 | | 0x51, 0xd1, 0xeb, 0xf2, 0x0, 0xb7, 0x4, 0xd, |
310 | | }; |
311 | | // Note that vld1q_s8_x2 on ARM seems to actually be slower. |
312 | | #if HWY_TARGET != HWY_SCALAR |
313 | | using hwy::HWY_NAMESPACE::ShiftLeft; |
314 | | using hwy::HWY_NAMESPACE::ShiftRight; |
315 | | // Every lane of exp is now (if cast to byte) {0, 0, 0, <index for lookup>}. |
316 | | auto exp = Sub(ShiftRight<23>(BitCast(di, v)), Set(di, 118)); |
317 | | auto pow25to18bits = TableLookupBytes( |
318 | | LoadDup128(di, |
319 | | reinterpret_cast<const int32_t*>(k2to512powers_25to18bits)), |
320 | | exp); |
321 | | auto pow17to10bits = TableLookupBytes( |
322 | | LoadDup128(di, |
323 | | reinterpret_cast<const int32_t*>(k2to512powers_17to10bits)), |
324 | | exp); |
325 | | // Now, pow* contain {0, 0, 0, <part of float repr of multiplier>}. Here |
326 | | // we take advantage of the fact that each table has its position 0 equal to |
327 | | // 0. |
328 | | // We can now just reassemble the float. |
329 | | auto mul = BitCast( |
330 | | d, Or(Or(ShiftLeft<18>(pow25to18bits), ShiftLeft<10>(pow17to10bits)), |
331 | | Set(di, k2to512powers_basebits))); |
332 | | #else |
333 | | // Fallback for scalar. |
334 | | uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf; |
335 | | auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) | |
336 | | (k2to512powers_17to10bits[exp] << 10) | |
337 | | k2to512powers_basebits)); |
338 | | #endif |
339 | | return IfThenElse(Lt(v, Set(d, 0.0031308f)), Mul(v, Set(d, 12.92f)), |
340 | | MulAdd(pow, mul, Set(d, -0.055))); |
341 | | } |
342 | | |
343 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
344 | | } // namespace HWY_NAMESPACE |
345 | | } // namespace jxl |
346 | | HWY_AFTER_NAMESPACE(); |
347 | | |
348 | | #endif // LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ |