/src/libjxl/lib/jxl/cms/transfer_functions-inl.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | // Transfer functions for color encodings. |
7 | | |
8 | | #if defined(LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE) |
9 | | #ifdef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ |
10 | | #undef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ |
11 | | #else |
12 | | #define LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ |
13 | | #endif |
14 | | |
15 | | #include <algorithm> |
16 | | #include <cmath> |
17 | | #include <hwy/highway.h> |
18 | | |
19 | | #include "lib/jxl/base/compiler_specific.h" |
20 | | #include "lib/jxl/base/fast_math-inl.h" |
21 | | #include "lib/jxl/base/rational_polynomial-inl.h" |
22 | | #include "lib/jxl/base/status.h" |
23 | | #include "lib/jxl/cms/transfer_functions.h" |
24 | | |
25 | | HWY_BEFORE_NAMESPACE(); |
26 | | namespace jxl { |
27 | | namespace HWY_NAMESPACE { |
28 | | |
29 | | // These templates are not found via ADL. |
30 | | using hwy::HWY_NAMESPACE::And; |
31 | | using hwy::HWY_NAMESPACE::AndNot; |
32 | | using hwy::HWY_NAMESPACE::Gt; |
33 | | using hwy::HWY_NAMESPACE::IfThenElse; |
34 | | using hwy::HWY_NAMESPACE::Lt; |
35 | | using hwy::HWY_NAMESPACE::Or; |
36 | | using hwy::HWY_NAMESPACE::Sqrt; |
37 | | using hwy::HWY_NAMESPACE::TableLookupBytes; |
38 | | |
39 | | // Definitions for BT.2100-2 transfer functions (used inside/outside SIMD): |
40 | | // "display" is linear light (nits) normalized to [0, 1]. |
41 | | // "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1]. |
42 | | // "scene" is a linear function of photon counts, normalized to [0, 1]. |
43 | | |
44 | | // Despite the stated ranges, we need unbounded transfer functions: see |
45 | | // http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or |
46 | | // above 1 due to chromatic adaptation. To avoid severe round-trip errors caused |
47 | | // by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see |
48 | | // https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb) |
49 | | // and extend the function domains above 1. |
50 | | |
51 | | // Hybrid Log-Gamma. |
52 | | class TF_HLG : TF_HLG_Base { |
53 | | public: |
54 | | // Maximum error 5e-7. |
55 | | template <class D, class V> |
56 | 0 | JXL_INLINE V EncodedFromDisplay(D d, V x) const { |
57 | 0 | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
58 | 0 | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
59 | 0 | const V original_sign = And(x, kSign); |
60 | 0 | x = AndNot(kSign, x); // abs |
61 | 0 | const V below_div12 = Sqrt(Mul(Set(d, 3.0f), x)); |
62 | 0 | const V e = |
63 | 0 | MulAdd(Set(d, kA * 0.693147181f), |
64 | 0 | FastLog2f(d, MulAdd(Set(d, 12), x, Set(d, -kB))), Set(d, kC)); |
65 | 0 | const V magnitude = IfThenElse(Le(x, Set(d, kDiv12)), below_div12, e); |
66 | 0 | return Or(AndNot(kSign, magnitude), original_sign); |
67 | 0 | } |
68 | | }; |
69 | | |
70 | | class TF_709 { |
71 | | public: |
72 | 0 | static JXL_INLINE double EncodedFromDisplay(const double d) { |
73 | 0 | if (d < kThresh) return kMulLow * d; |
74 | 0 | return kMulHi * std::pow(d, kPowHi) + kSub; |
75 | 0 | } |
76 | | |
77 | | // Maximum error 1e-6. |
78 | | template <class D, class V> |
79 | 0 | JXL_INLINE V EncodedFromDisplay(D d, V x) const { |
80 | 0 | auto low = Mul(Set(d, kMulLow), x); |
81 | 0 | auto hi = |
82 | 0 | MulAdd(Set(d, kMulHi), FastPowf(d, x, Set(d, kPowHi)), Set(d, kSub)); |
83 | 0 | return IfThenElse(Le(x, Set(d, kThresh)), low, hi); |
84 | 0 | } |
85 | | |
86 | | template <class D, class V> |
87 | 0 | JXL_INLINE V DisplayFromEncoded(D d, V x) const { |
88 | 0 | auto low = Mul(Set(d, kInvMulLow), x); |
89 | 0 | auto hi = FastPowf(d, MulAdd(x, Set(d, kInvMulHi), Set(d, kInvAdd)), |
90 | 0 | Set(d, kInvPowHi)); |
91 | 0 | return IfThenElse(Lt(x, Set(d, kInvThresh)), low, hi); |
92 | 0 | } |
93 | | |
94 | | private: |
95 | | static constexpr double kThresh = 0.018; |
96 | | static constexpr double kMulLow = 4.5; |
97 | | static constexpr double kMulHi = 1.099; |
98 | | static constexpr double kPowHi = 0.45; |
99 | | static constexpr double kSub = -0.099; |
100 | | |
101 | | static constexpr double kInvThresh = 0.081; |
102 | | static constexpr double kInvMulLow = 1 / 4.5; |
103 | | static constexpr double kInvMulHi = 1 / 1.099; |
104 | | static constexpr double kInvPowHi = 1 / 0.45; |
105 | | static constexpr double kInvAdd = 0.099 * kInvMulHi; |
106 | | }; |
107 | | |
108 | | // Perceptual Quantization |
109 | | class TF_PQ : TF_PQ_Base { |
110 | | public: |
111 | | explicit TF_PQ(float display_intensity_target = kDefaultIntensityTarget) |
112 | 0 | : display_scaling_factor_to_10000_nits_(display_intensity_target * |
113 | 0 | (1.0f / 10000.0f)), |
114 | 0 | display_scaling_factor_from_10000_nits_(10000.0f / |
115 | 0 | display_intensity_target) {} |
116 | | |
117 | | // Maximum error 3e-6 |
118 | | template <class D, class V> |
119 | 0 | JXL_INLINE V DisplayFromEncoded(D d, V x) const { |
120 | 0 | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
121 | 0 | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
122 | 0 | const V original_sign = And(x, kSign); |
123 | 0 | x = AndNot(kSign, x); // abs |
124 | | // 4-over-4-degree rational polynomial approximation on x+x*x. This improves |
125 | | // the maximum error by about 5x over a rational polynomial for x. |
126 | 0 | auto xpxx = MulAdd(x, x, x); |
127 | 0 | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { |
128 | 0 | HWY_REP4(2.62975656e-04f), HWY_REP4(-6.23553089e-03f), |
129 | 0 | HWY_REP4(7.38602301e-01f), HWY_REP4(2.64553172e+00f), |
130 | 0 | HWY_REP4(5.50034862e-01f), |
131 | 0 | }; |
132 | 0 | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { |
133 | 0 | HWY_REP4(4.21350107e+02f), HWY_REP4(-4.28736818e+02f), |
134 | 0 | HWY_REP4(1.74364667e+02f), HWY_REP4(-3.39078883e+01f), |
135 | 0 | HWY_REP4(2.67718770e+00f), |
136 | 0 | }; |
137 | 0 | auto magnitude = EvalRationalPolynomial(d, xpxx, p, q); |
138 | 0 | return Or( |
139 | 0 | AndNot(kSign, |
140 | 0 | Mul(magnitude, Set(d, display_scaling_factor_from_10000_nits_))), |
141 | 0 | original_sign); |
142 | 0 | } |
143 | | |
144 | | // Maximum error 7e-7. |
145 | | template <class D, class V> |
146 | 0 | JXL_INLINE V EncodedFromDisplay(D d, V x) const { |
147 | 0 | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
148 | 0 | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
149 | 0 | const V original_sign = And(x, kSign); |
150 | 0 | x = AndNot(kSign, x); // abs |
151 | | // 4-over-4-degree rational polynomial approximation on x**0.25, with two |
152 | | // different polynomials above and below 1e-4. |
153 | 0 | auto xto025 = |
154 | 0 | Sqrt(Sqrt(Mul(x, Set(d, display_scaling_factor_to_10000_nits_)))); |
155 | 0 | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { |
156 | 0 | HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f), |
157 | 0 | HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f), |
158 | 0 | HWY_REP4(4.838434e+01f), |
159 | 0 | }; |
160 | 0 | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { |
161 | 0 | HWY_REP4(1.012416e+00f), HWY_REP4(2.016708e+01f), |
162 | 0 | HWY_REP4(9.263710e+01f), HWY_REP4(1.120607e+02f), |
163 | 0 | HWY_REP4(2.590418e+01f), |
164 | 0 | }; |
165 | |
|
166 | 0 | HWY_ALIGN constexpr float plo[(4 + 1) * 4] = { |
167 | 0 | HWY_REP4(9.863406e-06f), HWY_REP4(3.881234e-01f), |
168 | 0 | HWY_REP4(1.352821e+02f), HWY_REP4(6.889862e+04f), |
169 | 0 | HWY_REP4(-2.864824e+05f), |
170 | 0 | }; |
171 | 0 | HWY_ALIGN constexpr float qlo[(4 + 1) * 4] = { |
172 | 0 | HWY_REP4(3.371868e+01f), HWY_REP4(1.477719e+03f), |
173 | 0 | HWY_REP4(1.608477e+04f), HWY_REP4(-4.389884e+04f), |
174 | 0 | HWY_REP4(-2.072546e+05f), |
175 | 0 | }; |
176 | |
|
177 | 0 | auto magnitude = IfThenElse(Lt(x, Set(d, 1e-4f)), |
178 | 0 | EvalRationalPolynomial(d, xto025, plo, qlo), |
179 | 0 | EvalRationalPolynomial(d, xto025, p, q)); |
180 | 0 | return Or(AndNot(kSign, magnitude), original_sign); |
181 | 0 | } |
182 | | |
183 | | private: |
184 | | const float display_scaling_factor_to_10000_nits_; |
185 | | const float display_scaling_factor_from_10000_nits_; |
186 | | }; |
187 | | |
188 | | // sRGB |
189 | | class TF_SRGB { |
190 | | public: |
191 | | template <typename V> |
192 | 0 | JXL_INLINE V DisplayFromEncoded(V x) const { |
193 | 0 | const HWY_FULL(float) d; |
194 | 0 | const HWY_FULL(uint32_t) du; |
195 | 0 | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
196 | 0 | const V original_sign = And(x, kSign); |
197 | 0 | x = AndNot(kSign, x); // abs |
198 | | |
199 | | // TODO(janwas): range reduction |
200 | | // Computed via af_cheb_rational (k=100); replicated 4x. |
201 | 0 | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { |
202 | 0 | HWY_REP4(2.200248328e-04f), HWY_REP4(1.043637593e-02f), |
203 | 0 | HWY_REP4(1.624820318e-01f), HWY_REP4(7.961564959e-01f), |
204 | 0 | HWY_REP4(8.210152774e-01f), |
205 | 0 | }; |
206 | 0 | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { |
207 | 0 | HWY_REP4(2.631846970e-01f), HWY_REP4(1.076976492e+00f), |
208 | 0 | HWY_REP4(4.987528350e-01f), HWY_REP4(-5.512498495e-02f), |
209 | 0 | HWY_REP4(6.521209011e-03f), |
210 | 0 | }; |
211 | 0 | const V linear = Mul(x, Set(d, kLowDivInv)); |
212 | 0 | const V poly = EvalRationalPolynomial(d, x, p, q); |
213 | 0 | const V magnitude = |
214 | 0 | IfThenElse(Gt(x, Set(d, kThreshSRGBToLinear)), poly, linear); |
215 | 0 | return Or(AndNot(kSign, magnitude), original_sign); |
216 | 0 | } |
217 | | |
218 | | // Error ~5e-07 |
219 | | template <class D, class V> |
220 | 922M | JXL_INLINE V EncodedFromDisplay(D d, V x) const { |
221 | 922M | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
222 | 922M | const V kSign = BitCast(d, Set(du, 0x80000000u)); |
223 | 922M | const V original_sign = And(x, kSign); |
224 | 922M | x = AndNot(kSign, x); // abs |
225 | | |
226 | | // Computed via af_cheb_rational (k=100); replicated 4x. |
227 | 922M | HWY_ALIGN constexpr float p[(4 + 1) * 4] = { |
228 | 922M | HWY_REP4(-5.135152395e-04f), HWY_REP4(5.287254571e-03f), |
229 | 922M | HWY_REP4(3.903842876e-01f), HWY_REP4(1.474205315e+00f), |
230 | 922M | HWY_REP4(7.352629620e-01f), |
231 | 922M | }; |
232 | 922M | HWY_ALIGN constexpr float q[(4 + 1) * 4] = { |
233 | 922M | HWY_REP4(1.004519624e-02f), HWY_REP4(3.036675394e-01f), |
234 | 922M | HWY_REP4(1.340816930e+00f), HWY_REP4(9.258482155e-01f), |
235 | 922M | HWY_REP4(2.424867759e-02f), |
236 | 922M | }; |
237 | 922M | const V linear = Mul(x, Set(d, kLowDiv)); |
238 | 922M | const V poly = EvalRationalPolynomial(d, Sqrt(x), p, q); |
239 | 922M | const V magnitude = |
240 | 922M | IfThenElse(Gt(x, Set(d, kThreshLinearToSRGB)), poly, linear); |
241 | 922M | return Or(AndNot(kSign, magnitude), original_sign); |
242 | 922M | } |
243 | | |
244 | | private: |
245 | | static constexpr float kThreshSRGBToLinear = 0.04045f; |
246 | | static constexpr float kThreshLinearToSRGB = 0.0031308f; |
247 | | static constexpr float kLowDiv = 12.92f; |
248 | | static constexpr float kLowDivInv = 1.0f / kLowDiv; |
249 | | }; |
250 | | |
251 | | // Linear to sRGB conversion with error of at most 1.2e-4. |
252 | | template <typename D, typename V> |
253 | | V FastLinearToSRGB(D d, V v) { |
254 | | const hwy::HWY_NAMESPACE::Rebind<uint32_t, D> du; |
255 | | const hwy::HWY_NAMESPACE::Rebind<int32_t, D> di; |
256 | | // Convert to 0.25 - 0.5 range. |
257 | | auto v025_05 = BitCast( |
258 | | d, And(Or(BitCast(du, v), Set(du, 0x3e800000)), Set(du, 0x3effffff))); |
259 | | // third degree polynomial approximation between 0.25 and 0.5 |
260 | | // of 1.055/2^(7/2.4) * x^(1/2.4) * 0.5. A degree 4 polynomial only improves |
261 | | // accuracy by about 3x. |
262 | | auto d1 = MulAdd(v025_05, Set(d, 0.059914046f), Set(d, -0.108894556f)); |
263 | | auto d2 = MulAdd(d1, v025_05, Set(d, 0.107963754f)); |
264 | | auto pow = MulAdd(d2, v025_05, Set(d, 0.018092343f)); |
265 | | // Compute extra multiplier depending on exponent. Valid exponent range for |
266 | | // [0.0031308f, 1.0) is 0...8 after subtracting 118. |
267 | | // The next three constants contain a representation of the powers of |
268 | | // 2**(1/2.4) = 2**(5/12) times two; in particular, bits from 26 to 31 are |
269 | | // always the same and in k2to512powers_basebits, and the two arrays contain |
270 | | // the next groups of 8 bits. This ends up being a 22-bit representation (with |
271 | | // a mantissa of 13 bits). The choice of polynomial to approximate is such |
272 | | // that the multiplication factor has the highest 5 bits constant, and that |
273 | | // the factor for the lowest possible exponent is a power of two (thus making |
274 | | // the additional bits 0, which is used to correctly merge back together the |
275 | | // floats). |
276 | | constexpr uint32_t k2to512powers_basebits = 0x40000000; |
277 | | HWY_ALIGN constexpr uint8_t k2to512powers_25to18bits[16] = { |
278 | | 0x0, 0xa, 0x19, 0x26, 0x32, 0x41, 0x4d, 0x5c, |
279 | | 0x68, 0x75, 0x83, 0x8f, 0xa0, 0xaa, 0xb9, 0xc6, |
280 | | }; |
281 | | HWY_ALIGN constexpr uint8_t k2to512powers_17to10bits[16] = { |
282 | | 0x0, 0xb7, 0x4, 0xd, 0xcb, 0xe7, 0x41, 0x68, |
283 | | 0x51, 0xd1, 0xeb, 0xf2, 0x0, 0xb7, 0x4, 0xd, |
284 | | }; |
285 | | // Note that vld1q_s8_x2 on ARM seems to actually be slower. |
286 | | #if HWY_TARGET != HWY_SCALAR |
287 | | using hwy::HWY_NAMESPACE::ShiftLeft; |
288 | | using hwy::HWY_NAMESPACE::ShiftRight; |
289 | | // Every lane of exp is now (if cast to byte) {0, 0, 0, <index for lookup>}. |
290 | | auto exp = Sub(ShiftRight<23>(BitCast(di, v)), Set(di, 118)); |
291 | | auto pow25to18bits = TableLookupBytes( |
292 | | LoadDup128(di, |
293 | | reinterpret_cast<const int32_t*>(k2to512powers_25to18bits)), |
294 | | exp); |
295 | | auto pow17to10bits = TableLookupBytes( |
296 | | LoadDup128(di, |
297 | | reinterpret_cast<const int32_t*>(k2to512powers_17to10bits)), |
298 | | exp); |
299 | | // Now, pow* contain {0, 0, 0, <part of float repr of multiplier>}. Here |
300 | | // we take advantage of the fact that each table has its position 0 equal to |
301 | | // 0. |
302 | | // We can now just reassemble the float. |
303 | | auto mul = BitCast( |
304 | | d, Or(Or(ShiftLeft<18>(pow25to18bits), ShiftLeft<10>(pow17to10bits)), |
305 | | Set(di, k2to512powers_basebits))); |
306 | | #else |
307 | | // Fallback for scalar. |
308 | | uint32_t exp = ((BitCast(di, v).raw >> 23) - 118) & 0xf; |
309 | | auto mul = BitCast(d, Set(di, (k2to512powers_25to18bits[exp] << 18) | |
310 | | (k2to512powers_17to10bits[exp] << 10) | |
311 | | k2to512powers_basebits)); |
312 | | #endif |
313 | | return IfThenElse(Lt(v, Set(d, 0.0031308f)), Mul(v, Set(d, 12.92f)), |
314 | | MulAdd(pow, mul, Set(d, -0.055))); |
315 | | } |
316 | | |
317 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
318 | | } // namespace HWY_NAMESPACE |
319 | | } // namespace jxl |
320 | | HWY_AFTER_NAMESPACE(); |
321 | | |
322 | | #endif // LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_ |