/src/libjxl/lib/jxl/dec_xyb-inl.h
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | // XYB -> linear sRGB helper function. |
7 | | |
8 | | #if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE) |
9 | | #ifdef LIB_JXL_DEC_XYB_INL_H_ |
10 | | #undef LIB_JXL_DEC_XYB_INL_H_ |
11 | | #else |
12 | | #define LIB_JXL_DEC_XYB_INL_H_ |
13 | | #endif |
14 | | |
15 | | #include <hwy/highway.h> |
16 | | |
17 | | #include "lib/jxl/common.h" |
18 | | #include "lib/jxl/dec_xyb.h" |
19 | | |
20 | | HWY_BEFORE_NAMESPACE(); |
21 | | namespace jxl { |
22 | | namespace HWY_NAMESPACE { |
23 | | namespace { |
24 | | |
25 | | // These templates are not found via ADL. |
26 | | using hwy::HWY_NAMESPACE::Add; |
27 | | using hwy::HWY_NAMESPACE::Broadcast; |
28 | | using hwy::HWY_NAMESPACE::Mul; |
29 | | using hwy::HWY_NAMESPACE::MulAdd; |
30 | | using hwy::HWY_NAMESPACE::Sub; |
31 | | |
32 | | // Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including |
33 | | // the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB) |
34 | | // gamut values may be in-gamut after transforming to a wider space. |
35 | | // "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries |
36 | | // of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its |
37 | | // entries by c is equivalent to multiplying linear_* by c afterwards. |
38 | | template <class D, class V> |
39 | | HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y, |
40 | | const V opsin_b, |
41 | | const OpsinParams& opsin_params, |
42 | | V* const HWY_RESTRICT linear_r, |
43 | | V* const HWY_RESTRICT linear_g, |
44 | 42.6M | V* const HWY_RESTRICT linear_b) { |
45 | | #if HWY_TARGET == HWY_SCALAR |
46 | | const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]); |
47 | | const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]); |
48 | | const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]); |
49 | | #else |
50 | 42.6M | const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases); |
51 | 42.6M | const auto neg_bias_r = Broadcast<0>(neg_bias_rgb); |
52 | 42.6M | const auto neg_bias_g = Broadcast<1>(neg_bias_rgb); |
53 | 42.6M | const auto neg_bias_b = Broadcast<2>(neg_bias_rgb); |
54 | 42.6M | #endif |
55 | | |
56 | | // Color space: XYB -> RGB |
57 | 42.6M | auto gamma_r = Add(opsin_y, opsin_x); |
58 | 42.6M | auto gamma_g = Sub(opsin_y, opsin_x); |
59 | 42.6M | auto gamma_b = opsin_b; |
60 | | |
61 | 42.6M | gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0])); |
62 | 42.6M | gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1])); |
63 | 42.6M | gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2])); |
64 | | |
65 | | // Undo gamma compression: linear = gamma^3 for efficiency. |
66 | 42.6M | const auto gamma_r2 = Mul(gamma_r, gamma_r); |
67 | 42.6M | const auto gamma_g2 = Mul(gamma_g, gamma_g); |
68 | 42.6M | const auto gamma_b2 = Mul(gamma_b, gamma_b); |
69 | 42.6M | const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r); |
70 | 42.6M | const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g); |
71 | 42.6M | const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b); |
72 | | |
73 | 42.6M | const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix; |
74 | | |
75 | | // Unmix (multiply by 3x3 inverse_matrix) |
76 | | // TODO(eustas): ref would be more readable than pointer |
77 | 42.6M | *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r); |
78 | 42.6M | *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r); |
79 | 42.6M | *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r); |
80 | 42.6M | *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r); |
81 | 42.6M | *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g); |
82 | 42.6M | *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b); |
83 | 42.6M | *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r); |
84 | 42.6M | *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g); |
85 | 42.6M | *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b); |
86 | 42.6M | } Unexecuted instantiation: dec_xyb.cc:void jxl::N_SSE4::(anonymous namespace)::XybToRgb<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, jxl::OpsinParams const&, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*) Unexecuted instantiation: dec_xyb.cc:void jxl::N_AVX2::(anonymous namespace)::XybToRgb<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, jxl::OpsinParams const&, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*) Unexecuted instantiation: dec_xyb.cc:void jxl::N_SSE2::(anonymous namespace)::XybToRgb<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, jxl::OpsinParams const&, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*) Unexecuted instantiation: stage_xyb.cc:void jxl::N_SSE4::(anonymous namespace)::XybToRgb<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, jxl::OpsinParams const&, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*) stage_xyb.cc:void jxl::N_AVX2::(anonymous namespace)::XybToRgb<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, jxl::OpsinParams const&, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*) Line | Count | Source | 44 | 42.6M | V* const HWY_RESTRICT linear_b) { | 45 | | #if HWY_TARGET == HWY_SCALAR | 46 | | const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]); | 47 | | const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]); | 48 | | const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]); | 49 | | #else | 50 | 42.6M | const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases); | 51 | 42.6M | const auto neg_bias_r = Broadcast<0>(neg_bias_rgb); | 52 | 42.6M | const auto neg_bias_g = Broadcast<1>(neg_bias_rgb); | 53 | 42.6M | const auto neg_bias_b = Broadcast<2>(neg_bias_rgb); | 54 | 42.6M | #endif | 55 | | | 56 | | // Color space: XYB -> RGB | 57 | 42.6M | auto gamma_r = Add(opsin_y, opsin_x); | 58 | 42.6M | auto gamma_g = Sub(opsin_y, opsin_x); | 59 | 42.6M | auto gamma_b = opsin_b; | 60 | | | 61 | 42.6M | gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0])); | 62 | 42.6M | gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1])); | 63 | 42.6M | gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2])); | 64 | | | 65 | | // Undo gamma compression: linear = gamma^3 for efficiency. | 66 | 42.6M | const auto gamma_r2 = Mul(gamma_r, gamma_r); | 67 | 42.6M | const auto gamma_g2 = Mul(gamma_g, gamma_g); | 68 | 42.6M | const auto gamma_b2 = Mul(gamma_b, gamma_b); | 69 | 42.6M | const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r); | 70 | 42.6M | const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g); | 71 | 42.6M | const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b); | 72 | | | 73 | 42.6M | const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix; | 74 | | | 75 | | // Unmix (multiply by 3x3 inverse_matrix) | 76 | | // TODO(eustas): ref would be more readable than pointer | 77 | 42.6M | *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r); | 78 | 42.6M | *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r); | 79 | 42.6M | *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r); | 80 | 42.6M | *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r); | 81 | 42.6M | *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g); | 82 | 42.6M | *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b); | 83 | 42.6M | *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r); | 84 | 42.6M | *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g); | 85 | 42.6M | *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b); | 86 | 42.6M | } |
Unexecuted instantiation: stage_xyb.cc:void jxl::N_SSE2::(anonymous namespace)::XybToRgb<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, jxl::OpsinParams const&, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*) |
87 | | |
88 | | #if !JXL_HIGH_PRECISION |
89 | | inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() { |
90 | | #if HWY_TARGET == HWY_NEON |
91 | | return true; |
92 | | #else |
93 | | return false; |
94 | | #endif |
95 | | } |
96 | | |
97 | | inline HWY_MAYBE_UNUSED Status FastXYBTosRGB8(const float* input[4], |
98 | | uint8_t* output, bool is_rgba, |
99 | | size_t xsize) { |
100 | | // This function is very NEON-specific. As such, it uses intrinsics directly. |
101 | | #if HWY_TARGET == HWY_NEON |
102 | | // WARNING: doing fixed point arithmetic correctly is very complicated. |
103 | | // Changes to this function should be thoroughly tested. |
104 | | |
105 | | // Note that the input is assumed to have 13 bits of mantissa, and the output |
106 | | // will have 14 bits. |
107 | | auto srgb_tf = [&](int16x8_t v16) { |
108 | | int16x8_t clz = vclzq_s16(v16); |
109 | | // Convert to [0.25, 0.5) range. |
110 | | int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2))); |
111 | | |
112 | | // third degree polynomial approximation between 0.25 and 0.5 |
113 | | // of 1.055/2^(7/2.4) * x^(1/2.4) / 32. |
114 | | // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29 |
115 | | // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72 |
116 | | // overflow our fixed point representation. |
117 | | |
118 | | int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16); |
119 | | |
120 | | // 0.47 * x |
121 | | int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706); |
122 | | // - 0.87 |
123 | | int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546)); |
124 | | // * x |
125 | | int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16); |
126 | | // + 0.86 |
127 | | int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302)); |
128 | | // * 2x |
129 | | int16x8_t step5 = vqrdmulhq_s16(step4, twov); |
130 | | // + 0.29 |
131 | | int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485)); |
132 | | |
133 | | int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz); |
134 | | // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are |
135 | | // capped to 1. |
136 | | // Generated with the following Python script: |
137 | | // a = [] |
138 | | // b = [] |
139 | | // |
140 | | // for i in range(0, 16): |
141 | | // v = 2**(5/12.*i) |
142 | | // v /= 16 |
143 | | // v *= 256 * 128 |
144 | | // v = int(v) |
145 | | // a.append(v // 256) |
146 | | // b.append(v % 256) |
147 | | // |
148 | | // print(", ".join("0x%02x" % x for x in a)) |
149 | | // |
150 | | // print(", ".join("0x%02x" % x for x in b)) |
151 | | |
152 | | HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = { |
153 | | 0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c, |
154 | | 0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, |
155 | | }; |
156 | | HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = { |
157 | | 0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68, |
158 | | 0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
159 | | }; |
160 | | // Using the highway implementation here since vqtbl1q is aarch64-only. |
161 | | using hwy::HWY_NAMESPACE::Vec128; |
162 | | uint8x16_t pow_low = |
163 | | TableLookupBytes( |
164 | | Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_low)), |
165 | | Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16))) |
166 | | .raw; |
167 | | uint8x16_t pow_high = |
168 | | TableLookupBytes( |
169 | | Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_high)), |
170 | | Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16))) |
171 | | .raw; |
172 | | int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16( |
173 | | vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8)); |
174 | | |
175 | | // approximation of v * 12.92, divided by 2 |
176 | | // Note that our input is using 13 mantissa bits instead of 15. |
177 | | int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5); |
178 | | // 1.055*pow(v, 1/2.4) - 0.055, divided by 2 |
179 | | auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901)); |
180 | | // > 0.0031308f (note that v16 has 13 mantissa bits) |
181 | | return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear); |
182 | | }; |
183 | | |
184 | | const float* JXL_RESTRICT row_in_x = input[0]; |
185 | | const float* JXL_RESTRICT row_in_y = input[1]; |
186 | | const float* JXL_RESTRICT row_in_b = input[2]; |
187 | | const float* JXL_RESTRICT row_in_a = input[3]; |
188 | | for (size_t x = 0; x < xsize; x += 8) { |
189 | | // Normal ranges for xyb for in-gamut sRGB colors: |
190 | | // x: -0.015386 0.028100 |
191 | | // y: 0.000000 0.845308 |
192 | | // b: 0.000000 0.845308 |
193 | | |
194 | | // We actually want x * 8 to have some extra precision. |
195 | | // TODO(veluca): consider different approaches here, like vld1q_f32_x2. |
196 | | float32x4_t opsin_x_left = vld1q_f32(row_in_x + x); |
197 | | int16x4_t opsin_x16_times8_left = |
198 | | vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18)); |
199 | | float32x4_t opsin_x_right = |
200 | | vld1q_f32(row_in_x + x + (x + 4 < xsize ? 4 : 0)); |
201 | | int16x4_t opsin_x16_times8_right = |
202 | | vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18)); |
203 | | int16x8_t opsin_x16_times8 = |
204 | | vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right); |
205 | | |
206 | | float32x4_t opsin_y_left = vld1q_f32(row_in_y + x); |
207 | | int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15)); |
208 | | float32x4_t opsin_y_right = |
209 | | vld1q_f32(row_in_y + x + (x + 4 < xsize ? 4 : 0)); |
210 | | int16x4_t opsin_y16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15)); |
211 | | int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right); |
212 | | |
213 | | float32x4_t opsin_b_left = vld1q_f32(row_in_b + x); |
214 | | int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15)); |
215 | | float32x4_t opsin_b_right = |
216 | | vld1q_f32(row_in_b + x + (x + 4 < xsize ? 4 : 0)); |
217 | | int16x4_t opsin_b16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15)); |
218 | | int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right); |
219 | | |
220 | | int16x8_t neg_bias16 = vdupq_n_s16(-124); // -0.0037930732552754493 |
221 | | int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110); // -0.155954201 |
222 | | int16x8_t neg_bias_half16 = vdupq_n_s16(-62); |
223 | | |
224 | | // Color space: XYB -> RGB |
225 | | // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2, |
226 | | // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias. |
227 | | // Note that ignoring x2 in the formulas below (as x << y) results in |
228 | | // errors of at least 3 in the final sRGB values. |
229 | | int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16); |
230 | | int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16); |
231 | | int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3); |
232 | | int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16); |
233 | | int16x8_t threexsq16 = |
234 | | vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6); |
235 | | |
236 | | // We can ignore x^3 here. Note that this is multiplied by 8. |
237 | | int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16); |
238 | | |
239 | | int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16); |
240 | | int16x8_t mixed_rpg16 = vhaddq_s16( |
241 | | vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16); |
242 | | |
243 | | int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16); |
244 | | int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16); |
245 | | int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16); |
246 | | int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16); |
247 | | // mixed_rpg and mixed_b are in 0-1 range. |
248 | | // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note |
249 | | // that at this point it is already multiplied by 8. |
250 | | |
251 | | // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit |
252 | | // fixed point) to ensure intermediate quantities are in range. Note that |
253 | | // r-g is not shifted, and was x8 before here; this corresponds to a x32 |
254 | | // overall multiplicative factor and ensures that all the matrix constants |
255 | | // are in 0-1 range. |
256 | | // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two |
257 | | // vhadd + using neg_bias_half. |
258 | | mixed_b16 = vshrq_n_s16(mixed_b16, 2); |
259 | | |
260 | | // Unmix (multiply by 3x3 inverse_matrix) |
261 | | // For increased precision, we use a matrix for converting from |
262 | | // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This |
263 | | // avoids cancellation effects when computing (y+x)^3-(y-x)^3. |
264 | | // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c * |
265 | | // mixed_b pattern is repeated frequently in the code below. This allows |
266 | | // us to save a multiply per channel, and removes the presence of |
267 | | // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1) |
268 | | // range, so the subtraction is safe. |
269 | | // All the magic-looking constants here are derived by computing the |
270 | | // inverse opsin matrix for the transformation modified as described |
271 | | // above. |
272 | | |
273 | | // Precomputation common to multiple color values. |
274 | | int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16); |
275 | | int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394); |
276 | | int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16); |
277 | | |
278 | | // R |
279 | | int16x8_t linear_r16 = |
280 | | vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400)); |
281 | | |
282 | | // G |
283 | | int16x8_t linear_g16 = |
284 | | vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857)); |
285 | | |
286 | | // B |
287 | | int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996); |
288 | | linear_b16 = vqaddq_s16(linear_b16, mixed_b16); |
289 | | linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525)); |
290 | | |
291 | | // Apply SRGB transfer function. |
292 | | int16x8_t r = srgb_tf(linear_r16); |
293 | | int16x8_t g = srgb_tf(linear_g16); |
294 | | int16x8_t b = srgb_tf(linear_b16); |
295 | | |
296 | | uint8x8_t r8 = |
297 | | vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6)); |
298 | | uint8x8_t g8 = |
299 | | vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6)); |
300 | | uint8x8_t b8 = |
301 | | vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6)); |
302 | | |
303 | | size_t n = xsize - x; |
304 | | if (is_rgba) { |
305 | | float32x4_t a_f32_left = |
306 | | row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f); |
307 | | float32x4_t a_f32_right = |
308 | | row_in_a ? vld1q_f32(row_in_a + x + (x + 4 < xsize ? 4 : 0)) |
309 | | : vdupq_n_f32(1.0f); |
310 | | int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8)); |
311 | | int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8)); |
312 | | uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right)); |
313 | | uint8_t* buf = output + 4 * x; |
314 | | uint8x8x4_t data = {r8, g8, b8, a8}; |
315 | | if (n >= 8) { |
316 | | vst4_u8(buf, data); |
317 | | } else { |
318 | | uint8_t tmp[8 * 4]; |
319 | | vst4_u8(tmp, data); |
320 | | memcpy(buf, tmp, n * 4); |
321 | | } |
322 | | } else { |
323 | | uint8_t* buf = output + 3 * x; |
324 | | uint8x8x3_t data = {r8, g8, b8}; |
325 | | if (n >= 8) { |
326 | | vst3_u8(buf, data); |
327 | | } else { |
328 | | uint8_t tmp[8 * 3]; |
329 | | vst3_u8(tmp, data); |
330 | | memcpy(buf, tmp, n * 3); |
331 | | } |
332 | | } |
333 | | } |
334 | | return true; |
335 | | #else // HWY_TARGET == HWY_NEON |
336 | | (void)input; |
337 | | (void)output; |
338 | | (void)is_rgba; |
339 | | (void)xsize; |
340 | | return JXL_UNREACHABLE("unsupported platform"); |
341 | | #endif // HWY_TARGET == HWY_NEON |
342 | | } |
343 | | #endif // !JXL_HIGH_PRECISION |
344 | | |
345 | | } // namespace |
346 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
347 | | } // namespace HWY_NAMESPACE |
348 | | } // namespace jxl |
349 | | HWY_AFTER_NAMESPACE(); |
350 | | |
351 | | #endif // LIB_JXL_DEC_XYB_INL_H_ |