/src/libjxl/lib/jxl/dec_xyb-inl.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | // XYB -> linear sRGB helper function. |
7 | | |
8 | | #if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE) |
9 | | #ifdef LIB_JXL_DEC_XYB_INL_H_ |
10 | | #undef LIB_JXL_DEC_XYB_INL_H_ |
11 | | #else |
12 | | #define LIB_JXL_DEC_XYB_INL_H_ |
13 | | #endif |
14 | | |
15 | | #include <hwy/highway.h> |
16 | | |
17 | | #include "lib/jxl/dec_xyb.h" |
18 | | HWY_BEFORE_NAMESPACE(); |
19 | | namespace jxl { |
20 | | namespace HWY_NAMESPACE { |
21 | | namespace { |
22 | | |
23 | | // These templates are not found via ADL. |
24 | | using hwy::HWY_NAMESPACE::Add; |
25 | | using hwy::HWY_NAMESPACE::Broadcast; |
26 | | using hwy::HWY_NAMESPACE::Mul; |
27 | | using hwy::HWY_NAMESPACE::MulAdd; |
28 | | using hwy::HWY_NAMESPACE::Sub; |
29 | | |
30 | | // Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including |
31 | | // the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB) |
32 | | // gamut values may be in-gamut after transforming to a wider space. |
33 | | // "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries |
34 | | // of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its |
35 | | // entries by c is equivalent to multiplying linear_* by c afterwards. |
36 | | template <class D, class V> |
37 | | HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y, |
38 | | const V opsin_b, |
39 | | const OpsinParams& opsin_params, |
40 | | V* const HWY_RESTRICT linear_r, |
41 | | V* const HWY_RESTRICT linear_g, |
42 | 322M | V* const HWY_RESTRICT linear_b) { |
43 | 322M | #if HWY_TARGET == HWY_SCALAR |
44 | 322M | const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]); |
45 | 322M | const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]); |
46 | 322M | const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]); |
47 | | #else |
48 | | const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases); |
49 | | const auto neg_bias_r = Broadcast<0>(neg_bias_rgb); |
50 | | const auto neg_bias_g = Broadcast<1>(neg_bias_rgb); |
51 | | const auto neg_bias_b = Broadcast<2>(neg_bias_rgb); |
52 | | #endif |
53 | | |
54 | | // Color space: XYB -> RGB |
55 | 322M | auto gamma_r = Add(opsin_y, opsin_x); |
56 | 322M | auto gamma_g = Sub(opsin_y, opsin_x); |
57 | 322M | auto gamma_b = opsin_b; |
58 | | |
59 | 322M | gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0])); |
60 | 322M | gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1])); |
61 | 322M | gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2])); |
62 | | |
63 | | // Undo gamma compression: linear = gamma^3 for efficiency. |
64 | 322M | const auto gamma_r2 = Mul(gamma_r, gamma_r); |
65 | 322M | const auto gamma_g2 = Mul(gamma_g, gamma_g); |
66 | 322M | const auto gamma_b2 = Mul(gamma_b, gamma_b); |
67 | 322M | const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r); |
68 | 322M | const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g); |
69 | 322M | const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b); |
70 | | |
71 | 322M | const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix; |
72 | | |
73 | | // Unmix (multiply by 3x3 inverse_matrix) |
74 | | // TODO(eustas): ref would be more readable than pointer |
75 | 322M | *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r); |
76 | 322M | *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r); |
77 | 322M | *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r); |
78 | 322M | *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r); |
79 | 322M | *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g); |
80 | 322M | *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b); |
81 | 322M | *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r); |
82 | 322M | *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g); |
83 | 322M | *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b); |
84 | 322M | } Unexecuted instantiation: dec_xyb.cc:void jxl::N_SCALAR::(anonymous namespace)::XybToRgb<hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float> >(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, jxl::OpsinParams const&, hwy::N_SCALAR::Vec1<float>*, hwy::N_SCALAR::Vec1<float>*, hwy::N_SCALAR::Vec1<float>*) stage_xyb.cc:void jxl::N_SCALAR::(anonymous namespace)::XybToRgb<hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float> >(hwy::N_SCALAR::Simd<float, 1ul, 0>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, hwy::N_SCALAR::Vec1<float>, jxl::OpsinParams const&, hwy::N_SCALAR::Vec1<float>*, hwy::N_SCALAR::Vec1<float>*, hwy::N_SCALAR::Vec1<float>*) Line | Count | Source | 42 | 322M | V* const HWY_RESTRICT linear_b) { | 43 | 322M | #if HWY_TARGET == HWY_SCALAR | 44 | 322M | const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]); | 45 | 322M | const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]); | 46 | 322M | const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]); | 47 | | #else | 48 | | const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases); | 49 | | const auto neg_bias_r = Broadcast<0>(neg_bias_rgb); | 50 | | const auto neg_bias_g = Broadcast<1>(neg_bias_rgb); | 51 | | const auto neg_bias_b = Broadcast<2>(neg_bias_rgb); | 52 | | #endif | 53 | | | 54 | | // Color space: XYB -> RGB | 55 | 322M | auto gamma_r = Add(opsin_y, opsin_x); | 56 | 322M | auto gamma_g = Sub(opsin_y, opsin_x); | 57 | 322M | auto gamma_b = opsin_b; | 58 | | | 59 | 322M | gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0])); | 60 | 322M | gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1])); | 61 | 322M | gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2])); | 62 | | | 63 | | // Undo gamma compression: linear = gamma^3 for efficiency. | 64 | 322M | const auto gamma_r2 = Mul(gamma_r, gamma_r); | 65 | 322M | const auto gamma_g2 = Mul(gamma_g, gamma_g); | 66 | 322M | const auto gamma_b2 = Mul(gamma_b, gamma_b); | 67 | 322M | const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r); | 68 | 322M | const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g); | 69 | 322M | const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b); | 70 | | | 71 | 322M | const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix; | 72 | | | 73 | | // Unmix (multiply by 3x3 inverse_matrix) | 74 | | // TODO(eustas): ref would be more readable than pointer | 75 | 322M | *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r); | 76 | 322M | *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r); | 77 | 322M | *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r); | 78 | 322M | *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r); | 79 | 322M | *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g); | 80 | 322M | *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b); | 81 | 322M | *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r); | 82 | 322M | *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g); | 83 | 322M | *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b); | 84 | 322M | } |
|
85 | | |
86 | 0 | inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() { |
87 | | #if HWY_TARGET == HWY_NEON |
88 | | return true; |
89 | | #else |
90 | 0 | return false; |
91 | 0 | #endif |
92 | 0 | } Unexecuted instantiation: dec_xyb.cc:jxl::N_SCALAR::(anonymous namespace)::HasFastXYBTosRGB8() Unexecuted instantiation: stage_cms.cc:jxl::N_SCALAR::(anonymous namespace)::HasFastXYBTosRGB8() Unexecuted instantiation: stage_tone_mapping.cc:jxl::N_SCALAR::(anonymous namespace)::HasFastXYBTosRGB8() Unexecuted instantiation: stage_xyb.cc:jxl::N_SCALAR::(anonymous namespace)::HasFastXYBTosRGB8() |
93 | | |
94 | | inline HWY_MAYBE_UNUSED Status FastXYBTosRGB8(const float* input[4], |
95 | | uint8_t* output, bool is_rgba, |
96 | 0 | size_t xsize) { |
97 | | // This function is very NEON-specific. As such, it uses intrinsics directly. |
98 | | #if HWY_TARGET == HWY_NEON |
99 | | // WARNING: doing fixed point arithmetic correctly is very complicated. |
100 | | // Changes to this function should be thoroughly tested. |
101 | | |
102 | | // Note that the input is assumed to have 13 bits of mantissa, and the output |
103 | | // will have 14 bits. |
104 | | auto srgb_tf = [&](int16x8_t v16) { |
105 | | int16x8_t clz = vclzq_s16(v16); |
106 | | // Convert to [0.25, 0.5) range. |
107 | | int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2))); |
108 | | |
109 | | // third degree polynomial approximation between 0.25 and 0.5 |
110 | | // of 1.055/2^(7/2.4) * x^(1/2.4) / 32. |
111 | | // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29 |
112 | | // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72 |
113 | | // overflow our fixed point representation. |
114 | | |
115 | | int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16); |
116 | | |
117 | | // 0.47 * x |
118 | | int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706); |
119 | | // - 0.87 |
120 | | int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546)); |
121 | | // * x |
122 | | int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16); |
123 | | // + 0.86 |
124 | | int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302)); |
125 | | // * 2x |
126 | | int16x8_t step5 = vqrdmulhq_s16(step4, twov); |
127 | | // + 0.29 |
128 | | int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485)); |
129 | | |
130 | | int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz); |
131 | | // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are |
132 | | // capped to 1. |
133 | | // Generated with the following Python script: |
134 | | // a = [] |
135 | | // b = [] |
136 | | // |
137 | | // for i in range(0, 16): |
138 | | // v = 2**(5/12.*i) |
139 | | // v /= 16 |
140 | | // v *= 256 * 128 |
141 | | // v = int(v) |
142 | | // a.append(v // 256) |
143 | | // b.append(v % 256) |
144 | | // |
145 | | // print(", ".join("0x%02x" % x for x in a)) |
146 | | // |
147 | | // print(", ".join("0x%02x" % x for x in b)) |
148 | | |
149 | | HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = { |
150 | | 0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c, |
151 | | 0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, |
152 | | }; |
153 | | HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = { |
154 | | 0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68, |
155 | | 0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
156 | | }; |
157 | | // Using the highway implementation here since vqtbl1q is aarch64-only. |
158 | | using hwy::HWY_NAMESPACE::Vec128; |
159 | | uint8x16_t pow_low = |
160 | | TableLookupBytes( |
161 | | Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_low)), |
162 | | Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16))) |
163 | | .raw; |
164 | | uint8x16_t pow_high = |
165 | | TableLookupBytes( |
166 | | Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_high)), |
167 | | Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16))) |
168 | | .raw; |
169 | | int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16( |
170 | | vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8)); |
171 | | |
172 | | // approximation of v * 12.92, divided by 2 |
173 | | // Note that our input is using 13 mantissa bits instead of 15. |
174 | | int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5); |
175 | | // 1.055*pow(v, 1/2.4) - 0.055, divided by 2 |
176 | | auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901)); |
177 | | // > 0.0031308f (note that v16 has 13 mantissa bits) |
178 | | return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear); |
179 | | }; |
180 | | |
181 | | const float* JXL_RESTRICT row_in_x = input[0]; |
182 | | const float* JXL_RESTRICT row_in_y = input[1]; |
183 | | const float* JXL_RESTRICT row_in_b = input[2]; |
184 | | const float* JXL_RESTRICT row_in_a = input[3]; |
185 | | for (size_t x = 0; x < xsize; x += 8) { |
186 | | // Normal ranges for xyb for in-gamut sRGB colors: |
187 | | // x: -0.015386 0.028100 |
188 | | // y: 0.000000 0.845308 |
189 | | // b: 0.000000 0.845308 |
190 | | |
191 | | // We actually want x * 8 to have some extra precision. |
192 | | // TODO(veluca): consider different approaches here, like vld1q_f32_x2. |
193 | | float32x4_t opsin_x_left = vld1q_f32(row_in_x + x); |
194 | | int16x4_t opsin_x16_times8_left = |
195 | | vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18)); |
196 | | float32x4_t opsin_x_right = |
197 | | vld1q_f32(row_in_x + x + (x + 4 < xsize ? 4 : 0)); |
198 | | int16x4_t opsin_x16_times8_right = |
199 | | vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18)); |
200 | | int16x8_t opsin_x16_times8 = |
201 | | vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right); |
202 | | |
203 | | float32x4_t opsin_y_left = vld1q_f32(row_in_y + x); |
204 | | int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15)); |
205 | | float32x4_t opsin_y_right = |
206 | | vld1q_f32(row_in_y + x + (x + 4 < xsize ? 4 : 0)); |
207 | | int16x4_t opsin_y16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15)); |
208 | | int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right); |
209 | | |
210 | | float32x4_t opsin_b_left = vld1q_f32(row_in_b + x); |
211 | | int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15)); |
212 | | float32x4_t opsin_b_right = |
213 | | vld1q_f32(row_in_b + x + (x + 4 < xsize ? 4 : 0)); |
214 | | int16x4_t opsin_b16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15)); |
215 | | int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right); |
216 | | |
217 | | int16x8_t neg_bias16 = vdupq_n_s16(-124); // -0.0037930732552754493 |
218 | | int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110); // -0.155954201 |
219 | | int16x8_t neg_bias_half16 = vdupq_n_s16(-62); |
220 | | |
221 | | // Color space: XYB -> RGB |
222 | | // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2, |
223 | | // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias. |
224 | | // Note that ignoring x2 in the formulas below (as x << y) results in |
225 | | // errors of at least 3 in the final sRGB values. |
226 | | int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16); |
227 | | int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16); |
228 | | int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3); |
229 | | int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16); |
230 | | int16x8_t threexsq16 = |
231 | | vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6); |
232 | | |
233 | | // We can ignore x^3 here. Note that this is multiplied by 8. |
234 | | int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16); |
235 | | |
236 | | int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16); |
237 | | int16x8_t mixed_rpg16 = vhaddq_s16( |
238 | | vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16); |
239 | | |
240 | | int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16); |
241 | | int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16); |
242 | | int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16); |
243 | | int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16); |
244 | | // mixed_rpg and mixed_b are in 0-1 range. |
245 | | // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note |
246 | | // that at this point it is already multiplied by 8. |
247 | | |
248 | | // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit |
249 | | // fixed point) to ensure intermediate quantities are in range. Note that |
250 | | // r-g is not shifted, and was x8 before here; this corresponds to a x32 |
251 | | // overall multiplicative factor and ensures that all the matrix constants |
252 | | // are in 0-1 range. |
253 | | // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two |
254 | | // vhadd + using neg_bias_half. |
255 | | mixed_b16 = vshrq_n_s16(mixed_b16, 2); |
256 | | |
257 | | // Unmix (multiply by 3x3 inverse_matrix) |
258 | | // For increased precision, we use a matrix for converting from |
259 | | // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This |
260 | | // avoids cancellation effects when computing (y+x)^3-(y-x)^3. |
261 | | // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c * |
262 | | // mixed_b pattern is repeated frequently in the code below. This allows |
263 | | // us to save a multiply per channel, and removes the presence of |
264 | | // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1) |
265 | | // range, so the subtraction is safe. |
266 | | // All the magic-looking constants here are derived by computing the |
267 | | // inverse opsin matrix for the transformation modified as described |
268 | | // above. |
269 | | |
270 | | // Precomputation common to multiple color values. |
271 | | int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16); |
272 | | int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394); |
273 | | int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16); |
274 | | |
275 | | // R |
276 | | int16x8_t linear_r16 = |
277 | | vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400)); |
278 | | |
279 | | // G |
280 | | int16x8_t linear_g16 = |
281 | | vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857)); |
282 | | |
283 | | // B |
284 | | int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996); |
285 | | linear_b16 = vqaddq_s16(linear_b16, mixed_b16); |
286 | | linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525)); |
287 | | |
288 | | // Apply SRGB transfer function. |
289 | | int16x8_t r = srgb_tf(linear_r16); |
290 | | int16x8_t g = srgb_tf(linear_g16); |
291 | | int16x8_t b = srgb_tf(linear_b16); |
292 | | |
293 | | uint8x8_t r8 = |
294 | | vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6)); |
295 | | uint8x8_t g8 = |
296 | | vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6)); |
297 | | uint8x8_t b8 = |
298 | | vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6)); |
299 | | |
300 | | size_t n = xsize - x; |
301 | | if (is_rgba) { |
302 | | float32x4_t a_f32_left = |
303 | | row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f); |
304 | | float32x4_t a_f32_right = |
305 | | row_in_a ? vld1q_f32(row_in_a + x + (x + 4 < xsize ? 4 : 0)) |
306 | | : vdupq_n_f32(1.0f); |
307 | | int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8)); |
308 | | int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8)); |
309 | | uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right)); |
310 | | uint8_t* buf = output + 4 * x; |
311 | | uint8x8x4_t data = {r8, g8, b8, a8}; |
312 | | if (n >= 8) { |
313 | | vst4_u8(buf, data); |
314 | | } else { |
315 | | uint8_t tmp[8 * 4]; |
316 | | vst4_u8(tmp, data); |
317 | | memcpy(buf, tmp, n * 4); |
318 | | } |
319 | | } else { |
320 | | uint8_t* buf = output + 3 * x; |
321 | | uint8x8x3_t data = {r8, g8, b8}; |
322 | | if (n >= 8) { |
323 | | vst3_u8(buf, data); |
324 | | } else { |
325 | | uint8_t tmp[8 * 3]; |
326 | | vst3_u8(tmp, data); |
327 | | memcpy(buf, tmp, n * 3); |
328 | | } |
329 | | } |
330 | | } |
331 | | return true; |
332 | | #else |
333 | 0 | (void)input; |
334 | 0 | (void)output; |
335 | 0 | (void)is_rgba; |
336 | 0 | (void)xsize; |
337 | 0 | return JXL_UNREACHABLE("unsupported platform"); |
338 | 0 | #endif |
339 | 0 | } Unexecuted instantiation: dec_xyb.cc:jxl::N_SCALAR::(anonymous namespace)::FastXYBTosRGB8(float const**, unsigned char*, bool, unsigned long) Unexecuted instantiation: stage_cms.cc:jxl::N_SCALAR::(anonymous namespace)::FastXYBTosRGB8(float const**, unsigned char*, bool, unsigned long) Unexecuted instantiation: stage_tone_mapping.cc:jxl::N_SCALAR::(anonymous namespace)::FastXYBTosRGB8(float const**, unsigned char*, bool, unsigned long) Unexecuted instantiation: stage_xyb.cc:jxl::N_SCALAR::(anonymous namespace)::FastXYBTosRGB8(float const**, unsigned char*, bool, unsigned long) |
340 | | |
341 | | } // namespace |
342 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
343 | | } // namespace HWY_NAMESPACE |
344 | | } // namespace jxl |
345 | | HWY_AFTER_NAMESPACE(); |
346 | | |
347 | | #endif // LIB_JXL_DEC_XYB_INL_H_ |