Coverage Report

Created: 2025-06-16 07:00

/src/libjxl/lib/jxl/dec_xyb-inl.h
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
// XYB -> linear sRGB helper function.
7
8
#if defined(LIB_JXL_DEC_XYB_INL_H_) == defined(HWY_TARGET_TOGGLE)
9
#ifdef LIB_JXL_DEC_XYB_INL_H_
10
#undef LIB_JXL_DEC_XYB_INL_H_
11
#else
12
#define LIB_JXL_DEC_XYB_INL_H_
13
#endif
14
15
#include <hwy/highway.h>
16
17
#include "lib/jxl/common.h"
18
#include "lib/jxl/dec_xyb.h"
19
20
HWY_BEFORE_NAMESPACE();
21
namespace jxl {
22
namespace HWY_NAMESPACE {
23
namespace {
24
25
// These templates are not found via ADL.
26
using hwy::HWY_NAMESPACE::Add;
27
using hwy::HWY_NAMESPACE::Broadcast;
28
using hwy::HWY_NAMESPACE::Mul;
29
using hwy::HWY_NAMESPACE::MulAdd;
30
using hwy::HWY_NAMESPACE::Sub;
31
32
// Inverts the pixel-wise RGB->XYB conversion in OpsinDynamicsImage() (including
33
// the gamma mixing and simple gamma). Avoids clamping to [0, 1] - out of (sRGB)
34
// gamut values may be in-gamut after transforming to a wider space.
35
// "inverse_matrix" points to 9 broadcasted vectors, which are the 3x3 entries
36
// of the (row-major) opsin absorbance matrix inverse. Pre-multiplying its
37
// entries by c is equivalent to multiplying linear_* by c afterwards.
38
template <class D, class V>
39
HWY_INLINE HWY_MAYBE_UNUSED void XybToRgb(D d, const V opsin_x, const V opsin_y,
40
                                          const V opsin_b,
41
                                          const OpsinParams& opsin_params,
42
                                          V* const HWY_RESTRICT linear_r,
43
                                          V* const HWY_RESTRICT linear_g,
44
42.6M
                                          V* const HWY_RESTRICT linear_b) {
45
#if HWY_TARGET == HWY_SCALAR
46
  const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]);
47
  const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]);
48
  const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]);
49
#else
50
42.6M
  const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases);
51
42.6M
  const auto neg_bias_r = Broadcast<0>(neg_bias_rgb);
52
42.6M
  const auto neg_bias_g = Broadcast<1>(neg_bias_rgb);
53
42.6M
  const auto neg_bias_b = Broadcast<2>(neg_bias_rgb);
54
42.6M
#endif
55
56
  // Color space: XYB -> RGB
57
42.6M
  auto gamma_r = Add(opsin_y, opsin_x);
58
42.6M
  auto gamma_g = Sub(opsin_y, opsin_x);
59
42.6M
  auto gamma_b = opsin_b;
60
61
42.6M
  gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0]));
62
42.6M
  gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1]));
63
42.6M
  gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2]));
64
65
  // Undo gamma compression: linear = gamma^3 for efficiency.
66
42.6M
  const auto gamma_r2 = Mul(gamma_r, gamma_r);
67
42.6M
  const auto gamma_g2 = Mul(gamma_g, gamma_g);
68
42.6M
  const auto gamma_b2 = Mul(gamma_b, gamma_b);
69
42.6M
  const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r);
70
42.6M
  const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g);
71
42.6M
  const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b);
72
73
42.6M
  const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix;
74
75
  // Unmix (multiply by 3x3 inverse_matrix)
76
  // TODO(eustas): ref would be more readable than pointer
77
42.6M
  *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r);
78
42.6M
  *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r);
79
42.6M
  *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r);
80
42.6M
  *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r);
81
42.6M
  *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g);
82
42.6M
  *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b);
83
42.6M
  *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r);
84
42.6M
  *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g);
85
42.6M
  *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b);
86
42.6M
}
Unexecuted instantiation: dec_xyb.cc:void jxl::N_SSE4::(anonymous namespace)::XybToRgb<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, jxl::OpsinParams const&, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*)
Unexecuted instantiation: dec_xyb.cc:void jxl::N_AVX2::(anonymous namespace)::XybToRgb<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, jxl::OpsinParams const&, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*)
Unexecuted instantiation: dec_xyb.cc:void jxl::N_SSE2::(anonymous namespace)::XybToRgb<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, jxl::OpsinParams const&, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*)
Unexecuted instantiation: stage_xyb.cc:void jxl::N_SSE4::(anonymous namespace)::XybToRgb<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, jxl::OpsinParams const&, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*)
stage_xyb.cc:void jxl::N_AVX2::(anonymous namespace)::XybToRgb<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, jxl::OpsinParams const&, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*)
Line
Count
Source
44
42.6M
                                          V* const HWY_RESTRICT linear_b) {
45
#if HWY_TARGET == HWY_SCALAR
46
  const auto neg_bias_r = Set(d, opsin_params.opsin_biases[0]);
47
  const auto neg_bias_g = Set(d, opsin_params.opsin_biases[1]);
48
  const auto neg_bias_b = Set(d, opsin_params.opsin_biases[2]);
49
#else
50
42.6M
  const auto neg_bias_rgb = LoadDup128(d, opsin_params.opsin_biases);
51
42.6M
  const auto neg_bias_r = Broadcast<0>(neg_bias_rgb);
52
42.6M
  const auto neg_bias_g = Broadcast<1>(neg_bias_rgb);
53
42.6M
  const auto neg_bias_b = Broadcast<2>(neg_bias_rgb);
54
42.6M
#endif
55
56
  // Color space: XYB -> RGB
57
42.6M
  auto gamma_r = Add(opsin_y, opsin_x);
58
42.6M
  auto gamma_g = Sub(opsin_y, opsin_x);
59
42.6M
  auto gamma_b = opsin_b;
60
61
42.6M
  gamma_r = Sub(gamma_r, Set(d, opsin_params.opsin_biases_cbrt[0]));
62
42.6M
  gamma_g = Sub(gamma_g, Set(d, opsin_params.opsin_biases_cbrt[1]));
63
42.6M
  gamma_b = Sub(gamma_b, Set(d, opsin_params.opsin_biases_cbrt[2]));
64
65
  // Undo gamma compression: linear = gamma^3 for efficiency.
66
42.6M
  const auto gamma_r2 = Mul(gamma_r, gamma_r);
67
42.6M
  const auto gamma_g2 = Mul(gamma_g, gamma_g);
68
42.6M
  const auto gamma_b2 = Mul(gamma_b, gamma_b);
69
42.6M
  const auto mixed_r = MulAdd(gamma_r2, gamma_r, neg_bias_r);
70
42.6M
  const auto mixed_g = MulAdd(gamma_g2, gamma_g, neg_bias_g);
71
42.6M
  const auto mixed_b = MulAdd(gamma_b2, gamma_b, neg_bias_b);
72
73
42.6M
  const float* HWY_RESTRICT inverse_matrix = opsin_params.inverse_opsin_matrix;
74
75
  // Unmix (multiply by 3x3 inverse_matrix)
76
  // TODO(eustas): ref would be more readable than pointer
77
42.6M
  *linear_r = Mul(LoadDup128(d, &inverse_matrix[0 * 4]), mixed_r);
78
42.6M
  *linear_g = Mul(LoadDup128(d, &inverse_matrix[3 * 4]), mixed_r);
79
42.6M
  *linear_b = Mul(LoadDup128(d, &inverse_matrix[6 * 4]), mixed_r);
80
42.6M
  *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[1 * 4]), mixed_g, *linear_r);
81
42.6M
  *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[4 * 4]), mixed_g, *linear_g);
82
42.6M
  *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[7 * 4]), mixed_g, *linear_b);
83
42.6M
  *linear_r = MulAdd(LoadDup128(d, &inverse_matrix[2 * 4]), mixed_b, *linear_r);
84
42.6M
  *linear_g = MulAdd(LoadDup128(d, &inverse_matrix[5 * 4]), mixed_b, *linear_g);
85
42.6M
  *linear_b = MulAdd(LoadDup128(d, &inverse_matrix[8 * 4]), mixed_b, *linear_b);
86
42.6M
}
Unexecuted instantiation: stage_xyb.cc:void jxl::N_SSE2::(anonymous namespace)::XybToRgb<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, jxl::OpsinParams const&, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*)
87
88
#if !JXL_HIGH_PRECISION
89
inline HWY_MAYBE_UNUSED bool HasFastXYBTosRGB8() {
90
#if HWY_TARGET == HWY_NEON
91
  return true;
92
#else
93
  return false;
94
#endif
95
}
96
97
inline HWY_MAYBE_UNUSED Status FastXYBTosRGB8(const float* input[4],
98
                                              uint8_t* output, bool is_rgba,
99
                                              size_t xsize) {
100
  // This function is very NEON-specific. As such, it uses intrinsics directly.
101
#if HWY_TARGET == HWY_NEON
102
  // WARNING: doing fixed point arithmetic correctly is very complicated.
103
  // Changes to this function should be thoroughly tested.
104
105
  // Note that the input is assumed to have 13 bits of mantissa, and the output
106
  // will have 14 bits.
107
  auto srgb_tf = [&](int16x8_t v16) {
108
    int16x8_t clz = vclzq_s16(v16);
109
    // Convert to [0.25, 0.5) range.
110
    int16x8_t v025_05_16 = vqshlq_s16(v16, vqsubq_s16(clz, vdupq_n_s16(2)));
111
112
    // third degree polynomial approximation between 0.25 and 0.5
113
    // of 1.055/2^(7/2.4) * x^(1/2.4) / 32.
114
    // poly ~ ((0.95x-1.75)*x+1.72)*x+0.29
115
    // We actually compute ~ ((0.47x-0.87)*x+0.86)*(2x)+0.29 as 1.75 and 1.72
116
    // overflow our fixed point representation.
117
118
    int16x8_t twov = vqaddq_s16(v025_05_16, v025_05_16);
119
120
    // 0.47 * x
121
    int16x8_t step1 = vqrdmulhq_n_s16(v025_05_16, 15706);
122
    // - 0.87
123
    int16x8_t step2 = vsubq_s16(step1, vdupq_n_s16(28546));
124
    // * x
125
    int16x8_t step3 = vqrdmulhq_s16(step2, v025_05_16);
126
    // + 0.86
127
    int16x8_t step4 = vaddq_s16(step3, vdupq_n_s16(28302));
128
    // * 2x
129
    int16x8_t step5 = vqrdmulhq_s16(step4, twov);
130
    // + 0.29
131
    int16x8_t mul16 = vaddq_s16(step5, vdupq_n_s16(9485));
132
133
    int16x8_t exp16 = vsubq_s16(vdupq_n_s16(11), clz);
134
    // Compute 2**(1/2.4*exp16)/32. Values of exp16 that would overflow are
135
    // capped to 1.
136
    // Generated with the following Python script:
137
    // a = []
138
    // b = []
139
    //
140
    // for i in range(0, 16):
141
    //   v = 2**(5/12.*i)
142
    //   v /= 16
143
    //   v *= 256 * 128
144
    //   v = int(v)
145
    //   a.append(v // 256)
146
    //   b.append(v % 256)
147
    //
148
    // print(", ".join("0x%02x" % x for x in a))
149
    //
150
    // print(", ".join("0x%02x" % x for x in b))
151
152
    HWY_ALIGN constexpr uint8_t k2to512powersm1div32_high[16] = {
153
        0x08, 0x0a, 0x0e, 0x13, 0x19, 0x21, 0x2d, 0x3c,
154
        0x50, 0x6b, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f, 0x8f,
155
    };
156
    HWY_ALIGN constexpr uint8_t k2to512powersm1div32_low[16] = {
157
        0x00, 0xad, 0x41, 0x06, 0x65, 0xe7, 0x41, 0x68,
158
        0xa2, 0xa2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
159
    };
160
    // Using the highway implementation here since vqtbl1q is aarch64-only.
161
    using hwy::HWY_NAMESPACE::Vec128;
162
    uint8x16_t pow_low =
163
        TableLookupBytes(
164
            Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_low)),
165
            Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16)))
166
            .raw;
167
    uint8x16_t pow_high =
168
        TableLookupBytes(
169
            Vec128<uint8_t, 16>(vld1q_u8(k2to512powersm1div32_high)),
170
            Vec128<uint8_t, 16>(vreinterpretq_u8_s16(exp16)))
171
            .raw;
172
    int16x8_t pow16 = vreinterpretq_s16_u16(vsliq_n_u16(
173
        vreinterpretq_u16_u8(pow_low), vreinterpretq_u16_u8(pow_high), 8));
174
175
    // approximation of v * 12.92, divided by 2
176
    // Note that our input is using 13 mantissa bits instead of 15.
177
    int16x8_t v16_linear = vrshrq_n_s16(vmulq_n_s16(v16, 826), 5);
178
    // 1.055*pow(v, 1/2.4) - 0.055, divided by 2
179
    auto v16_pow = vsubq_s16(vqrdmulhq_s16(mul16, pow16), vdupq_n_s16(901));
180
    // > 0.0031308f (note that v16 has 13 mantissa bits)
181
    return vbslq_s16(vcgeq_s16(v16, vdupq_n_s16(26)), v16_pow, v16_linear);
182
  };
183
184
  const float* JXL_RESTRICT row_in_x = input[0];
185
  const float* JXL_RESTRICT row_in_y = input[1];
186
  const float* JXL_RESTRICT row_in_b = input[2];
187
  const float* JXL_RESTRICT row_in_a = input[3];
188
  for (size_t x = 0; x < xsize; x += 8) {
189
    // Normal ranges for xyb for in-gamut sRGB colors:
190
    // x: -0.015386 0.028100
191
    // y: 0.000000 0.845308
192
    // b: 0.000000 0.845308
193
194
    // We actually want x * 8 to have some extra precision.
195
    // TODO(veluca): consider different approaches here, like vld1q_f32_x2.
196
    float32x4_t opsin_x_left = vld1q_f32(row_in_x + x);
197
    int16x4_t opsin_x16_times8_left =
198
        vqmovn_s32(vcvtq_n_s32_f32(opsin_x_left, 18));
199
    float32x4_t opsin_x_right =
200
        vld1q_f32(row_in_x + x + (x + 4 < xsize ? 4 : 0));
201
    int16x4_t opsin_x16_times8_right =
202
        vqmovn_s32(vcvtq_n_s32_f32(opsin_x_right, 18));
203
    int16x8_t opsin_x16_times8 =
204
        vcombine_s16(opsin_x16_times8_left, opsin_x16_times8_right);
205
206
    float32x4_t opsin_y_left = vld1q_f32(row_in_y + x);
207
    int16x4_t opsin_y16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_left, 15));
208
    float32x4_t opsin_y_right =
209
        vld1q_f32(row_in_y + x + (x + 4 < xsize ? 4 : 0));
210
    int16x4_t opsin_y16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_y_right, 15));
211
    int16x8_t opsin_y16 = vcombine_s16(opsin_y16_left, opsin_y16_right);
212
213
    float32x4_t opsin_b_left = vld1q_f32(row_in_b + x);
214
    int16x4_t opsin_b16_left = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_left, 15));
215
    float32x4_t opsin_b_right =
216
        vld1q_f32(row_in_b + x + (x + 4 < xsize ? 4 : 0));
217
    int16x4_t opsin_b16_right = vqmovn_s32(vcvtq_n_s32_f32(opsin_b_right, 15));
218
    int16x8_t opsin_b16 = vcombine_s16(opsin_b16_left, opsin_b16_right);
219
220
    int16x8_t neg_bias16 = vdupq_n_s16(-124);        // -0.0037930732552754493
221
    int16x8_t neg_bias_cbrt16 = vdupq_n_s16(-5110);  // -0.155954201
222
    int16x8_t neg_bias_half16 = vdupq_n_s16(-62);
223
224
    // Color space: XYB -> RGB
225
    // Compute ((y+x-bias_cbrt)^3-(y-x-bias_cbrt)^3)/2,
226
    // ((y+x-bias_cbrt)^3+(y-x-bias_cbrt)^3)/2+bias, (b-bias_cbrt)^3+bias.
227
    // Note that ignoring x2 in the formulas below (as x << y) results in
228
    // errors of at least 3 in the final sRGB values.
229
    int16x8_t opsin_yp16 = vqsubq_s16(opsin_y16, neg_bias_cbrt16);
230
    int16x8_t ysq16 = vqrdmulhq_s16(opsin_yp16, opsin_yp16);
231
    int16x8_t twentyfourx16 = vmulq_n_s16(opsin_x16_times8, 3);
232
    int16x8_t twentyfourxy16 = vqrdmulhq_s16(opsin_yp16, twentyfourx16);
233
    int16x8_t threexsq16 =
234
        vrshrq_n_s16(vqrdmulhq_s16(opsin_x16_times8, twentyfourx16), 6);
235
236
    // We can ignore x^3 here. Note that this is multiplied by 8.
237
    int16x8_t mixed_rmg16 = vqrdmulhq_s16(twentyfourxy16, opsin_yp16);
238
239
    int16x8_t mixed_rpg_sos_half = vhaddq_s16(ysq16, threexsq16);
240
    int16x8_t mixed_rpg16 = vhaddq_s16(
241
        vqrdmulhq_s16(opsin_yp16, mixed_rpg_sos_half), neg_bias_half16);
242
243
    int16x8_t gamma_b16 = vqsubq_s16(opsin_b16, neg_bias_cbrt16);
244
    int16x8_t gamma_bsq16 = vqrdmulhq_s16(gamma_b16, gamma_b16);
245
    int16x8_t gamma_bcb16 = vqrdmulhq_s16(gamma_bsq16, gamma_b16);
246
    int16x8_t mixed_b16 = vqaddq_s16(gamma_bcb16, neg_bias16);
247
    // mixed_rpg and mixed_b are in 0-1 range.
248
    // mixed_rmg has a smaller range (-0.035 to 0.035 for valid sRGB). Note
249
    // that at this point it is already multiplied by 8.
250
251
    // We multiply all the mixed values by 1/4 (i.e. shift them to 13-bit
252
    // fixed point) to ensure intermediate quantities are in range. Note that
253
    // r-g is not shifted, and was x8 before here; this corresponds to a x32
254
    // overall multiplicative factor and ensures that all the matrix constants
255
    // are in 0-1 range.
256
    // Similarly, mixed_rpg16 is already multiplied by 1/4 because of the two
257
    // vhadd + using neg_bias_half.
258
    mixed_b16 = vshrq_n_s16(mixed_b16, 2);
259
260
    // Unmix (multiply by 3x3 inverse_matrix)
261
    // For increased precision, we use a matrix for converting from
262
    // ((mixed_r - mixed_g)/2, (mixed_r + mixed_g)/2, mixed_b) to rgb. This
263
    // avoids cancellation effects when computing (y+x)^3-(y-x)^3.
264
    // We compute mixed_rpg - mixed_b because the (1+c)*mixed_rpg - c *
265
    // mixed_b pattern is repeated frequently in the code below. This allows
266
    // us to save a multiply per channel, and removes the presence of
267
    // some constants above 1. Moreover, mixed_rmg - mixed_b is in (-1, 1)
268
    // range, so the subtraction is safe.
269
    // All the magic-looking constants here are derived by computing the
270
    // inverse opsin matrix for the transformation modified as described
271
    // above.
272
273
    // Precomputation common to multiple color values.
274
    int16x8_t mixed_rpgmb16 = vqsubq_s16(mixed_rpg16, mixed_b16);
275
    int16x8_t mixed_rpgmb_times_016 = vqrdmulhq_n_s16(mixed_rpgmb16, 5394);
276
    int16x8_t mixed_rg16 = vqaddq_s16(mixed_rpgmb_times_016, mixed_rpg16);
277
278
    // R
279
    int16x8_t linear_r16 =
280
        vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, 21400));
281
282
    // G
283
    int16x8_t linear_g16 =
284
        vqaddq_s16(mixed_rg16, vqrdmulhq_n_s16(mixed_rmg16, -7857));
285
286
    // B
287
    int16x8_t linear_b16 = vqrdmulhq_n_s16(mixed_rpgmb16, -30996);
288
    linear_b16 = vqaddq_s16(linear_b16, mixed_b16);
289
    linear_b16 = vqaddq_s16(linear_b16, vqrdmulhq_n_s16(mixed_rmg16, -6525));
290
291
    // Apply SRGB transfer function.
292
    int16x8_t r = srgb_tf(linear_r16);
293
    int16x8_t g = srgb_tf(linear_g16);
294
    int16x8_t b = srgb_tf(linear_b16);
295
296
    uint8x8_t r8 =
297
        vqmovun_s16(vrshrq_n_s16(vsubq_s16(r, vshrq_n_s16(r, 8)), 6));
298
    uint8x8_t g8 =
299
        vqmovun_s16(vrshrq_n_s16(vsubq_s16(g, vshrq_n_s16(g, 8)), 6));
300
    uint8x8_t b8 =
301
        vqmovun_s16(vrshrq_n_s16(vsubq_s16(b, vshrq_n_s16(b, 8)), 6));
302
303
    size_t n = xsize - x;
304
    if (is_rgba) {
305
      float32x4_t a_f32_left =
306
          row_in_a ? vld1q_f32(row_in_a + x) : vdupq_n_f32(1.0f);
307
      float32x4_t a_f32_right =
308
          row_in_a ? vld1q_f32(row_in_a + x + (x + 4 < xsize ? 4 : 0))
309
                   : vdupq_n_f32(1.0f);
310
      int16x4_t a16_left = vqmovn_s32(vcvtq_n_s32_f32(a_f32_left, 8));
311
      int16x4_t a16_right = vqmovn_s32(vcvtq_n_s32_f32(a_f32_right, 8));
312
      uint8x8_t a8 = vqmovun_s16(vcombine_s16(a16_left, a16_right));
313
      uint8_t* buf = output + 4 * x;
314
      uint8x8x4_t data = {r8, g8, b8, a8};
315
      if (n >= 8) {
316
        vst4_u8(buf, data);
317
      } else {
318
        uint8_t tmp[8 * 4];
319
        vst4_u8(tmp, data);
320
        memcpy(buf, tmp, n * 4);
321
      }
322
    } else {
323
      uint8_t* buf = output + 3 * x;
324
      uint8x8x3_t data = {r8, g8, b8};
325
      if (n >= 8) {
326
        vst3_u8(buf, data);
327
      } else {
328
        uint8_t tmp[8 * 3];
329
        vst3_u8(tmp, data);
330
        memcpy(buf, tmp, n * 3);
331
      }
332
    }
333
  }
334
  return true;
335
#else   // HWY_TARGET == HWY_NEON
336
  (void)input;
337
  (void)output;
338
  (void)is_rgba;
339
  (void)xsize;
340
  return JXL_UNREACHABLE("unsupported platform");
341
#endif  // HWY_TARGET == HWY_NEON
342
}
343
#endif  // !JXL_HIGH_PRECISION
344
345
}  // namespace
346
// NOLINTNEXTLINE(google-readability-namespace-comments)
347
}  // namespace HWY_NAMESPACE
348
}  // namespace jxl
349
HWY_AFTER_NAMESPACE();
350
351
#endif  // LIB_JXL_DEC_XYB_INL_H_