/src/libjxl/lib/jxl/enc_xyb.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/enc_xyb.h" |
7 | | |
8 | | #include <jxl/cms_interface.h> |
9 | | #include <jxl/memory_manager.h> |
10 | | |
11 | | #include <algorithm> |
12 | | #include <cmath> |
13 | | #include <cstdint> |
14 | | #include <cstdlib> |
15 | | |
16 | | #include "lib/jxl/base/common.h" |
17 | | #include "lib/jxl/frame_dimensions.h" |
18 | | #include "lib/jxl/image.h" |
19 | | |
20 | | #undef HWY_TARGET_INCLUDE |
21 | | #define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc" |
22 | | #include <hwy/foreach_target.h> |
23 | | #include <hwy/highway.h> |
24 | | |
25 | | #include "lib/jxl/base/compiler_specific.h" |
26 | | #include "lib/jxl/base/data_parallel.h" |
27 | | #include "lib/jxl/base/fast_math-inl.h" |
28 | | #include "lib/jxl/base/rect.h" |
29 | | #include "lib/jxl/base/status.h" |
30 | | #include "lib/jxl/cms/opsin_params.h" |
31 | | #include "lib/jxl/cms/transfer_functions-inl.h" |
32 | | #include "lib/jxl/color_encoding_internal.h" |
33 | | #include "lib/jxl/enc_image_bundle.h" |
34 | | #include "lib/jxl/image_ops.h" |
35 | | #include "lib/jxl/memory_manager_internal.h" |
36 | | |
37 | | HWY_BEFORE_NAMESPACE(); |
38 | | namespace jxl { |
39 | | namespace HWY_NAMESPACE { |
40 | | |
41 | | // These templates are not found via ADL. |
42 | | using hwy::HWY_NAMESPACE::Add; |
43 | | using hwy::HWY_NAMESPACE::Mul; |
44 | | using hwy::HWY_NAMESPACE::MulAdd; |
45 | | using hwy::HWY_NAMESPACE::Sub; |
46 | | using hwy::HWY_NAMESPACE::ZeroIfNegative; |
47 | | |
48 | | // 4x3 matrix * 3x1 SIMD vectors |
49 | | template <class V> |
50 | | JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b, |
51 | | const float* JXL_RESTRICT premul_absorb, |
52 | | V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1, |
53 | 3.16M | V* JXL_RESTRICT mixed2) { |
54 | 3.16M | const float* bias = jxl::cms::kOpsinAbsorbanceBias.data(); |
55 | 3.16M | const HWY_FULL(float) d; |
56 | 3.16M | const size_t N = Lanes(d); |
57 | 3.16M | const auto m0 = Load(d, premul_absorb + 0 * N); |
58 | 3.16M | const auto m1 = Load(d, premul_absorb + 1 * N); |
59 | 3.16M | const auto m2 = Load(d, premul_absorb + 2 * N); |
60 | 3.16M | const auto m3 = Load(d, premul_absorb + 3 * N); |
61 | 3.16M | const auto m4 = Load(d, premul_absorb + 4 * N); |
62 | 3.16M | const auto m5 = Load(d, premul_absorb + 5 * N); |
63 | 3.16M | const auto m6 = Load(d, premul_absorb + 6 * N); |
64 | 3.16M | const auto m7 = Load(d, premul_absorb + 7 * N); |
65 | 3.16M | const auto m8 = Load(d, premul_absorb + 8 * N); |
66 | 3.16M | *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0])))); |
67 | 3.16M | *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1])))); |
68 | 3.16M | *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2])))); |
69 | 3.16M | } Unexecuted instantiation: void jxl::N_SSE4::OpsinAbsorbance<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*) void jxl::N_AVX2::OpsinAbsorbance<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*) Line | Count | Source | 53 | 3.16M | V* JXL_RESTRICT mixed2) { | 54 | 3.16M | const float* bias = jxl::cms::kOpsinAbsorbanceBias.data(); | 55 | 3.16M | const HWY_FULL(float) d; | 56 | 3.16M | const size_t N = Lanes(d); | 57 | 3.16M | const auto m0 = Load(d, premul_absorb + 0 * N); | 58 | 3.16M | const auto m1 = Load(d, premul_absorb + 1 * N); | 59 | 3.16M | const auto m2 = Load(d, premul_absorb + 2 * N); | 60 | 3.16M | const auto m3 = Load(d, premul_absorb + 3 * N); | 61 | 3.16M | const auto m4 = Load(d, premul_absorb + 4 * N); | 62 | 3.16M | const auto m5 = Load(d, premul_absorb + 5 * N); | 63 | 3.16M | const auto m6 = Load(d, premul_absorb + 6 * N); | 64 | 3.16M | const auto m7 = Load(d, premul_absorb + 7 * N); | 65 | 3.16M | const auto m8 = Load(d, premul_absorb + 8 * N); | 66 | 3.16M | *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0])))); | 67 | 3.16M | *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1])))); | 68 | 3.16M | *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2])))); | 69 | 3.16M | } |
Unexecuted instantiation: void jxl::N_SSE2::OpsinAbsorbance<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*, hwy::N_SSE2::Vec128<float, 4ul>*) |
70 | | |
71 | | template <class V> |
72 | | void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx, |
73 | 3.16M | float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) { |
74 | 3.16M | const HWY_FULL(float) d; |
75 | 3.16M | const V half = Set(d, 0.5f); |
76 | 3.16M | Store(Mul(half, Sub(r, g)), d, valx); |
77 | 3.16M | Store(Mul(half, Add(r, g)), d, valy); |
78 | 3.16M | Store(b, d, valz); |
79 | 3.16M | } Unexecuted instantiation: void jxl::N_SSE4::StoreXYB<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float*, float*, float*) void jxl::N_AVX2::StoreXYB<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float*, float*, float*) Line | Count | Source | 73 | 3.16M | float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) { | 74 | 3.16M | const HWY_FULL(float) d; | 75 | 3.16M | const V half = Set(d, 0.5f); | 76 | 3.16M | Store(Mul(half, Sub(r, g)), d, valx); | 77 | 3.16M | Store(Mul(half, Add(r, g)), d, valy); | 78 | 3.16M | Store(b, d, valz); | 79 | 3.16M | } |
Unexecuted instantiation: void jxl::N_SSE2::StoreXYB<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float*, float*, float*) |
80 | | |
81 | | // Converts one RGB vector to XYB. |
82 | | template <class V> |
83 | | void LinearRGBToXYB(const V r, const V g, const V b, |
84 | | const float* JXL_RESTRICT premul_absorb, |
85 | | float* JXL_RESTRICT valx, float* JXL_RESTRICT valy, |
86 | 3.16M | float* JXL_RESTRICT valz) { |
87 | 3.16M | V mixed0; |
88 | 3.16M | V mixed1; |
89 | 3.16M | V mixed2; |
90 | 3.16M | OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2); |
91 | | |
92 | | // mixed* should be non-negative even for wide-gamut, so clamp to zero. |
93 | 3.16M | mixed0 = ZeroIfNegative(mixed0); |
94 | 3.16M | mixed1 = ZeroIfNegative(mixed1); |
95 | 3.16M | mixed2 = ZeroIfNegative(mixed2); |
96 | | |
97 | 3.16M | const HWY_FULL(float) d; |
98 | 3.16M | const size_t N = Lanes(d); |
99 | 3.16M | mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N)); |
100 | 3.16M | mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N)); |
101 | 3.16M | mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N)); |
102 | 3.16M | StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz); |
103 | | |
104 | | // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative. |
105 | 3.16M | } Unexecuted instantiation: void jxl::N_SSE4::LinearRGBToXYB<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float const*, float*, float*, float*) void jxl::N_AVX2::LinearRGBToXYB<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float const*, float*, float*, float*) Line | Count | Source | 86 | 3.16M | float* JXL_RESTRICT valz) { | 87 | 3.16M | V mixed0; | 88 | 3.16M | V mixed1; | 89 | 3.16M | V mixed2; | 90 | 3.16M | OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2); | 91 | | | 92 | | // mixed* should be non-negative even for wide-gamut, so clamp to zero. | 93 | 3.16M | mixed0 = ZeroIfNegative(mixed0); | 94 | 3.16M | mixed1 = ZeroIfNegative(mixed1); | 95 | 3.16M | mixed2 = ZeroIfNegative(mixed2); | 96 | | | 97 | 3.16M | const HWY_FULL(float) d; | 98 | 3.16M | const size_t N = Lanes(d); | 99 | 3.16M | mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N)); | 100 | 3.16M | mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N)); | 101 | 3.16M | mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N)); | 102 | 3.16M | StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz); | 103 | | | 104 | | // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative. | 105 | 3.16M | } |
Unexecuted instantiation: void jxl::N_SSE2::LinearRGBToXYB<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float const*, float*, float*, float*) |
106 | | |
107 | | void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, |
108 | | float* JXL_RESTRICT row2, |
109 | 0 | const float* JXL_RESTRICT premul_absorb, size_t xsize) { |
110 | 0 | const HWY_FULL(float) d; |
111 | 0 | for (size_t x = 0; x < xsize; x += Lanes(d)) { |
112 | 0 | const auto r = Load(d, row0 + x); |
113 | 0 | const auto g = Load(d, row1 + x); |
114 | 0 | const auto b = Load(d, row2 + x); |
115 | 0 | LinearRGBToXYB(r, g, b, premul_absorb, row0 + x, row1 + x, row2 + x); |
116 | 0 | } |
117 | 0 | } Unexecuted instantiation: jxl::N_SSE4::LinearRGBRowToXYB(float*, float*, float*, float const*, unsigned long) Unexecuted instantiation: jxl::N_AVX2::LinearRGBRowToXYB(float*, float*, float*, float const*, unsigned long) Unexecuted instantiation: jxl::N_SSE2::LinearRGBRowToXYB(float*, float*, float*, float const*, unsigned long) |
118 | | |
119 | | // Input/output uses the codec.h scaling: nominally 0-1 if in-gamut. |
120 | | template <class V> |
121 | 9.50M | V LinearFromSRGB(V encoded) { |
122 | 9.50M | return TF_SRGB().DisplayFromEncoded(encoded); |
123 | 9.50M | } Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::LinearFromSRGB<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>) hwy::N_AVX2::Vec256<float> jxl::N_AVX2::LinearFromSRGB<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>) Line | Count | Source | 121 | 9.50M | V LinearFromSRGB(V encoded) { | 122 | 9.50M | return TF_SRGB().DisplayFromEncoded(encoded); | 123 | 9.50M | } |
Unexecuted instantiation: hwy::N_SSE2::Vec128<float, 4ul> jxl::N_SSE2::LinearFromSRGB<hwy::N_SSE2::Vec128<float, 4ul> >(hwy::N_SSE2::Vec128<float, 4ul>) |
124 | | |
125 | | Status LinearSRGBToXYB(const float* JXL_RESTRICT premul_absorb, |
126 | 0 | ThreadPool* pool, Image3F* JXL_RESTRICT image) { |
127 | 0 | const size_t xsize = image->xsize(); |
128 | |
|
129 | 0 | const HWY_FULL(float) d; |
130 | 0 | const auto process_row = [&](const uint32_t task, |
131 | 0 | size_t /*thread*/) -> Status { |
132 | 0 | const size_t y = static_cast<size_t>(task); |
133 | 0 | float* JXL_RESTRICT row0 = image->PlaneRow(0, y); |
134 | 0 | float* JXL_RESTRICT row1 = image->PlaneRow(1, y); |
135 | 0 | float* JXL_RESTRICT row2 = image->PlaneRow(2, y); |
136 | |
|
137 | 0 | for (size_t x = 0; x < xsize; x += Lanes(d)) { |
138 | 0 | const auto in_r = Load(d, row0 + x); |
139 | 0 | const auto in_g = Load(d, row1 + x); |
140 | 0 | const auto in_b = Load(d, row2 + x); |
141 | 0 | LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x, |
142 | 0 | row2 + x); |
143 | 0 | } |
144 | 0 | return true; |
145 | 0 | }; Unexecuted instantiation: enc_xyb.cc:jxl::N_SSE4::LinearSRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const Unexecuted instantiation: enc_xyb.cc:jxl::N_AVX2::LinearSRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const Unexecuted instantiation: enc_xyb.cc:jxl::N_SSE2::LinearSRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const |
146 | 0 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(image->ysize()), |
147 | 0 | ThreadPool::NoInit, process_row, |
148 | 0 | "LinearToXYB")); |
149 | 0 | return true; |
150 | 0 | } Unexecuted instantiation: jxl::N_SSE4::LinearSRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*) Unexecuted instantiation: jxl::N_AVX2::LinearSRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*) Unexecuted instantiation: jxl::N_SSE2::LinearSRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*) |
151 | | |
152 | | Status SRGBToXYB(const float* JXL_RESTRICT premul_absorb, ThreadPool* pool, |
153 | 186 | Image3F* JXL_RESTRICT image) { |
154 | 186 | const size_t xsize = image->xsize(); |
155 | | |
156 | 186 | const HWY_FULL(float) d; |
157 | 186 | const auto process_row = [&](const uint32_t task, |
158 | 58.2k | size_t /*thread*/) -> Status { |
159 | 58.2k | const size_t y = static_cast<size_t>(task); |
160 | 58.2k | float* JXL_RESTRICT row0 = image->PlaneRow(0, y); |
161 | 58.2k | float* JXL_RESTRICT row1 = image->PlaneRow(1, y); |
162 | 58.2k | float* JXL_RESTRICT row2 = image->PlaneRow(2, y); |
163 | | |
164 | 3.22M | for (size_t x = 0; x < xsize; x += Lanes(d)) { |
165 | 3.16M | const auto in_r = LinearFromSRGB(Load(d, row0 + x)); |
166 | 3.16M | const auto in_g = LinearFromSRGB(Load(d, row1 + x)); |
167 | 3.16M | const auto in_b = LinearFromSRGB(Load(d, row2 + x)); |
168 | 3.16M | LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x, |
169 | 3.16M | row2 + x); |
170 | 3.16M | } |
171 | 58.2k | return true; |
172 | 58.2k | }; Unexecuted instantiation: enc_xyb.cc:jxl::N_SSE4::SRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const enc_xyb.cc:jxl::N_AVX2::SRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 158 | 58.2k | size_t /*thread*/) -> Status { | 159 | 58.2k | const size_t y = static_cast<size_t>(task); | 160 | 58.2k | float* JXL_RESTRICT row0 = image->PlaneRow(0, y); | 161 | 58.2k | float* JXL_RESTRICT row1 = image->PlaneRow(1, y); | 162 | 58.2k | float* JXL_RESTRICT row2 = image->PlaneRow(2, y); | 163 | | | 164 | 3.22M | for (size_t x = 0; x < xsize; x += Lanes(d)) { | 165 | 3.16M | const auto in_r = LinearFromSRGB(Load(d, row0 + x)); | 166 | 3.16M | const auto in_g = LinearFromSRGB(Load(d, row1 + x)); | 167 | 3.16M | const auto in_b = LinearFromSRGB(Load(d, row2 + x)); | 168 | 3.16M | LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x, | 169 | 3.16M | row2 + x); | 170 | 3.16M | } | 171 | 58.2k | return true; | 172 | 58.2k | }; |
Unexecuted instantiation: enc_xyb.cc:jxl::N_SSE2::SRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const |
173 | 186 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(image->ysize()), |
174 | 186 | ThreadPool::NoInit, process_row, "SRGBToXYB")); |
175 | 186 | return true; |
176 | 186 | } Unexecuted instantiation: jxl::N_SSE4::SRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*) jxl::N_AVX2::SRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*) Line | Count | Source | 153 | 186 | Image3F* JXL_RESTRICT image) { | 154 | 186 | const size_t xsize = image->xsize(); | 155 | | | 156 | 186 | const HWY_FULL(float) d; | 157 | 186 | const auto process_row = [&](const uint32_t task, | 158 | 186 | size_t /*thread*/) -> Status { | 159 | 186 | const size_t y = static_cast<size_t>(task); | 160 | 186 | float* JXL_RESTRICT row0 = image->PlaneRow(0, y); | 161 | 186 | float* JXL_RESTRICT row1 = image->PlaneRow(1, y); | 162 | 186 | float* JXL_RESTRICT row2 = image->PlaneRow(2, y); | 163 | | | 164 | 186 | for (size_t x = 0; x < xsize; x += Lanes(d)) { | 165 | 186 | const auto in_r = LinearFromSRGB(Load(d, row0 + x)); | 166 | 186 | const auto in_g = LinearFromSRGB(Load(d, row1 + x)); | 167 | 186 | const auto in_b = LinearFromSRGB(Load(d, row2 + x)); | 168 | 186 | LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x, | 169 | 186 | row2 + x); | 170 | 186 | } | 171 | 186 | return true; | 172 | 186 | }; | 173 | 186 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(image->ysize()), | 174 | 186 | ThreadPool::NoInit, process_row, "SRGBToXYB")); | 175 | 186 | return true; | 176 | 186 | } |
Unexecuted instantiation: jxl::N_SSE2::SRGBToXYB(float const*, jxl::ThreadPool*, jxl::Image3<float>*) |
177 | | |
178 | | Status SRGBToXYBAndLinear(const float* JXL_RESTRICT premul_absorb, |
179 | | ThreadPool* pool, Image3F* JXL_RESTRICT image, |
180 | 0 | Image3F* JXL_RESTRICT linear) { |
181 | 0 | const size_t xsize = image->xsize(); |
182 | |
|
183 | 0 | const HWY_FULL(float) d; |
184 | 0 | const auto process_row = [&](const uint32_t task, |
185 | 0 | size_t /*thread*/) -> Status { |
186 | 0 | const size_t y = static_cast<size_t>(task); |
187 | 0 | float* JXL_RESTRICT row_image0 = image->PlaneRow(0, y); |
188 | 0 | float* JXL_RESTRICT row_image1 = image->PlaneRow(1, y); |
189 | 0 | float* JXL_RESTRICT row_image2 = image->PlaneRow(2, y); |
190 | 0 | float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y); |
191 | 0 | float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y); |
192 | 0 | float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y); |
193 | |
|
194 | 0 | for (size_t x = 0; x < xsize; x += Lanes(d)) { |
195 | 0 | const auto in_r = LinearFromSRGB(Load(d, row_image0 + x)); |
196 | 0 | const auto in_g = LinearFromSRGB(Load(d, row_image1 + x)); |
197 | 0 | const auto in_b = LinearFromSRGB(Load(d, row_image2 + x)); |
198 | |
|
199 | 0 | Store(in_r, d, row_linear0 + x); |
200 | 0 | Store(in_g, d, row_linear1 + x); |
201 | 0 | Store(in_b, d, row_linear2 + x); |
202 | |
|
203 | 0 | LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_image0 + x, |
204 | 0 | row_image1 + x, row_image2 + x); |
205 | 0 | } |
206 | 0 | return true; |
207 | 0 | }; Unexecuted instantiation: enc_xyb.cc:jxl::N_SSE4::SRGBToXYBAndLinear(float const*, jxl::ThreadPool*, jxl::Image3<float>*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const Unexecuted instantiation: enc_xyb.cc:jxl::N_AVX2::SRGBToXYBAndLinear(float const*, jxl::ThreadPool*, jxl::Image3<float>*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const Unexecuted instantiation: enc_xyb.cc:jxl::N_SSE2::SRGBToXYBAndLinear(float const*, jxl::ThreadPool*, jxl::Image3<float>*, jxl::Image3<float>*)::$_0::operator()(unsigned int, unsigned long) const |
208 | 0 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(image->ysize()), |
209 | 0 | ThreadPool::NoInit, process_row, |
210 | 0 | "SRGBToXYBAndLinear")); |
211 | 0 | return true; |
212 | 0 | } Unexecuted instantiation: jxl::N_SSE4::SRGBToXYBAndLinear(float const*, jxl::ThreadPool*, jxl::Image3<float>*, jxl::Image3<float>*) Unexecuted instantiation: jxl::N_AVX2::SRGBToXYBAndLinear(float const*, jxl::ThreadPool*, jxl::Image3<float>*, jxl::Image3<float>*) Unexecuted instantiation: jxl::N_SSE2::SRGBToXYBAndLinear(float const*, jxl::ThreadPool*, jxl::Image3<float>*, jxl::Image3<float>*) |
213 | | |
214 | 186 | void ComputePremulAbsorb(float intensity_target, float* premul_absorb) { |
215 | 186 | const HWY_FULL(float) d; |
216 | 186 | const size_t N = Lanes(d); |
217 | 186 | const float mul = intensity_target / 255.0f; |
218 | 744 | for (size_t j = 0; j < 3; ++j) { |
219 | 2.23k | for (size_t i = 0; i < 3; ++i) { |
220 | 1.67k | const auto absorb = Set(d, jxl::cms::kOpsinAbsorbanceMatrix[j][i] * mul); |
221 | 1.67k | Store(absorb, d, premul_absorb + (j * 3 + i) * N); |
222 | 1.67k | } |
223 | 558 | } |
224 | 744 | for (size_t i = 0; i < 3; ++i) { |
225 | 558 | const auto neg_bias_cbrt = |
226 | 558 | Set(d, -cbrtf(jxl::cms::kOpsinAbsorbanceBias[i])); |
227 | 558 | Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N); |
228 | 558 | } |
229 | 186 | } Unexecuted instantiation: jxl::N_SSE4::ComputePremulAbsorb(float, float*) jxl::N_AVX2::ComputePremulAbsorb(float, float*) Line | Count | Source | 214 | 186 | void ComputePremulAbsorb(float intensity_target, float* premul_absorb) { | 215 | 186 | const HWY_FULL(float) d; | 216 | 186 | const size_t N = Lanes(d); | 217 | 186 | const float mul = intensity_target / 255.0f; | 218 | 744 | for (size_t j = 0; j < 3; ++j) { | 219 | 2.23k | for (size_t i = 0; i < 3; ++i) { | 220 | 1.67k | const auto absorb = Set(d, jxl::cms::kOpsinAbsorbanceMatrix[j][i] * mul); | 221 | 1.67k | Store(absorb, d, premul_absorb + (j * 3 + i) * N); | 222 | 1.67k | } | 223 | 558 | } | 224 | 744 | for (size_t i = 0; i < 3; ++i) { | 225 | 558 | const auto neg_bias_cbrt = | 226 | 558 | Set(d, -cbrtf(jxl::cms::kOpsinAbsorbanceBias[i])); | 227 | 558 | Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N); | 228 | 558 | } | 229 | 186 | } |
Unexecuted instantiation: jxl::N_SSE2::ComputePremulAbsorb(float, float*) |
230 | | |
231 | | // This is different from Butteraugli's OpsinDynamicsImage() in the sense that |
232 | | // it does not contain a sensitivity multiplier based on the blurred image. |
233 | | Status ToXYB(const ColorEncoding& c_current, float intensity_target, |
234 | | const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image, |
235 | 186 | const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) { |
236 | 186 | JXL_ENSURE(image); |
237 | 186 | if (black) JXL_ENSURE(SameSize(*image, *black)); |
238 | 186 | if (linear) JXL_ENSURE(SameSize(*image, *linear)); |
239 | | |
240 | 186 | JxlMemoryManager* memory_manager = image->memory_manager(); |
241 | 186 | JXL_ENSURE(memory_manager); |
242 | | |
243 | 186 | const HWY_FULL(float) d; |
244 | | // Pre-broadcasted constants |
245 | 186 | JXL_ASSIGN_OR_RETURN( |
246 | 186 | AlignedMemory mem, |
247 | 186 | AlignedMemory::Create(memory_manager, Lanes(d) * 12 * sizeof(float))); |
248 | 186 | float* premul_absorb = mem.address<float>(); |
249 | 186 | ComputePremulAbsorb(intensity_target, premul_absorb); |
250 | | |
251 | 186 | const bool want_linear = (linear != nullptr); |
252 | | |
253 | 186 | const ColorEncoding& c_linear_srgb = |
254 | 186 | ColorEncoding::LinearSRGB(c_current.IsGray()); |
255 | | // Linear sRGB inputs are rare but can be useful for the fastest encoders, for |
256 | | // which undoing the sRGB transfer function would be a large part of the cost. |
257 | 186 | if (c_linear_srgb.SameColorEncoding(c_current)) { |
258 | | // This only happens if kitten or slower, moving ImageBundle might be |
259 | | // possible but the encoder is much slower than this copy. |
260 | 0 | if (want_linear) { |
261 | 0 | JXL_RETURN_IF_ERROR(CopyImageTo(*image, linear)); |
262 | 0 | } |
263 | 0 | JXL_RETURN_IF_ERROR(LinearSRGBToXYB(premul_absorb, pool, image)); |
264 | 0 | return true; |
265 | 0 | } |
266 | | |
267 | | // Common case: already sRGB, can avoid the color transform |
268 | 186 | if (c_current.IsSRGB()) { |
269 | | // Common case: can avoid allocating/copying |
270 | 186 | if (want_linear) { |
271 | | // Slow encoder also wants linear sRGB. |
272 | 0 | JXL_RETURN_IF_ERROR( |
273 | 0 | SRGBToXYBAndLinear(premul_absorb, pool, image, linear)); |
274 | 186 | } else { |
275 | 186 | JXL_RETURN_IF_ERROR(SRGBToXYB(premul_absorb, pool, image)); |
276 | 186 | } |
277 | 186 | return true; |
278 | 186 | } |
279 | | |
280 | 0 | JXL_RETURN_IF_ERROR(ApplyColorTransform( |
281 | 0 | c_current, intensity_target, *image, black, Rect(*image), c_linear_srgb, |
282 | 0 | cms, pool, want_linear ? linear : image)); |
283 | 0 | if (want_linear) { |
284 | 0 | JXL_RETURN_IF_ERROR(CopyImageTo(*linear, image)); |
285 | 0 | } |
286 | 0 | JXL_RETURN_IF_ERROR(LinearSRGBToXYB(premul_absorb, pool, image)); |
287 | 0 | return true; |
288 | 0 | } Unexecuted instantiation: jxl::N_SSE4::ToXYB(jxl::ColorEncoding const&, float, jxl::Plane<float> const*, jxl::ThreadPool*, jxl::Image3<float>*, JxlCmsInterface const&, jxl::Image3<float>*) jxl::N_AVX2::ToXYB(jxl::ColorEncoding const&, float, jxl::Plane<float> const*, jxl::ThreadPool*, jxl::Image3<float>*, JxlCmsInterface const&, jxl::Image3<float>*) Line | Count | Source | 235 | 186 | const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) { | 236 | 186 | JXL_ENSURE(image); | 237 | 186 | if (black) JXL_ENSURE(SameSize(*image, *black)); | 238 | 186 | if (linear) JXL_ENSURE(SameSize(*image, *linear)); | 239 | | | 240 | 186 | JxlMemoryManager* memory_manager = image->memory_manager(); | 241 | 186 | JXL_ENSURE(memory_manager); | 242 | | | 243 | 186 | const HWY_FULL(float) d; | 244 | | // Pre-broadcasted constants | 245 | 186 | JXL_ASSIGN_OR_RETURN( | 246 | 186 | AlignedMemory mem, | 247 | 186 | AlignedMemory::Create(memory_manager, Lanes(d) * 12 * sizeof(float))); | 248 | 186 | float* premul_absorb = mem.address<float>(); | 249 | 186 | ComputePremulAbsorb(intensity_target, premul_absorb); | 250 | | | 251 | 186 | const bool want_linear = (linear != nullptr); | 252 | | | 253 | 186 | const ColorEncoding& c_linear_srgb = | 254 | 186 | ColorEncoding::LinearSRGB(c_current.IsGray()); | 255 | | // Linear sRGB inputs are rare but can be useful for the fastest encoders, for | 256 | | // which undoing the sRGB transfer function would be a large part of the cost. | 257 | 186 | if (c_linear_srgb.SameColorEncoding(c_current)) { | 258 | | // This only happens if kitten or slower, moving ImageBundle might be | 259 | | // possible but the encoder is much slower than this copy. | 260 | 0 | if (want_linear) { | 261 | 0 | JXL_RETURN_IF_ERROR(CopyImageTo(*image, linear)); | 262 | 0 | } | 263 | 0 | JXL_RETURN_IF_ERROR(LinearSRGBToXYB(premul_absorb, pool, image)); | 264 | 0 | return true; | 265 | 0 | } | 266 | | | 267 | | // Common case: already sRGB, can avoid the color transform | 268 | 186 | if (c_current.IsSRGB()) { | 269 | | // Common case: can avoid allocating/copying | 270 | 186 | if (want_linear) { | 271 | | // Slow encoder also wants linear sRGB. | 272 | 0 | JXL_RETURN_IF_ERROR( | 273 | 0 | SRGBToXYBAndLinear(premul_absorb, pool, image, linear)); | 274 | 186 | } else { | 275 | 186 | JXL_RETURN_IF_ERROR(SRGBToXYB(premul_absorb, pool, image)); | 276 | 186 | } | 277 | 186 | return true; | 278 | 186 | } | 279 | | | 280 | 0 | JXL_RETURN_IF_ERROR(ApplyColorTransform( | 281 | 0 | c_current, intensity_target, *image, black, Rect(*image), c_linear_srgb, | 282 | 0 | cms, pool, want_linear ? linear : image)); | 283 | 0 | if (want_linear) { | 284 | 0 | JXL_RETURN_IF_ERROR(CopyImageTo(*linear, image)); | 285 | 0 | } | 286 | 0 | JXL_RETURN_IF_ERROR(LinearSRGBToXYB(premul_absorb, pool, image)); | 287 | 0 | return true; | 288 | 0 | } |
Unexecuted instantiation: jxl::N_SSE2::ToXYB(jxl::ColorEncoding const&, float, jxl::Plane<float> const*, jxl::ThreadPool*, jxl::Image3<float>*, JxlCmsInterface const&, jxl::Image3<float>*) |
289 | | |
290 | | // Transform RGB to YCbCr. |
291 | | // Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B). |
292 | | Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, |
293 | | const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, |
294 | 0 | ImageF* cr_plane, ThreadPool* pool) { |
295 | 0 | const HWY_FULL(float) df; |
296 | 0 | const size_t S = Lanes(df); // Step. |
297 | |
|
298 | 0 | const size_t xsize = r_plane.xsize(); |
299 | 0 | const size_t ysize = r_plane.ysize(); |
300 | 0 | if ((xsize == 0) || (ysize == 0)) return true; |
301 | | |
302 | | // Full-range BT.601 as defined by JFIF Clause 7: |
303 | | // https://www.itu.int/rec/T-REC-T.871-201105-I/en |
304 | 0 | const auto k128 = Set(df, 128.0f / 255); |
305 | 0 | const auto kR = Set(df, 0.299f); // NTSC luma |
306 | 0 | const auto kG = Set(df, 0.587f); |
307 | 0 | const auto kB = Set(df, 0.114f); |
308 | 0 | const auto kAmpR = Set(df, 0.701f); |
309 | 0 | const auto kAmpB = Set(df, 0.886f); |
310 | 0 | const auto kDiffR = Add(kAmpR, kR); |
311 | 0 | const auto kDiffB = Add(kAmpB, kB); |
312 | 0 | const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB)))); |
313 | 0 | const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB)))); |
314 | |
|
315 | 0 | constexpr size_t kGroupArea = kGroupDim * kGroupDim; |
316 | 0 | const size_t lines_per_group = DivCeil(kGroupArea, xsize); |
317 | 0 | const size_t num_stripes = DivCeil(ysize, lines_per_group); |
318 | 0 | const auto transform = [&](int idx, int /* thread*/) -> Status { |
319 | 0 | const size_t y0 = idx * lines_per_group; |
320 | 0 | const size_t y1 = std::min<size_t>(y0 + lines_per_group, ysize); |
321 | 0 | for (size_t y = y0; y < y1; ++y) { |
322 | 0 | const float* r_row = r_plane.ConstRow(y); |
323 | 0 | const float* g_row = g_plane.ConstRow(y); |
324 | 0 | const float* b_row = b_plane.ConstRow(y); |
325 | 0 | float* y_row = y_plane->Row(y); |
326 | 0 | float* cb_row = cb_plane->Row(y); |
327 | 0 | float* cr_row = cr_plane->Row(y); |
328 | 0 | for (size_t x = 0; x < xsize; x += S) { |
329 | 0 | const auto r = Load(df, r_row + x); |
330 | 0 | const auto g = Load(df, g_row + x); |
331 | 0 | const auto b = Load(df, b_row + x); |
332 | 0 | const auto r_base = Mul(r, kR); |
333 | 0 | const auto r_diff = Mul(r, kDiffR); |
334 | 0 | const auto g_base = Mul(g, kG); |
335 | 0 | const auto b_base = Mul(b, kB); |
336 | 0 | const auto b_diff = Mul(b, kDiffB); |
337 | 0 | const auto y_base = Add(r_base, Add(g_base, b_base)); |
338 | 0 | const auto y_vec = Sub(y_base, k128); |
339 | 0 | const auto cb_vec = Mul(Sub(b_diff, y_base), kNormB); |
340 | 0 | const auto cr_vec = Mul(Sub(r_diff, y_base), kNormR); |
341 | 0 | Store(y_vec, df, y_row + x); |
342 | 0 | Store(cb_vec, df, cb_row + x); |
343 | 0 | Store(cr_vec, df, cr_row + x); |
344 | 0 | } |
345 | 0 | } |
346 | 0 | return true; |
347 | 0 | }; Unexecuted instantiation: enc_xyb.cc:jxl::N_SSE4::RgbToYcbcr(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float>*, jxl::Plane<float>*, jxl::Plane<float>*, jxl::ThreadPool*)::$_0::operator()(int, int) const Unexecuted instantiation: enc_xyb.cc:jxl::N_AVX2::RgbToYcbcr(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float>*, jxl::Plane<float>*, jxl::Plane<float>*, jxl::ThreadPool*)::$_0::operator()(int, int) const Unexecuted instantiation: enc_xyb.cc:jxl::N_SSE2::RgbToYcbcr(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float>*, jxl::Plane<float>*, jxl::Plane<float>*, jxl::ThreadPool*)::$_0::operator()(int, int) const |
348 | 0 | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<int>(num_stripes), |
349 | 0 | ThreadPool::NoInit, transform, "RgbToYcbCr")); |
350 | 0 | return true; |
351 | 0 | } Unexecuted instantiation: jxl::N_SSE4::RgbToYcbcr(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float>*, jxl::Plane<float>*, jxl::Plane<float>*, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_AVX2::RgbToYcbcr(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float>*, jxl::Plane<float>*, jxl::Plane<float>*, jxl::ThreadPool*) Unexecuted instantiation: jxl::N_SSE2::RgbToYcbcr(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::Plane<float>*, jxl::Plane<float>*, jxl::Plane<float>*, jxl::ThreadPool*) |
352 | | |
353 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
354 | | } // namespace HWY_NAMESPACE |
355 | | } // namespace jxl |
356 | | HWY_AFTER_NAMESPACE(); |
357 | | |
358 | | #if HWY_ONCE |
359 | | namespace jxl { |
360 | | HWY_EXPORT(ToXYB); |
361 | | Status ToXYB(const ColorEncoding& c_current, float intensity_target, |
362 | | const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image, |
363 | 186 | const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) { |
364 | 186 | return HWY_DYNAMIC_DISPATCH(ToXYB)(c_current, intensity_target, black, pool, |
365 | 186 | image, cms, linear); |
366 | 186 | } |
367 | | |
368 | | HWY_EXPORT(LinearRGBRowToXYB); |
369 | | void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, |
370 | | float* JXL_RESTRICT row2, |
371 | 0 | const float* JXL_RESTRICT premul_absorb, size_t xsize) { |
372 | 0 | HWY_DYNAMIC_DISPATCH(LinearRGBRowToXYB) |
373 | 0 | (row0, row1, row2, premul_absorb, xsize); |
374 | 0 | } |
375 | | |
376 | | HWY_EXPORT(ComputePremulAbsorb); |
377 | 0 | void ComputePremulAbsorb(float intensity_target, float* premul_absorb) { |
378 | 0 | HWY_DYNAMIC_DISPATCH(ComputePremulAbsorb)(intensity_target, premul_absorb); |
379 | 0 | } |
380 | | |
381 | | void ScaleXYBRow(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1, |
382 | 0 | float* JXL_RESTRICT row2, size_t xsize) { |
383 | 0 | for (size_t x = 0; x < xsize; x++) { |
384 | 0 | row2[x] = (row2[x] - row1[x] + jxl::cms::kScaledXYBOffset[2]) * |
385 | 0 | jxl::cms::kScaledXYBScale[2]; |
386 | 0 | row0[x] = (row0[x] + jxl::cms::kScaledXYBOffset[0]) * |
387 | 0 | jxl::cms::kScaledXYBScale[0]; |
388 | 0 | row1[x] = (row1[x] + jxl::cms::kScaledXYBOffset[1]) * |
389 | 0 | jxl::cms::kScaledXYBScale[1]; |
390 | 0 | } |
391 | 0 | } |
392 | | |
393 | 0 | void ScaleXYB(Image3F* opsin) { |
394 | 0 | for (size_t y = 0; y < opsin->ysize(); y++) { |
395 | 0 | float* row0 = opsin->PlaneRow(0, y); |
396 | 0 | float* row1 = opsin->PlaneRow(1, y); |
397 | 0 | float* row2 = opsin->PlaneRow(2, y); |
398 | 0 | ScaleXYBRow(row0, row1, row2, opsin->xsize()); |
399 | 0 | } |
400 | 0 | } |
401 | | |
402 | | HWY_EXPORT(RgbToYcbcr); |
403 | | Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, |
404 | | const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, |
405 | 0 | ImageF* cr_plane, ThreadPool* pool) { |
406 | 0 | return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane, |
407 | 0 | cb_plane, cr_plane, pool); |
408 | 0 | } |
409 | | |
410 | | } // namespace jxl |
411 | | #endif // HWY_ONCE |