/src/libjxl/lib/jxl/dec_noise.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/dec_noise.h" |
7 | | |
8 | | #include <cstdint> |
9 | | #include <cstdlib> |
10 | | #include <cstring> |
11 | | #include <utility> |
12 | | |
13 | | #include "lib/jxl/base/status.h" |
14 | | #include "lib/jxl/dec_bit_reader.h" |
15 | | #include "lib/jxl/dec_cache.h" |
16 | | #include "lib/jxl/frame_header.h" |
17 | | #include "lib/jxl/noise.h" |
18 | | #include "lib/jxl/render_pipeline/render_pipeline.h" |
19 | | |
20 | | #undef HWY_TARGET_INCLUDE |
21 | | #define HWY_TARGET_INCLUDE "lib/jxl/dec_noise.cc" |
22 | | #include <hwy/foreach_target.h> |
23 | | #include <hwy/highway.h> |
24 | | |
25 | | #include "lib/jxl/base/compiler_specific.h" |
26 | | #include "lib/jxl/base/rect.h" |
27 | | #include "lib/jxl/frame_dimensions.h" |
28 | | #include "lib/jxl/image.h" |
29 | | #include "lib/jxl/xorshift128plus-inl.h" |
30 | | |
31 | | HWY_BEFORE_NAMESPACE(); |
32 | | namespace jxl { |
33 | | namespace HWY_NAMESPACE { |
34 | | |
35 | | // These templates are not found via ADL. |
36 | | using hwy::HWY_NAMESPACE::Or; |
37 | | using hwy::HWY_NAMESPACE::Rebind; |
38 | | using hwy::HWY_NAMESPACE::ShiftRight; |
39 | | using hwy::HWY_NAMESPACE::Vec; |
40 | | |
41 | | using D = HWY_CAPPED(float, kBlockDim); |
42 | | using DI = Rebind<int, D>; |
43 | | |
44 | | // Converts one vector's worth of random bits to floats in [1, 2). |
45 | | // NOTE: as the convolution kernel sums to 0, it doesn't matter if inputs are in |
46 | | // [0, 1) or in [1, 2). |
47 | | void BitsToFloat(const uint32_t* JXL_RESTRICT random_bits, |
48 | 64.5M | float* JXL_RESTRICT floats) { |
49 | 64.5M | const HWY_FULL(float) df; |
50 | 64.5M | const HWY_FULL(uint32_t) du; |
51 | | |
52 | 64.5M | const auto bits = Load(du, random_bits); |
53 | | // 1.0 + 23 random mantissa bits = [1, 2) |
54 | 64.5M | const auto rand12 = BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000))); |
55 | 64.5M | Store(rand12, df, floats); |
56 | 64.5M | } Unexecuted instantiation: jxl::N_SSE4::BitsToFloat(unsigned int const*, float*) jxl::N_AVX2::BitsToFloat(unsigned int const*, float*) Line | Count | Source | 48 | 64.5M | float* JXL_RESTRICT floats) { | 49 | 64.5M | const HWY_FULL(float) df; | 50 | 64.5M | const HWY_FULL(uint32_t) du; | 51 | | | 52 | 64.5M | const auto bits = Load(du, random_bits); | 53 | | // 1.0 + 23 random mantissa bits = [1, 2) | 54 | 64.5M | const auto rand12 = BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000))); | 55 | 64.5M | Store(rand12, df, floats); | 56 | 64.5M | } |
Unexecuted instantiation: jxl::N_SSE2::BitsToFloat(unsigned int const*, float*) |
57 | | |
58 | | void RandomImage(Xorshift128Plus* rng, const Rect& rect, |
59 | 54.4k | ImageF* JXL_RESTRICT noise) { |
60 | 54.4k | const size_t xsize = rect.xsize(); |
61 | 54.4k | const size_t ysize = rect.ysize(); |
62 | | |
63 | | // May exceed the vector size, hence we have two loops over x below. |
64 | 54.4k | constexpr size_t kFloatsPerBatch = |
65 | 54.4k | Xorshift128Plus::N * sizeof(uint64_t) / sizeof(float); |
66 | 54.4k | HWY_ALIGN uint64_t batch64[Xorshift128Plus::N] = {}; |
67 | 54.4k | HWY_ALIGN uint32_t batch32[2 * Xorshift128Plus::N]; |
68 | | |
69 | 54.4k | const HWY_FULL(float) df; |
70 | 54.4k | const size_t N = Lanes(df); |
71 | | |
72 | 3.60M | for (size_t y = 0; y < ysize; ++y) { |
73 | 3.55M | float* JXL_RESTRICT row = rect.Row(noise, y); |
74 | | |
75 | 3.55M | size_t x = 0; |
76 | | // Only entire batches (avoids exceeding the image padding). |
77 | 33.0M | for (; x + kFloatsPerBatch < xsize; x += kFloatsPerBatch) { |
78 | 29.4M | rng->Fill(batch64); |
79 | | // Workaround for https://github.com/llvm/llvm-project/issues/121229 |
80 | 29.4M | memcpy(batch32, batch64, sizeof(batch32)); |
81 | 88.4M | for (size_t i = 0; i < kFloatsPerBatch; i += Lanes(df)) { |
82 | 58.9M | BitsToFloat(batch32 + i, row + x + i); |
83 | 58.9M | } |
84 | 29.4M | } |
85 | | |
86 | | // Any remaining pixels, rounded up to vectors (safe due to padding). |
87 | 3.55M | rng->Fill(batch64); |
88 | | // Workaround for https://github.com/llvm/llvm-project/issues/121229 |
89 | 3.55M | memcpy(batch32, batch64, sizeof(batch32)); |
90 | 3.55M | size_t batch_pos = 0; // < kFloatsPerBatch |
91 | 9.12M | for (; x < xsize; x += N) { |
92 | 5.56M | BitsToFloat(batch32 + batch_pos, row + x); |
93 | 5.56M | batch_pos += N; |
94 | 5.56M | } |
95 | 3.55M | } |
96 | 54.4k | } Unexecuted instantiation: dec_noise.cc:jxl::N_SSE4::RandomImage(jxl::N_SSE4::(anonymous namespace)::Xorshift128Plus*, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) dec_noise.cc:jxl::N_AVX2::RandomImage(jxl::N_AVX2::(anonymous namespace)::Xorshift128Plus*, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) Line | Count | Source | 59 | 54.4k | ImageF* JXL_RESTRICT noise) { | 60 | 54.4k | const size_t xsize = rect.xsize(); | 61 | 54.4k | const size_t ysize = rect.ysize(); | 62 | | | 63 | | // May exceed the vector size, hence we have two loops over x below. | 64 | 54.4k | constexpr size_t kFloatsPerBatch = | 65 | 54.4k | Xorshift128Plus::N * sizeof(uint64_t) / sizeof(float); | 66 | 54.4k | HWY_ALIGN uint64_t batch64[Xorshift128Plus::N] = {}; | 67 | 54.4k | HWY_ALIGN uint32_t batch32[2 * Xorshift128Plus::N]; | 68 | | | 69 | 54.4k | const HWY_FULL(float) df; | 70 | 54.4k | const size_t N = Lanes(df); | 71 | | | 72 | 3.60M | for (size_t y = 0; y < ysize; ++y) { | 73 | 3.55M | float* JXL_RESTRICT row = rect.Row(noise, y); | 74 | | | 75 | 3.55M | size_t x = 0; | 76 | | // Only entire batches (avoids exceeding the image padding). | 77 | 33.0M | for (; x + kFloatsPerBatch < xsize; x += kFloatsPerBatch) { | 78 | 29.4M | rng->Fill(batch64); | 79 | | // Workaround for https://github.com/llvm/llvm-project/issues/121229 | 80 | 29.4M | memcpy(batch32, batch64, sizeof(batch32)); | 81 | 88.4M | for (size_t i = 0; i < kFloatsPerBatch; i += Lanes(df)) { | 82 | 58.9M | BitsToFloat(batch32 + i, row + x + i); | 83 | 58.9M | } | 84 | 29.4M | } | 85 | | | 86 | | // Any remaining pixels, rounded up to vectors (safe due to padding). | 87 | 3.55M | rng->Fill(batch64); | 88 | | // Workaround for https://github.com/llvm/llvm-project/issues/121229 | 89 | 3.55M | memcpy(batch32, batch64, sizeof(batch32)); | 90 | 3.55M | size_t batch_pos = 0; // < kFloatsPerBatch | 91 | 9.12M | for (; x < xsize; x += N) { | 92 | 5.56M | BitsToFloat(batch32 + batch_pos, row + x); | 93 | 5.56M | batch_pos += N; | 94 | 5.56M | } | 95 | 3.55M | } | 96 | 54.4k | } |
Unexecuted instantiation: dec_noise.cc:jxl::N_SSE2::RandomImage(jxl::N_SSE2::(anonymous namespace)::Xorshift128Plus*, jxl::RectT<unsigned long> const&, jxl::Plane<float>*) |
97 | | void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index, |
98 | | size_t x0, size_t y0, const std::pair<ImageF*, Rect>& plane0, |
99 | | const std::pair<ImageF*, Rect>& plane1, |
100 | 18.1k | const std::pair<ImageF*, Rect>& plane2) { |
101 | 18.1k | HWY_ALIGN Xorshift128Plus rng(visible_frame_index, nonvisible_frame_index, x0, |
102 | 18.1k | y0); |
103 | 18.1k | RandomImage(&rng, plane0.second, plane0.first); |
104 | 18.1k | RandomImage(&rng, plane1.second, plane1.first); |
105 | 18.1k | RandomImage(&rng, plane2.second, plane2.first); |
106 | 18.1k | } Unexecuted instantiation: jxl::N_SSE4::Random3Planes(unsigned long, unsigned long, unsigned long, unsigned long, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&) jxl::N_AVX2::Random3Planes(unsigned long, unsigned long, unsigned long, unsigned long, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&) Line | Count | Source | 100 | 18.1k | const std::pair<ImageF*, Rect>& plane2) { | 101 | 18.1k | HWY_ALIGN Xorshift128Plus rng(visible_frame_index, nonvisible_frame_index, x0, | 102 | 18.1k | y0); | 103 | 18.1k | RandomImage(&rng, plane0.second, plane0.first); | 104 | 18.1k | RandomImage(&rng, plane1.second, plane1.first); | 105 | 18.1k | RandomImage(&rng, plane2.second, plane2.first); | 106 | 18.1k | } |
Unexecuted instantiation: jxl::N_SSE2::Random3Planes(unsigned long, unsigned long, unsigned long, unsigned long, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&, std::__1::pair<jxl::Plane<float>*, jxl::RectT<unsigned long> > const&) |
107 | | |
108 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
109 | | } // namespace HWY_NAMESPACE |
110 | | } // namespace jxl |
111 | | HWY_AFTER_NAMESPACE(); |
112 | | |
113 | | #if HWY_ONCE |
114 | | namespace jxl { |
115 | | |
116 | | namespace { |
117 | | HWY_EXPORT(Random3Planes); |
118 | | } // namespace |
119 | | |
120 | | void PrepareNoiseInput(const PassesDecoderState& dec_state, |
121 | | const FrameDimensions& frame_dim, |
122 | | const FrameHeader& frame_header, size_t group_index, |
123 | 3.18k | size_t thread) { |
124 | 3.18k | size_t group_dim = frame_dim.group_dim; |
125 | 3.18k | const size_t gx = group_index % frame_dim.xsize_groups; |
126 | 3.18k | const size_t gy = group_index / frame_dim.xsize_groups; |
127 | 3.18k | RenderPipelineInput input = |
128 | 3.18k | dec_state.render_pipeline->GetInputBuffers(group_index, thread); |
129 | 3.18k | size_t noise_c_start = |
130 | 3.18k | 3 + frame_header.nonserialized_metadata->m.num_extra_channels; |
131 | | // When the color channels are downsampled, we need to generate more noise |
132 | | // input for the current group than just the group dimensions. |
133 | 3.18k | std::pair<ImageF*, Rect> rects[3]; |
134 | 8.42k | for (size_t iy = 0; iy < frame_header.upsampling; iy++) { |
135 | 23.3k | for (size_t ix = 0; ix < frame_header.upsampling; ix++) { |
136 | 72.5k | for (size_t c = 0; c < 3; c++) { |
137 | 54.4k | auto r = input.GetBuffer(noise_c_start + c); |
138 | 54.4k | rects[c].first = r.first; |
139 | 54.4k | size_t x1 = r.second.x0() + r.second.xsize(); |
140 | 54.4k | size_t y1 = r.second.y0() + r.second.ysize(); |
141 | 54.4k | rects[c].second = |
142 | 54.4k | Rect(r.second.x0() + ix * group_dim, r.second.y0() + iy * group_dim, |
143 | 54.4k | group_dim, group_dim, x1, y1); |
144 | 54.4k | } |
145 | 18.1k | HWY_DYNAMIC_DISPATCH(Random3Planes) |
146 | 18.1k | (dec_state.visible_frame_index, dec_state.nonvisible_frame_index, |
147 | 18.1k | (gx * frame_header.upsampling + ix) * group_dim, |
148 | 18.1k | (gy * frame_header.upsampling + iy) * group_dim, rects[0], rects[1], |
149 | 18.1k | rects[2]); |
150 | 18.1k | } |
151 | 5.24k | } |
152 | 3.18k | } |
153 | | |
154 | 15.1k | void DecodeFloatParam(float precision, float* val, BitReader* br) { |
155 | 15.1k | const int absval_quant = br->ReadFixedBits<10>(); |
156 | 15.1k | *val = absval_quant / precision; |
157 | 15.1k | } |
158 | | |
159 | 1.89k | Status DecodeNoise(BitReader* br, NoiseParams* noise_params) { |
160 | 15.1k | for (float& i : noise_params->lut) { |
161 | 15.1k | DecodeFloatParam(kNoisePrecision, &i, br); |
162 | 15.1k | } |
163 | 1.89k | return true; |
164 | 1.89k | } |
165 | | |
166 | | } // namespace jxl |
167 | | #endif // HWY_ONCE |