/src/libjxl/lib/jxl/dec_noise.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/dec_noise.h" |
7 | | |
8 | | #include <cstdint> |
9 | | #include <cstdlib> |
10 | | #include <cstring> |
11 | | #include <utility> |
12 | | |
13 | | #include "lib/jxl/base/status.h" |
14 | | #include "lib/jxl/dec_bit_reader.h" |
15 | | #include "lib/jxl/dec_cache.h" |
16 | | #include "lib/jxl/frame_header.h" |
17 | | #include "lib/jxl/noise.h" |
18 | | #include "lib/jxl/render_pipeline/render_pipeline.h" |
19 | | |
20 | | #undef HWY_TARGET_INCLUDE |
21 | | #define HWY_TARGET_INCLUDE "lib/jxl/dec_noise.cc" |
22 | | #include <hwy/foreach_target.h> |
23 | | #include <hwy/highway.h> |
24 | | |
25 | | #include "lib/jxl/base/compiler_specific.h" |
26 | | #include "lib/jxl/base/rect.h" |
27 | | #include "lib/jxl/frame_dimensions.h" |
28 | | #include "lib/jxl/image.h" |
29 | | #include "lib/jxl/xorshift128plus-inl.h" |
30 | | |
31 | | HWY_BEFORE_NAMESPACE(); |
32 | | namespace jxl { |
33 | | namespace HWY_NAMESPACE { |
34 | | |
35 | | // These templates are not found via ADL. |
36 | | using hwy::HWY_NAMESPACE::Or; |
37 | | using hwy::HWY_NAMESPACE::Rebind; |
38 | | using hwy::HWY_NAMESPACE::ShiftRight; |
39 | | using hwy::HWY_NAMESPACE::Vec; |
40 | | |
41 | | using D = HWY_CAPPED(float, kBlockDim); |
42 | | using DI = Rebind<int, D>; |
43 | | |
44 | | // Converts one vector's worth of random bits to floats in [1, 2). |
45 | | // NOTE: as the convolution kernel sums to 0, it doesn't matter if inputs are in |
46 | | // [0, 1) or in [1, 2). |
47 | | void BitsToFloat(const uint32_t* JXL_RESTRICT random_bits, |
48 | 67.5M | float* JXL_RESTRICT floats) { |
49 | 67.5M | const HWY_FULL(float) df; |
50 | 67.5M | const HWY_FULL(uint32_t) du; |
51 | | |
52 | 67.5M | const auto bits = Load(du, random_bits); |
53 | | // 1.0 + 23 random mantissa bits = [1, 2) |
54 | 67.5M | const auto rand12 = BitCast(df, Or(ShiftRight<9>(bits), Set(du, 0x3F800000))); |
55 | 67.5M | Store(rand12, df, floats); |
56 | 67.5M | } |
57 | | |
58 | | void RandomImage(Xorshift128Plus* rng, const Rect& rect, |
59 | 14.7k | ImageF* JXL_RESTRICT noise) { |
60 | 14.7k | const size_t xsize = rect.xsize(); |
61 | 14.7k | const size_t ysize = rect.ysize(); |
62 | | |
63 | | // May exceed the vector size, hence we have two loops over x below. |
64 | 14.7k | constexpr size_t kFloatsPerBatch = |
65 | 14.7k | Xorshift128Plus::N * sizeof(uint64_t) / sizeof(float); |
66 | 14.7k | HWY_ALIGN uint64_t batch64[Xorshift128Plus::N] = {}; |
67 | 14.7k | HWY_ALIGN uint32_t batch32[2 * Xorshift128Plus::N]; |
68 | | |
69 | 14.7k | const HWY_FULL(float) df; |
70 | 14.7k | const size_t N = Lanes(df); |
71 | | |
72 | 450k | for (size_t y = 0; y < ysize; ++y) { |
73 | 435k | float* JXL_RESTRICT row = rect.Row(noise, y); |
74 | | |
75 | 435k | size_t x = 0; |
76 | | // Only entire batches (avoids exceeding the image padding). |
77 | 4.55M | for (; x + kFloatsPerBatch < xsize; x += kFloatsPerBatch) { |
78 | 4.12M | rng->Fill(batch64); |
79 | | // Workaround for https://github.com/llvm/llvm-project/issues/121229 |
80 | 4.12M | memcpy(batch32, batch64, sizeof(batch32)); |
81 | 67.6M | for (size_t i = 0; i < kFloatsPerBatch; i += Lanes(df)) { |
82 | 63.5M | BitsToFloat(batch32 + i, row + x + i); |
83 | 63.5M | } |
84 | 4.12M | } |
85 | | |
86 | | // Any remaining pixels, rounded up to vectors (safe due to padding). |
87 | 435k | rng->Fill(batch64); |
88 | | // Workaround for https://github.com/llvm/llvm-project/issues/121229 |
89 | 435k | memcpy(batch32, batch64, sizeof(batch32)); |
90 | 435k | size_t batch_pos = 0; // < kFloatsPerBatch |
91 | 6.13M | for (; x < xsize; x += N) { |
92 | 5.69M | BitsToFloat(batch32 + batch_pos, row + x); |
93 | 5.69M | batch_pos += N; |
94 | 5.69M | } |
95 | 435k | } |
96 | 14.7k | } |
97 | | void Random3Planes(size_t visible_frame_index, size_t nonvisible_frame_index, |
98 | | size_t x0, size_t y0, const std::pair<ImageF*, Rect>& plane0, |
99 | | const std::pair<ImageF*, Rect>& plane1, |
100 | 4.87k | const std::pair<ImageF*, Rect>& plane2) { |
101 | 4.87k | HWY_ALIGN Xorshift128Plus rng(visible_frame_index, nonvisible_frame_index, x0, |
102 | 4.87k | y0); |
103 | 4.87k | RandomImage(&rng, plane0.second, plane0.first); |
104 | 4.87k | RandomImage(&rng, plane1.second, plane1.first); |
105 | 4.87k | RandomImage(&rng, plane2.second, plane2.first); |
106 | 4.87k | } |
107 | | |
108 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
109 | | } // namespace HWY_NAMESPACE |
110 | | } // namespace jxl |
111 | | HWY_AFTER_NAMESPACE(); |
112 | | |
113 | | #if HWY_ONCE |
114 | | namespace jxl { |
115 | | |
116 | | namespace { |
117 | | HWY_EXPORT(Random3Planes); |
118 | | } // namespace |
119 | | |
120 | | void PrepareNoiseInput(const PassesDecoderState& dec_state, |
121 | | const FrameDimensions& frame_dim, |
122 | | const FrameHeader& frame_header, size_t group_index, |
123 | 2.35k | size_t thread) { |
124 | 2.35k | size_t group_dim = frame_dim.group_dim; |
125 | 2.35k | const size_t gx = group_index % frame_dim.xsize_groups; |
126 | 2.35k | const size_t gy = group_index / frame_dim.xsize_groups; |
127 | 2.35k | RenderPipelineInput input = |
128 | 2.35k | dec_state.render_pipeline->GetInputBuffers(group_index, thread); |
129 | 2.35k | size_t noise_c_start = |
130 | 2.35k | 3 + frame_header.nonserialized_metadata->m.num_extra_channels; |
131 | | // When the color channels are downsampled, we need to generate more noise |
132 | | // input for the current group than just the group dimensions. |
133 | 2.35k | std::pair<ImageF*, Rect> rects[3]; |
134 | 5.11k | for (size_t iy = 0; iy < frame_header.upsampling; iy++) { |
135 | 7.67k | for (size_t ix = 0; ix < frame_header.upsampling; ix++) { |
136 | 19.5k | for (size_t c = 0; c < 3; c++) { |
137 | 14.6k | auto r = input.GetBuffer(noise_c_start + c); |
138 | 14.6k | rects[c].first = r.first; |
139 | 14.6k | size_t x1 = r.second.x0() + r.second.xsize(); |
140 | 14.6k | size_t y1 = r.second.y0() + r.second.ysize(); |
141 | 14.6k | rects[c].second = |
142 | 14.6k | Rect(r.second.x0() + ix * group_dim, r.second.y0() + iy * group_dim, |
143 | 14.6k | group_dim, group_dim, x1, y1); |
144 | 14.6k | } |
145 | 4.91k | HWY_DYNAMIC_DISPATCH(Random3Planes) |
146 | 4.91k | (dec_state.visible_frame_index, dec_state.nonvisible_frame_index, |
147 | 4.91k | (gx * frame_header.upsampling + ix) * group_dim, |
148 | 4.91k | (gy * frame_header.upsampling + iy) * group_dim, rects[0], rects[1], |
149 | 4.91k | rects[2]); |
150 | 4.91k | } |
151 | 2.76k | } |
152 | 2.35k | } |
153 | | |
154 | 9.66k | void DecodeFloatParam(float precision, float* val, BitReader* br) { |
155 | 9.66k | const int absval_quant = br->ReadFixedBits<10>(); |
156 | 9.66k | *val = absval_quant / precision; |
157 | 9.66k | } |
158 | | |
159 | 1.20k | Status DecodeNoise(BitReader* br, NoiseParams* noise_params) { |
160 | 9.66k | for (float& i : noise_params->lut) { |
161 | 9.66k | DecodeFloatParam(kNoisePrecision, &i, br); |
162 | 9.66k | } |
163 | 1.20k | return true; |
164 | 1.20k | } |
165 | | |
166 | | } // namespace jxl |
167 | | #endif // HWY_ONCE |