/src/libjxl/lib/jxl/modular/transform/squeeze.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jxl/modular/transform/squeeze.h" |
7 | | |
8 | | #include <jxl/memory_manager.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <cstdint> |
12 | | #include <cstdlib> |
13 | | #include <utility> |
14 | | #include <vector> |
15 | | |
16 | | #include "lib/jxl/base/common.h" |
17 | | #include "lib/jxl/base/compiler_specific.h" |
18 | | #include "lib/jxl/base/data_parallel.h" |
19 | | #include "lib/jxl/base/printf_macros.h" |
20 | | #include "lib/jxl/base/status.h" |
21 | | #include "lib/jxl/modular/modular_image.h" |
22 | | #include "lib/jxl/modular/transform/squeeze_params.h" |
23 | | #undef HWY_TARGET_INCLUDE |
24 | | #define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/squeeze.cc" |
25 | | #include <hwy/foreach_target.h> |
26 | | #include <hwy/highway.h> |
27 | | |
28 | | #include "lib/jxl/simd_util-inl.h" |
29 | | |
30 | | HWY_BEFORE_NAMESPACE(); |
31 | | namespace jxl { |
32 | | namespace HWY_NAMESPACE { |
33 | | |
34 | | #if HWY_TARGET != HWY_SCALAR |
35 | | |
36 | | // These templates are not found via ADL. |
37 | | using hwy::HWY_NAMESPACE::Abs; |
38 | | using hwy::HWY_NAMESPACE::Add; |
39 | | using hwy::HWY_NAMESPACE::And; |
40 | | using hwy::HWY_NAMESPACE::DupEven; |
41 | | using hwy::HWY_NAMESPACE::DupOdd; |
42 | | using hwy::HWY_NAMESPACE::Gt; |
43 | | using hwy::HWY_NAMESPACE::IfThenElse; |
44 | | using hwy::HWY_NAMESPACE::IfThenZeroElse; |
45 | | using hwy::HWY_NAMESPACE::Lt; |
46 | | using hwy::HWY_NAMESPACE::MulEven; |
47 | | using hwy::HWY_NAMESPACE::MulOdd; |
48 | | using hwy::HWY_NAMESPACE::Ne; |
49 | | using hwy::HWY_NAMESPACE::Neg; |
50 | | using hwy::HWY_NAMESPACE::OddEven; |
51 | | using hwy::HWY_NAMESPACE::RebindToUnsigned; |
52 | | using hwy::HWY_NAMESPACE::ShiftLeft; |
53 | | using hwy::HWY_NAMESPACE::ShiftRight; |
54 | | using hwy::HWY_NAMESPACE::Sub; |
55 | | using hwy::HWY_NAMESPACE::Xor; |
56 | | |
57 | | using D = HWY_CAPPED(pixel_type, 8); |
58 | | using DU = RebindToUnsigned<D>; |
59 | | constexpr D d; |
60 | | constexpr DU du; |
61 | | |
62 | | JXL_INLINE void FastUnsqueeze(const pixel_type *JXL_RESTRICT p_residual, |
63 | | const pixel_type *JXL_RESTRICT p_avg, |
64 | | const pixel_type *JXL_RESTRICT p_navg, |
65 | | const pixel_type *p_pout, |
66 | | pixel_type *JXL_RESTRICT p_out, |
67 | 331M | pixel_type *p_nout) { |
68 | 331M | const size_t N = Lanes(d); |
69 | 331M | auto onethird = Set(d, 0x55555556); |
70 | 694M | for (size_t x = 0; x < 8; x += N) { |
71 | 363M | auto avg = Load(d, p_avg + x); |
72 | 363M | auto next_avg = Load(d, p_navg + x); |
73 | 363M | auto top = Load(d, p_pout + x); |
74 | | // Equivalent to SmoothTendency(top,avg,next_avg), but without branches |
75 | | // typo:off |
76 | 363M | auto Ba = Sub(top, avg); |
77 | 363M | auto an = Sub(avg, next_avg); |
78 | 363M | auto nonmono = Xor(Ba, an); |
79 | 363M | auto absBa = Abs(Ba); |
80 | 363M | auto absan = Abs(an); |
81 | 363M | auto absBn = Abs(Sub(top, next_avg)); |
82 | | // Compute a3 = absBa / 3 |
83 | 363M | auto a3eh = MulEven(absBa, onethird); |
84 | 363M | auto a3oh = MulOdd(absBa, onethird); |
85 | | |
86 | 363M | #if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2)) |
87 | 363M | #if HWY_IS_LITTLE_ENDIAN |
88 | 363M | auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh)); |
89 | | #else // not little endian |
90 | | auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh)); |
91 | | #endif // endianness |
92 | | #else // hwy < 1.2 |
93 | | #if HWY_IS_LITTLE_ENDIAN |
94 | | auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh))); |
95 | | #else // not little endian |
96 | | auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh)) |
97 | | #endif // endianness |
98 | | #endif // hwy version |
99 | | |
100 | 363M | a3 = Add(a3, Add(absBn, Set(d, 2))); |
101 | 363M | auto absdiff = ShiftRight<2>(a3); |
102 | 363M | auto skipdiff = Ne(Ba, Zero(d)); |
103 | 363M | skipdiff = And(skipdiff, Ne(an, Zero(d))); |
104 | 363M | skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); |
105 | 363M | auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); |
106 | 363M | absdiff = IfThenElse(Gt(absdiff, absBa2), |
107 | 363M | Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); |
108 | | // typo:on |
109 | 363M | auto absan2 = ShiftLeft<1>(absan); |
110 | 363M | absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), |
111 | 363M | absan2, absdiff); |
112 | 363M | auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); |
113 | 363M | auto tendency = IfThenZeroElse(skipdiff, diff1); |
114 | | |
115 | 363M | auto diff_minus_tendency = Load(d, p_residual + x); |
116 | 363M | auto diff = Add(diff_minus_tendency, tendency); |
117 | 363M | auto out = |
118 | 363M | Add(avg, ShiftRight<1>( |
119 | 363M | Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); |
120 | 363M | Store(out, d, p_out + x); |
121 | 363M | Store(Sub(out, diff), d, p_nout + x); |
122 | 363M | } |
123 | 331M | } jxl::N_SSE4::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) Line | Count | Source | 67 | 16.6M | pixel_type *p_nout) { | 68 | 16.6M | const size_t N = Lanes(d); | 69 | 16.6M | auto onethird = Set(d, 0x55555556); | 70 | 49.8M | for (size_t x = 0; x < 8; x += N) { | 71 | 33.1M | auto avg = Load(d, p_avg + x); | 72 | 33.1M | auto next_avg = Load(d, p_navg + x); | 73 | 33.1M | auto top = Load(d, p_pout + x); | 74 | | // Equivalent to SmoothTendency(top,avg,next_avg), but without branches | 75 | | // typo:off | 76 | 33.1M | auto Ba = Sub(top, avg); | 77 | 33.1M | auto an = Sub(avg, next_avg); | 78 | 33.1M | auto nonmono = Xor(Ba, an); | 79 | 33.1M | auto absBa = Abs(Ba); | 80 | 33.1M | auto absan = Abs(an); | 81 | 33.1M | auto absBn = Abs(Sub(top, next_avg)); | 82 | | // Compute a3 = absBa / 3 | 83 | 33.1M | auto a3eh = MulEven(absBa, onethird); | 84 | 33.1M | auto a3oh = MulOdd(absBa, onethird); | 85 | | | 86 | 33.1M | #if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2)) | 87 | 33.1M | #if HWY_IS_LITTLE_ENDIAN | 88 | 33.1M | auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 89 | | #else // not little endian | 90 | | auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 91 | | #endif // endianness | 92 | | #else // hwy < 1.2 | 93 | | #if HWY_IS_LITTLE_ENDIAN | 94 | | auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh))); | 95 | | #else // not little endian | 96 | | auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh)) | 97 | | #endif // endianness | 98 | | #endif // hwy version | 99 | | | 100 | 33.1M | a3 = Add(a3, Add(absBn, Set(d, 2))); | 101 | 33.1M | auto absdiff = ShiftRight<2>(a3); | 102 | 33.1M | auto skipdiff = Ne(Ba, Zero(d)); | 103 | 33.1M | skipdiff = And(skipdiff, Ne(an, Zero(d))); | 104 | 33.1M | skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); | 105 | 33.1M | auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); | 106 | 33.1M | absdiff = IfThenElse(Gt(absdiff, absBa2), | 107 | 33.1M | Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); | 108 | | // typo:on | 109 | 33.1M | auto absan2 = ShiftLeft<1>(absan); | 110 | 33.1M | absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), | 111 | 33.1M | absan2, absdiff); | 112 | 33.1M | auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); | 113 | 33.1M | auto tendency = IfThenZeroElse(skipdiff, diff1); | 114 | | | 115 | 33.1M | auto diff_minus_tendency = Load(d, p_residual + x); | 116 | 33.1M | auto diff = Add(diff_minus_tendency, tendency); | 117 | 33.1M | auto out = | 118 | 33.1M | Add(avg, ShiftRight<1>( | 119 | 33.1M | Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); | 120 | 33.1M | Store(out, d, p_out + x); | 121 | 33.1M | Store(Sub(out, diff), d, p_nout + x); | 122 | 33.1M | } | 123 | 16.6M | } |
jxl::N_AVX2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) Line | Count | Source | 67 | 298M | pixel_type *p_nout) { | 68 | 298M | const size_t N = Lanes(d); | 69 | 298M | auto onethird = Set(d, 0x55555556); | 70 | 596M | for (size_t x = 0; x < 8; x += N) { | 71 | 298M | auto avg = Load(d, p_avg + x); | 72 | 298M | auto next_avg = Load(d, p_navg + x); | 73 | 298M | auto top = Load(d, p_pout + x); | 74 | | // Equivalent to SmoothTendency(top,avg,next_avg), but without branches | 75 | | // typo:off | 76 | 298M | auto Ba = Sub(top, avg); | 77 | 298M | auto an = Sub(avg, next_avg); | 78 | 298M | auto nonmono = Xor(Ba, an); | 79 | 298M | auto absBa = Abs(Ba); | 80 | 298M | auto absan = Abs(an); | 81 | 298M | auto absBn = Abs(Sub(top, next_avg)); | 82 | | // Compute a3 = absBa / 3 | 83 | 298M | auto a3eh = MulEven(absBa, onethird); | 84 | 298M | auto a3oh = MulOdd(absBa, onethird); | 85 | | | 86 | 298M | #if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2)) | 87 | 298M | #if HWY_IS_LITTLE_ENDIAN | 88 | 298M | auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 89 | | #else // not little endian | 90 | | auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 91 | | #endif // endianness | 92 | | #else // hwy < 1.2 | 93 | | #if HWY_IS_LITTLE_ENDIAN | 94 | | auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh))); | 95 | | #else // not little endian | 96 | | auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh)) | 97 | | #endif // endianness | 98 | | #endif // hwy version | 99 | | | 100 | 298M | a3 = Add(a3, Add(absBn, Set(d, 2))); | 101 | 298M | auto absdiff = ShiftRight<2>(a3); | 102 | 298M | auto skipdiff = Ne(Ba, Zero(d)); | 103 | 298M | skipdiff = And(skipdiff, Ne(an, Zero(d))); | 104 | 298M | skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); | 105 | 298M | auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); | 106 | 298M | absdiff = IfThenElse(Gt(absdiff, absBa2), | 107 | 298M | Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); | 108 | | // typo:on | 109 | 298M | auto absan2 = ShiftLeft<1>(absan); | 110 | 298M | absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), | 111 | 298M | absan2, absdiff); | 112 | 298M | auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); | 113 | 298M | auto tendency = IfThenZeroElse(skipdiff, diff1); | 114 | | | 115 | 298M | auto diff_minus_tendency = Load(d, p_residual + x); | 116 | 298M | auto diff = Add(diff_minus_tendency, tendency); | 117 | 298M | auto out = | 118 | 298M | Add(avg, ShiftRight<1>( | 119 | 298M | Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); | 120 | 298M | Store(out, d, p_out + x); | 121 | 298M | Store(Sub(out, diff), d, p_nout + x); | 122 | 298M | } | 123 | 298M | } |
jxl::N_SSE2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*) Line | Count | Source | 67 | 16.3M | pixel_type *p_nout) { | 68 | 16.3M | const size_t N = Lanes(d); | 69 | 16.3M | auto onethird = Set(d, 0x55555556); | 70 | 48.4M | for (size_t x = 0; x < 8; x += N) { | 71 | 32.1M | auto avg = Load(d, p_avg + x); | 72 | 32.1M | auto next_avg = Load(d, p_navg + x); | 73 | 32.1M | auto top = Load(d, p_pout + x); | 74 | | // Equivalent to SmoothTendency(top,avg,next_avg), but without branches | 75 | | // typo:off | 76 | 32.1M | auto Ba = Sub(top, avg); | 77 | 32.1M | auto an = Sub(avg, next_avg); | 78 | 32.1M | auto nonmono = Xor(Ba, an); | 79 | 32.1M | auto absBa = Abs(Ba); | 80 | 32.1M | auto absan = Abs(an); | 81 | 32.1M | auto absBn = Abs(Sub(top, next_avg)); | 82 | | // Compute a3 = absBa / 3 | 83 | 32.1M | auto a3eh = MulEven(absBa, onethird); | 84 | 32.1M | auto a3oh = MulOdd(absBa, onethird); | 85 | | | 86 | 32.1M | #if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2)) | 87 | 32.1M | #if HWY_IS_LITTLE_ENDIAN | 88 | 32.1M | auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 89 | | #else // not little endian | 90 | | auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh)); | 91 | | #endif // endianness | 92 | | #else // hwy < 1.2 | 93 | | #if HWY_IS_LITTLE_ENDIAN | 94 | | auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh))); | 95 | | #else // not little endian | 96 | | auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh)) | 97 | | #endif // endianness | 98 | | #endif // hwy version | 99 | | | 100 | 32.1M | a3 = Add(a3, Add(absBn, Set(d, 2))); | 101 | 32.1M | auto absdiff = ShiftRight<2>(a3); | 102 | 32.1M | auto skipdiff = Ne(Ba, Zero(d)); | 103 | 32.1M | skipdiff = And(skipdiff, Ne(an, Zero(d))); | 104 | 32.1M | skipdiff = And(skipdiff, Lt(nonmono, Zero(d))); | 105 | 32.1M | auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1))); | 106 | 32.1M | absdiff = IfThenElse(Gt(absdiff, absBa2), | 107 | 32.1M | Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff); | 108 | | // typo:on | 109 | 32.1M | auto absan2 = ShiftLeft<1>(absan); | 110 | 32.1M | absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2), | 111 | 32.1M | absan2, absdiff); | 112 | 32.1M | auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff); | 113 | 32.1M | auto tendency = IfThenZeroElse(skipdiff, diff1); | 114 | | | 115 | 32.1M | auto diff_minus_tendency = Load(d, p_residual + x); | 116 | 32.1M | auto diff = Add(diff_minus_tendency, tendency); | 117 | 32.1M | auto out = | 118 | 32.1M | Add(avg, ShiftRight<1>( | 119 | 32.1M | Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff)))))); | 120 | 32.1M | Store(out, d, p_out + x); | 121 | 32.1M | Store(Sub(out, diff), d, p_nout + x); | 122 | 32.1M | } | 123 | 16.3M | } |
|
124 | | |
125 | | #endif // HWY_TARGET != HWY_SCALAR |
126 | | |
127 | 361k | Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { |
128 | 361k | JXL_ENSURE(c < input.channel.size()); |
129 | 361k | JXL_ENSURE(rc < input.channel.size()); |
130 | 361k | Channel &chin = input.channel[c]; |
131 | 361k | const Channel &chin_residual = input.channel[rc]; |
132 | | // These must be valid since we ran MetaApply already. |
133 | 361k | JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2)); |
134 | 361k | JXL_ENSURE(chin.h == chin_residual.h); |
135 | 361k | JxlMemoryManager *memory_manager = input.memory_manager(); |
136 | | |
137 | 361k | if (chin_residual.w == 0) { |
138 | | // Short-circuit: output channel has same dimensions as input. |
139 | 9.93k | input.channel[c].hshift--; |
140 | 9.93k | return true; |
141 | 9.93k | } |
142 | | |
143 | | // Note: chin.w >= chin_residual.w and at most 1 different. |
144 | 703k | JXL_ASSIGN_OR_RETURN(Channel chout, |
145 | 703k | Channel::Create(memory_manager, chin.w + chin_residual.w, |
146 | 703k | chin.h, chin.hshift - 1, chin.vshift)); |
147 | 703k | JXL_DEBUG_V(4, |
148 | 703k | "Undoing horizontal squeeze of channel %i using residuals in " |
149 | 703k | "channel %i (going from width %" PRIuS " to %" PRIuS ")", |
150 | 703k | c, rc, chin.w, chout.w); |
151 | | |
152 | 703k | if (chin_residual.h == 0) { |
153 | | // Short-circuit: channel with no pixels. |
154 | 0 | input.channel[c] = std::move(chout); |
155 | 0 | return true; |
156 | 0 | } |
157 | 130M | auto unsqueeze_row = [&](size_t y, size_t x0) { |
158 | 130M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); |
159 | 130M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); |
160 | 130M | pixel_type *JXL_RESTRICT p_out = chout.Row(y); |
161 | 763M | for (size_t x = x0; x < chin_residual.w; x++) { |
162 | 633M | pixel_type_w diff_minus_tendency = p_residual[x]; |
163 | 633M | pixel_type_w avg = p_avg[x]; |
164 | 633M | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); |
165 | 633M | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); |
166 | 633M | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); |
167 | 633M | pixel_type_w diff = diff_minus_tendency + tendency; |
168 | 633M | pixel_type_w A = avg + (diff / 2); |
169 | 633M | p_out[(x << 1)] = A; |
170 | 633M | pixel_type_w B = A - diff; |
171 | 633M | p_out[(x << 1) + 1] = B; |
172 | 633M | } |
173 | 130M | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; |
174 | 130M | }; squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const Line | Count | Source | 157 | 1.24M | auto unsqueeze_row = [&](size_t y, size_t x0) { | 158 | 1.24M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 159 | 1.24M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 160 | 1.24M | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 161 | 11.6M | for (size_t x = x0; x < chin_residual.w; x++) { | 162 | 10.3M | pixel_type_w diff_minus_tendency = p_residual[x]; | 163 | 10.3M | pixel_type_w avg = p_avg[x]; | 164 | 10.3M | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 165 | 10.3M | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 166 | 10.3M | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 167 | 10.3M | pixel_type_w diff = diff_minus_tendency + tendency; | 168 | 10.3M | pixel_type_w A = avg + (diff / 2); | 169 | 10.3M | p_out[(x << 1)] = A; | 170 | 10.3M | pixel_type_w B = A - diff; | 171 | 10.3M | p_out[(x << 1) + 1] = B; | 172 | 10.3M | } | 173 | 1.24M | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 174 | 1.24M | }; |
squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const Line | Count | Source | 157 | 128M | auto unsqueeze_row = [&](size_t y, size_t x0) { | 158 | 128M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 159 | 128M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 160 | 128M | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 161 | 742M | for (size_t x = x0; x < chin_residual.w; x++) { | 162 | 614M | pixel_type_w diff_minus_tendency = p_residual[x]; | 163 | 614M | pixel_type_w avg = p_avg[x]; | 164 | 614M | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 165 | 614M | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 166 | 614M | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 167 | 614M | pixel_type_w diff = diff_minus_tendency + tendency; | 168 | 614M | pixel_type_w A = avg + (diff / 2); | 169 | 614M | p_out[(x << 1)] = A; | 170 | 614M | pixel_type_w B = A - diff; | 171 | 614M | p_out[(x << 1) + 1] = B; | 172 | 614M | } | 173 | 128M | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 174 | 128M | }; |
squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const Line | Count | Source | 157 | 1.05M | auto unsqueeze_row = [&](size_t y, size_t x0) { | 158 | 1.05M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 159 | 1.05M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 160 | 1.05M | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 161 | 9.95M | for (size_t x = x0; x < chin_residual.w; x++) { | 162 | 8.90M | pixel_type_w diff_minus_tendency = p_residual[x]; | 163 | 8.90M | pixel_type_w avg = p_avg[x]; | 164 | 8.90M | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 165 | 8.90M | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 166 | 8.90M | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 167 | 8.90M | pixel_type_w diff = diff_minus_tendency + tendency; | 168 | 8.90M | pixel_type_w A = avg + (diff / 2); | 169 | 8.90M | p_out[(x << 1)] = A; | 170 | 8.90M | pixel_type_w B = A - diff; | 171 | 8.90M | p_out[(x << 1) + 1] = B; | 172 | 8.90M | } | 173 | 1.05M | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 174 | 1.05M | }; |
|
175 | | |
176 | | // somewhat complicated trickery just to be able to SIMD this. |
177 | | // Horizontal unsqueeze has horizontal data dependencies, so we do |
178 | | // 8 rows at a time and treat it as a vertical unsqueeze of a |
179 | | // transposed 8x8 block (or 9x8 for one input). |
180 | 351k | static constexpr const size_t kRowsPerThread = 8; |
181 | 351k | const auto unsqueeze_span = [&](const uint32_t task, |
182 | 16.4M | size_t /* thread */) -> Status { |
183 | 16.4M | const size_t y0 = task * kRowsPerThread; |
184 | 16.4M | const size_t rows = std::min(kRowsPerThread, chin.h - y0); |
185 | 16.4M | size_t x = 0; |
186 | | |
187 | 16.4M | #if HWY_TARGET != HWY_SCALAR |
188 | 16.4M | intptr_t onerow_in = chin.plane.PixelsPerRow(); |
189 | 16.4M | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); |
190 | 16.4M | intptr_t onerow_out = chout.plane.PixelsPerRow(); |
191 | 16.4M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); |
192 | 16.4M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); |
193 | 16.4M | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); |
194 | 16.4M | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; |
195 | 16.4M | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; |
196 | 16.4M | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; |
197 | 16.4M | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; |
198 | 16.4M | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; |
199 | 16.4M | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; |
200 | 16.4M | const size_t N = Lanes(d); |
201 | 16.4M | if (chin_residual.w > 16 && rows == kRowsPerThread) { |
202 | 19.9M | for (; x < chin_residual.w - 9; x += 8) { |
203 | 17.8M | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); |
204 | 17.8M | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); |
205 | 160M | for (size_t y = 0; y < kRowsPerThread; y++) { |
206 | 142M | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; |
207 | 142M | } |
208 | 156M | for (size_t i = 0; i < 8; i++) { |
209 | 138M | FastUnsqueeze( |
210 | 138M | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), |
211 | 138M | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), |
212 | 138M | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); |
213 | 138M | } |
214 | | |
215 | 17.8M | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); |
216 | 17.8M | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); |
217 | 159M | for (size_t y = 0; y < kRowsPerThread; y++) { |
218 | 301M | for (size_t i = 0; i < kRowsPerThread; i += N) { |
219 | 160M | auto even = Load(d, b_p_out_evenT + 8 * y + i); |
220 | 160M | auto odd = Load(d, b_p_out_oddT + 8 * y + i); |
221 | 160M | StoreInterleaved(d, even, odd, |
222 | 160M | p_out + ((x + i) << 1) + onerow_out * y); |
223 | 160M | } |
224 | 141M | } |
225 | 17.8M | } |
226 | 2.05M | } |
227 | 16.4M | #endif // HWY_TARGET != HWY_SCALAR |
228 | 146M | for (size_t y = 0; y < rows; y++) { |
229 | 130M | unsqueeze_row(y0 + y, x); |
230 | 130M | } |
231 | 16.4M | return true; |
232 | 16.4M | }; squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const Line | Count | Source | 182 | 172k | size_t /* thread */) -> Status { | 183 | 172k | const size_t y0 = task * kRowsPerThread; | 184 | 172k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 185 | 172k | size_t x = 0; | 186 | | | 187 | 172k | #if HWY_TARGET != HWY_SCALAR | 188 | 172k | intptr_t onerow_in = chin.plane.PixelsPerRow(); | 189 | 172k | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 190 | 172k | intptr_t onerow_out = chout.plane.PixelsPerRow(); | 191 | 172k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 192 | 172k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 193 | 172k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 194 | 172k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 195 | 172k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 196 | 172k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 197 | 172k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 198 | 172k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 199 | 172k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 200 | 172k | const size_t N = Lanes(d); | 201 | 172k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 202 | 1.42M | for (; x < chin_residual.w - 9; x += 8) { | 203 | 1.29M | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 204 | 1.29M | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 205 | 11.6M | for (size_t y = 0; y < kRowsPerThread; y++) { | 206 | 10.3M | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 207 | 10.3M | } | 208 | 10.1M | for (size_t i = 0; i < 8; i++) { | 209 | 8.82M | FastUnsqueeze( | 210 | 8.82M | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 211 | 18.4E | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 212 | 8.82M | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 213 | 8.82M | } | 214 | | | 215 | 1.29M | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 216 | 1.29M | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 217 | 11.5M | for (size_t y = 0; y < kRowsPerThread; y++) { | 218 | 30.8M | for (size_t i = 0; i < kRowsPerThread; i += N) { | 219 | 20.5M | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 220 | 20.5M | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 221 | 20.5M | StoreInterleaved(d, even, odd, | 222 | 20.5M | p_out + ((x + i) << 1) + onerow_out * y); | 223 | 20.5M | } | 224 | 10.2M | } | 225 | 1.29M | } | 226 | 126k | } | 227 | 172k | #endif // HWY_TARGET != HWY_SCALAR | 228 | 1.42M | for (size_t y = 0; y < rows; y++) { | 229 | 1.24M | unsqueeze_row(y0 + y, x); | 230 | 1.24M | } | 231 | 172k | return true; | 232 | 172k | }; |
squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const Line | Count | Source | 182 | 16.1M | size_t /* thread */) -> Status { | 183 | 16.1M | const size_t y0 = task * kRowsPerThread; | 184 | 16.1M | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 185 | 16.1M | size_t x = 0; | 186 | | | 187 | 16.1M | #if HWY_TARGET != HWY_SCALAR | 188 | 16.1M | intptr_t onerow_in = chin.plane.PixelsPerRow(); | 189 | 16.1M | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 190 | 16.1M | intptr_t onerow_out = chout.plane.PixelsPerRow(); | 191 | 16.1M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 192 | 16.1M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 193 | 16.1M | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 194 | 16.1M | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 195 | 16.1M | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 196 | 16.1M | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 197 | 16.1M | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 198 | 16.1M | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 199 | 16.1M | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 200 | 16.1M | const size_t N = Lanes(d); | 201 | 16.1M | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 202 | 17.2M | for (; x < chin_residual.w - 9; x += 8) { | 203 | 15.4M | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 204 | 15.4M | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 205 | 138M | for (size_t y = 0; y < kRowsPerThread; y++) { | 206 | 122M | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 207 | 122M | } | 208 | 137M | for (size_t i = 0; i < 8; i++) { | 209 | 122M | FastUnsqueeze( | 210 | 122M | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 211 | 122M | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 212 | 122M | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 213 | 122M | } | 214 | | | 215 | 15.4M | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 216 | 15.4M | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 217 | 137M | for (size_t y = 0; y < kRowsPerThread; y++) { | 218 | 245M | for (size_t i = 0; i < kRowsPerThread; i += N) { | 219 | 122M | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 220 | 122M | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 221 | 122M | StoreInterleaved(d, even, odd, | 222 | 122M | p_out + ((x + i) << 1) + onerow_out * y); | 223 | 122M | } | 224 | 122M | } | 225 | 15.4M | } | 226 | 1.81M | } | 227 | 16.1M | #endif // HWY_TARGET != HWY_SCALAR | 228 | 144M | for (size_t y = 0; y < rows; y++) { | 229 | 128M | unsqueeze_row(y0 + y, x); | 230 | 128M | } | 231 | 16.1M | return true; | 232 | 16.1M | }; |
squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const Line | Count | Source | 182 | 147k | size_t /* thread */) -> Status { | 183 | 147k | const size_t y0 = task * kRowsPerThread; | 184 | 147k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 185 | 147k | size_t x = 0; | 186 | | | 187 | 147k | #if HWY_TARGET != HWY_SCALAR | 188 | 147k | intptr_t onerow_in = chin.plane.PixelsPerRow(); | 189 | 147k | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 190 | 147k | intptr_t onerow_out = chout.plane.PixelsPerRow(); | 191 | 147k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 192 | 147k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 193 | 147k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 194 | 147k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 195 | 147k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 196 | 147k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 197 | 147k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 198 | 147k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 199 | 147k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 200 | 147k | const size_t N = Lanes(d); | 201 | 147k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 202 | 1.24M | for (; x < chin_residual.w - 9; x += 8) { | 203 | 1.13M | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 204 | 1.13M | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 205 | 10.0M | for (size_t y = 0; y < kRowsPerThread; y++) { | 206 | 8.92M | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 207 | 8.92M | } | 208 | 8.14M | for (size_t i = 0; i < 8; i++) { | 209 | 7.01M | FastUnsqueeze( | 210 | 7.01M | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 211 | 18.4E | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 212 | 7.01M | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 213 | 7.01M | } | 214 | | | 215 | 1.13M | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 216 | 1.13M | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 217 | 9.66M | for (size_t y = 0; y < kRowsPerThread; y++) { | 218 | 25.5M | for (size_t i = 0; i < kRowsPerThread; i += N) { | 219 | 17.0M | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 220 | 17.0M | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 221 | 17.0M | StoreInterleaved(d, even, odd, | 222 | 17.0M | p_out + ((x + i) << 1) + onerow_out * y); | 223 | 17.0M | } | 224 | 8.53M | } | 225 | 1.13M | } | 226 | 110k | } | 227 | 147k | #endif // HWY_TARGET != HWY_SCALAR | 228 | 1.20M | for (size_t y = 0; y < rows; y++) { | 229 | 1.05M | unsqueeze_row(y0 + y, x); | 230 | 1.05M | } | 231 | 147k | return true; | 232 | 147k | }; |
|
233 | 351k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), |
234 | 351k | ThreadPool::NoInit, unsqueeze_span, |
235 | 351k | "InvHorizontalSqueeze")); |
236 | 351k | input.channel[c] = std::move(chout); |
237 | 351k | return true; |
238 | 351k | } jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 127 | 37.7k | Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 128 | 37.7k | JXL_ENSURE(c < input.channel.size()); | 129 | 37.7k | JXL_ENSURE(rc < input.channel.size()); | 130 | 37.7k | Channel &chin = input.channel[c]; | 131 | 37.7k | const Channel &chin_residual = input.channel[rc]; | 132 | | // These must be valid since we ran MetaApply already. | 133 | 37.7k | JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2)); | 134 | 37.7k | JXL_ENSURE(chin.h == chin_residual.h); | 135 | 37.7k | JxlMemoryManager *memory_manager = input.memory_manager(); | 136 | | | 137 | 37.7k | if (chin_residual.w == 0) { | 138 | | // Short-circuit: output channel has same dimensions as input. | 139 | 362 | input.channel[c].hshift--; | 140 | 362 | return true; | 141 | 362 | } | 142 | | | 143 | | // Note: chin.w >= chin_residual.w and at most 1 different. | 144 | 74.7k | JXL_ASSIGN_OR_RETURN(Channel chout, | 145 | 74.7k | Channel::Create(memory_manager, chin.w + chin_residual.w, | 146 | 74.7k | chin.h, chin.hshift - 1, chin.vshift)); | 147 | 74.7k | JXL_DEBUG_V(4, | 148 | 74.7k | "Undoing horizontal squeeze of channel %i using residuals in " | 149 | 74.7k | "channel %i (going from width %" PRIuS " to %" PRIuS ")", | 150 | 74.7k | c, rc, chin.w, chout.w); | 151 | | | 152 | 74.7k | if (chin_residual.h == 0) { | 153 | | // Short-circuit: channel with no pixels. | 154 | 0 | input.channel[c] = std::move(chout); | 155 | 0 | return true; | 156 | 0 | } | 157 | 37.3k | auto unsqueeze_row = [&](size_t y, size_t x0) { | 158 | 37.3k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 159 | 37.3k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 160 | 37.3k | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 161 | 37.3k | for (size_t x = x0; x < chin_residual.w; x++) { | 162 | 37.3k | pixel_type_w diff_minus_tendency = p_residual[x]; | 163 | 37.3k | pixel_type_w avg = p_avg[x]; | 164 | 37.3k | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 165 | 37.3k | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 166 | 37.3k | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 167 | 37.3k | pixel_type_w diff = diff_minus_tendency + tendency; | 168 | 37.3k | pixel_type_w A = avg + (diff / 2); | 169 | 37.3k | p_out[(x << 1)] = A; | 170 | 37.3k | pixel_type_w B = A - diff; | 171 | 37.3k | p_out[(x << 1) + 1] = B; | 172 | 37.3k | } | 173 | 37.3k | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 174 | 37.3k | }; | 175 | | | 176 | | // somewhat complicated trickery just to be able to SIMD this. | 177 | | // Horizontal unsqueeze has horizontal data dependencies, so we do | 178 | | // 8 rows at a time and treat it as a vertical unsqueeze of a | 179 | | // transposed 8x8 block (or 9x8 for one input). | 180 | 37.3k | static constexpr const size_t kRowsPerThread = 8; | 181 | 37.3k | const auto unsqueeze_span = [&](const uint32_t task, | 182 | 37.3k | size_t /* thread */) -> Status { | 183 | 37.3k | const size_t y0 = task * kRowsPerThread; | 184 | 37.3k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 185 | 37.3k | size_t x = 0; | 186 | | | 187 | 37.3k | #if HWY_TARGET != HWY_SCALAR | 188 | 37.3k | intptr_t onerow_in = chin.plane.PixelsPerRow(); | 189 | 37.3k | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 190 | 37.3k | intptr_t onerow_out = chout.plane.PixelsPerRow(); | 191 | 37.3k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 192 | 37.3k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 193 | 37.3k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 194 | 37.3k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 195 | 37.3k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 196 | 37.3k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 197 | 37.3k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 198 | 37.3k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 199 | 37.3k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 200 | 37.3k | const size_t N = Lanes(d); | 201 | 37.3k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 202 | 37.3k | for (; x < chin_residual.w - 9; x += 8) { | 203 | 37.3k | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 204 | 37.3k | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 205 | 37.3k | for (size_t y = 0; y < kRowsPerThread; y++) { | 206 | 37.3k | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 207 | 37.3k | } | 208 | 37.3k | for (size_t i = 0; i < 8; i++) { | 209 | 37.3k | FastUnsqueeze( | 210 | 37.3k | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 211 | 37.3k | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 212 | 37.3k | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 213 | 37.3k | } | 214 | | | 215 | 37.3k | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 216 | 37.3k | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 217 | 37.3k | for (size_t y = 0; y < kRowsPerThread; y++) { | 218 | 37.3k | for (size_t i = 0; i < kRowsPerThread; i += N) { | 219 | 37.3k | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 220 | 37.3k | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 221 | 37.3k | StoreInterleaved(d, even, odd, | 222 | 37.3k | p_out + ((x + i) << 1) + onerow_out * y); | 223 | 37.3k | } | 224 | 37.3k | } | 225 | 37.3k | } | 226 | 37.3k | } | 227 | 37.3k | #endif // HWY_TARGET != HWY_SCALAR | 228 | 37.3k | for (size_t y = 0; y < rows; y++) { | 229 | 37.3k | unsqueeze_row(y0 + y, x); | 230 | 37.3k | } | 231 | 37.3k | return true; | 232 | 37.3k | }; | 233 | 37.3k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), | 234 | 37.3k | ThreadPool::NoInit, unsqueeze_span, | 235 | 37.3k | "InvHorizontalSqueeze")); | 236 | 37.3k | input.channel[c] = std::move(chout); | 237 | 37.3k | return true; | 238 | 37.3k | } |
jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 127 | 288k | Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 128 | 288k | JXL_ENSURE(c < input.channel.size()); | 129 | 288k | JXL_ENSURE(rc < input.channel.size()); | 130 | 288k | Channel &chin = input.channel[c]; | 131 | 288k | const Channel &chin_residual = input.channel[rc]; | 132 | | // These must be valid since we ran MetaApply already. | 133 | 288k | JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2)); | 134 | 288k | JXL_ENSURE(chin.h == chin_residual.h); | 135 | 288k | JxlMemoryManager *memory_manager = input.memory_manager(); | 136 | | | 137 | 288k | if (chin_residual.w == 0) { | 138 | | // Short-circuit: output channel has same dimensions as input. | 139 | 8.51k | input.channel[c].hshift--; | 140 | 8.51k | return true; | 141 | 8.51k | } | 142 | | | 143 | | // Note: chin.w >= chin_residual.w and at most 1 different. | 144 | 560k | JXL_ASSIGN_OR_RETURN(Channel chout, | 145 | 560k | Channel::Create(memory_manager, chin.w + chin_residual.w, | 146 | 560k | chin.h, chin.hshift - 1, chin.vshift)); | 147 | 560k | JXL_DEBUG_V(4, | 148 | 560k | "Undoing horizontal squeeze of channel %i using residuals in " | 149 | 560k | "channel %i (going from width %" PRIuS " to %" PRIuS ")", | 150 | 560k | c, rc, chin.w, chout.w); | 151 | | | 152 | 560k | if (chin_residual.h == 0) { | 153 | | // Short-circuit: channel with no pixels. | 154 | 0 | input.channel[c] = std::move(chout); | 155 | 0 | return true; | 156 | 0 | } | 157 | 280k | auto unsqueeze_row = [&](size_t y, size_t x0) { | 158 | 280k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 159 | 280k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 160 | 280k | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 161 | 280k | for (size_t x = x0; x < chin_residual.w; x++) { | 162 | 280k | pixel_type_w diff_minus_tendency = p_residual[x]; | 163 | 280k | pixel_type_w avg = p_avg[x]; | 164 | 280k | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 165 | 280k | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 166 | 280k | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 167 | 280k | pixel_type_w diff = diff_minus_tendency + tendency; | 168 | 280k | pixel_type_w A = avg + (diff / 2); | 169 | 280k | p_out[(x << 1)] = A; | 170 | 280k | pixel_type_w B = A - diff; | 171 | 280k | p_out[(x << 1) + 1] = B; | 172 | 280k | } | 173 | 280k | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 174 | 280k | }; | 175 | | | 176 | | // somewhat complicated trickery just to be able to SIMD this. | 177 | | // Horizontal unsqueeze has horizontal data dependencies, so we do | 178 | | // 8 rows at a time and treat it as a vertical unsqueeze of a | 179 | | // transposed 8x8 block (or 9x8 for one input). | 180 | 280k | static constexpr const size_t kRowsPerThread = 8; | 181 | 280k | const auto unsqueeze_span = [&](const uint32_t task, | 182 | 280k | size_t /* thread */) -> Status { | 183 | 280k | const size_t y0 = task * kRowsPerThread; | 184 | 280k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 185 | 280k | size_t x = 0; | 186 | | | 187 | 280k | #if HWY_TARGET != HWY_SCALAR | 188 | 280k | intptr_t onerow_in = chin.plane.PixelsPerRow(); | 189 | 280k | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 190 | 280k | intptr_t onerow_out = chout.plane.PixelsPerRow(); | 191 | 280k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 192 | 280k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 193 | 280k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 194 | 280k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 195 | 280k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 196 | 280k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 197 | 280k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 198 | 280k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 199 | 280k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 200 | 280k | const size_t N = Lanes(d); | 201 | 280k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 202 | 280k | for (; x < chin_residual.w - 9; x += 8) { | 203 | 280k | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 204 | 280k | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 205 | 280k | for (size_t y = 0; y < kRowsPerThread; y++) { | 206 | 280k | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 207 | 280k | } | 208 | 280k | for (size_t i = 0; i < 8; i++) { | 209 | 280k | FastUnsqueeze( | 210 | 280k | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 211 | 280k | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 212 | 280k | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 213 | 280k | } | 214 | | | 215 | 280k | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 216 | 280k | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 217 | 280k | for (size_t y = 0; y < kRowsPerThread; y++) { | 218 | 280k | for (size_t i = 0; i < kRowsPerThread; i += N) { | 219 | 280k | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 220 | 280k | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 221 | 280k | StoreInterleaved(d, even, odd, | 222 | 280k | p_out + ((x + i) << 1) + onerow_out * y); | 223 | 280k | } | 224 | 280k | } | 225 | 280k | } | 226 | 280k | } | 227 | 280k | #endif // HWY_TARGET != HWY_SCALAR | 228 | 280k | for (size_t y = 0; y < rows; y++) { | 229 | 280k | unsqueeze_row(y0 + y, x); | 230 | 280k | } | 231 | 280k | return true; | 232 | 280k | }; | 233 | 280k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), | 234 | 280k | ThreadPool::NoInit, unsqueeze_span, | 235 | 280k | "InvHorizontalSqueeze")); | 236 | 280k | input.channel[c] = std::move(chout); | 237 | 280k | return true; | 238 | 280k | } |
jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 127 | 35.1k | Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 128 | 35.1k | JXL_ENSURE(c < input.channel.size()); | 129 | 35.1k | JXL_ENSURE(rc < input.channel.size()); | 130 | 35.1k | Channel &chin = input.channel[c]; | 131 | 35.1k | const Channel &chin_residual = input.channel[rc]; | 132 | | // These must be valid since we ran MetaApply already. | 133 | 35.1k | JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2)); | 134 | 35.1k | JXL_ENSURE(chin.h == chin_residual.h); | 135 | 35.1k | JxlMemoryManager *memory_manager = input.memory_manager(); | 136 | | | 137 | 35.1k | if (chin_residual.w == 0) { | 138 | | // Short-circuit: output channel has same dimensions as input. | 139 | 1.05k | input.channel[c].hshift--; | 140 | 1.05k | return true; | 141 | 1.05k | } | 142 | | | 143 | | // Note: chin.w >= chin_residual.w and at most 1 different. | 144 | 68.1k | JXL_ASSIGN_OR_RETURN(Channel chout, | 145 | 68.1k | Channel::Create(memory_manager, chin.w + chin_residual.w, | 146 | 68.1k | chin.h, chin.hshift - 1, chin.vshift)); | 147 | 68.1k | JXL_DEBUG_V(4, | 148 | 68.1k | "Undoing horizontal squeeze of channel %i using residuals in " | 149 | 68.1k | "channel %i (going from width %" PRIuS " to %" PRIuS ")", | 150 | 68.1k | c, rc, chin.w, chout.w); | 151 | | | 152 | 68.1k | if (chin_residual.h == 0) { | 153 | | // Short-circuit: channel with no pixels. | 154 | 0 | input.channel[c] = std::move(chout); | 155 | 0 | return true; | 156 | 0 | } | 157 | 34.0k | auto unsqueeze_row = [&](size_t y, size_t x0) { | 158 | 34.0k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y); | 159 | 34.0k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y); | 160 | 34.0k | pixel_type *JXL_RESTRICT p_out = chout.Row(y); | 161 | 34.0k | for (size_t x = x0; x < chin_residual.w; x++) { | 162 | 34.0k | pixel_type_w diff_minus_tendency = p_residual[x]; | 163 | 34.0k | pixel_type_w avg = p_avg[x]; | 164 | 34.0k | pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg); | 165 | 34.0k | pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg); | 166 | 34.0k | pixel_type_w tendency = SmoothTendency(left, avg, next_avg); | 167 | 34.0k | pixel_type_w diff = diff_minus_tendency + tendency; | 168 | 34.0k | pixel_type_w A = avg + (diff / 2); | 169 | 34.0k | p_out[(x << 1)] = A; | 170 | 34.0k | pixel_type_w B = A - diff; | 171 | 34.0k | p_out[(x << 1) + 1] = B; | 172 | 34.0k | } | 173 | 34.0k | if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1]; | 174 | 34.0k | }; | 175 | | | 176 | | // somewhat complicated trickery just to be able to SIMD this. | 177 | | // Horizontal unsqueeze has horizontal data dependencies, so we do | 178 | | // 8 rows at a time and treat it as a vertical unsqueeze of a | 179 | | // transposed 8x8 block (or 9x8 for one input). | 180 | 34.0k | static constexpr const size_t kRowsPerThread = 8; | 181 | 34.0k | const auto unsqueeze_span = [&](const uint32_t task, | 182 | 34.0k | size_t /* thread */) -> Status { | 183 | 34.0k | const size_t y0 = task * kRowsPerThread; | 184 | 34.0k | const size_t rows = std::min(kRowsPerThread, chin.h - y0); | 185 | 34.0k | size_t x = 0; | 186 | | | 187 | 34.0k | #if HWY_TARGET != HWY_SCALAR | 188 | 34.0k | intptr_t onerow_in = chin.plane.PixelsPerRow(); | 189 | 34.0k | intptr_t onerow_inr = chin_residual.plane.PixelsPerRow(); | 190 | 34.0k | intptr_t onerow_out = chout.plane.PixelsPerRow(); | 191 | 34.0k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0); | 192 | 34.0k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0); | 193 | 34.0k | pixel_type *JXL_RESTRICT p_out = chout.Row(y0); | 194 | 34.0k | HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread]; | 195 | 34.0k | HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread]; | 196 | 34.0k | HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread]; | 197 | 34.0k | HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread]; | 198 | 34.0k | HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread]; | 199 | 34.0k | HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread]; | 200 | 34.0k | const size_t N = Lanes(d); | 201 | 34.0k | if (chin_residual.w > 16 && rows == kRowsPerThread) { | 202 | 34.0k | for (; x < chin_residual.w - 9; x += 8) { | 203 | 34.0k | Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr); | 204 | 34.0k | Transpose8x8Block(p_avg + x, b_p_avg, onerow_in); | 205 | 34.0k | for (size_t y = 0; y < kRowsPerThread; y++) { | 206 | 34.0k | b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y]; | 207 | 34.0k | } | 208 | 34.0k | for (size_t i = 0; i < 8; i++) { | 209 | 34.0k | FastUnsqueeze( | 210 | 34.0k | b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1), | 211 | 34.0k | (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i), | 212 | 34.0k | b_p_out_even + 8 * i, b_p_out_odd + 8 * i); | 213 | 34.0k | } | 214 | | | 215 | 34.0k | Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8); | 216 | 34.0k | Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8); | 217 | 34.0k | for (size_t y = 0; y < kRowsPerThread; y++) { | 218 | 34.0k | for (size_t i = 0; i < kRowsPerThread; i += N) { | 219 | 34.0k | auto even = Load(d, b_p_out_evenT + 8 * y + i); | 220 | 34.0k | auto odd = Load(d, b_p_out_oddT + 8 * y + i); | 221 | 34.0k | StoreInterleaved(d, even, odd, | 222 | 34.0k | p_out + ((x + i) << 1) + onerow_out * y); | 223 | 34.0k | } | 224 | 34.0k | } | 225 | 34.0k | } | 226 | 34.0k | } | 227 | 34.0k | #endif // HWY_TARGET != HWY_SCALAR | 228 | 34.0k | for (size_t y = 0; y < rows; y++) { | 229 | 34.0k | unsqueeze_row(y0 + y, x); | 230 | 34.0k | } | 231 | 34.0k | return true; | 232 | 34.0k | }; | 233 | 34.0k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread), | 234 | 34.0k | ThreadPool::NoInit, unsqueeze_span, | 235 | 34.0k | "InvHorizontalSqueeze")); | 236 | 34.0k | input.channel[c] = std::move(chout); | 237 | 34.0k | return true; | 238 | 34.0k | } |
|
239 | | |
240 | 359k | Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { |
241 | 359k | JXL_ENSURE(c < input.channel.size()); |
242 | 359k | JXL_ENSURE(rc < input.channel.size()); |
243 | 359k | const Channel &chin = input.channel[c]; |
244 | 359k | const Channel &chin_residual = input.channel[rc]; |
245 | | // These must be valid since we ran MetaApply already. |
246 | 359k | JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2)); |
247 | 359k | JXL_ENSURE(chin.w == chin_residual.w); |
248 | 359k | JxlMemoryManager *memory_manager = input.memory_manager(); |
249 | | |
250 | 359k | if (chin_residual.h == 0) { |
251 | | // Short-circuit: output channel has same dimensions as input. |
252 | 13.6k | input.channel[c].vshift--; |
253 | 13.6k | return true; |
254 | 13.6k | } |
255 | | |
256 | | // Note: chin.h >= chin_residual.h and at most 1 different. |
257 | 690k | JXL_ASSIGN_OR_RETURN( |
258 | 690k | Channel chout, |
259 | 690k | Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h, |
260 | 690k | chin.hshift, chin.vshift - 1)); |
261 | 690k | JXL_DEBUG_V( |
262 | 690k | 4, |
263 | 690k | "Undoing vertical squeeze of channel %i using residuals in channel " |
264 | 690k | "%i (going from height %" PRIuS " to %" PRIuS ")", |
265 | 690k | c, rc, chin.h, chout.h); |
266 | | |
267 | 690k | if (chin_residual.w == 0) { |
268 | | // Short-circuit: channel with no pixels. |
269 | 0 | input.channel[c] = std::move(chout); |
270 | 0 | return true; |
271 | 0 | } |
272 | | |
273 | 345k | static constexpr const int kColsPerThread = 64; |
274 | 345k | const auto unsqueeze_slice = [&](const uint32_t task, |
275 | 569k | size_t /* thread */) -> Status { |
276 | 569k | const size_t x0 = task * kColsPerThread; |
277 | 569k | const size_t x1 = |
278 | 569k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); |
279 | 569k | const size_t w = x1 - x0; |
280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is |
281 | | // always chin_residual.h. |
282 | 138M | for (size_t y = 0; y < chin_residual.h; y++) { |
283 | 138M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; |
284 | 138M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; |
285 | 138M | const pixel_type *JXL_RESTRICT p_navg = |
286 | 138M | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; |
287 | 138M | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; |
288 | 138M | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; |
289 | 138M | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; |
290 | 138M | size_t x = 0; |
291 | 138M | #if HWY_TARGET != HWY_SCALAR |
292 | 335M | for (; x + 7 < w; x += 8) { |
293 | 196M | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, |
294 | 196M | p_out + x, p_nout + x); |
295 | 196M | } |
296 | 138M | #endif |
297 | 467M | for (; x < w; x++) { |
298 | 329M | pixel_type_w avg = p_avg[x]; |
299 | 329M | pixel_type_w next_avg = p_navg[x]; |
300 | 329M | pixel_type_w top = p_pout[x]; |
301 | 329M | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); |
302 | 329M | pixel_type_w diff_minus_tendency = p_residual[x]; |
303 | 329M | pixel_type_w diff = diff_minus_tendency + tendency; |
304 | 329M | pixel_type_w out = avg + (diff / 2); |
305 | 329M | p_out[x] = out; |
306 | | // If the chin_residual.h == chin.h, the output has an even number |
307 | | // of rows so the next line is fine. Otherwise, this loop won't |
308 | | // write to the last output row which is handled separately. |
309 | 329M | p_nout[x] = out - diff; |
310 | 329M | } |
311 | 138M | } |
312 | 569k | return true; |
313 | 569k | }; squeeze.cc:jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 275 | 39.7k | size_t /* thread */) -> Status { | 276 | 39.7k | const size_t x0 = task * kColsPerThread; | 277 | 39.7k | const size_t x1 = | 278 | 39.7k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 279 | 39.7k | const size_t w = x1 - x0; | 280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 281 | | // always chin_residual.h. | 282 | 1.26M | for (size_t y = 0; y < chin_residual.h; y++) { | 283 | 1.22M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 284 | 1.22M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 285 | 1.22M | const pixel_type *JXL_RESTRICT p_navg = | 286 | 1.22M | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 287 | 1.22M | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 288 | 1.22M | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 289 | 1.22M | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 290 | 1.22M | size_t x = 0; | 291 | 1.22M | #if HWY_TARGET != HWY_SCALAR | 292 | 9.22M | for (; x + 7 < w; x += 8) { | 293 | 8.00M | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 294 | 8.00M | p_out + x, p_nout + x); | 295 | 8.00M | } | 296 | 1.22M | #endif | 297 | 2.81M | for (; x < w; x++) { | 298 | 1.59M | pixel_type_w avg = p_avg[x]; | 299 | 1.59M | pixel_type_w next_avg = p_navg[x]; | 300 | 1.59M | pixel_type_w top = p_pout[x]; | 301 | 1.59M | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 302 | 1.59M | pixel_type_w diff_minus_tendency = p_residual[x]; | 303 | 1.59M | pixel_type_w diff = diff_minus_tendency + tendency; | 304 | 1.59M | pixel_type_w out = avg + (diff / 2); | 305 | 1.59M | p_out[x] = out; | 306 | | // If the chin_residual.h == chin.h, the output has an even number | 307 | | // of rows so the next line is fine. Otherwise, this loop won't | 308 | | // write to the last output row which is handled separately. | 309 | 1.59M | p_nout[x] = out - diff; | 310 | 1.59M | } | 311 | 1.22M | } | 312 | 39.7k | return true; | 313 | 39.7k | }; |
squeeze.cc:jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 275 | 491k | size_t /* thread */) -> Status { | 276 | 491k | const size_t x0 = task * kColsPerThread; | 277 | 491k | const size_t x1 = | 278 | 491k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 279 | 491k | const size_t w = x1 - x0; | 280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 281 | | // always chin_residual.h. | 282 | 136M | for (size_t y = 0; y < chin_residual.h; y++) { | 283 | 135M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 284 | 135M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 285 | 135M | const pixel_type *JXL_RESTRICT p_navg = | 286 | 135M | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 287 | 135M | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 288 | 135M | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 289 | 135M | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 290 | 135M | size_t x = 0; | 291 | 135M | #if HWY_TARGET != HWY_SCALAR | 292 | 315M | for (; x + 7 < w; x += 8) { | 293 | 179M | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 294 | 179M | p_out + x, p_nout + x); | 295 | 179M | } | 296 | 135M | #endif | 297 | 462M | for (; x < w; x++) { | 298 | 326M | pixel_type_w avg = p_avg[x]; | 299 | 326M | pixel_type_w next_avg = p_navg[x]; | 300 | 326M | pixel_type_w top = p_pout[x]; | 301 | 326M | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 302 | 326M | pixel_type_w diff_minus_tendency = p_residual[x]; | 303 | 326M | pixel_type_w diff = diff_minus_tendency + tendency; | 304 | 326M | pixel_type_w out = avg + (diff / 2); | 305 | 326M | p_out[x] = out; | 306 | | // If the chin_residual.h == chin.h, the output has an even number | 307 | | // of rows so the next line is fine. Otherwise, this loop won't | 308 | | // write to the last output row which is handled separately. | 309 | 326M | p_nout[x] = out - diff; | 310 | 326M | } | 311 | 135M | } | 312 | 491k | return true; | 313 | 491k | }; |
squeeze.cc:jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const Line | Count | Source | 275 | 38.6k | size_t /* thread */) -> Status { | 276 | 38.6k | const size_t x0 = task * kColsPerThread; | 277 | 38.6k | const size_t x1 = | 278 | 38.6k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 279 | 38.6k | const size_t w = x1 - x0; | 280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 281 | | // always chin_residual.h. | 282 | 1.37M | for (size_t y = 0; y < chin_residual.h; y++) { | 283 | 1.33M | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 284 | 1.33M | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 285 | 1.33M | const pixel_type *JXL_RESTRICT p_navg = | 286 | 1.33M | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 287 | 1.33M | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 288 | 1.33M | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 289 | 1.33M | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 290 | 1.33M | size_t x = 0; | 291 | 1.33M | #if HWY_TARGET != HWY_SCALAR | 292 | 10.7M | for (; x + 7 < w; x += 8) { | 293 | 9.44M | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 294 | 9.44M | p_out + x, p_nout + x); | 295 | 9.44M | } | 296 | 1.33M | #endif | 297 | 2.37M | for (; x < w; x++) { | 298 | 1.03M | pixel_type_w avg = p_avg[x]; | 299 | 1.03M | pixel_type_w next_avg = p_navg[x]; | 300 | 1.03M | pixel_type_w top = p_pout[x]; | 301 | 1.03M | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 302 | 1.03M | pixel_type_w diff_minus_tendency = p_residual[x]; | 303 | 1.03M | pixel_type_w diff = diff_minus_tendency + tendency; | 304 | 1.03M | pixel_type_w out = avg + (diff / 2); | 305 | 1.03M | p_out[x] = out; | 306 | | // If the chin_residual.h == chin.h, the output has an even number | 307 | | // of rows so the next line is fine. Otherwise, this loop won't | 308 | | // write to the last output row which is handled separately. | 309 | 1.03M | p_nout[x] = out - diff; | 310 | 1.03M | } | 311 | 1.33M | } | 312 | 38.6k | return true; | 313 | 38.6k | }; |
|
314 | 345k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), |
315 | 345k | ThreadPool::NoInit, unsqueeze_slice, |
316 | 345k | "InvVertSqueeze")); |
317 | | |
318 | 345k | if (chout.h & 1) { |
319 | 99.1k | size_t y = chin.h - 1; |
320 | 99.1k | const pixel_type *p_avg = chin.Row(y); |
321 | 99.1k | pixel_type *p_out = chout.Row(y << 1); |
322 | 8.55M | for (size_t x = 0; x < chin.w; x++) { |
323 | 8.45M | p_out[x] = p_avg[x]; |
324 | 8.45M | } |
325 | 99.1k | } |
326 | 345k | input.channel[c] = std::move(chout); |
327 | 345k | return true; |
328 | 345k | } jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 240 | 34.3k | Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 241 | 34.3k | JXL_ENSURE(c < input.channel.size()); | 242 | 34.3k | JXL_ENSURE(rc < input.channel.size()); | 243 | 34.3k | const Channel &chin = input.channel[c]; | 244 | 34.3k | const Channel &chin_residual = input.channel[rc]; | 245 | | // These must be valid since we ran MetaApply already. | 246 | 34.3k | JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2)); | 247 | 34.3k | JXL_ENSURE(chin.w == chin_residual.w); | 248 | 34.3k | JxlMemoryManager *memory_manager = input.memory_manager(); | 249 | | | 250 | 34.3k | if (chin_residual.h == 0) { | 251 | | // Short-circuit: output channel has same dimensions as input. | 252 | 1.51k | input.channel[c].vshift--; | 253 | 1.51k | return true; | 254 | 1.51k | } | 255 | | | 256 | | // Note: chin.h >= chin_residual.h and at most 1 different. | 257 | 65.7k | JXL_ASSIGN_OR_RETURN( | 258 | 65.7k | Channel chout, | 259 | 65.7k | Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h, | 260 | 65.7k | chin.hshift, chin.vshift - 1)); | 261 | 65.7k | JXL_DEBUG_V( | 262 | 65.7k | 4, | 263 | 65.7k | "Undoing vertical squeeze of channel %i using residuals in channel " | 264 | 65.7k | "%i (going from height %" PRIuS " to %" PRIuS ")", | 265 | 65.7k | c, rc, chin.h, chout.h); | 266 | | | 267 | 65.7k | if (chin_residual.w == 0) { | 268 | | // Short-circuit: channel with no pixels. | 269 | 0 | input.channel[c] = std::move(chout); | 270 | 0 | return true; | 271 | 0 | } | 272 | | | 273 | 32.8k | static constexpr const int kColsPerThread = 64; | 274 | 32.8k | const auto unsqueeze_slice = [&](const uint32_t task, | 275 | 32.8k | size_t /* thread */) -> Status { | 276 | 32.8k | const size_t x0 = task * kColsPerThread; | 277 | 32.8k | const size_t x1 = | 278 | 32.8k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 279 | 32.8k | const size_t w = x1 - x0; | 280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 281 | | // always chin_residual.h. | 282 | 32.8k | for (size_t y = 0; y < chin_residual.h; y++) { | 283 | 32.8k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 284 | 32.8k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 285 | 32.8k | const pixel_type *JXL_RESTRICT p_navg = | 286 | 32.8k | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 287 | 32.8k | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 288 | 32.8k | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 289 | 32.8k | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 290 | 32.8k | size_t x = 0; | 291 | 32.8k | #if HWY_TARGET != HWY_SCALAR | 292 | 32.8k | for (; x + 7 < w; x += 8) { | 293 | 32.8k | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 294 | 32.8k | p_out + x, p_nout + x); | 295 | 32.8k | } | 296 | 32.8k | #endif | 297 | 32.8k | for (; x < w; x++) { | 298 | 32.8k | pixel_type_w avg = p_avg[x]; | 299 | 32.8k | pixel_type_w next_avg = p_navg[x]; | 300 | 32.8k | pixel_type_w top = p_pout[x]; | 301 | 32.8k | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 302 | 32.8k | pixel_type_w diff_minus_tendency = p_residual[x]; | 303 | 32.8k | pixel_type_w diff = diff_minus_tendency + tendency; | 304 | 32.8k | pixel_type_w out = avg + (diff / 2); | 305 | 32.8k | p_out[x] = out; | 306 | | // If the chin_residual.h == chin.h, the output has an even number | 307 | | // of rows so the next line is fine. Otherwise, this loop won't | 308 | | // write to the last output row which is handled separately. | 309 | 32.8k | p_nout[x] = out - diff; | 310 | 32.8k | } | 311 | 32.8k | } | 312 | 32.8k | return true; | 313 | 32.8k | }; | 314 | 32.8k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), | 315 | 32.8k | ThreadPool::NoInit, unsqueeze_slice, | 316 | 32.8k | "InvVertSqueeze")); | 317 | | | 318 | 32.8k | if (chout.h & 1) { | 319 | 12.8k | size_t y = chin.h - 1; | 320 | 12.8k | const pixel_type *p_avg = chin.Row(y); | 321 | 12.8k | pixel_type *p_out = chout.Row(y << 1); | 322 | 481k | for (size_t x = 0; x < chin.w; x++) { | 323 | 468k | p_out[x] = p_avg[x]; | 324 | 468k | } | 325 | 12.8k | } | 326 | 32.8k | input.channel[c] = std::move(chout); | 327 | 32.8k | return true; | 328 | 32.8k | } |
jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 240 | 292k | Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 241 | 292k | JXL_ENSURE(c < input.channel.size()); | 242 | 292k | JXL_ENSURE(rc < input.channel.size()); | 243 | 292k | const Channel &chin = input.channel[c]; | 244 | 292k | const Channel &chin_residual = input.channel[rc]; | 245 | | // These must be valid since we ran MetaApply already. | 246 | 292k | JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2)); | 247 | 292k | JXL_ENSURE(chin.w == chin_residual.w); | 248 | 292k | JxlMemoryManager *memory_manager = input.memory_manager(); | 249 | | | 250 | 292k | if (chin_residual.h == 0) { | 251 | | // Short-circuit: output channel has same dimensions as input. | 252 | 10.4k | input.channel[c].vshift--; | 253 | 10.4k | return true; | 254 | 10.4k | } | 255 | | | 256 | | // Note: chin.h >= chin_residual.h and at most 1 different. | 257 | 564k | JXL_ASSIGN_OR_RETURN( | 258 | 564k | Channel chout, | 259 | 564k | Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h, | 260 | 564k | chin.hshift, chin.vshift - 1)); | 261 | 564k | JXL_DEBUG_V( | 262 | 564k | 4, | 263 | 564k | "Undoing vertical squeeze of channel %i using residuals in channel " | 264 | 564k | "%i (going from height %" PRIuS " to %" PRIuS ")", | 265 | 564k | c, rc, chin.h, chout.h); | 266 | | | 267 | 564k | if (chin_residual.w == 0) { | 268 | | // Short-circuit: channel with no pixels. | 269 | 0 | input.channel[c] = std::move(chout); | 270 | 0 | return true; | 271 | 0 | } | 272 | | | 273 | 282k | static constexpr const int kColsPerThread = 64; | 274 | 282k | const auto unsqueeze_slice = [&](const uint32_t task, | 275 | 282k | size_t /* thread */) -> Status { | 276 | 282k | const size_t x0 = task * kColsPerThread; | 277 | 282k | const size_t x1 = | 278 | 282k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 279 | 282k | const size_t w = x1 - x0; | 280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 281 | | // always chin_residual.h. | 282 | 282k | for (size_t y = 0; y < chin_residual.h; y++) { | 283 | 282k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 284 | 282k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 285 | 282k | const pixel_type *JXL_RESTRICT p_navg = | 286 | 282k | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 287 | 282k | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 288 | 282k | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 289 | 282k | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 290 | 282k | size_t x = 0; | 291 | 282k | #if HWY_TARGET != HWY_SCALAR | 292 | 282k | for (; x + 7 < w; x += 8) { | 293 | 282k | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 294 | 282k | p_out + x, p_nout + x); | 295 | 282k | } | 296 | 282k | #endif | 297 | 282k | for (; x < w; x++) { | 298 | 282k | pixel_type_w avg = p_avg[x]; | 299 | 282k | pixel_type_w next_avg = p_navg[x]; | 300 | 282k | pixel_type_w top = p_pout[x]; | 301 | 282k | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 302 | 282k | pixel_type_w diff_minus_tendency = p_residual[x]; | 303 | 282k | pixel_type_w diff = diff_minus_tendency + tendency; | 304 | 282k | pixel_type_w out = avg + (diff / 2); | 305 | 282k | p_out[x] = out; | 306 | | // If the chin_residual.h == chin.h, the output has an even number | 307 | | // of rows so the next line is fine. Otherwise, this loop won't | 308 | | // write to the last output row which is handled separately. | 309 | 282k | p_nout[x] = out - diff; | 310 | 282k | } | 311 | 282k | } | 312 | 282k | return true; | 313 | 282k | }; | 314 | 282k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), | 315 | 282k | ThreadPool::NoInit, unsqueeze_slice, | 316 | 282k | "InvVertSqueeze")); | 317 | | | 318 | 282k | if (chout.h & 1) { | 319 | 76.7k | size_t y = chin.h - 1; | 320 | 76.7k | const pixel_type *p_avg = chin.Row(y); | 321 | 76.7k | pixel_type *p_out = chout.Row(y << 1); | 322 | 7.73M | for (size_t x = 0; x < chin.w; x++) { | 323 | 7.65M | p_out[x] = p_avg[x]; | 324 | 7.65M | } | 325 | 76.7k | } | 326 | 282k | input.channel[c] = std::move(chout); | 327 | 282k | return true; | 328 | 282k | } |
jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*) Line | Count | Source | 240 | 32.0k | Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) { | 241 | 32.0k | JXL_ENSURE(c < input.channel.size()); | 242 | 32.0k | JXL_ENSURE(rc < input.channel.size()); | 243 | 32.0k | const Channel &chin = input.channel[c]; | 244 | 32.0k | const Channel &chin_residual = input.channel[rc]; | 245 | | // These must be valid since we ran MetaApply already. | 246 | 32.0k | JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2)); | 247 | 32.0k | JXL_ENSURE(chin.w == chin_residual.w); | 248 | 32.0k | JxlMemoryManager *memory_manager = input.memory_manager(); | 249 | | | 250 | 32.0k | if (chin_residual.h == 0) { | 251 | | // Short-circuit: output channel has same dimensions as input. | 252 | 1.75k | input.channel[c].vshift--; | 253 | 1.75k | return true; | 254 | 1.75k | } | 255 | | | 256 | | // Note: chin.h >= chin_residual.h and at most 1 different. | 257 | 60.6k | JXL_ASSIGN_OR_RETURN( | 258 | 60.6k | Channel chout, | 259 | 60.6k | Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h, | 260 | 60.6k | chin.hshift, chin.vshift - 1)); | 261 | 60.6k | JXL_DEBUG_V( | 262 | 60.6k | 4, | 263 | 60.6k | "Undoing vertical squeeze of channel %i using residuals in channel " | 264 | 60.6k | "%i (going from height %" PRIuS " to %" PRIuS ")", | 265 | 60.6k | c, rc, chin.h, chout.h); | 266 | | | 267 | 60.6k | if (chin_residual.w == 0) { | 268 | | // Short-circuit: channel with no pixels. | 269 | 0 | input.channel[c] = std::move(chout); | 270 | 0 | return true; | 271 | 0 | } | 272 | | | 273 | 30.3k | static constexpr const int kColsPerThread = 64; | 274 | 30.3k | const auto unsqueeze_slice = [&](const uint32_t task, | 275 | 30.3k | size_t /* thread */) -> Status { | 276 | 30.3k | const size_t x0 = task * kColsPerThread; | 277 | 30.3k | const size_t x1 = | 278 | 30.3k | std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w); | 279 | 30.3k | const size_t w = x1 - x0; | 280 | | // We only iterate up to std::min(chin_residual.h, chin.h) which is | 281 | | // always chin_residual.h. | 282 | 30.3k | for (size_t y = 0; y < chin_residual.h; y++) { | 283 | 30.3k | const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0; | 284 | 30.3k | const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0; | 285 | 30.3k | const pixel_type *JXL_RESTRICT p_navg = | 286 | 30.3k | chin.Row(y + 1 < chin.h ? y + 1 : y) + x0; | 287 | 30.3k | pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0; | 288 | 30.3k | pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0; | 289 | 30.3k | const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg; | 290 | 30.3k | size_t x = 0; | 291 | 30.3k | #if HWY_TARGET != HWY_SCALAR | 292 | 30.3k | for (; x + 7 < w; x += 8) { | 293 | 30.3k | FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x, | 294 | 30.3k | p_out + x, p_nout + x); | 295 | 30.3k | } | 296 | 30.3k | #endif | 297 | 30.3k | for (; x < w; x++) { | 298 | 30.3k | pixel_type_w avg = p_avg[x]; | 299 | 30.3k | pixel_type_w next_avg = p_navg[x]; | 300 | 30.3k | pixel_type_w top = p_pout[x]; | 301 | 30.3k | pixel_type_w tendency = SmoothTendency(top, avg, next_avg); | 302 | 30.3k | pixel_type_w diff_minus_tendency = p_residual[x]; | 303 | 30.3k | pixel_type_w diff = diff_minus_tendency + tendency; | 304 | 30.3k | pixel_type_w out = avg + (diff / 2); | 305 | 30.3k | p_out[x] = out; | 306 | | // If the chin_residual.h == chin.h, the output has an even number | 307 | | // of rows so the next line is fine. Otherwise, this loop won't | 308 | | // write to the last output row which is handled separately. | 309 | 30.3k | p_nout[x] = out - diff; | 310 | 30.3k | } | 311 | 30.3k | } | 312 | 30.3k | return true; | 313 | 30.3k | }; | 314 | 30.3k | JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread), | 315 | 30.3k | ThreadPool::NoInit, unsqueeze_slice, | 316 | 30.3k | "InvVertSqueeze")); | 317 | | | 318 | 30.3k | if (chout.h & 1) { | 319 | 9.50k | size_t y = chin.h - 1; | 320 | 9.50k | const pixel_type *p_avg = chin.Row(y); | 321 | 9.50k | pixel_type *p_out = chout.Row(y << 1); | 322 | 338k | for (size_t x = 0; x < chin.w; x++) { | 323 | 329k | p_out[x] = p_avg[x]; | 324 | 329k | } | 325 | 9.50k | } | 326 | 30.3k | input.channel[c] = std::move(chout); | 327 | 30.3k | return true; | 328 | 30.3k | } |
|
329 | | |
330 | | Status InvSqueeze(Image &input, const std::vector<SqueezeParams> ¶meters, |
331 | 44.0k | ThreadPool *pool) { |
332 | 243k | for (int i = parameters.size() - 1; i >= 0; i--) { |
333 | 199k | JXL_RETURN_IF_ERROR( |
334 | 199k | CheckMetaSqueezeParams(parameters[i], input.channel.size())); |
335 | 199k | bool horizontal = parameters[i].horizontal; |
336 | 199k | bool in_place = parameters[i].in_place; |
337 | 199k | uint32_t beginc = parameters[i].begin_c; |
338 | 199k | uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; |
339 | 199k | uint32_t offset; |
340 | 199k | if (in_place) { |
341 | 128k | offset = endc + 1; |
342 | 128k | } else { |
343 | 71.1k | offset = input.channel.size() + beginc - endc - 1; |
344 | 71.1k | } |
345 | 199k | if (beginc < input.nb_meta_channels) { |
346 | | // This is checked in MetaSqueeze. |
347 | 299 | JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c); |
348 | 299 | input.nb_meta_channels -= parameters[i].num_c; |
349 | 299 | } |
350 | | |
351 | 920k | for (uint32_t c = beginc; c <= endc; c++) { |
352 | 720k | uint32_t rc = offset + c - beginc; |
353 | | // MetaApply should imply that `rc` is within range, otherwise there's a |
354 | | // programming bug. |
355 | 720k | JXL_ENSURE(rc < input.channel.size()); |
356 | 720k | if ((input.channel[c].w < input.channel[rc].w) || |
357 | 720k | (input.channel[c].h < input.channel[rc].h)) { |
358 | 0 | return JXL_FAILURE("Corrupted squeeze transform"); |
359 | 0 | } |
360 | 720k | if (horizontal) { |
361 | 361k | JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); |
362 | 361k | } else { |
363 | 359k | JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); |
364 | 359k | } |
365 | 720k | } |
366 | 199k | input.channel.erase(input.channel.begin() + offset, |
367 | 199k | input.channel.begin() + offset + (endc - beginc + 1)); |
368 | 199k | } |
369 | 44.0k | return true; |
370 | 44.0k | } jxl::N_SSE4::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) Line | Count | Source | 331 | 5.24k | ThreadPool *pool) { | 332 | 28.7k | for (int i = parameters.size() - 1; i >= 0; i--) { | 333 | 23.5k | JXL_RETURN_IF_ERROR( | 334 | 23.5k | CheckMetaSqueezeParams(parameters[i], input.channel.size())); | 335 | 23.5k | bool horizontal = parameters[i].horizontal; | 336 | 23.5k | bool in_place = parameters[i].in_place; | 337 | 23.5k | uint32_t beginc = parameters[i].begin_c; | 338 | 23.5k | uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; | 339 | 23.5k | uint32_t offset; | 340 | 23.5k | if (in_place) { | 341 | 14.8k | offset = endc + 1; | 342 | 14.8k | } else { | 343 | 8.64k | offset = input.channel.size() + beginc - endc - 1; | 344 | 8.64k | } | 345 | 23.5k | if (beginc < input.nb_meta_channels) { | 346 | | // This is checked in MetaSqueeze. | 347 | 54 | JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c); | 348 | 54 | input.nb_meta_channels -= parameters[i].num_c; | 349 | 54 | } | 350 | | | 351 | 95.6k | for (uint32_t c = beginc; c <= endc; c++) { | 352 | 72.1k | uint32_t rc = offset + c - beginc; | 353 | | // MetaApply should imply that `rc` is within range, otherwise there's a | 354 | | // programming bug. | 355 | 72.1k | JXL_ENSURE(rc < input.channel.size()); | 356 | 72.1k | if ((input.channel[c].w < input.channel[rc].w) || | 357 | 72.1k | (input.channel[c].h < input.channel[rc].h)) { | 358 | 0 | return JXL_FAILURE("Corrupted squeeze transform"); | 359 | 0 | } | 360 | 72.1k | if (horizontal) { | 361 | 37.7k | JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); | 362 | 37.7k | } else { | 363 | 34.3k | JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); | 364 | 34.3k | } | 365 | 72.1k | } | 366 | 23.5k | input.channel.erase(input.channel.begin() + offset, | 367 | 23.5k | input.channel.begin() + offset + (endc - beginc + 1)); | 368 | 23.5k | } | 369 | 5.24k | return true; | 370 | 5.24k | } |
jxl::N_AVX2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) Line | Count | Source | 331 | 33.0k | ThreadPool *pool) { | 332 | 187k | for (int i = parameters.size() - 1; i >= 0; i--) { | 333 | 154k | JXL_RETURN_IF_ERROR( | 334 | 154k | CheckMetaSqueezeParams(parameters[i], input.channel.size())); | 335 | 154k | bool horizontal = parameters[i].horizontal; | 336 | 154k | bool in_place = parameters[i].in_place; | 337 | 154k | uint32_t beginc = parameters[i].begin_c; | 338 | 154k | uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; | 339 | 154k | uint32_t offset; | 340 | 154k | if (in_place) { | 341 | 102k | offset = endc + 1; | 342 | 102k | } else { | 343 | 52.5k | offset = input.channel.size() + beginc - endc - 1; | 344 | 52.5k | } | 345 | 154k | if (beginc < input.nb_meta_channels) { | 346 | | // This is checked in MetaSqueeze. | 347 | 207 | JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c); | 348 | 207 | input.nb_meta_channels -= parameters[i].num_c; | 349 | 207 | } | 350 | | | 351 | 735k | for (uint32_t c = beginc; c <= endc; c++) { | 352 | 581k | uint32_t rc = offset + c - beginc; | 353 | | // MetaApply should imply that `rc` is within range, otherwise there's a | 354 | | // programming bug. | 355 | 581k | JXL_ENSURE(rc < input.channel.size()); | 356 | 581k | if ((input.channel[c].w < input.channel[rc].w) || | 357 | 581k | (input.channel[c].h < input.channel[rc].h)) { | 358 | 0 | return JXL_FAILURE("Corrupted squeeze transform"); | 359 | 0 | } | 360 | 581k | if (horizontal) { | 361 | 288k | JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); | 362 | 292k | } else { | 363 | 292k | JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); | 364 | 292k | } | 365 | 581k | } | 366 | 154k | input.channel.erase(input.channel.begin() + offset, | 367 | 154k | input.channel.begin() + offset + (endc - beginc + 1)); | 368 | 154k | } | 369 | 33.0k | return true; | 370 | 33.0k | } |
jxl::N_SSE2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*) Line | Count | Source | 331 | 5.74k | ThreadPool *pool) { | 332 | 27.4k | for (int i = parameters.size() - 1; i >= 0; i--) { | 333 | 21.6k | JXL_RETURN_IF_ERROR( | 334 | 21.6k | CheckMetaSqueezeParams(parameters[i], input.channel.size())); | 335 | 21.6k | bool horizontal = parameters[i].horizontal; | 336 | 21.6k | bool in_place = parameters[i].in_place; | 337 | 21.6k | uint32_t beginc = parameters[i].begin_c; | 338 | 21.6k | uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1; | 339 | 21.6k | uint32_t offset; | 340 | 21.6k | if (in_place) { | 341 | 11.7k | offset = endc + 1; | 342 | 11.7k | } else { | 343 | 9.95k | offset = input.channel.size() + beginc - endc - 1; | 344 | 9.95k | } | 345 | 21.6k | if (beginc < input.nb_meta_channels) { | 346 | | // This is checked in MetaSqueeze. | 347 | 38 | JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c); | 348 | 38 | input.nb_meta_channels -= parameters[i].num_c; | 349 | 38 | } | 350 | | | 351 | 88.8k | for (uint32_t c = beginc; c <= endc; c++) { | 352 | 67.2k | uint32_t rc = offset + c - beginc; | 353 | | // MetaApply should imply that `rc` is within range, otherwise there's a | 354 | | // programming bug. | 355 | 67.2k | JXL_ENSURE(rc < input.channel.size()); | 356 | 67.2k | if ((input.channel[c].w < input.channel[rc].w) || | 357 | 67.2k | (input.channel[c].h < input.channel[rc].h)) { | 358 | 0 | return JXL_FAILURE("Corrupted squeeze transform"); | 359 | 0 | } | 360 | 67.2k | if (horizontal) { | 361 | 35.1k | JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool)); | 362 | 35.1k | } else { | 363 | 32.0k | JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool)); | 364 | 32.0k | } | 365 | 67.2k | } | 366 | 21.6k | input.channel.erase(input.channel.begin() + offset, | 367 | 21.6k | input.channel.begin() + offset + (endc - beginc + 1)); | 368 | 21.6k | } | 369 | 5.74k | return true; | 370 | 5.74k | } |
|
371 | | |
372 | | } // namespace HWY_NAMESPACE |
373 | | } // namespace jxl |
374 | | HWY_AFTER_NAMESPACE(); |
375 | | |
376 | | #if HWY_ONCE |
377 | | |
378 | | namespace jxl { |
379 | | |
380 | | HWY_EXPORT(InvSqueeze); |
381 | | Status InvSqueeze(Image &input, const std::vector<SqueezeParams> ¶meters, |
382 | 44.0k | ThreadPool *pool) { |
383 | 44.0k | return HWY_DYNAMIC_DISPATCH(InvSqueeze)(input, parameters, pool); |
384 | 44.0k | } |
385 | | |
386 | | void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters, |
387 | 43.4k | const Image &image) { |
388 | 43.4k | int nb_channels = image.channel.size() - image.nb_meta_channels; |
389 | | |
390 | 43.4k | parameters->clear(); |
391 | 43.4k | size_t w = image.channel[image.nb_meta_channels].w; |
392 | 43.4k | size_t h = image.channel[image.nb_meta_channels].h; |
393 | 43.4k | JXL_DEBUG_V( |
394 | 43.4k | 7, "Default squeeze parameters for %" PRIuS "x%" PRIuS " image: ", w, h); |
395 | | |
396 | | // do horizontal first on wide images; vertical first on tall images |
397 | 43.4k | bool wide = (w > h); |
398 | | |
399 | 43.4k | if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w && |
400 | 43.4k | image.channel[image.nb_meta_channels + 1].h == h) { |
401 | | // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0 |
402 | | // previews |
403 | 30.3k | JXL_DEBUG_V(7, "(4:2:0 chroma), %" PRIuS "x%" PRIuS " image", w, h); |
404 | 30.3k | SqueezeParams params; |
405 | | // horizontal chroma squeeze |
406 | 30.3k | params.horizontal = true; |
407 | 30.3k | params.in_place = false; |
408 | 30.3k | params.begin_c = image.nb_meta_channels + 1; |
409 | 30.3k | params.num_c = 2; |
410 | 30.3k | parameters->push_back(params); |
411 | 30.3k | params.horizontal = false; |
412 | | // vertical chroma squeeze |
413 | 30.3k | parameters->push_back(params); |
414 | 30.3k | } |
415 | 43.4k | SqueezeParams params; |
416 | 43.4k | params.begin_c = image.nb_meta_channels; |
417 | 43.4k | params.num_c = nb_channels; |
418 | 43.4k | params.in_place = true; |
419 | | |
420 | 43.4k | if (!wide) { |
421 | 21.1k | if (h > kMaxFirstPreviewSize) { |
422 | 8.94k | params.horizontal = false; |
423 | 8.94k | parameters->push_back(params); |
424 | 8.94k | h = (h + 1) / 2; |
425 | 8.94k | JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h); |
426 | 8.94k | } |
427 | 21.1k | } |
428 | 119k | while (w > kMaxFirstPreviewSize || h > kMaxFirstPreviewSize) { |
429 | 76.3k | if (w > kMaxFirstPreviewSize) { |
430 | 69.9k | params.horizontal = true; |
431 | 69.9k | parameters->push_back(params); |
432 | 69.9k | w = (w + 1) / 2; |
433 | 69.9k | JXL_DEBUG_V(7, "Horizontal (%" PRIuS "x%" PRIuS "), ", w, h); |
434 | 69.9k | } |
435 | 76.3k | if (h > kMaxFirstPreviewSize) { |
436 | 51.7k | params.horizontal = false; |
437 | 51.7k | parameters->push_back(params); |
438 | 51.7k | h = (h + 1) / 2; |
439 | 51.7k | JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h); |
440 | 51.7k | } |
441 | 76.3k | } |
442 | 43.4k | JXL_DEBUG_V(7, "that's it"); |
443 | 43.4k | } |
444 | | |
445 | | Status CheckMetaSqueezeParams(const SqueezeParams ¶meter, |
446 | 409k | int num_channels) { |
447 | 409k | int c1 = parameter.begin_c; |
448 | 409k | int c2 = parameter.begin_c + parameter.num_c - 1; |
449 | 409k | if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) { |
450 | 120 | return JXL_FAILURE("Invalid channel range"); |
451 | 120 | } |
452 | 409k | return true; |
453 | 409k | } |
454 | | |
455 | 46.7k | Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters) { |
456 | 46.7k | JxlMemoryManager *memory_manager = image.memory_manager(); |
457 | 46.7k | if (parameters->empty()) { |
458 | 43.4k | DefaultSqueezeParameters(parameters, image); |
459 | 43.4k | } |
460 | | |
461 | 209k | for (auto ¶meter : *parameters) { |
462 | 209k | JXL_RETURN_IF_ERROR( |
463 | 209k | CheckMetaSqueezeParams(parameter, image.channel.size())); |
464 | 209k | bool horizontal = parameter.horizontal; |
465 | 209k | bool in_place = parameter.in_place; |
466 | 209k | uint32_t beginc = parameter.begin_c; |
467 | 209k | uint32_t endc = parameter.begin_c + parameter.num_c - 1; |
468 | | |
469 | 209k | uint32_t offset; |
470 | 209k | if (beginc < image.nb_meta_channels) { |
471 | 424 | if (endc >= image.nb_meta_channels) { |
472 | 7 | return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels"); |
473 | 7 | } |
474 | 417 | if (!in_place) { |
475 | 7 | return JXL_FAILURE( |
476 | 7 | "Invalid squeeze: meta channels require in-place residuals"); |
477 | 7 | } |
478 | 410 | image.nb_meta_channels += parameter.num_c; |
479 | 410 | } |
480 | 209k | if (in_place) { |
481 | 134k | offset = endc + 1; |
482 | 134k | } else { |
483 | 75.1k | offset = image.channel.size(); |
484 | 75.1k | } |
485 | 1.06M | for (uint32_t c = beginc; c <= endc; c++) { |
486 | 852k | if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) { |
487 | 12 | return JXL_FAILURE("Too many squeezes: shift > 30"); |
488 | 12 | } |
489 | 852k | size_t w = image.channel[c].w; |
490 | 852k | size_t h = image.channel[c].h; |
491 | 852k | if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel"); |
492 | 852k | if (horizontal) { |
493 | 460k | image.channel[c].w = (w + 1) / 2; |
494 | 460k | if (image.channel[c].hshift >= 0) image.channel[c].hshift++; |
495 | 460k | w = w - (w + 1) / 2; |
496 | 460k | } else { |
497 | 391k | image.channel[c].h = (h + 1) / 2; |
498 | 391k | if (image.channel[c].vshift >= 0) image.channel[c].vshift++; |
499 | 391k | h = h - (h + 1) / 2; |
500 | 391k | } |
501 | 852k | JXL_RETURN_IF_ERROR(image.channel[c].shrink()); |
502 | 1.70M | JXL_ASSIGN_OR_RETURN(Channel placeholder, |
503 | 1.70M | Channel::Create(memory_manager, w, h)); |
504 | 1.70M | placeholder.hshift = image.channel[c].hshift; |
505 | 1.70M | placeholder.vshift = image.channel[c].vshift; |
506 | | |
507 | 1.70M | image.channel.insert(image.channel.begin() + offset + (c - beginc), |
508 | 1.70M | std::move(placeholder)); |
509 | 1.70M | JXL_DEBUG_V(8, "MetaSqueeze applied, current image: %s", |
510 | 1.70M | image.DebugString().c_str()); |
511 | 1.70M | } |
512 | 209k | } |
513 | 46.6k | return true; |
514 | 46.7k | } |
515 | | |
516 | | } // namespace jxl |
517 | | |
518 | | #endif |