Coverage Report

Created: 2026-06-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/modular/transform/squeeze.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/modular/transform/squeeze.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cstddef>
12
#include <cstdint>
13
#include <cstdlib>
14
#include <utility>
15
#include <vector>
16
17
#include "lib/jxl/base/common.h"
18
#include "lib/jxl/base/compiler_specific.h"
19
#include "lib/jxl/base/data_parallel.h"
20
#include "lib/jxl/base/printf_macros.h"
21
#include "lib/jxl/base/status.h"
22
#include "lib/jxl/modular/modular_image.h"
23
#include "lib/jxl/modular/transform/squeeze_params.h"
24
#undef HWY_TARGET_INCLUDE
25
#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/squeeze.cc"
26
#include <hwy/foreach_target.h>
27
#include <hwy/highway.h>
28
29
#include "lib/jxl/simd_util-inl.h"
30
31
HWY_BEFORE_NAMESPACE();
32
namespace jxl {
33
namespace HWY_NAMESPACE {
34
35
#if HWY_TARGET != HWY_SCALAR
36
37
// These templates are not found via ADL.
38
using hwy::HWY_NAMESPACE::Abs;
39
using hwy::HWY_NAMESPACE::Add;
40
using hwy::HWY_NAMESPACE::And;
41
using hwy::HWY_NAMESPACE::DupEven;
42
using hwy::HWY_NAMESPACE::DupOdd;
43
using hwy::HWY_NAMESPACE::Gt;
44
using hwy::HWY_NAMESPACE::IfThenElse;
45
using hwy::HWY_NAMESPACE::IfThenZeroElse;
46
using hwy::HWY_NAMESPACE::Lt;
47
using hwy::HWY_NAMESPACE::MulEven;
48
using hwy::HWY_NAMESPACE::MulOdd;
49
using hwy::HWY_NAMESPACE::Ne;
50
using hwy::HWY_NAMESPACE::Neg;
51
using hwy::HWY_NAMESPACE::OddEven;
52
using hwy::HWY_NAMESPACE::RebindToUnsigned;
53
using hwy::HWY_NAMESPACE::ShiftLeft;
54
using hwy::HWY_NAMESPACE::ShiftRight;
55
using hwy::HWY_NAMESPACE::Sub;
56
using hwy::HWY_NAMESPACE::Xor;
57
58
using D = HWY_CAPPED(pixel_type, 8);
59
using DU = RebindToUnsigned<D>;
60
constexpr D d;
61
constexpr DU du;
62
63
JXL_INLINE void FastUnsqueeze(const pixel_type *JXL_RESTRICT p_residual,
64
                              const pixel_type *JXL_RESTRICT p_avg,
65
                              const pixel_type *JXL_RESTRICT p_navg,
66
                              const pixel_type *p_pout,
67
                              pixel_type *JXL_RESTRICT p_out,
68
37.3M
                              pixel_type *p_nout) {
69
37.3M
  const size_t N = Lanes(d);
70
37.3M
  auto onethird = Set(d, 0x55555556);
71
74.6M
  for (size_t x = 0; x < 8; x += N) {
72
37.3M
    auto avg = Load(d, p_avg + x);
73
37.3M
    auto next_avg = Load(d, p_navg + x);
74
37.3M
    auto top = Load(d, p_pout + x);
75
    // Equivalent to SmoothTendency(top,avg,next_avg), but without branches
76
    // typo:off
77
37.3M
    auto Ba = Sub(top, avg);
78
37.3M
    auto an = Sub(avg, next_avg);
79
37.3M
    auto nonmono = Xor(Ba, an);
80
37.3M
    auto absBa = Abs(Ba);
81
37.3M
    auto absan = Abs(an);
82
37.3M
    auto absBn = Abs(Sub(top, next_avg));
83
    // Compute a3 = absBa / 3
84
37.3M
    auto a3eh = MulEven(absBa, onethird);
85
37.3M
    auto a3oh = MulOdd(absBa, onethird);
86
87
37.3M
#if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2))
88
37.3M
#if HWY_IS_LITTLE_ENDIAN
89
37.3M
    auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh));
90
#else  // not little endian
91
    auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh));
92
#endif  // endianness
93
#else  // hwy < 1.2
94
#if HWY_IS_LITTLE_ENDIAN
95
    auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh)));
96
#else  // not little endian
97
    auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh))
98
#endif  // endianness
99
#endif  // hwy version
100
101
37.3M
    a3 = Add(a3, Add(absBn, Set(d, 2)));
102
37.3M
    auto absdiff = ShiftRight<2>(a3);
103
37.3M
    auto skipdiff = Ne(Ba, Zero(d));
104
37.3M
    skipdiff = And(skipdiff, Ne(an, Zero(d)));
105
37.3M
    skipdiff = And(skipdiff, Lt(nonmono, Zero(d)));
106
37.3M
    auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1)));
107
37.3M
    absdiff = IfThenElse(Gt(absdiff, absBa2),
108
37.3M
                         Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff);
109
    // typo:on
110
37.3M
    auto absan2 = ShiftLeft<1>(absan);
111
37.3M
    absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2),
112
37.3M
                         absan2, absdiff);
113
37.3M
    auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff);
114
37.3M
    auto tendency = IfThenZeroElse(skipdiff, diff1);
115
116
37.3M
    auto diff_minus_tendency = Load(d, p_residual + x);
117
37.3M
    auto diff = Add(diff_minus_tendency, tendency);
118
37.3M
    auto out =
119
37.3M
        Add(avg, ShiftRight<1>(
120
37.3M
                     Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff))))));
121
37.3M
    Store(out, d, p_out + x);
122
37.3M
    Store(Sub(out, diff), d, p_nout + x);
123
37.3M
  }
124
37.3M
}
Unexecuted instantiation: jxl::N_SSE4::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
jxl::N_AVX2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
Line
Count
Source
68
37.3M
                              pixel_type *p_nout) {
69
37.3M
  const size_t N = Lanes(d);
70
37.3M
  auto onethird = Set(d, 0x55555556);
71
74.6M
  for (size_t x = 0; x < 8; x += N) {
72
37.3M
    auto avg = Load(d, p_avg + x);
73
37.3M
    auto next_avg = Load(d, p_navg + x);
74
37.3M
    auto top = Load(d, p_pout + x);
75
    // Equivalent to SmoothTendency(top,avg,next_avg), but without branches
76
    // typo:off
77
37.3M
    auto Ba = Sub(top, avg);
78
37.3M
    auto an = Sub(avg, next_avg);
79
37.3M
    auto nonmono = Xor(Ba, an);
80
37.3M
    auto absBa = Abs(Ba);
81
37.3M
    auto absan = Abs(an);
82
37.3M
    auto absBn = Abs(Sub(top, next_avg));
83
    // Compute a3 = absBa / 3
84
37.3M
    auto a3eh = MulEven(absBa, onethird);
85
37.3M
    auto a3oh = MulOdd(absBa, onethird);
86
87
37.3M
#if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2))
88
37.3M
#if HWY_IS_LITTLE_ENDIAN
89
37.3M
    auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh));
90
#else  // not little endian
91
    auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh));
92
#endif  // endianness
93
#else  // hwy < 1.2
94
#if HWY_IS_LITTLE_ENDIAN
95
    auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh)));
96
#else  // not little endian
97
    auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh))
98
#endif  // endianness
99
#endif  // hwy version
100
101
37.3M
    a3 = Add(a3, Add(absBn, Set(d, 2)));
102
37.3M
    auto absdiff = ShiftRight<2>(a3);
103
37.3M
    auto skipdiff = Ne(Ba, Zero(d));
104
37.3M
    skipdiff = And(skipdiff, Ne(an, Zero(d)));
105
37.3M
    skipdiff = And(skipdiff, Lt(nonmono, Zero(d)));
106
37.3M
    auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1)));
107
37.3M
    absdiff = IfThenElse(Gt(absdiff, absBa2),
108
37.3M
                         Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff);
109
    // typo:on
110
37.3M
    auto absan2 = ShiftLeft<1>(absan);
111
37.3M
    absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2),
112
37.3M
                         absan2, absdiff);
113
37.3M
    auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff);
114
37.3M
    auto tendency = IfThenZeroElse(skipdiff, diff1);
115
116
37.3M
    auto diff_minus_tendency = Load(d, p_residual + x);
117
37.3M
    auto diff = Add(diff_minus_tendency, tendency);
118
37.3M
    auto out =
119
37.3M
        Add(avg, ShiftRight<1>(
120
37.3M
                     Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff))))));
121
37.3M
    Store(out, d, p_out + x);
122
37.3M
    Store(Sub(out, diff), d, p_nout + x);
123
37.3M
  }
124
37.3M
}
Unexecuted instantiation: jxl::N_AVX3::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
Unexecuted instantiation: jxl::N_AVX3_SPR::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
Unexecuted instantiation: jxl::N_SSE2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
125
126
#endif  // HWY_TARGET != HWY_SCALAR
127
128
161k
Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
129
161k
  JXL_ENSURE(c < input.channel.size());
130
161k
  JXL_ENSURE(rc < input.channel.size());
131
161k
  Channel &chin = input.channel[c];
132
161k
  const Channel &chin_residual = input.channel[rc];
133
  // These must be valid since we ran MetaApply already.
134
161k
  JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2));
135
161k
  JXL_ENSURE(chin.h == chin_residual.h);
136
161k
  JxlMemoryManager *memory_manager = input.memory_manager();
137
138
161k
  if (chin_residual.w == 0) {
139
    // Short-circuit: output channel has same dimensions as input.
140
6.82k
    input.channel[c].hshift--;
141
6.82k
    return true;
142
6.82k
  }
143
144
  // Note: chin.w >= chin_residual.w and at most 1 different.
145
310k
  JXL_ASSIGN_OR_RETURN(Channel chout,
146
310k
                       Channel::Create(memory_manager, chin.w + chin_residual.w,
147
310k
                                       chin.h, chin.hshift - 1, chin.vshift));
148
310k
  JXL_DEBUG_V(4,
149
310k
              "Undoing horizontal squeeze of channel %i using residuals in "
150
310k
              "channel %i (going from width %" PRIuS " to %" PRIuS ")",
151
310k
              c, rc, chin.w, chout.w);
152
153
310k
  if (chin_residual.h == 0) {
154
    // Short-circuit: channel with no pixels.
155
0
    input.channel[c] = std::move(chout);
156
0
    return true;
157
0
  }
158
5.50M
  auto unsqueeze_row = [&](size_t y, size_t x0) {
159
5.50M
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
160
5.50M
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
161
5.50M
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
162
46.4M
    for (size_t x = x0; x < chin_residual.w; x++) {
163
40.9M
      pixel_type_w diff_minus_tendency = p_residual[x];
164
40.9M
      pixel_type_w avg = p_avg[x];
165
40.9M
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
166
40.9M
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
167
40.9M
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
168
40.9M
      pixel_type_w diff = diff_minus_tendency + tendency;
169
40.9M
      pixel_type_w A = avg + (diff / 2);
170
40.9M
      p_out[(x << 1)] = A;
171
40.9M
      pixel_type_w B = A - diff;
172
40.9M
      p_out[(x << 1) + 1] = B;
173
40.9M
    }
174
5.50M
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
175
5.50M
  };
Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
Line
Count
Source
158
5.50M
  auto unsqueeze_row = [&](size_t y, size_t x0) {
159
5.50M
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
160
5.50M
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
161
5.50M
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
162
46.4M
    for (size_t x = x0; x < chin_residual.w; x++) {
163
40.9M
      pixel_type_w diff_minus_tendency = p_residual[x];
164
40.9M
      pixel_type_w avg = p_avg[x];
165
40.9M
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
166
40.9M
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
167
40.9M
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
168
40.9M
      pixel_type_w diff = diff_minus_tendency + tendency;
169
40.9M
      pixel_type_w A = avg + (diff / 2);
170
40.9M
      p_out[(x << 1)] = A;
171
40.9M
      pixel_type_w B = A - diff;
172
40.9M
      p_out[(x << 1) + 1] = B;
173
40.9M
    }
174
5.50M
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
175
5.50M
  };
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_ZEN4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_SPR::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
176
177
  // somewhat complicated trickery just to be able to SIMD this.
178
  // Horizontal unsqueeze has horizontal data dependencies, so we do
179
  // 8 rows at a time and treat it as a vertical unsqueeze of a
180
  // transposed 8x8 block (or 9x8 for one input).
181
155k
  static constexpr const size_t kRowsPerThread = 8;
182
155k
  const auto unsqueeze_span = [&](const uint32_t task,
183
740k
                                  size_t /* thread */) -> Status {
184
740k
    const size_t y0 = task * kRowsPerThread;
185
740k
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
186
740k
    size_t x = 0;
187
188
740k
#if HWY_TARGET != HWY_SCALAR
189
740k
    ptrdiff_t onerow_in = chin.plane.PixelsPerRow();
190
740k
    ptrdiff_t onerow_inr = chin_residual.plane.PixelsPerRow();
191
740k
    ptrdiff_t onerow_out = chout.plane.PixelsPerRow();
192
740k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
193
740k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
194
740k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
195
740k
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
196
740k
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
197
740k
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
198
740k
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
199
740k
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
200
740k
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
201
740k
    const size_t N = Lanes(d);
202
740k
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
203
2.73M
      for (; x < chin_residual.w - 9; x += 8) {
204
2.31M
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
205
2.31M
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
206
20.8M
        for (size_t y = 0; y < kRowsPerThread; y++) {
207
18.4M
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
208
18.4M
        }
209
20.8M
        for (size_t i = 0; i < 8; i++) {
210
18.4M
          FastUnsqueeze(
211
18.4M
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
212
18.4M
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
213
18.4M
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
214
18.4M
        }
215
216
2.31M
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
217
2.31M
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
218
20.8M
        for (size_t y = 0; y < kRowsPerThread; y++) {
219
36.9M
          for (size_t i = 0; i < kRowsPerThread; i += N) {
220
18.4M
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
221
18.4M
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
222
18.4M
            StoreInterleaved(d, even, odd,
223
18.4M
                             p_out + ((x + i) << 1) + onerow_out * y);
224
18.4M
          }
225
18.4M
        }
226
2.31M
      }
227
424k
    }
228
740k
#endif  // HWY_TARGET != HWY_SCALAR
229
6.24M
    for (size_t y = 0; y < rows; y++) {
230
5.50M
      unsqueeze_row(y0 + y, x);
231
5.50M
    }
232
740k
    return true;
233
740k
  };
Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
Line
Count
Source
183
740k
                                  size_t /* thread */) -> Status {
184
740k
    const size_t y0 = task * kRowsPerThread;
185
740k
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
186
740k
    size_t x = 0;
187
188
740k
#if HWY_TARGET != HWY_SCALAR
189
740k
    ptrdiff_t onerow_in = chin.plane.PixelsPerRow();
190
740k
    ptrdiff_t onerow_inr = chin_residual.plane.PixelsPerRow();
191
740k
    ptrdiff_t onerow_out = chout.plane.PixelsPerRow();
192
740k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
193
740k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
194
740k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
195
740k
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
196
740k
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
197
740k
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
198
740k
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
199
740k
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
200
740k
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
201
740k
    const size_t N = Lanes(d);
202
740k
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
203
2.73M
      for (; x < chin_residual.w - 9; x += 8) {
204
2.31M
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
205
2.31M
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
206
20.8M
        for (size_t y = 0; y < kRowsPerThread; y++) {
207
18.4M
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
208
18.4M
        }
209
20.8M
        for (size_t i = 0; i < 8; i++) {
210
18.4M
          FastUnsqueeze(
211
18.4M
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
212
18.4M
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
213
18.4M
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
214
18.4M
        }
215
216
2.31M
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
217
2.31M
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
218
20.8M
        for (size_t y = 0; y < kRowsPerThread; y++) {
219
36.9M
          for (size_t i = 0; i < kRowsPerThread; i += N) {
220
18.4M
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
221
18.4M
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
222
18.4M
            StoreInterleaved(d, even, odd,
223
18.4M
                             p_out + ((x + i) << 1) + onerow_out * y);
224
18.4M
          }
225
18.4M
        }
226
2.31M
      }
227
424k
    }
228
740k
#endif  // HWY_TARGET != HWY_SCALAR
229
6.24M
    for (size_t y = 0; y < rows; y++) {
230
5.50M
      unsqueeze_row(y0 + y, x);
231
5.50M
    }
232
740k
    return true;
233
740k
  };
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_ZEN4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_SPR::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
234
155k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread),
235
155k
                                ThreadPool::NoInit, unsqueeze_span,
236
155k
                                "InvHorizontalSqueeze"));
237
155k
  input.channel[c] = std::move(chout);
238
155k
  return true;
239
155k
}
Unexecuted instantiation: jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Line
Count
Source
128
161k
Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
129
161k
  JXL_ENSURE(c < input.channel.size());
130
161k
  JXL_ENSURE(rc < input.channel.size());
131
161k
  Channel &chin = input.channel[c];
132
161k
  const Channel &chin_residual = input.channel[rc];
133
  // These must be valid since we ran MetaApply already.
134
161k
  JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2));
135
161k
  JXL_ENSURE(chin.h == chin_residual.h);
136
161k
  JxlMemoryManager *memory_manager = input.memory_manager();
137
138
161k
  if (chin_residual.w == 0) {
139
    // Short-circuit: output channel has same dimensions as input.
140
6.82k
    input.channel[c].hshift--;
141
6.82k
    return true;
142
6.82k
  }
143
144
  // Note: chin.w >= chin_residual.w and at most 1 different.
145
310k
  JXL_ASSIGN_OR_RETURN(Channel chout,
146
310k
                       Channel::Create(memory_manager, chin.w + chin_residual.w,
147
310k
                                       chin.h, chin.hshift - 1, chin.vshift));
148
310k
  JXL_DEBUG_V(4,
149
310k
              "Undoing horizontal squeeze of channel %i using residuals in "
150
310k
              "channel %i (going from width %" PRIuS " to %" PRIuS ")",
151
310k
              c, rc, chin.w, chout.w);
152
153
310k
  if (chin_residual.h == 0) {
154
    // Short-circuit: channel with no pixels.
155
0
    input.channel[c] = std::move(chout);
156
0
    return true;
157
0
  }
158
155k
  auto unsqueeze_row = [&](size_t y, size_t x0) {
159
155k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
160
155k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
161
155k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
162
155k
    for (size_t x = x0; x < chin_residual.w; x++) {
163
155k
      pixel_type_w diff_minus_tendency = p_residual[x];
164
155k
      pixel_type_w avg = p_avg[x];
165
155k
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
166
155k
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
167
155k
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
168
155k
      pixel_type_w diff = diff_minus_tendency + tendency;
169
155k
      pixel_type_w A = avg + (diff / 2);
170
155k
      p_out[(x << 1)] = A;
171
155k
      pixel_type_w B = A - diff;
172
155k
      p_out[(x << 1) + 1] = B;
173
155k
    }
174
155k
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
175
155k
  };
176
177
  // somewhat complicated trickery just to be able to SIMD this.
178
  // Horizontal unsqueeze has horizontal data dependencies, so we do
179
  // 8 rows at a time and treat it as a vertical unsqueeze of a
180
  // transposed 8x8 block (or 9x8 for one input).
181
155k
  static constexpr const size_t kRowsPerThread = 8;
182
155k
  const auto unsqueeze_span = [&](const uint32_t task,
183
155k
                                  size_t /* thread */) -> Status {
184
155k
    const size_t y0 = task * kRowsPerThread;
185
155k
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
186
155k
    size_t x = 0;
187
188
155k
#if HWY_TARGET != HWY_SCALAR
189
155k
    ptrdiff_t onerow_in = chin.plane.PixelsPerRow();
190
155k
    ptrdiff_t onerow_inr = chin_residual.plane.PixelsPerRow();
191
155k
    ptrdiff_t onerow_out = chout.plane.PixelsPerRow();
192
155k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
193
155k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
194
155k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
195
155k
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
196
155k
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
197
155k
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
198
155k
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
199
155k
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
200
155k
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
201
155k
    const size_t N = Lanes(d);
202
155k
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
203
155k
      for (; x < chin_residual.w - 9; x += 8) {
204
155k
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
205
155k
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
206
155k
        for (size_t y = 0; y < kRowsPerThread; y++) {
207
155k
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
208
155k
        }
209
155k
        for (size_t i = 0; i < 8; i++) {
210
155k
          FastUnsqueeze(
211
155k
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
212
155k
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
213
155k
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
214
155k
        }
215
216
155k
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
217
155k
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
218
155k
        for (size_t y = 0; y < kRowsPerThread; y++) {
219
155k
          for (size_t i = 0; i < kRowsPerThread; i += N) {
220
155k
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
221
155k
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
222
155k
            StoreInterleaved(d, even, odd,
223
155k
                             p_out + ((x + i) << 1) + onerow_out * y);
224
155k
          }
225
155k
        }
226
155k
      }
227
155k
    }
228
155k
#endif  // HWY_TARGET != HWY_SCALAR
229
155k
    for (size_t y = 0; y < rows; y++) {
230
155k
      unsqueeze_row(y0 + y, x);
231
155k
    }
232
155k
    return true;
233
155k
  };
234
155k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread),
235
155k
                                ThreadPool::NoInit, unsqueeze_span,
236
155k
                                "InvHorizontalSqueeze"));
237
155k
  input.channel[c] = std::move(chout);
238
155k
  return true;
239
155k
}
Unexecuted instantiation: jxl::N_AVX3::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_AVX3_SPR::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
240
241
206k
Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
242
206k
  JXL_ENSURE(c < input.channel.size());
243
206k
  JXL_ENSURE(rc < input.channel.size());
244
206k
  const Channel &chin = input.channel[c];
245
206k
  const Channel &chin_residual = input.channel[rc];
246
  // These must be valid since we ran MetaApply already.
247
206k
  JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2));
248
206k
  JXL_ENSURE(chin.w == chin_residual.w);
249
206k
  JxlMemoryManager *memory_manager = input.memory_manager();
250
251
206k
  if (chin_residual.h == 0) {
252
    // Short-circuit: output channel has same dimensions as input.
253
25.4k
    input.channel[c].vshift--;
254
25.4k
    return true;
255
25.4k
  }
256
257
  // Note: chin.h >= chin_residual.h and at most 1 different.
258
362k
  JXL_ASSIGN_OR_RETURN(
259
362k
      Channel chout,
260
362k
      Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h,
261
362k
                      chin.hshift, chin.vshift - 1));
262
362k
  JXL_DEBUG_V(
263
362k
      4,
264
362k
      "Undoing vertical squeeze of channel %i using residuals in channel "
265
362k
      "%i (going from height %" PRIuS " to %" PRIuS ")",
266
362k
      c, rc, chin.h, chout.h);
267
268
362k
  if (chin_residual.w == 0) {
269
    // Short-circuit: channel with no pixels.
270
0
    input.channel[c] = std::move(chout);
271
0
    return true;
272
0
  }
273
274
181k
  static constexpr const int kColsPerThread = 64;
275
181k
  const auto unsqueeze_slice = [&](const uint32_t task,
276
192k
                                   size_t /* thread */) -> Status {
277
192k
    const size_t x0 = task * kColsPerThread;
278
192k
    const size_t x1 =
279
192k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
280
192k
    const size_t w = x1 - x0;
281
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
282
    // always chin_residual.h.
283
4.91M
    for (size_t y = 0; y < chin_residual.h; y++) {
284
4.72M
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
285
4.72M
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
286
4.72M
      const pixel_type *JXL_RESTRICT p_navg =
287
4.72M
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
288
4.72M
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
289
4.72M
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
290
4.72M
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
291
4.72M
      size_t x = 0;
292
4.72M
#if HWY_TARGET != HWY_SCALAR
293
23.5M
      for (; x + 7 < w; x += 8) {
294
18.8M
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
295
18.8M
                      p_out + x, p_nout + x);
296
18.8M
      }
297
4.72M
#endif
298
16.1M
      for (; x < w; x++) {
299
11.4M
        pixel_type_w avg = p_avg[x];
300
11.4M
        pixel_type_w next_avg = p_navg[x];
301
11.4M
        pixel_type_w top = p_pout[x];
302
11.4M
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
303
11.4M
        pixel_type_w diff_minus_tendency = p_residual[x];
304
11.4M
        pixel_type_w diff = diff_minus_tendency + tendency;
305
11.4M
        pixel_type_w out = avg + (diff / 2);
306
11.4M
        p_out[x] = out;
307
        // If the chin_residual.h == chin.h, the output has an even number
308
        // of rows so the next line is fine. Otherwise, this loop won't
309
        // write to the last output row which is handled separately.
310
11.4M
        p_nout[x] = out - diff;
311
11.4M
      }
312
4.72M
    }
313
192k
    return true;
314
192k
  };
Unexecuted instantiation: squeeze.cc:jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
squeeze.cc:jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
276
192k
                                   size_t /* thread */) -> Status {
277
192k
    const size_t x0 = task * kColsPerThread;
278
192k
    const size_t x1 =
279
192k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
280
192k
    const size_t w = x1 - x0;
281
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
282
    // always chin_residual.h.
283
4.91M
    for (size_t y = 0; y < chin_residual.h; y++) {
284
4.72M
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
285
4.72M
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
286
4.72M
      const pixel_type *JXL_RESTRICT p_navg =
287
4.72M
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
288
4.72M
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
289
4.72M
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
290
4.72M
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
291
4.72M
      size_t x = 0;
292
4.72M
#if HWY_TARGET != HWY_SCALAR
293
23.5M
      for (; x + 7 < w; x += 8) {
294
18.8M
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
295
18.8M
                      p_out + x, p_nout + x);
296
18.8M
      }
297
4.72M
#endif
298
16.1M
      for (; x < w; x++) {
299
11.4M
        pixel_type_w avg = p_avg[x];
300
11.4M
        pixel_type_w next_avg = p_navg[x];
301
11.4M
        pixel_type_w top = p_pout[x];
302
11.4M
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
303
11.4M
        pixel_type_w diff_minus_tendency = p_residual[x];
304
11.4M
        pixel_type_w diff = diff_minus_tendency + tendency;
305
11.4M
        pixel_type_w out = avg + (diff / 2);
306
11.4M
        p_out[x] = out;
307
        // If the chin_residual.h == chin.h, the output has an even number
308
        // of rows so the next line is fine. Otherwise, this loop won't
309
        // write to the last output row which is handled separately.
310
11.4M
        p_nout[x] = out - diff;
311
11.4M
      }
312
4.72M
    }
313
192k
    return true;
314
192k
  };
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_ZEN4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_AVX3_SPR::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Unexecuted instantiation: squeeze.cc:jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
315
181k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread),
316
181k
                                ThreadPool::NoInit, unsqueeze_slice,
317
181k
                                "InvVertSqueeze"));
318
319
181k
  if (chout.h & 1) {
320
57.8k
    size_t y = chin.h - 1;
321
57.8k
    const pixel_type *p_avg = chin.Row(y);
322
57.8k
    pixel_type *p_out = chout.Row(y << 1);
323
1.53M
    for (size_t x = 0; x < chin.w; x++) {
324
1.47M
      p_out[x] = p_avg[x];
325
1.47M
    }
326
57.8k
  }
327
181k
  input.channel[c] = std::move(chout);
328
181k
  return true;
329
181k
}
Unexecuted instantiation: jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Line
Count
Source
241
206k
Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
242
206k
  JXL_ENSURE(c < input.channel.size());
243
206k
  JXL_ENSURE(rc < input.channel.size());
244
206k
  const Channel &chin = input.channel[c];
245
206k
  const Channel &chin_residual = input.channel[rc];
246
  // These must be valid since we ran MetaApply already.
247
206k
  JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2));
248
206k
  JXL_ENSURE(chin.w == chin_residual.w);
249
206k
  JxlMemoryManager *memory_manager = input.memory_manager();
250
251
206k
  if (chin_residual.h == 0) {
252
    // Short-circuit: output channel has same dimensions as input.
253
25.4k
    input.channel[c].vshift--;
254
25.4k
    return true;
255
25.4k
  }
256
257
  // Note: chin.h >= chin_residual.h and at most 1 different.
258
362k
  JXL_ASSIGN_OR_RETURN(
259
362k
      Channel chout,
260
362k
      Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h,
261
362k
                      chin.hshift, chin.vshift - 1));
262
362k
  JXL_DEBUG_V(
263
362k
      4,
264
362k
      "Undoing vertical squeeze of channel %i using residuals in channel "
265
362k
      "%i (going from height %" PRIuS " to %" PRIuS ")",
266
362k
      c, rc, chin.h, chout.h);
267
268
362k
  if (chin_residual.w == 0) {
269
    // Short-circuit: channel with no pixels.
270
0
    input.channel[c] = std::move(chout);
271
0
    return true;
272
0
  }
273
274
181k
  static constexpr const int kColsPerThread = 64;
275
181k
  const auto unsqueeze_slice = [&](const uint32_t task,
276
181k
                                   size_t /* thread */) -> Status {
277
181k
    const size_t x0 = task * kColsPerThread;
278
181k
    const size_t x1 =
279
181k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
280
181k
    const size_t w = x1 - x0;
281
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
282
    // always chin_residual.h.
283
181k
    for (size_t y = 0; y < chin_residual.h; y++) {
284
181k
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
285
181k
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
286
181k
      const pixel_type *JXL_RESTRICT p_navg =
287
181k
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
288
181k
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
289
181k
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
290
181k
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
291
181k
      size_t x = 0;
292
181k
#if HWY_TARGET != HWY_SCALAR
293
181k
      for (; x + 7 < w; x += 8) {
294
181k
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
295
181k
                      p_out + x, p_nout + x);
296
181k
      }
297
181k
#endif
298
181k
      for (; x < w; x++) {
299
181k
        pixel_type_w avg = p_avg[x];
300
181k
        pixel_type_w next_avg = p_navg[x];
301
181k
        pixel_type_w top = p_pout[x];
302
181k
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
303
181k
        pixel_type_w diff_minus_tendency = p_residual[x];
304
181k
        pixel_type_w diff = diff_minus_tendency + tendency;
305
181k
        pixel_type_w out = avg + (diff / 2);
306
181k
        p_out[x] = out;
307
        // If the chin_residual.h == chin.h, the output has an even number
308
        // of rows so the next line is fine. Otherwise, this loop won't
309
        // write to the last output row which is handled separately.
310
181k
        p_nout[x] = out - diff;
311
181k
      }
312
181k
    }
313
181k
    return true;
314
181k
  };
315
181k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread),
316
181k
                                ThreadPool::NoInit, unsqueeze_slice,
317
181k
                                "InvVertSqueeze"));
318
319
181k
  if (chout.h & 1) {
320
57.8k
    size_t y = chin.h - 1;
321
57.8k
    const pixel_type *p_avg = chin.Row(y);
322
57.8k
    pixel_type *p_out = chout.Row(y << 1);
323
1.53M
    for (size_t x = 0; x < chin.w; x++) {
324
1.47M
      p_out[x] = p_avg[x];
325
1.47M
    }
326
57.8k
  }
327
181k
  input.channel[c] = std::move(chout);
328
181k
  return true;
329
181k
}
Unexecuted instantiation: jxl::N_AVX3::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_AVX3_SPR::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
330
331
Status InvSqueeze(Image &input, const std::vector<SqueezeParams> &parameters,
332
22.2k
                  ThreadPool *pool) {
333
152k
  for (int i = parameters.size() - 1; i >= 0; i--) {
334
130k
    JXL_RETURN_IF_ERROR(
335
130k
        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
336
130k
    bool horizontal = parameters[i].horizontal;
337
130k
    bool in_place = parameters[i].in_place;
338
130k
    uint32_t beginc = parameters[i].begin_c;
339
130k
    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
340
130k
    uint32_t offset;
341
130k
    if (in_place) {
342
71.8k
      offset = endc + 1;
343
71.8k
    } else {
344
58.2k
      offset = input.channel.size() + beginc - endc - 1;
345
58.2k
    }
346
130k
    if (beginc < input.nb_meta_channels) {
347
      // This is checked in MetaSqueeze.
348
4
      JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c);
349
4
      input.nb_meta_channels -= parameters[i].num_c;
350
4
    }
351
352
498k
    for (uint32_t c = beginc; c <= endc; c++) {
353
368k
      uint32_t rc = offset + c - beginc;
354
      // MetaApply should imply that `rc` is within range, otherwise there's a
355
      // programming bug.
356
368k
      JXL_ENSURE(rc < input.channel.size());
357
368k
      if ((input.channel[c].w < input.channel[rc].w) ||
358
368k
          (input.channel[c].h < input.channel[rc].h)) {
359
0
        return JXL_FAILURE("Corrupted squeeze transform");
360
0
      }
361
368k
      if (horizontal) {
362
161k
        JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool));
363
206k
      } else {
364
206k
        JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool));
365
206k
      }
366
368k
    }
367
130k
    input.channel.erase(input.channel.begin() + offset,
368
130k
                        input.channel.begin() + offset + (endc - beginc + 1));
369
130k
  }
370
22.2k
  return true;
371
22.2k
}
Unexecuted instantiation: jxl::N_SSE4::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
jxl::N_AVX2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
Line
Count
Source
332
22.2k
                  ThreadPool *pool) {
333
152k
  for (int i = parameters.size() - 1; i >= 0; i--) {
334
130k
    JXL_RETURN_IF_ERROR(
335
130k
        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
336
130k
    bool horizontal = parameters[i].horizontal;
337
130k
    bool in_place = parameters[i].in_place;
338
130k
    uint32_t beginc = parameters[i].begin_c;
339
130k
    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
340
130k
    uint32_t offset;
341
130k
    if (in_place) {
342
71.8k
      offset = endc + 1;
343
71.8k
    } else {
344
58.2k
      offset = input.channel.size() + beginc - endc - 1;
345
58.2k
    }
346
130k
    if (beginc < input.nb_meta_channels) {
347
      // This is checked in MetaSqueeze.
348
4
      JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c);
349
4
      input.nb_meta_channels -= parameters[i].num_c;
350
4
    }
351
352
498k
    for (uint32_t c = beginc; c <= endc; c++) {
353
368k
      uint32_t rc = offset + c - beginc;
354
      // MetaApply should imply that `rc` is within range, otherwise there's a
355
      // programming bug.
356
368k
      JXL_ENSURE(rc < input.channel.size());
357
368k
      if ((input.channel[c].w < input.channel[rc].w) ||
358
368k
          (input.channel[c].h < input.channel[rc].h)) {
359
0
        return JXL_FAILURE("Corrupted squeeze transform");
360
0
      }
361
368k
      if (horizontal) {
362
161k
        JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool));
363
206k
      } else {
364
206k
        JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool));
365
206k
      }
366
368k
    }
367
130k
    input.channel.erase(input.channel.begin() + offset,
368
130k
                        input.channel.begin() + offset + (endc - beginc + 1));
369
130k
  }
370
22.2k
  return true;
371
22.2k
}
Unexecuted instantiation: jxl::N_AVX3::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_AVX3_ZEN4::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_AVX3_SPR::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
Unexecuted instantiation: jxl::N_SSE2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
372
373
}  // namespace HWY_NAMESPACE
374
}  // namespace jxl
375
HWY_AFTER_NAMESPACE();
376
377
#if HWY_ONCE
378
379
namespace jxl {
380
381
HWY_EXPORT(InvSqueeze);
382
Status InvSqueeze(Image &input, const std::vector<SqueezeParams> &parameters,
383
22.2k
                  ThreadPool *pool) {
384
22.2k
  return HWY_DYNAMIC_DISPATCH(InvSqueeze)(input, parameters, pool);
385
22.2k
}
386
387
void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters,
388
19.8k
                              const Image &image) {
389
19.8k
  int nb_channels = image.channel.size() - image.nb_meta_channels;
390
391
19.8k
  parameters->clear();
392
19.8k
  size_t w = image.channel[image.nb_meta_channels].w;
393
19.8k
  size_t h = image.channel[image.nb_meta_channels].h;
394
19.8k
  JXL_DEBUG_V(
395
19.8k
      7, "Default squeeze parameters for %" PRIuS "x%" PRIuS " image: ", w, h);
396
397
  // do horizontal first on wide images; vertical first on tall images
398
19.8k
  bool wide = (w > h);
399
400
19.8k
  if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w &&
401
12.7k
      image.channel[image.nb_meta_channels + 1].h == h) {
402
    // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0
403
    // previews
404
12.4k
    JXL_DEBUG_V(7, "(4:2:0 chroma), %" PRIuS "x%" PRIuS " image", w, h);
405
12.4k
    SqueezeParams params;
406
    // horizontal chroma squeeze
407
12.4k
    params.horizontal = true;
408
12.4k
    params.in_place = false;
409
12.4k
    params.begin_c = image.nb_meta_channels + 1;
410
12.4k
    params.num_c = 2;
411
12.4k
    parameters->push_back(params);
412
12.4k
    params.horizontal = false;
413
    // vertical chroma squeeze
414
12.4k
    parameters->push_back(params);
415
12.4k
  }
416
19.8k
  SqueezeParams params;
417
19.8k
  params.begin_c = image.nb_meta_channels;
418
19.8k
  params.num_c = nb_channels;
419
19.8k
  params.in_place = true;
420
421
19.8k
  if (!wide) {
422
14.3k
    if (h > kMaxFirstPreviewSize) {
423
10.0k
      params.horizontal = false;
424
10.0k
      parameters->push_back(params);
425
10.0k
      h = (h + 1) / 2;
426
10.0k
      JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h);
427
10.0k
    }
428
14.3k
  }
429
57.2k
  while (w > kMaxFirstPreviewSize || h > kMaxFirstPreviewSize) {
430
37.3k
    if (w > kMaxFirstPreviewSize) {
431
34.5k
      params.horizontal = true;
432
34.5k
      parameters->push_back(params);
433
34.5k
      w = (w + 1) / 2;
434
34.5k
      JXL_DEBUG_V(7, "Horizontal (%" PRIuS "x%" PRIuS "), ", w, h);
435
34.5k
    }
436
37.3k
    if (h > kMaxFirstPreviewSize) {
437
30.7k
      params.horizontal = false;
438
30.7k
      parameters->push_back(params);
439
30.7k
      h = (h + 1) / 2;
440
30.7k
      JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h);
441
30.7k
    }
442
37.3k
  }
443
19.8k
  JXL_DEBUG_V(7, "that's it");
444
19.8k
}
445
446
Status CheckMetaSqueezeParams(const SqueezeParams &parameter,
447
269k
                              int num_channels) {
448
269k
  int c1 = parameter.begin_c;
449
269k
  int c2 = parameter.begin_c + parameter.num_c - 1;
450
269k
  if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) {
451
267
    return JXL_FAILURE("Invalid channel range");
452
267
  }
453
268k
  return true;
454
269k
}
455
456
24.8k
Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters) {
457
24.8k
  JxlMemoryManager *memory_manager = image.memory_manager();
458
24.8k
  if (parameters->empty()) {
459
19.8k
    DefaultSqueezeParameters(parameters, image);
460
19.8k
  }
461
462
139k
  for (auto &parameter : *parameters) {
463
139k
    JXL_RETURN_IF_ERROR(
464
139k
        CheckMetaSqueezeParams(parameter, image.channel.size()));
465
138k
    bool horizontal = parameter.horizontal;
466
138k
    bool in_place = parameter.in_place;
467
138k
    uint32_t beginc = parameter.begin_c;
468
138k
    uint32_t endc = parameter.begin_c + parameter.num_c - 1;
469
470
138k
    uint32_t offset;
471
138k
    if (beginc < image.nb_meta_channels) {
472
36
      if (endc >= image.nb_meta_channels) {
473
4
        return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels");
474
4
      }
475
32
      if (!in_place) {
476
9
        return JXL_FAILURE(
477
9
            "Invalid squeeze: meta channels require in-place residuals");
478
9
      }
479
23
      image.nb_meta_channels += parameter.num_c;
480
23
    }
481
138k
    if (in_place) {
482
75.6k
      offset = endc + 1;
483
75.6k
    } else {
484
63.1k
      offset = image.channel.size();
485
63.1k
    }
486
526k
    for (uint32_t c = beginc; c <= endc; c++) {
487
387k
      if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) {
488
5
        return JXL_FAILURE("Too many squeezes: shift > 30");
489
5
      }
490
387k
      size_t w = image.channel[c].w;
491
387k
      size_t h = image.channel[c].h;
492
387k
      if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel");
493
387k
      if (horizontal) {
494
170k
        image.channel[c].w = (w + 1) / 2;
495
170k
        if (image.channel[c].hshift >= 0) image.channel[c].hshift++;
496
170k
        w = w - (w + 1) / 2;
497
217k
      } else {
498
217k
        image.channel[c].h = (h + 1) / 2;
499
217k
        if (image.channel[c].vshift >= 0) image.channel[c].vshift++;
500
217k
        h = h - (h + 1) / 2;
501
217k
      }
502
387k
      JXL_RETURN_IF_ERROR(image.channel[c].shrink());
503
775k
      JXL_ASSIGN_OR_RETURN(Channel placeholder,
504
775k
                           Channel::Create(memory_manager, w, h));
505
775k
      placeholder.hshift = image.channel[c].hshift;
506
775k
      placeholder.vshift = image.channel[c].vshift;
507
775k
      placeholder.component = image.channel[c].component;
508
775k
      image.channel.insert(image.channel.begin() + offset + (c - beginc),
509
775k
                           std::move(placeholder));
510
775k
      JXL_DEBUG_V(0, "MetaSqueeze applied, current image: %s",
511
775k
                  image.DebugString().c_str());
512
775k
    }
513
138k
  }
514
24.4k
  return true;
515
24.8k
}
516
517
}  // namespace jxl
518
519
#endif