Coverage Report

Created: 2025-07-23 07:47

/src/libjxl/lib/jxl/modular/transform/squeeze.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/modular/transform/squeeze.h"
7
8
#include <jxl/memory_manager.h>
9
10
#include <algorithm>
11
#include <cstdint>
12
#include <cstdlib>
13
#include <utility>
14
#include <vector>
15
16
#include "lib/jxl/base/common.h"
17
#include "lib/jxl/base/compiler_specific.h"
18
#include "lib/jxl/base/data_parallel.h"
19
#include "lib/jxl/base/printf_macros.h"
20
#include "lib/jxl/base/status.h"
21
#include "lib/jxl/modular/modular_image.h"
22
#include "lib/jxl/modular/transform/squeeze_params.h"
23
#undef HWY_TARGET_INCLUDE
24
#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/squeeze.cc"
25
#include <hwy/foreach_target.h>
26
#include <hwy/highway.h>
27
28
#include "lib/jxl/simd_util-inl.h"
29
30
HWY_BEFORE_NAMESPACE();
31
namespace jxl {
32
namespace HWY_NAMESPACE {
33
34
#if HWY_TARGET != HWY_SCALAR
35
36
// These templates are not found via ADL.
37
using hwy::HWY_NAMESPACE::Abs;
38
using hwy::HWY_NAMESPACE::Add;
39
using hwy::HWY_NAMESPACE::And;
40
using hwy::HWY_NAMESPACE::DupEven;
41
using hwy::HWY_NAMESPACE::DupOdd;
42
using hwy::HWY_NAMESPACE::Gt;
43
using hwy::HWY_NAMESPACE::IfThenElse;
44
using hwy::HWY_NAMESPACE::IfThenZeroElse;
45
using hwy::HWY_NAMESPACE::Lt;
46
using hwy::HWY_NAMESPACE::MulEven;
47
using hwy::HWY_NAMESPACE::MulOdd;
48
using hwy::HWY_NAMESPACE::Ne;
49
using hwy::HWY_NAMESPACE::Neg;
50
using hwy::HWY_NAMESPACE::OddEven;
51
using hwy::HWY_NAMESPACE::RebindToUnsigned;
52
using hwy::HWY_NAMESPACE::ShiftLeft;
53
using hwy::HWY_NAMESPACE::ShiftRight;
54
using hwy::HWY_NAMESPACE::Sub;
55
using hwy::HWY_NAMESPACE::Xor;
56
57
using D = HWY_CAPPED(pixel_type, 8);
58
using DU = RebindToUnsigned<D>;
59
constexpr D d;
60
constexpr DU du;
61
62
JXL_INLINE void FastUnsqueeze(const pixel_type *JXL_RESTRICT p_residual,
63
                              const pixel_type *JXL_RESTRICT p_avg,
64
                              const pixel_type *JXL_RESTRICT p_navg,
65
                              const pixel_type *p_pout,
66
                              pixel_type *JXL_RESTRICT p_out,
67
331M
                              pixel_type *p_nout) {
68
331M
  const size_t N = Lanes(d);
69
331M
  auto onethird = Set(d, 0x55555556);
70
694M
  for (size_t x = 0; x < 8; x += N) {
71
363M
    auto avg = Load(d, p_avg + x);
72
363M
    auto next_avg = Load(d, p_navg + x);
73
363M
    auto top = Load(d, p_pout + x);
74
    // Equivalent to SmoothTendency(top,avg,next_avg), but without branches
75
    // typo:off
76
363M
    auto Ba = Sub(top, avg);
77
363M
    auto an = Sub(avg, next_avg);
78
363M
    auto nonmono = Xor(Ba, an);
79
363M
    auto absBa = Abs(Ba);
80
363M
    auto absan = Abs(an);
81
363M
    auto absBn = Abs(Sub(top, next_avg));
82
    // Compute a3 = absBa / 3
83
363M
    auto a3eh = MulEven(absBa, onethird);
84
363M
    auto a3oh = MulOdd(absBa, onethird);
85
86
363M
#if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2))
87
363M
#if HWY_IS_LITTLE_ENDIAN
88
363M
    auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh));
89
#else  // not little endian
90
    auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh));
91
#endif  // endianness
92
#else  // hwy < 1.2
93
#if HWY_IS_LITTLE_ENDIAN
94
    auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh)));
95
#else  // not little endian
96
    auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh))
97
#endif  // endianness
98
#endif  // hwy version
99
100
363M
    a3 = Add(a3, Add(absBn, Set(d, 2)));
101
363M
    auto absdiff = ShiftRight<2>(a3);
102
363M
    auto skipdiff = Ne(Ba, Zero(d));
103
363M
    skipdiff = And(skipdiff, Ne(an, Zero(d)));
104
363M
    skipdiff = And(skipdiff, Lt(nonmono, Zero(d)));
105
363M
    auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1)));
106
363M
    absdiff = IfThenElse(Gt(absdiff, absBa2),
107
363M
                         Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff);
108
    // typo:on
109
363M
    auto absan2 = ShiftLeft<1>(absan);
110
363M
    absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2),
111
363M
                         absan2, absdiff);
112
363M
    auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff);
113
363M
    auto tendency = IfThenZeroElse(skipdiff, diff1);
114
115
363M
    auto diff_minus_tendency = Load(d, p_residual + x);
116
363M
    auto diff = Add(diff_minus_tendency, tendency);
117
363M
    auto out =
118
363M
        Add(avg, ShiftRight<1>(
119
363M
                     Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff))))));
120
363M
    Store(out, d, p_out + x);
121
363M
    Store(Sub(out, diff), d, p_nout + x);
122
363M
  }
123
331M
}
jxl::N_SSE4::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
Line
Count
Source
67
16.6M
                              pixel_type *p_nout) {
68
16.6M
  const size_t N = Lanes(d);
69
16.6M
  auto onethird = Set(d, 0x55555556);
70
49.8M
  for (size_t x = 0; x < 8; x += N) {
71
33.1M
    auto avg = Load(d, p_avg + x);
72
33.1M
    auto next_avg = Load(d, p_navg + x);
73
33.1M
    auto top = Load(d, p_pout + x);
74
    // Equivalent to SmoothTendency(top,avg,next_avg), but without branches
75
    // typo:off
76
33.1M
    auto Ba = Sub(top, avg);
77
33.1M
    auto an = Sub(avg, next_avg);
78
33.1M
    auto nonmono = Xor(Ba, an);
79
33.1M
    auto absBa = Abs(Ba);
80
33.1M
    auto absan = Abs(an);
81
33.1M
    auto absBn = Abs(Sub(top, next_avg));
82
    // Compute a3 = absBa / 3
83
33.1M
    auto a3eh = MulEven(absBa, onethird);
84
33.1M
    auto a3oh = MulOdd(absBa, onethird);
85
86
33.1M
#if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2))
87
33.1M
#if HWY_IS_LITTLE_ENDIAN
88
33.1M
    auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh));
89
#else  // not little endian
90
    auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh));
91
#endif  // endianness
92
#else  // hwy < 1.2
93
#if HWY_IS_LITTLE_ENDIAN
94
    auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh)));
95
#else  // not little endian
96
    auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh))
97
#endif  // endianness
98
#endif  // hwy version
99
100
33.1M
    a3 = Add(a3, Add(absBn, Set(d, 2)));
101
33.1M
    auto absdiff = ShiftRight<2>(a3);
102
33.1M
    auto skipdiff = Ne(Ba, Zero(d));
103
33.1M
    skipdiff = And(skipdiff, Ne(an, Zero(d)));
104
33.1M
    skipdiff = And(skipdiff, Lt(nonmono, Zero(d)));
105
33.1M
    auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1)));
106
33.1M
    absdiff = IfThenElse(Gt(absdiff, absBa2),
107
33.1M
                         Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff);
108
    // typo:on
109
33.1M
    auto absan2 = ShiftLeft<1>(absan);
110
33.1M
    absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2),
111
33.1M
                         absan2, absdiff);
112
33.1M
    auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff);
113
33.1M
    auto tendency = IfThenZeroElse(skipdiff, diff1);
114
115
33.1M
    auto diff_minus_tendency = Load(d, p_residual + x);
116
33.1M
    auto diff = Add(diff_minus_tendency, tendency);
117
33.1M
    auto out =
118
33.1M
        Add(avg, ShiftRight<1>(
119
33.1M
                     Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff))))));
120
33.1M
    Store(out, d, p_out + x);
121
33.1M
    Store(Sub(out, diff), d, p_nout + x);
122
33.1M
  }
123
16.6M
}
jxl::N_AVX2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
Line
Count
Source
67
298M
                              pixel_type *p_nout) {
68
298M
  const size_t N = Lanes(d);
69
298M
  auto onethird = Set(d, 0x55555556);
70
596M
  for (size_t x = 0; x < 8; x += N) {
71
298M
    auto avg = Load(d, p_avg + x);
72
298M
    auto next_avg = Load(d, p_navg + x);
73
298M
    auto top = Load(d, p_pout + x);
74
    // Equivalent to SmoothTendency(top,avg,next_avg), but without branches
75
    // typo:off
76
298M
    auto Ba = Sub(top, avg);
77
298M
    auto an = Sub(avg, next_avg);
78
298M
    auto nonmono = Xor(Ba, an);
79
298M
    auto absBa = Abs(Ba);
80
298M
    auto absan = Abs(an);
81
298M
    auto absBn = Abs(Sub(top, next_avg));
82
    // Compute a3 = absBa / 3
83
298M
    auto a3eh = MulEven(absBa, onethird);
84
298M
    auto a3oh = MulOdd(absBa, onethird);
85
86
298M
#if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2))
87
298M
#if HWY_IS_LITTLE_ENDIAN
88
298M
    auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh));
89
#else  // not little endian
90
    auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh));
91
#endif  // endianness
92
#else  // hwy < 1.2
93
#if HWY_IS_LITTLE_ENDIAN
94
    auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh)));
95
#else  // not little endian
96
    auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh))
97
#endif  // endianness
98
#endif  // hwy version
99
100
298M
    a3 = Add(a3, Add(absBn, Set(d, 2)));
101
298M
    auto absdiff = ShiftRight<2>(a3);
102
298M
    auto skipdiff = Ne(Ba, Zero(d));
103
298M
    skipdiff = And(skipdiff, Ne(an, Zero(d)));
104
298M
    skipdiff = And(skipdiff, Lt(nonmono, Zero(d)));
105
298M
    auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1)));
106
298M
    absdiff = IfThenElse(Gt(absdiff, absBa2),
107
298M
                         Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff);
108
    // typo:on
109
298M
    auto absan2 = ShiftLeft<1>(absan);
110
298M
    absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2),
111
298M
                         absan2, absdiff);
112
298M
    auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff);
113
298M
    auto tendency = IfThenZeroElse(skipdiff, diff1);
114
115
298M
    auto diff_minus_tendency = Load(d, p_residual + x);
116
298M
    auto diff = Add(diff_minus_tendency, tendency);
117
298M
    auto out =
118
298M
        Add(avg, ShiftRight<1>(
119
298M
                     Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff))))));
120
298M
    Store(out, d, p_out + x);
121
298M
    Store(Sub(out, diff), d, p_nout + x);
122
298M
  }
123
298M
}
jxl::N_SSE2::FastUnsqueeze(int const*, int const*, int const*, int const*, int*, int*)
Line
Count
Source
67
16.3M
                              pixel_type *p_nout) {
68
16.3M
  const size_t N = Lanes(d);
69
16.3M
  auto onethird = Set(d, 0x55555556);
70
48.4M
  for (size_t x = 0; x < 8; x += N) {
71
32.1M
    auto avg = Load(d, p_avg + x);
72
32.1M
    auto next_avg = Load(d, p_navg + x);
73
32.1M
    auto top = Load(d, p_pout + x);
74
    // Equivalent to SmoothTendency(top,avg,next_avg), but without branches
75
    // typo:off
76
32.1M
    auto Ba = Sub(top, avg);
77
32.1M
    auto an = Sub(avg, next_avg);
78
32.1M
    auto nonmono = Xor(Ba, an);
79
32.1M
    auto absBa = Abs(Ba);
80
32.1M
    auto absan = Abs(an);
81
32.1M
    auto absBn = Abs(Sub(top, next_avg));
82
    // Compute a3 = absBa / 3
83
32.1M
    auto a3eh = MulEven(absBa, onethird);
84
32.1M
    auto a3oh = MulOdd(absBa, onethird);
85
86
32.1M
#if (HWY_MAJOR > 1 || (HWY_MAJOR == 1 && HWY_MINOR >= 2))
87
32.1M
#if HWY_IS_LITTLE_ENDIAN
88
32.1M
    auto a3 = InterleaveOdd(d, BitCast(d, a3eh), BitCast(d, a3oh));
89
#else  // not little endian
90
    auto a3 = InterleaveEven(d, BitCast(d, a3eh), BitCast(d, a3oh));
91
#endif  // endianness
92
#else  // hwy < 1.2
93
#if HWY_IS_LITTLE_ENDIAN
94
    auto a3 = OddEven(BitCast(d, a3oh), DupOdd(BitCast(d, a3eh)));
95
#else  // not little endian
96
    auto a3 = OddEven(DupEven(BitCast(d, a3oh)), BitCast(d, a3eh))
97
#endif  // endianness
98
#endif  // hwy version
99
100
32.1M
    a3 = Add(a3, Add(absBn, Set(d, 2)));
101
32.1M
    auto absdiff = ShiftRight<2>(a3);
102
32.1M
    auto skipdiff = Ne(Ba, Zero(d));
103
32.1M
    skipdiff = And(skipdiff, Ne(an, Zero(d)));
104
32.1M
    skipdiff = And(skipdiff, Lt(nonmono, Zero(d)));
105
32.1M
    auto absBa2 = Add(ShiftLeft<1>(absBa), And(absdiff, Set(d, 1)));
106
32.1M
    absdiff = IfThenElse(Gt(absdiff, absBa2),
107
32.1M
                         Add(ShiftLeft<1>(absBa), Set(d, 1)), absdiff);
108
    // typo:on
109
32.1M
    auto absan2 = ShiftLeft<1>(absan);
110
32.1M
    absdiff = IfThenElse(Gt(Add(absdiff, And(absdiff, Set(d, 1))), absan2),
111
32.1M
                         absan2, absdiff);
112
32.1M
    auto diff1 = IfThenElse(Lt(top, next_avg), Neg(absdiff), absdiff);
113
32.1M
    auto tendency = IfThenZeroElse(skipdiff, diff1);
114
115
32.1M
    auto diff_minus_tendency = Load(d, p_residual + x);
116
32.1M
    auto diff = Add(diff_minus_tendency, tendency);
117
32.1M
    auto out =
118
32.1M
        Add(avg, ShiftRight<1>(
119
32.1M
                     Add(diff, BitCast(d, ShiftRight<31>(BitCast(du, diff))))));
120
32.1M
    Store(out, d, p_out + x);
121
32.1M
    Store(Sub(out, diff), d, p_nout + x);
122
32.1M
  }
123
16.3M
}
124
125
#endif  // HWY_TARGET != HWY_SCALAR
126
127
361k
Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
128
361k
  JXL_ENSURE(c < input.channel.size());
129
361k
  JXL_ENSURE(rc < input.channel.size());
130
361k
  Channel &chin = input.channel[c];
131
361k
  const Channel &chin_residual = input.channel[rc];
132
  // These must be valid since we ran MetaApply already.
133
361k
  JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2));
134
361k
  JXL_ENSURE(chin.h == chin_residual.h);
135
361k
  JxlMemoryManager *memory_manager = input.memory_manager();
136
137
361k
  if (chin_residual.w == 0) {
138
    // Short-circuit: output channel has same dimensions as input.
139
9.93k
    input.channel[c].hshift--;
140
9.93k
    return true;
141
9.93k
  }
142
143
  // Note: chin.w >= chin_residual.w and at most 1 different.
144
703k
  JXL_ASSIGN_OR_RETURN(Channel chout,
145
703k
                       Channel::Create(memory_manager, chin.w + chin_residual.w,
146
703k
                                       chin.h, chin.hshift - 1, chin.vshift));
147
703k
  JXL_DEBUG_V(4,
148
703k
              "Undoing horizontal squeeze of channel %i using residuals in "
149
703k
              "channel %i (going from width %" PRIuS " to %" PRIuS ")",
150
703k
              c, rc, chin.w, chout.w);
151
152
703k
  if (chin_residual.h == 0) {
153
    // Short-circuit: channel with no pixels.
154
0
    input.channel[c] = std::move(chout);
155
0
    return true;
156
0
  }
157
130M
  auto unsqueeze_row = [&](size_t y, size_t x0) {
158
130M
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
159
130M
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
160
130M
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
161
763M
    for (size_t x = x0; x < chin_residual.w; x++) {
162
633M
      pixel_type_w diff_minus_tendency = p_residual[x];
163
633M
      pixel_type_w avg = p_avg[x];
164
633M
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
165
633M
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
166
633M
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
167
633M
      pixel_type_w diff = diff_minus_tendency + tendency;
168
633M
      pixel_type_w A = avg + (diff / 2);
169
633M
      p_out[(x << 1)] = A;
170
633M
      pixel_type_w B = A - diff;
171
633M
      p_out[(x << 1) + 1] = B;
172
633M
    }
173
130M
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
174
130M
  };
squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
Line
Count
Source
157
1.24M
  auto unsqueeze_row = [&](size_t y, size_t x0) {
158
1.24M
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
159
1.24M
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
160
1.24M
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
161
11.6M
    for (size_t x = x0; x < chin_residual.w; x++) {
162
10.3M
      pixel_type_w diff_minus_tendency = p_residual[x];
163
10.3M
      pixel_type_w avg = p_avg[x];
164
10.3M
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
165
10.3M
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
166
10.3M
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
167
10.3M
      pixel_type_w diff = diff_minus_tendency + tendency;
168
10.3M
      pixel_type_w A = avg + (diff / 2);
169
10.3M
      p_out[(x << 1)] = A;
170
10.3M
      pixel_type_w B = A - diff;
171
10.3M
      p_out[(x << 1) + 1] = B;
172
10.3M
    }
173
1.24M
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
174
1.24M
  };
squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
Line
Count
Source
157
128M
  auto unsqueeze_row = [&](size_t y, size_t x0) {
158
128M
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
159
128M
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
160
128M
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
161
742M
    for (size_t x = x0; x < chin_residual.w; x++) {
162
614M
      pixel_type_w diff_minus_tendency = p_residual[x];
163
614M
      pixel_type_w avg = p_avg[x];
164
614M
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
165
614M
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
166
614M
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
167
614M
      pixel_type_w diff = diff_minus_tendency + tendency;
168
614M
      pixel_type_w A = avg + (diff / 2);
169
614M
      p_out[(x << 1)] = A;
170
614M
      pixel_type_w B = A - diff;
171
614M
      p_out[(x << 1) + 1] = B;
172
614M
    }
173
128M
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
174
128M
  };
squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned long, unsigned long) const
Line
Count
Source
157
1.05M
  auto unsqueeze_row = [&](size_t y, size_t x0) {
158
1.05M
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
159
1.05M
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
160
1.05M
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
161
9.95M
    for (size_t x = x0; x < chin_residual.w; x++) {
162
8.90M
      pixel_type_w diff_minus_tendency = p_residual[x];
163
8.90M
      pixel_type_w avg = p_avg[x];
164
8.90M
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
165
8.90M
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
166
8.90M
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
167
8.90M
      pixel_type_w diff = diff_minus_tendency + tendency;
168
8.90M
      pixel_type_w A = avg + (diff / 2);
169
8.90M
      p_out[(x << 1)] = A;
170
8.90M
      pixel_type_w B = A - diff;
171
8.90M
      p_out[(x << 1) + 1] = B;
172
8.90M
    }
173
1.05M
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
174
1.05M
  };
175
176
  // somewhat complicated trickery just to be able to SIMD this.
177
  // Horizontal unsqueeze has horizontal data dependencies, so we do
178
  // 8 rows at a time and treat it as a vertical unsqueeze of a
179
  // transposed 8x8 block (or 9x8 for one input).
180
351k
  static constexpr const size_t kRowsPerThread = 8;
181
351k
  const auto unsqueeze_span = [&](const uint32_t task,
182
16.4M
                                  size_t /* thread */) -> Status {
183
16.4M
    const size_t y0 = task * kRowsPerThread;
184
16.4M
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
185
16.4M
    size_t x = 0;
186
187
16.4M
#if HWY_TARGET != HWY_SCALAR
188
16.4M
    intptr_t onerow_in = chin.plane.PixelsPerRow();
189
16.4M
    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
190
16.4M
    intptr_t onerow_out = chout.plane.PixelsPerRow();
191
16.4M
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
192
16.4M
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
193
16.4M
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
194
16.4M
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
195
16.4M
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
196
16.4M
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
197
16.4M
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
198
16.4M
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
199
16.4M
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
200
16.4M
    const size_t N = Lanes(d);
201
16.4M
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
202
19.9M
      for (; x < chin_residual.w - 9; x += 8) {
203
17.8M
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
204
17.8M
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
205
160M
        for (size_t y = 0; y < kRowsPerThread; y++) {
206
142M
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
207
142M
        }
208
156M
        for (size_t i = 0; i < 8; i++) {
209
138M
          FastUnsqueeze(
210
138M
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
211
138M
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
212
138M
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
213
138M
        }
214
215
17.8M
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
216
17.8M
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
217
159M
        for (size_t y = 0; y < kRowsPerThread; y++) {
218
301M
          for (size_t i = 0; i < kRowsPerThread; i += N) {
219
160M
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
220
160M
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
221
160M
            StoreInterleaved(d, even, odd,
222
160M
                             p_out + ((x + i) << 1) + onerow_out * y);
223
160M
          }
224
141M
        }
225
17.8M
      }
226
2.05M
    }
227
16.4M
#endif  // HWY_TARGET != HWY_SCALAR
228
146M
    for (size_t y = 0; y < rows; y++) {
229
130M
      unsqueeze_row(y0 + y, x);
230
130M
    }
231
16.4M
    return true;
232
16.4M
  };
squeeze.cc:jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
Line
Count
Source
182
172k
                                  size_t /* thread */) -> Status {
183
172k
    const size_t y0 = task * kRowsPerThread;
184
172k
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
185
172k
    size_t x = 0;
186
187
172k
#if HWY_TARGET != HWY_SCALAR
188
172k
    intptr_t onerow_in = chin.plane.PixelsPerRow();
189
172k
    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
190
172k
    intptr_t onerow_out = chout.plane.PixelsPerRow();
191
172k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
192
172k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
193
172k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
194
172k
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
195
172k
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
196
172k
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
197
172k
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
198
172k
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
199
172k
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
200
172k
    const size_t N = Lanes(d);
201
172k
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
202
1.42M
      for (; x < chin_residual.w - 9; x += 8) {
203
1.29M
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
204
1.29M
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
205
11.6M
        for (size_t y = 0; y < kRowsPerThread; y++) {
206
10.3M
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
207
10.3M
        }
208
10.1M
        for (size_t i = 0; i < 8; i++) {
209
8.82M
          FastUnsqueeze(
210
8.82M
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
211
18.4E
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
212
8.82M
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
213
8.82M
        }
214
215
1.29M
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
216
1.29M
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
217
11.5M
        for (size_t y = 0; y < kRowsPerThread; y++) {
218
30.8M
          for (size_t i = 0; i < kRowsPerThread; i += N) {
219
20.5M
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
220
20.5M
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
221
20.5M
            StoreInterleaved(d, even, odd,
222
20.5M
                             p_out + ((x + i) << 1) + onerow_out * y);
223
20.5M
          }
224
10.2M
        }
225
1.29M
      }
226
126k
    }
227
172k
#endif  // HWY_TARGET != HWY_SCALAR
228
1.42M
    for (size_t y = 0; y < rows; y++) {
229
1.24M
      unsqueeze_row(y0 + y, x);
230
1.24M
    }
231
172k
    return true;
232
172k
  };
squeeze.cc:jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
Line
Count
Source
182
16.1M
                                  size_t /* thread */) -> Status {
183
16.1M
    const size_t y0 = task * kRowsPerThread;
184
16.1M
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
185
16.1M
    size_t x = 0;
186
187
16.1M
#if HWY_TARGET != HWY_SCALAR
188
16.1M
    intptr_t onerow_in = chin.plane.PixelsPerRow();
189
16.1M
    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
190
16.1M
    intptr_t onerow_out = chout.plane.PixelsPerRow();
191
16.1M
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
192
16.1M
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
193
16.1M
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
194
16.1M
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
195
16.1M
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
196
16.1M
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
197
16.1M
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
198
16.1M
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
199
16.1M
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
200
16.1M
    const size_t N = Lanes(d);
201
16.1M
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
202
17.2M
      for (; x < chin_residual.w - 9; x += 8) {
203
15.4M
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
204
15.4M
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
205
138M
        for (size_t y = 0; y < kRowsPerThread; y++) {
206
122M
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
207
122M
        }
208
137M
        for (size_t i = 0; i < 8; i++) {
209
122M
          FastUnsqueeze(
210
122M
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
211
122M
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
212
122M
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
213
122M
        }
214
215
15.4M
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
216
15.4M
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
217
137M
        for (size_t y = 0; y < kRowsPerThread; y++) {
218
245M
          for (size_t i = 0; i < kRowsPerThread; i += N) {
219
122M
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
220
122M
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
221
122M
            StoreInterleaved(d, even, odd,
222
122M
                             p_out + ((x + i) << 1) + onerow_out * y);
223
122M
          }
224
122M
        }
225
15.4M
      }
226
1.81M
    }
227
16.1M
#endif  // HWY_TARGET != HWY_SCALAR
228
144M
    for (size_t y = 0; y < rows; y++) {
229
128M
      unsqueeze_row(y0 + y, x);
230
128M
    }
231
16.1M
    return true;
232
16.1M
  };
squeeze.cc:jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_1::operator()(unsigned int, unsigned long) const
Line
Count
Source
182
147k
                                  size_t /* thread */) -> Status {
183
147k
    const size_t y0 = task * kRowsPerThread;
184
147k
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
185
147k
    size_t x = 0;
186
187
147k
#if HWY_TARGET != HWY_SCALAR
188
147k
    intptr_t onerow_in = chin.plane.PixelsPerRow();
189
147k
    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
190
147k
    intptr_t onerow_out = chout.plane.PixelsPerRow();
191
147k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
192
147k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
193
147k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
194
147k
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
195
147k
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
196
147k
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
197
147k
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
198
147k
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
199
147k
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
200
147k
    const size_t N = Lanes(d);
201
147k
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
202
1.24M
      for (; x < chin_residual.w - 9; x += 8) {
203
1.13M
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
204
1.13M
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
205
10.0M
        for (size_t y = 0; y < kRowsPerThread; y++) {
206
8.92M
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
207
8.92M
        }
208
8.14M
        for (size_t i = 0; i < 8; i++) {
209
7.01M
          FastUnsqueeze(
210
7.01M
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
211
18.4E
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
212
7.01M
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
213
7.01M
        }
214
215
1.13M
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
216
1.13M
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
217
9.66M
        for (size_t y = 0; y < kRowsPerThread; y++) {
218
25.5M
          for (size_t i = 0; i < kRowsPerThread; i += N) {
219
17.0M
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
220
17.0M
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
221
17.0M
            StoreInterleaved(d, even, odd,
222
17.0M
                             p_out + ((x + i) << 1) + onerow_out * y);
223
17.0M
          }
224
8.53M
        }
225
1.13M
      }
226
110k
    }
227
147k
#endif  // HWY_TARGET != HWY_SCALAR
228
1.20M
    for (size_t y = 0; y < rows; y++) {
229
1.05M
      unsqueeze_row(y0 + y, x);
230
1.05M
    }
231
147k
    return true;
232
147k
  };
233
351k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread),
234
351k
                                ThreadPool::NoInit, unsqueeze_span,
235
351k
                                "InvHorizontalSqueeze"));
236
351k
  input.channel[c] = std::move(chout);
237
351k
  return true;
238
351k
}
jxl::N_SSE4::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Line
Count
Source
127
37.7k
Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
128
37.7k
  JXL_ENSURE(c < input.channel.size());
129
37.7k
  JXL_ENSURE(rc < input.channel.size());
130
37.7k
  Channel &chin = input.channel[c];
131
37.7k
  const Channel &chin_residual = input.channel[rc];
132
  // These must be valid since we ran MetaApply already.
133
37.7k
  JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2));
134
37.7k
  JXL_ENSURE(chin.h == chin_residual.h);
135
37.7k
  JxlMemoryManager *memory_manager = input.memory_manager();
136
137
37.7k
  if (chin_residual.w == 0) {
138
    // Short-circuit: output channel has same dimensions as input.
139
362
    input.channel[c].hshift--;
140
362
    return true;
141
362
  }
142
143
  // Note: chin.w >= chin_residual.w and at most 1 different.
144
74.7k
  JXL_ASSIGN_OR_RETURN(Channel chout,
145
74.7k
                       Channel::Create(memory_manager, chin.w + chin_residual.w,
146
74.7k
                                       chin.h, chin.hshift - 1, chin.vshift));
147
74.7k
  JXL_DEBUG_V(4,
148
74.7k
              "Undoing horizontal squeeze of channel %i using residuals in "
149
74.7k
              "channel %i (going from width %" PRIuS " to %" PRIuS ")",
150
74.7k
              c, rc, chin.w, chout.w);
151
152
74.7k
  if (chin_residual.h == 0) {
153
    // Short-circuit: channel with no pixels.
154
0
    input.channel[c] = std::move(chout);
155
0
    return true;
156
0
  }
157
37.3k
  auto unsqueeze_row = [&](size_t y, size_t x0) {
158
37.3k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
159
37.3k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
160
37.3k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
161
37.3k
    for (size_t x = x0; x < chin_residual.w; x++) {
162
37.3k
      pixel_type_w diff_minus_tendency = p_residual[x];
163
37.3k
      pixel_type_w avg = p_avg[x];
164
37.3k
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
165
37.3k
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
166
37.3k
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
167
37.3k
      pixel_type_w diff = diff_minus_tendency + tendency;
168
37.3k
      pixel_type_w A = avg + (diff / 2);
169
37.3k
      p_out[(x << 1)] = A;
170
37.3k
      pixel_type_w B = A - diff;
171
37.3k
      p_out[(x << 1) + 1] = B;
172
37.3k
    }
173
37.3k
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
174
37.3k
  };
175
176
  // somewhat complicated trickery just to be able to SIMD this.
177
  // Horizontal unsqueeze has horizontal data dependencies, so we do
178
  // 8 rows at a time and treat it as a vertical unsqueeze of a
179
  // transposed 8x8 block (or 9x8 for one input).
180
37.3k
  static constexpr const size_t kRowsPerThread = 8;
181
37.3k
  const auto unsqueeze_span = [&](const uint32_t task,
182
37.3k
                                  size_t /* thread */) -> Status {
183
37.3k
    const size_t y0 = task * kRowsPerThread;
184
37.3k
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
185
37.3k
    size_t x = 0;
186
187
37.3k
#if HWY_TARGET != HWY_SCALAR
188
37.3k
    intptr_t onerow_in = chin.plane.PixelsPerRow();
189
37.3k
    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
190
37.3k
    intptr_t onerow_out = chout.plane.PixelsPerRow();
191
37.3k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
192
37.3k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
193
37.3k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
194
37.3k
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
195
37.3k
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
196
37.3k
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
197
37.3k
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
198
37.3k
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
199
37.3k
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
200
37.3k
    const size_t N = Lanes(d);
201
37.3k
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
202
37.3k
      for (; x < chin_residual.w - 9; x += 8) {
203
37.3k
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
204
37.3k
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
205
37.3k
        for (size_t y = 0; y < kRowsPerThread; y++) {
206
37.3k
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
207
37.3k
        }
208
37.3k
        for (size_t i = 0; i < 8; i++) {
209
37.3k
          FastUnsqueeze(
210
37.3k
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
211
37.3k
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
212
37.3k
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
213
37.3k
        }
214
215
37.3k
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
216
37.3k
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
217
37.3k
        for (size_t y = 0; y < kRowsPerThread; y++) {
218
37.3k
          for (size_t i = 0; i < kRowsPerThread; i += N) {
219
37.3k
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
220
37.3k
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
221
37.3k
            StoreInterleaved(d, even, odd,
222
37.3k
                             p_out + ((x + i) << 1) + onerow_out * y);
223
37.3k
          }
224
37.3k
        }
225
37.3k
      }
226
37.3k
    }
227
37.3k
#endif  // HWY_TARGET != HWY_SCALAR
228
37.3k
    for (size_t y = 0; y < rows; y++) {
229
37.3k
      unsqueeze_row(y0 + y, x);
230
37.3k
    }
231
37.3k
    return true;
232
37.3k
  };
233
37.3k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread),
234
37.3k
                                ThreadPool::NoInit, unsqueeze_span,
235
37.3k
                                "InvHorizontalSqueeze"));
236
37.3k
  input.channel[c] = std::move(chout);
237
37.3k
  return true;
238
37.3k
}
jxl::N_AVX2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Line
Count
Source
127
288k
Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
128
288k
  JXL_ENSURE(c < input.channel.size());
129
288k
  JXL_ENSURE(rc < input.channel.size());
130
288k
  Channel &chin = input.channel[c];
131
288k
  const Channel &chin_residual = input.channel[rc];
132
  // These must be valid since we ran MetaApply already.
133
288k
  JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2));
134
288k
  JXL_ENSURE(chin.h == chin_residual.h);
135
288k
  JxlMemoryManager *memory_manager = input.memory_manager();
136
137
288k
  if (chin_residual.w == 0) {
138
    // Short-circuit: output channel has same dimensions as input.
139
8.51k
    input.channel[c].hshift--;
140
8.51k
    return true;
141
8.51k
  }
142
143
  // Note: chin.w >= chin_residual.w and at most 1 different.
144
560k
  JXL_ASSIGN_OR_RETURN(Channel chout,
145
560k
                       Channel::Create(memory_manager, chin.w + chin_residual.w,
146
560k
                                       chin.h, chin.hshift - 1, chin.vshift));
147
560k
  JXL_DEBUG_V(4,
148
560k
              "Undoing horizontal squeeze of channel %i using residuals in "
149
560k
              "channel %i (going from width %" PRIuS " to %" PRIuS ")",
150
560k
              c, rc, chin.w, chout.w);
151
152
560k
  if (chin_residual.h == 0) {
153
    // Short-circuit: channel with no pixels.
154
0
    input.channel[c] = std::move(chout);
155
0
    return true;
156
0
  }
157
280k
  auto unsqueeze_row = [&](size_t y, size_t x0) {
158
280k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
159
280k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
160
280k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
161
280k
    for (size_t x = x0; x < chin_residual.w; x++) {
162
280k
      pixel_type_w diff_minus_tendency = p_residual[x];
163
280k
      pixel_type_w avg = p_avg[x];
164
280k
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
165
280k
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
166
280k
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
167
280k
      pixel_type_w diff = diff_minus_tendency + tendency;
168
280k
      pixel_type_w A = avg + (diff / 2);
169
280k
      p_out[(x << 1)] = A;
170
280k
      pixel_type_w B = A - diff;
171
280k
      p_out[(x << 1) + 1] = B;
172
280k
    }
173
280k
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
174
280k
  };
175
176
  // somewhat complicated trickery just to be able to SIMD this.
177
  // Horizontal unsqueeze has horizontal data dependencies, so we do
178
  // 8 rows at a time and treat it as a vertical unsqueeze of a
179
  // transposed 8x8 block (or 9x8 for one input).
180
280k
  static constexpr const size_t kRowsPerThread = 8;
181
280k
  const auto unsqueeze_span = [&](const uint32_t task,
182
280k
                                  size_t /* thread */) -> Status {
183
280k
    const size_t y0 = task * kRowsPerThread;
184
280k
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
185
280k
    size_t x = 0;
186
187
280k
#if HWY_TARGET != HWY_SCALAR
188
280k
    intptr_t onerow_in = chin.plane.PixelsPerRow();
189
280k
    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
190
280k
    intptr_t onerow_out = chout.plane.PixelsPerRow();
191
280k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
192
280k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
193
280k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
194
280k
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
195
280k
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
196
280k
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
197
280k
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
198
280k
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
199
280k
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
200
280k
    const size_t N = Lanes(d);
201
280k
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
202
280k
      for (; x < chin_residual.w - 9; x += 8) {
203
280k
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
204
280k
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
205
280k
        for (size_t y = 0; y < kRowsPerThread; y++) {
206
280k
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
207
280k
        }
208
280k
        for (size_t i = 0; i < 8; i++) {
209
280k
          FastUnsqueeze(
210
280k
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
211
280k
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
212
280k
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
213
280k
        }
214
215
280k
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
216
280k
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
217
280k
        for (size_t y = 0; y < kRowsPerThread; y++) {
218
280k
          for (size_t i = 0; i < kRowsPerThread; i += N) {
219
280k
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
220
280k
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
221
280k
            StoreInterleaved(d, even, odd,
222
280k
                             p_out + ((x + i) << 1) + onerow_out * y);
223
280k
          }
224
280k
        }
225
280k
      }
226
280k
    }
227
280k
#endif  // HWY_TARGET != HWY_SCALAR
228
280k
    for (size_t y = 0; y < rows; y++) {
229
280k
      unsqueeze_row(y0 + y, x);
230
280k
    }
231
280k
    return true;
232
280k
  };
233
280k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread),
234
280k
                                ThreadPool::NoInit, unsqueeze_span,
235
280k
                                "InvHorizontalSqueeze"));
236
280k
  input.channel[c] = std::move(chout);
237
280k
  return true;
238
280k
}
jxl::N_SSE2::InvHSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Line
Count
Source
127
35.1k
Status InvHSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
128
35.1k
  JXL_ENSURE(c < input.channel.size());
129
35.1k
  JXL_ENSURE(rc < input.channel.size());
130
35.1k
  Channel &chin = input.channel[c];
131
35.1k
  const Channel &chin_residual = input.channel[rc];
132
  // These must be valid since we ran MetaApply already.
133
35.1k
  JXL_ENSURE(chin.w == DivCeil(chin.w + chin_residual.w, 2));
134
35.1k
  JXL_ENSURE(chin.h == chin_residual.h);
135
35.1k
  JxlMemoryManager *memory_manager = input.memory_manager();
136
137
35.1k
  if (chin_residual.w == 0) {
138
    // Short-circuit: output channel has same dimensions as input.
139
1.05k
    input.channel[c].hshift--;
140
1.05k
    return true;
141
1.05k
  }
142
143
  // Note: chin.w >= chin_residual.w and at most 1 different.
144
68.1k
  JXL_ASSIGN_OR_RETURN(Channel chout,
145
68.1k
                       Channel::Create(memory_manager, chin.w + chin_residual.w,
146
68.1k
                                       chin.h, chin.hshift - 1, chin.vshift));
147
68.1k
  JXL_DEBUG_V(4,
148
68.1k
              "Undoing horizontal squeeze of channel %i using residuals in "
149
68.1k
              "channel %i (going from width %" PRIuS " to %" PRIuS ")",
150
68.1k
              c, rc, chin.w, chout.w);
151
152
68.1k
  if (chin_residual.h == 0) {
153
    // Short-circuit: channel with no pixels.
154
0
    input.channel[c] = std::move(chout);
155
0
    return true;
156
0
  }
157
34.0k
  auto unsqueeze_row = [&](size_t y, size_t x0) {
158
34.0k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y);
159
34.0k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y);
160
34.0k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y);
161
34.0k
    for (size_t x = x0; x < chin_residual.w; x++) {
162
34.0k
      pixel_type_w diff_minus_tendency = p_residual[x];
163
34.0k
      pixel_type_w avg = p_avg[x];
164
34.0k
      pixel_type_w next_avg = (x + 1 < chin.w ? p_avg[x + 1] : avg);
165
34.0k
      pixel_type_w left = (x ? p_out[(x << 1) - 1] : avg);
166
34.0k
      pixel_type_w tendency = SmoothTendency(left, avg, next_avg);
167
34.0k
      pixel_type_w diff = diff_minus_tendency + tendency;
168
34.0k
      pixel_type_w A = avg + (diff / 2);
169
34.0k
      p_out[(x << 1)] = A;
170
34.0k
      pixel_type_w B = A - diff;
171
34.0k
      p_out[(x << 1) + 1] = B;
172
34.0k
    }
173
34.0k
    if (chout.w & 1) p_out[chout.w - 1] = p_avg[chin.w - 1];
174
34.0k
  };
175
176
  // somewhat complicated trickery just to be able to SIMD this.
177
  // Horizontal unsqueeze has horizontal data dependencies, so we do
178
  // 8 rows at a time and treat it as a vertical unsqueeze of a
179
  // transposed 8x8 block (or 9x8 for one input).
180
34.0k
  static constexpr const size_t kRowsPerThread = 8;
181
34.0k
  const auto unsqueeze_span = [&](const uint32_t task,
182
34.0k
                                  size_t /* thread */) -> Status {
183
34.0k
    const size_t y0 = task * kRowsPerThread;
184
34.0k
    const size_t rows = std::min(kRowsPerThread, chin.h - y0);
185
34.0k
    size_t x = 0;
186
187
34.0k
#if HWY_TARGET != HWY_SCALAR
188
34.0k
    intptr_t onerow_in = chin.plane.PixelsPerRow();
189
34.0k
    intptr_t onerow_inr = chin_residual.plane.PixelsPerRow();
190
34.0k
    intptr_t onerow_out = chout.plane.PixelsPerRow();
191
34.0k
    const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y0);
192
34.0k
    const pixel_type *JXL_RESTRICT p_avg = chin.Row(y0);
193
34.0k
    pixel_type *JXL_RESTRICT p_out = chout.Row(y0);
194
34.0k
    HWY_ALIGN pixel_type b_p_avg[9 * kRowsPerThread];
195
34.0k
    HWY_ALIGN pixel_type b_p_residual[8 * kRowsPerThread];
196
34.0k
    HWY_ALIGN pixel_type b_p_out_even[8 * kRowsPerThread];
197
34.0k
    HWY_ALIGN pixel_type b_p_out_odd[8 * kRowsPerThread];
198
34.0k
    HWY_ALIGN pixel_type b_p_out_evenT[8 * kRowsPerThread];
199
34.0k
    HWY_ALIGN pixel_type b_p_out_oddT[8 * kRowsPerThread];
200
34.0k
    const size_t N = Lanes(d);
201
34.0k
    if (chin_residual.w > 16 && rows == kRowsPerThread) {
202
34.0k
      for (; x < chin_residual.w - 9; x += 8) {
203
34.0k
        Transpose8x8Block(p_residual + x, b_p_residual, onerow_inr);
204
34.0k
        Transpose8x8Block(p_avg + x, b_p_avg, onerow_in);
205
34.0k
        for (size_t y = 0; y < kRowsPerThread; y++) {
206
34.0k
          b_p_avg[8 * 8 + y] = p_avg[x + 8 + onerow_in * y];
207
34.0k
        }
208
34.0k
        for (size_t i = 0; i < 8; i++) {
209
34.0k
          FastUnsqueeze(
210
34.0k
              b_p_residual + 8 * i, b_p_avg + 8 * i, b_p_avg + 8 * (i + 1),
211
34.0k
              (x + i ? b_p_out_odd + 8 * ((x + i - 1) & 7) : b_p_avg + 8 * i),
212
34.0k
              b_p_out_even + 8 * i, b_p_out_odd + 8 * i);
213
34.0k
        }
214
215
34.0k
        Transpose8x8Block(b_p_out_even, b_p_out_evenT, 8);
216
34.0k
        Transpose8x8Block(b_p_out_odd, b_p_out_oddT, 8);
217
34.0k
        for (size_t y = 0; y < kRowsPerThread; y++) {
218
34.0k
          for (size_t i = 0; i < kRowsPerThread; i += N) {
219
34.0k
            auto even = Load(d, b_p_out_evenT + 8 * y + i);
220
34.0k
            auto odd = Load(d, b_p_out_oddT + 8 * y + i);
221
34.0k
            StoreInterleaved(d, even, odd,
222
34.0k
                             p_out + ((x + i) << 1) + onerow_out * y);
223
34.0k
          }
224
34.0k
        }
225
34.0k
      }
226
34.0k
    }
227
34.0k
#endif  // HWY_TARGET != HWY_SCALAR
228
34.0k
    for (size_t y = 0; y < rows; y++) {
229
34.0k
      unsqueeze_row(y0 + y, x);
230
34.0k
    }
231
34.0k
    return true;
232
34.0k
  };
233
34.0k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.h, kRowsPerThread),
234
34.0k
                                ThreadPool::NoInit, unsqueeze_span,
235
34.0k
                                "InvHorizontalSqueeze"));
236
34.0k
  input.channel[c] = std::move(chout);
237
34.0k
  return true;
238
34.0k
}
239
240
359k
Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
241
359k
  JXL_ENSURE(c < input.channel.size());
242
359k
  JXL_ENSURE(rc < input.channel.size());
243
359k
  const Channel &chin = input.channel[c];
244
359k
  const Channel &chin_residual = input.channel[rc];
245
  // These must be valid since we ran MetaApply already.
246
359k
  JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2));
247
359k
  JXL_ENSURE(chin.w == chin_residual.w);
248
359k
  JxlMemoryManager *memory_manager = input.memory_manager();
249
250
359k
  if (chin_residual.h == 0) {
251
    // Short-circuit: output channel has same dimensions as input.
252
13.6k
    input.channel[c].vshift--;
253
13.6k
    return true;
254
13.6k
  }
255
256
  // Note: chin.h >= chin_residual.h and at most 1 different.
257
690k
  JXL_ASSIGN_OR_RETURN(
258
690k
      Channel chout,
259
690k
      Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h,
260
690k
                      chin.hshift, chin.vshift - 1));
261
690k
  JXL_DEBUG_V(
262
690k
      4,
263
690k
      "Undoing vertical squeeze of channel %i using residuals in channel "
264
690k
      "%i (going from height %" PRIuS " to %" PRIuS ")",
265
690k
      c, rc, chin.h, chout.h);
266
267
690k
  if (chin_residual.w == 0) {
268
    // Short-circuit: channel with no pixels.
269
0
    input.channel[c] = std::move(chout);
270
0
    return true;
271
0
  }
272
273
345k
  static constexpr const int kColsPerThread = 64;
274
345k
  const auto unsqueeze_slice = [&](const uint32_t task,
275
569k
                                   size_t /* thread */) -> Status {
276
569k
    const size_t x0 = task * kColsPerThread;
277
569k
    const size_t x1 =
278
569k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
279
569k
    const size_t w = x1 - x0;
280
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
281
    // always chin_residual.h.
282
138M
    for (size_t y = 0; y < chin_residual.h; y++) {
283
138M
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
284
138M
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
285
138M
      const pixel_type *JXL_RESTRICT p_navg =
286
138M
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
287
138M
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
288
138M
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
289
138M
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
290
138M
      size_t x = 0;
291
138M
#if HWY_TARGET != HWY_SCALAR
292
335M
      for (; x + 7 < w; x += 8) {
293
196M
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
294
196M
                      p_out + x, p_nout + x);
295
196M
      }
296
138M
#endif
297
467M
      for (; x < w; x++) {
298
329M
        pixel_type_w avg = p_avg[x];
299
329M
        pixel_type_w next_avg = p_navg[x];
300
329M
        pixel_type_w top = p_pout[x];
301
329M
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
302
329M
        pixel_type_w diff_minus_tendency = p_residual[x];
303
329M
        pixel_type_w diff = diff_minus_tendency + tendency;
304
329M
        pixel_type_w out = avg + (diff / 2);
305
329M
        p_out[x] = out;
306
        // If the chin_residual.h == chin.h, the output has an even number
307
        // of rows so the next line is fine. Otherwise, this loop won't
308
        // write to the last output row which is handled separately.
309
329M
        p_nout[x] = out - diff;
310
329M
      }
311
138M
    }
312
569k
    return true;
313
569k
  };
squeeze.cc:jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
275
39.7k
                                   size_t /* thread */) -> Status {
276
39.7k
    const size_t x0 = task * kColsPerThread;
277
39.7k
    const size_t x1 =
278
39.7k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
279
39.7k
    const size_t w = x1 - x0;
280
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
281
    // always chin_residual.h.
282
1.26M
    for (size_t y = 0; y < chin_residual.h; y++) {
283
1.22M
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
284
1.22M
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
285
1.22M
      const pixel_type *JXL_RESTRICT p_navg =
286
1.22M
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
287
1.22M
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
288
1.22M
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
289
1.22M
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
290
1.22M
      size_t x = 0;
291
1.22M
#if HWY_TARGET != HWY_SCALAR
292
9.22M
      for (; x + 7 < w; x += 8) {
293
8.00M
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
294
8.00M
                      p_out + x, p_nout + x);
295
8.00M
      }
296
1.22M
#endif
297
2.81M
      for (; x < w; x++) {
298
1.59M
        pixel_type_w avg = p_avg[x];
299
1.59M
        pixel_type_w next_avg = p_navg[x];
300
1.59M
        pixel_type_w top = p_pout[x];
301
1.59M
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
302
1.59M
        pixel_type_w diff_minus_tendency = p_residual[x];
303
1.59M
        pixel_type_w diff = diff_minus_tendency + tendency;
304
1.59M
        pixel_type_w out = avg + (diff / 2);
305
1.59M
        p_out[x] = out;
306
        // If the chin_residual.h == chin.h, the output has an even number
307
        // of rows so the next line is fine. Otherwise, this loop won't
308
        // write to the last output row which is handled separately.
309
1.59M
        p_nout[x] = out - diff;
310
1.59M
      }
311
1.22M
    }
312
39.7k
    return true;
313
39.7k
  };
squeeze.cc:jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
275
491k
                                   size_t /* thread */) -> Status {
276
491k
    const size_t x0 = task * kColsPerThread;
277
491k
    const size_t x1 =
278
491k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
279
491k
    const size_t w = x1 - x0;
280
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
281
    // always chin_residual.h.
282
136M
    for (size_t y = 0; y < chin_residual.h; y++) {
283
135M
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
284
135M
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
285
135M
      const pixel_type *JXL_RESTRICT p_navg =
286
135M
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
287
135M
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
288
135M
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
289
135M
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
290
135M
      size_t x = 0;
291
135M
#if HWY_TARGET != HWY_SCALAR
292
315M
      for (; x + 7 < w; x += 8) {
293
179M
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
294
179M
                      p_out + x, p_nout + x);
295
179M
      }
296
135M
#endif
297
462M
      for (; x < w; x++) {
298
326M
        pixel_type_w avg = p_avg[x];
299
326M
        pixel_type_w next_avg = p_navg[x];
300
326M
        pixel_type_w top = p_pout[x];
301
326M
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
302
326M
        pixel_type_w diff_minus_tendency = p_residual[x];
303
326M
        pixel_type_w diff = diff_minus_tendency + tendency;
304
326M
        pixel_type_w out = avg + (diff / 2);
305
326M
        p_out[x] = out;
306
        // If the chin_residual.h == chin.h, the output has an even number
307
        // of rows so the next line is fine. Otherwise, this loop won't
308
        // write to the last output row which is handled separately.
309
326M
        p_nout[x] = out - diff;
310
326M
      }
311
135M
    }
312
491k
    return true;
313
491k
  };
squeeze.cc:jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
275
38.6k
                                   size_t /* thread */) -> Status {
276
38.6k
    const size_t x0 = task * kColsPerThread;
277
38.6k
    const size_t x1 =
278
38.6k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
279
38.6k
    const size_t w = x1 - x0;
280
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
281
    // always chin_residual.h.
282
1.37M
    for (size_t y = 0; y < chin_residual.h; y++) {
283
1.33M
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
284
1.33M
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
285
1.33M
      const pixel_type *JXL_RESTRICT p_navg =
286
1.33M
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
287
1.33M
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
288
1.33M
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
289
1.33M
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
290
1.33M
      size_t x = 0;
291
1.33M
#if HWY_TARGET != HWY_SCALAR
292
10.7M
      for (; x + 7 < w; x += 8) {
293
9.44M
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
294
9.44M
                      p_out + x, p_nout + x);
295
9.44M
      }
296
1.33M
#endif
297
2.37M
      for (; x < w; x++) {
298
1.03M
        pixel_type_w avg = p_avg[x];
299
1.03M
        pixel_type_w next_avg = p_navg[x];
300
1.03M
        pixel_type_w top = p_pout[x];
301
1.03M
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
302
1.03M
        pixel_type_w diff_minus_tendency = p_residual[x];
303
1.03M
        pixel_type_w diff = diff_minus_tendency + tendency;
304
1.03M
        pixel_type_w out = avg + (diff / 2);
305
1.03M
        p_out[x] = out;
306
        // If the chin_residual.h == chin.h, the output has an even number
307
        // of rows so the next line is fine. Otherwise, this loop won't
308
        // write to the last output row which is handled separately.
309
1.03M
        p_nout[x] = out - diff;
310
1.03M
      }
311
1.33M
    }
312
38.6k
    return true;
313
38.6k
  };
314
345k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread),
315
345k
                                ThreadPool::NoInit, unsqueeze_slice,
316
345k
                                "InvVertSqueeze"));
317
318
345k
  if (chout.h & 1) {
319
99.1k
    size_t y = chin.h - 1;
320
99.1k
    const pixel_type *p_avg = chin.Row(y);
321
99.1k
    pixel_type *p_out = chout.Row(y << 1);
322
8.55M
    for (size_t x = 0; x < chin.w; x++) {
323
8.45M
      p_out[x] = p_avg[x];
324
8.45M
    }
325
99.1k
  }
326
345k
  input.channel[c] = std::move(chout);
327
345k
  return true;
328
345k
}
jxl::N_SSE4::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Line
Count
Source
240
34.3k
Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
241
34.3k
  JXL_ENSURE(c < input.channel.size());
242
34.3k
  JXL_ENSURE(rc < input.channel.size());
243
34.3k
  const Channel &chin = input.channel[c];
244
34.3k
  const Channel &chin_residual = input.channel[rc];
245
  // These must be valid since we ran MetaApply already.
246
34.3k
  JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2));
247
34.3k
  JXL_ENSURE(chin.w == chin_residual.w);
248
34.3k
  JxlMemoryManager *memory_manager = input.memory_manager();
249
250
34.3k
  if (chin_residual.h == 0) {
251
    // Short-circuit: output channel has same dimensions as input.
252
1.51k
    input.channel[c].vshift--;
253
1.51k
    return true;
254
1.51k
  }
255
256
  // Note: chin.h >= chin_residual.h and at most 1 different.
257
65.7k
  JXL_ASSIGN_OR_RETURN(
258
65.7k
      Channel chout,
259
65.7k
      Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h,
260
65.7k
                      chin.hshift, chin.vshift - 1));
261
65.7k
  JXL_DEBUG_V(
262
65.7k
      4,
263
65.7k
      "Undoing vertical squeeze of channel %i using residuals in channel "
264
65.7k
      "%i (going from height %" PRIuS " to %" PRIuS ")",
265
65.7k
      c, rc, chin.h, chout.h);
266
267
65.7k
  if (chin_residual.w == 0) {
268
    // Short-circuit: channel with no pixels.
269
0
    input.channel[c] = std::move(chout);
270
0
    return true;
271
0
  }
272
273
32.8k
  static constexpr const int kColsPerThread = 64;
274
32.8k
  const auto unsqueeze_slice = [&](const uint32_t task,
275
32.8k
                                   size_t /* thread */) -> Status {
276
32.8k
    const size_t x0 = task * kColsPerThread;
277
32.8k
    const size_t x1 =
278
32.8k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
279
32.8k
    const size_t w = x1 - x0;
280
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
281
    // always chin_residual.h.
282
32.8k
    for (size_t y = 0; y < chin_residual.h; y++) {
283
32.8k
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
284
32.8k
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
285
32.8k
      const pixel_type *JXL_RESTRICT p_navg =
286
32.8k
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
287
32.8k
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
288
32.8k
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
289
32.8k
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
290
32.8k
      size_t x = 0;
291
32.8k
#if HWY_TARGET != HWY_SCALAR
292
32.8k
      for (; x + 7 < w; x += 8) {
293
32.8k
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
294
32.8k
                      p_out + x, p_nout + x);
295
32.8k
      }
296
32.8k
#endif
297
32.8k
      for (; x < w; x++) {
298
32.8k
        pixel_type_w avg = p_avg[x];
299
32.8k
        pixel_type_w next_avg = p_navg[x];
300
32.8k
        pixel_type_w top = p_pout[x];
301
32.8k
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
302
32.8k
        pixel_type_w diff_minus_tendency = p_residual[x];
303
32.8k
        pixel_type_w diff = diff_minus_tendency + tendency;
304
32.8k
        pixel_type_w out = avg + (diff / 2);
305
32.8k
        p_out[x] = out;
306
        // If the chin_residual.h == chin.h, the output has an even number
307
        // of rows so the next line is fine. Otherwise, this loop won't
308
        // write to the last output row which is handled separately.
309
32.8k
        p_nout[x] = out - diff;
310
32.8k
      }
311
32.8k
    }
312
32.8k
    return true;
313
32.8k
  };
314
32.8k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread),
315
32.8k
                                ThreadPool::NoInit, unsqueeze_slice,
316
32.8k
                                "InvVertSqueeze"));
317
318
32.8k
  if (chout.h & 1) {
319
12.8k
    size_t y = chin.h - 1;
320
12.8k
    const pixel_type *p_avg = chin.Row(y);
321
12.8k
    pixel_type *p_out = chout.Row(y << 1);
322
481k
    for (size_t x = 0; x < chin.w; x++) {
323
468k
      p_out[x] = p_avg[x];
324
468k
    }
325
12.8k
  }
326
32.8k
  input.channel[c] = std::move(chout);
327
32.8k
  return true;
328
32.8k
}
jxl::N_AVX2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Line
Count
Source
240
292k
Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
241
292k
  JXL_ENSURE(c < input.channel.size());
242
292k
  JXL_ENSURE(rc < input.channel.size());
243
292k
  const Channel &chin = input.channel[c];
244
292k
  const Channel &chin_residual = input.channel[rc];
245
  // These must be valid since we ran MetaApply already.
246
292k
  JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2));
247
292k
  JXL_ENSURE(chin.w == chin_residual.w);
248
292k
  JxlMemoryManager *memory_manager = input.memory_manager();
249
250
292k
  if (chin_residual.h == 0) {
251
    // Short-circuit: output channel has same dimensions as input.
252
10.4k
    input.channel[c].vshift--;
253
10.4k
    return true;
254
10.4k
  }
255
256
  // Note: chin.h >= chin_residual.h and at most 1 different.
257
564k
  JXL_ASSIGN_OR_RETURN(
258
564k
      Channel chout,
259
564k
      Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h,
260
564k
                      chin.hshift, chin.vshift - 1));
261
564k
  JXL_DEBUG_V(
262
564k
      4,
263
564k
      "Undoing vertical squeeze of channel %i using residuals in channel "
264
564k
      "%i (going from height %" PRIuS " to %" PRIuS ")",
265
564k
      c, rc, chin.h, chout.h);
266
267
564k
  if (chin_residual.w == 0) {
268
    // Short-circuit: channel with no pixels.
269
0
    input.channel[c] = std::move(chout);
270
0
    return true;
271
0
  }
272
273
282k
  static constexpr const int kColsPerThread = 64;
274
282k
  const auto unsqueeze_slice = [&](const uint32_t task,
275
282k
                                   size_t /* thread */) -> Status {
276
282k
    const size_t x0 = task * kColsPerThread;
277
282k
    const size_t x1 =
278
282k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
279
282k
    const size_t w = x1 - x0;
280
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
281
    // always chin_residual.h.
282
282k
    for (size_t y = 0; y < chin_residual.h; y++) {
283
282k
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
284
282k
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
285
282k
      const pixel_type *JXL_RESTRICT p_navg =
286
282k
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
287
282k
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
288
282k
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
289
282k
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
290
282k
      size_t x = 0;
291
282k
#if HWY_TARGET != HWY_SCALAR
292
282k
      for (; x + 7 < w; x += 8) {
293
282k
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
294
282k
                      p_out + x, p_nout + x);
295
282k
      }
296
282k
#endif
297
282k
      for (; x < w; x++) {
298
282k
        pixel_type_w avg = p_avg[x];
299
282k
        pixel_type_w next_avg = p_navg[x];
300
282k
        pixel_type_w top = p_pout[x];
301
282k
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
302
282k
        pixel_type_w diff_minus_tendency = p_residual[x];
303
282k
        pixel_type_w diff = diff_minus_tendency + tendency;
304
282k
        pixel_type_w out = avg + (diff / 2);
305
282k
        p_out[x] = out;
306
        // If the chin_residual.h == chin.h, the output has an even number
307
        // of rows so the next line is fine. Otherwise, this loop won't
308
        // write to the last output row which is handled separately.
309
282k
        p_nout[x] = out - diff;
310
282k
      }
311
282k
    }
312
282k
    return true;
313
282k
  };
314
282k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread),
315
282k
                                ThreadPool::NoInit, unsqueeze_slice,
316
282k
                                "InvVertSqueeze"));
317
318
282k
  if (chout.h & 1) {
319
76.7k
    size_t y = chin.h - 1;
320
76.7k
    const pixel_type *p_avg = chin.Row(y);
321
76.7k
    pixel_type *p_out = chout.Row(y << 1);
322
7.73M
    for (size_t x = 0; x < chin.w; x++) {
323
7.65M
      p_out[x] = p_avg[x];
324
7.65M
    }
325
76.7k
  }
326
282k
  input.channel[c] = std::move(chout);
327
282k
  return true;
328
282k
}
jxl::N_SSE2::InvVSqueeze(jxl::Image&, unsigned int, unsigned int, jxl::ThreadPool*)
Line
Count
Source
240
32.0k
Status InvVSqueeze(Image &input, uint32_t c, uint32_t rc, ThreadPool *pool) {
241
32.0k
  JXL_ENSURE(c < input.channel.size());
242
32.0k
  JXL_ENSURE(rc < input.channel.size());
243
32.0k
  const Channel &chin = input.channel[c];
244
32.0k
  const Channel &chin_residual = input.channel[rc];
245
  // These must be valid since we ran MetaApply already.
246
32.0k
  JXL_ENSURE(chin.h == DivCeil(chin.h + chin_residual.h, 2));
247
32.0k
  JXL_ENSURE(chin.w == chin_residual.w);
248
32.0k
  JxlMemoryManager *memory_manager = input.memory_manager();
249
250
32.0k
  if (chin_residual.h == 0) {
251
    // Short-circuit: output channel has same dimensions as input.
252
1.75k
    input.channel[c].vshift--;
253
1.75k
    return true;
254
1.75k
  }
255
256
  // Note: chin.h >= chin_residual.h and at most 1 different.
257
60.6k
  JXL_ASSIGN_OR_RETURN(
258
60.6k
      Channel chout,
259
60.6k
      Channel::Create(memory_manager, chin.w, chin.h + chin_residual.h,
260
60.6k
                      chin.hshift, chin.vshift - 1));
261
60.6k
  JXL_DEBUG_V(
262
60.6k
      4,
263
60.6k
      "Undoing vertical squeeze of channel %i using residuals in channel "
264
60.6k
      "%i (going from height %" PRIuS " to %" PRIuS ")",
265
60.6k
      c, rc, chin.h, chout.h);
266
267
60.6k
  if (chin_residual.w == 0) {
268
    // Short-circuit: channel with no pixels.
269
0
    input.channel[c] = std::move(chout);
270
0
    return true;
271
0
  }
272
273
30.3k
  static constexpr const int kColsPerThread = 64;
274
30.3k
  const auto unsqueeze_slice = [&](const uint32_t task,
275
30.3k
                                   size_t /* thread */) -> Status {
276
30.3k
    const size_t x0 = task * kColsPerThread;
277
30.3k
    const size_t x1 =
278
30.3k
        std::min(static_cast<size_t>(task + 1) * kColsPerThread, chin.w);
279
30.3k
    const size_t w = x1 - x0;
280
    // We only iterate up to std::min(chin_residual.h, chin.h) which is
281
    // always chin_residual.h.
282
30.3k
    for (size_t y = 0; y < chin_residual.h; y++) {
283
30.3k
      const pixel_type *JXL_RESTRICT p_residual = chin_residual.Row(y) + x0;
284
30.3k
      const pixel_type *JXL_RESTRICT p_avg = chin.Row(y) + x0;
285
30.3k
      const pixel_type *JXL_RESTRICT p_navg =
286
30.3k
          chin.Row(y + 1 < chin.h ? y + 1 : y) + x0;
287
30.3k
      pixel_type *JXL_RESTRICT p_out = chout.Row(y << 1) + x0;
288
30.3k
      pixel_type *JXL_RESTRICT p_nout = chout.Row((y << 1) + 1) + x0;
289
30.3k
      const pixel_type *p_pout = y > 0 ? chout.Row((y << 1) - 1) + x0 : p_avg;
290
30.3k
      size_t x = 0;
291
30.3k
#if HWY_TARGET != HWY_SCALAR
292
30.3k
      for (; x + 7 < w; x += 8) {
293
30.3k
        FastUnsqueeze(p_residual + x, p_avg + x, p_navg + x, p_pout + x,
294
30.3k
                      p_out + x, p_nout + x);
295
30.3k
      }
296
30.3k
#endif
297
30.3k
      for (; x < w; x++) {
298
30.3k
        pixel_type_w avg = p_avg[x];
299
30.3k
        pixel_type_w next_avg = p_navg[x];
300
30.3k
        pixel_type_w top = p_pout[x];
301
30.3k
        pixel_type_w tendency = SmoothTendency(top, avg, next_avg);
302
30.3k
        pixel_type_w diff_minus_tendency = p_residual[x];
303
30.3k
        pixel_type_w diff = diff_minus_tendency + tendency;
304
30.3k
        pixel_type_w out = avg + (diff / 2);
305
30.3k
        p_out[x] = out;
306
        // If the chin_residual.h == chin.h, the output has an even number
307
        // of rows so the next line is fine. Otherwise, this loop won't
308
        // write to the last output row which is handled separately.
309
30.3k
        p_nout[x] = out - diff;
310
30.3k
      }
311
30.3k
    }
312
30.3k
    return true;
313
30.3k
  };
314
30.3k
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, DivCeil(chin.w, kColsPerThread),
315
30.3k
                                ThreadPool::NoInit, unsqueeze_slice,
316
30.3k
                                "InvVertSqueeze"));
317
318
30.3k
  if (chout.h & 1) {
319
9.50k
    size_t y = chin.h - 1;
320
9.50k
    const pixel_type *p_avg = chin.Row(y);
321
9.50k
    pixel_type *p_out = chout.Row(y << 1);
322
338k
    for (size_t x = 0; x < chin.w; x++) {
323
329k
      p_out[x] = p_avg[x];
324
329k
    }
325
9.50k
  }
326
30.3k
  input.channel[c] = std::move(chout);
327
30.3k
  return true;
328
30.3k
}
329
330
Status InvSqueeze(Image &input, const std::vector<SqueezeParams> &parameters,
331
44.0k
                  ThreadPool *pool) {
332
243k
  for (int i = parameters.size() - 1; i >= 0; i--) {
333
199k
    JXL_RETURN_IF_ERROR(
334
199k
        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
335
199k
    bool horizontal = parameters[i].horizontal;
336
199k
    bool in_place = parameters[i].in_place;
337
199k
    uint32_t beginc = parameters[i].begin_c;
338
199k
    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
339
199k
    uint32_t offset;
340
199k
    if (in_place) {
341
128k
      offset = endc + 1;
342
128k
    } else {
343
71.1k
      offset = input.channel.size() + beginc - endc - 1;
344
71.1k
    }
345
199k
    if (beginc < input.nb_meta_channels) {
346
      // This is checked in MetaSqueeze.
347
299
      JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c);
348
299
      input.nb_meta_channels -= parameters[i].num_c;
349
299
    }
350
351
920k
    for (uint32_t c = beginc; c <= endc; c++) {
352
720k
      uint32_t rc = offset + c - beginc;
353
      // MetaApply should imply that `rc` is within range, otherwise there's a
354
      // programming bug.
355
720k
      JXL_ENSURE(rc < input.channel.size());
356
720k
      if ((input.channel[c].w < input.channel[rc].w) ||
357
720k
          (input.channel[c].h < input.channel[rc].h)) {
358
0
        return JXL_FAILURE("Corrupted squeeze transform");
359
0
      }
360
720k
      if (horizontal) {
361
361k
        JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool));
362
361k
      } else {
363
359k
        JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool));
364
359k
      }
365
720k
    }
366
199k
    input.channel.erase(input.channel.begin() + offset,
367
199k
                        input.channel.begin() + offset + (endc - beginc + 1));
368
199k
  }
369
44.0k
  return true;
370
44.0k
}
jxl::N_SSE4::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
Line
Count
Source
331
5.24k
                  ThreadPool *pool) {
332
28.7k
  for (int i = parameters.size() - 1; i >= 0; i--) {
333
23.5k
    JXL_RETURN_IF_ERROR(
334
23.5k
        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
335
23.5k
    bool horizontal = parameters[i].horizontal;
336
23.5k
    bool in_place = parameters[i].in_place;
337
23.5k
    uint32_t beginc = parameters[i].begin_c;
338
23.5k
    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
339
23.5k
    uint32_t offset;
340
23.5k
    if (in_place) {
341
14.8k
      offset = endc + 1;
342
14.8k
    } else {
343
8.64k
      offset = input.channel.size() + beginc - endc - 1;
344
8.64k
    }
345
23.5k
    if (beginc < input.nb_meta_channels) {
346
      // This is checked in MetaSqueeze.
347
54
      JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c);
348
54
      input.nb_meta_channels -= parameters[i].num_c;
349
54
    }
350
351
95.6k
    for (uint32_t c = beginc; c <= endc; c++) {
352
72.1k
      uint32_t rc = offset + c - beginc;
353
      // MetaApply should imply that `rc` is within range, otherwise there's a
354
      // programming bug.
355
72.1k
      JXL_ENSURE(rc < input.channel.size());
356
72.1k
      if ((input.channel[c].w < input.channel[rc].w) ||
357
72.1k
          (input.channel[c].h < input.channel[rc].h)) {
358
0
        return JXL_FAILURE("Corrupted squeeze transform");
359
0
      }
360
72.1k
      if (horizontal) {
361
37.7k
        JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool));
362
37.7k
      } else {
363
34.3k
        JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool));
364
34.3k
      }
365
72.1k
    }
366
23.5k
    input.channel.erase(input.channel.begin() + offset,
367
23.5k
                        input.channel.begin() + offset + (endc - beginc + 1));
368
23.5k
  }
369
5.24k
  return true;
370
5.24k
}
jxl::N_AVX2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
Line
Count
Source
331
33.0k
                  ThreadPool *pool) {
332
187k
  for (int i = parameters.size() - 1; i >= 0; i--) {
333
154k
    JXL_RETURN_IF_ERROR(
334
154k
        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
335
154k
    bool horizontal = parameters[i].horizontal;
336
154k
    bool in_place = parameters[i].in_place;
337
154k
    uint32_t beginc = parameters[i].begin_c;
338
154k
    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
339
154k
    uint32_t offset;
340
154k
    if (in_place) {
341
102k
      offset = endc + 1;
342
102k
    } else {
343
52.5k
      offset = input.channel.size() + beginc - endc - 1;
344
52.5k
    }
345
154k
    if (beginc < input.nb_meta_channels) {
346
      // This is checked in MetaSqueeze.
347
207
      JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c);
348
207
      input.nb_meta_channels -= parameters[i].num_c;
349
207
    }
350
351
735k
    for (uint32_t c = beginc; c <= endc; c++) {
352
581k
      uint32_t rc = offset + c - beginc;
353
      // MetaApply should imply that `rc` is within range, otherwise there's a
354
      // programming bug.
355
581k
      JXL_ENSURE(rc < input.channel.size());
356
581k
      if ((input.channel[c].w < input.channel[rc].w) ||
357
581k
          (input.channel[c].h < input.channel[rc].h)) {
358
0
        return JXL_FAILURE("Corrupted squeeze transform");
359
0
      }
360
581k
      if (horizontal) {
361
288k
        JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool));
362
292k
      } else {
363
292k
        JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool));
364
292k
      }
365
581k
    }
366
154k
    input.channel.erase(input.channel.begin() + offset,
367
154k
                        input.channel.begin() + offset + (endc - beginc + 1));
368
154k
  }
369
33.0k
  return true;
370
33.0k
}
jxl::N_SSE2::InvSqueeze(jxl::Image&, std::__1::vector<jxl::SqueezeParams, std::__1::allocator<jxl::SqueezeParams> > const&, jxl::ThreadPool*)
Line
Count
Source
331
5.74k
                  ThreadPool *pool) {
332
27.4k
  for (int i = parameters.size() - 1; i >= 0; i--) {
333
21.6k
    JXL_RETURN_IF_ERROR(
334
21.6k
        CheckMetaSqueezeParams(parameters[i], input.channel.size()));
335
21.6k
    bool horizontal = parameters[i].horizontal;
336
21.6k
    bool in_place = parameters[i].in_place;
337
21.6k
    uint32_t beginc = parameters[i].begin_c;
338
21.6k
    uint32_t endc = parameters[i].begin_c + parameters[i].num_c - 1;
339
21.6k
    uint32_t offset;
340
21.6k
    if (in_place) {
341
11.7k
      offset = endc + 1;
342
11.7k
    } else {
343
9.95k
      offset = input.channel.size() + beginc - endc - 1;
344
9.95k
    }
345
21.6k
    if (beginc < input.nb_meta_channels) {
346
      // This is checked in MetaSqueeze.
347
38
      JXL_ENSURE(input.nb_meta_channels > parameters[i].num_c);
348
38
      input.nb_meta_channels -= parameters[i].num_c;
349
38
    }
350
351
88.8k
    for (uint32_t c = beginc; c <= endc; c++) {
352
67.2k
      uint32_t rc = offset + c - beginc;
353
      // MetaApply should imply that `rc` is within range, otherwise there's a
354
      // programming bug.
355
67.2k
      JXL_ENSURE(rc < input.channel.size());
356
67.2k
      if ((input.channel[c].w < input.channel[rc].w) ||
357
67.2k
          (input.channel[c].h < input.channel[rc].h)) {
358
0
        return JXL_FAILURE("Corrupted squeeze transform");
359
0
      }
360
67.2k
      if (horizontal) {
361
35.1k
        JXL_RETURN_IF_ERROR(InvHSqueeze(input, c, rc, pool));
362
35.1k
      } else {
363
32.0k
        JXL_RETURN_IF_ERROR(InvVSqueeze(input, c, rc, pool));
364
32.0k
      }
365
67.2k
    }
366
21.6k
    input.channel.erase(input.channel.begin() + offset,
367
21.6k
                        input.channel.begin() + offset + (endc - beginc + 1));
368
21.6k
  }
369
5.74k
  return true;
370
5.74k
}
371
372
}  // namespace HWY_NAMESPACE
373
}  // namespace jxl
374
HWY_AFTER_NAMESPACE();
375
376
#if HWY_ONCE
377
378
namespace jxl {
379
380
HWY_EXPORT(InvSqueeze);
381
Status InvSqueeze(Image &input, const std::vector<SqueezeParams> &parameters,
382
44.0k
                  ThreadPool *pool) {
383
44.0k
  return HWY_DYNAMIC_DISPATCH(InvSqueeze)(input, parameters, pool);
384
44.0k
}
385
386
void DefaultSqueezeParameters(std::vector<SqueezeParams> *parameters,
387
43.4k
                              const Image &image) {
388
43.4k
  int nb_channels = image.channel.size() - image.nb_meta_channels;
389
390
43.4k
  parameters->clear();
391
43.4k
  size_t w = image.channel[image.nb_meta_channels].w;
392
43.4k
  size_t h = image.channel[image.nb_meta_channels].h;
393
43.4k
  JXL_DEBUG_V(
394
43.4k
      7, "Default squeeze parameters for %" PRIuS "x%" PRIuS " image: ", w, h);
395
396
  // do horizontal first on wide images; vertical first on tall images
397
43.4k
  bool wide = (w > h);
398
399
43.4k
  if (nb_channels > 2 && image.channel[image.nb_meta_channels + 1].w == w &&
400
43.4k
      image.channel[image.nb_meta_channels + 1].h == h) {
401
    // assume channels 1 and 2 are chroma, and can be squeezed first for 4:2:0
402
    // previews
403
30.3k
    JXL_DEBUG_V(7, "(4:2:0 chroma), %" PRIuS "x%" PRIuS " image", w, h);
404
30.3k
    SqueezeParams params;
405
    // horizontal chroma squeeze
406
30.3k
    params.horizontal = true;
407
30.3k
    params.in_place = false;
408
30.3k
    params.begin_c = image.nb_meta_channels + 1;
409
30.3k
    params.num_c = 2;
410
30.3k
    parameters->push_back(params);
411
30.3k
    params.horizontal = false;
412
    // vertical chroma squeeze
413
30.3k
    parameters->push_back(params);
414
30.3k
  }
415
43.4k
  SqueezeParams params;
416
43.4k
  params.begin_c = image.nb_meta_channels;
417
43.4k
  params.num_c = nb_channels;
418
43.4k
  params.in_place = true;
419
420
43.4k
  if (!wide) {
421
21.1k
    if (h > kMaxFirstPreviewSize) {
422
8.94k
      params.horizontal = false;
423
8.94k
      parameters->push_back(params);
424
8.94k
      h = (h + 1) / 2;
425
8.94k
      JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h);
426
8.94k
    }
427
21.1k
  }
428
119k
  while (w > kMaxFirstPreviewSize || h > kMaxFirstPreviewSize) {
429
76.3k
    if (w > kMaxFirstPreviewSize) {
430
69.9k
      params.horizontal = true;
431
69.9k
      parameters->push_back(params);
432
69.9k
      w = (w + 1) / 2;
433
69.9k
      JXL_DEBUG_V(7, "Horizontal (%" PRIuS "x%" PRIuS "), ", w, h);
434
69.9k
    }
435
76.3k
    if (h > kMaxFirstPreviewSize) {
436
51.7k
      params.horizontal = false;
437
51.7k
      parameters->push_back(params);
438
51.7k
      h = (h + 1) / 2;
439
51.7k
      JXL_DEBUG_V(7, "Vertical (%" PRIuS "x%" PRIuS "), ", w, h);
440
51.7k
    }
441
76.3k
  }
442
43.4k
  JXL_DEBUG_V(7, "that's it");
443
43.4k
}
444
445
Status CheckMetaSqueezeParams(const SqueezeParams &parameter,
446
409k
                              int num_channels) {
447
409k
  int c1 = parameter.begin_c;
448
409k
  int c2 = parameter.begin_c + parameter.num_c - 1;
449
409k
  if (c1 < 0 || c1 >= num_channels || c2 < 0 || c2 >= num_channels || c2 < c1) {
450
120
    return JXL_FAILURE("Invalid channel range");
451
120
  }
452
409k
  return true;
453
409k
}
454
455
46.7k
Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters) {
456
46.7k
  JxlMemoryManager *memory_manager = image.memory_manager();
457
46.7k
  if (parameters->empty()) {
458
43.4k
    DefaultSqueezeParameters(parameters, image);
459
43.4k
  }
460
461
209k
  for (auto &parameter : *parameters) {
462
209k
    JXL_RETURN_IF_ERROR(
463
209k
        CheckMetaSqueezeParams(parameter, image.channel.size()));
464
209k
    bool horizontal = parameter.horizontal;
465
209k
    bool in_place = parameter.in_place;
466
209k
    uint32_t beginc = parameter.begin_c;
467
209k
    uint32_t endc = parameter.begin_c + parameter.num_c - 1;
468
469
209k
    uint32_t offset;
470
209k
    if (beginc < image.nb_meta_channels) {
471
424
      if (endc >= image.nb_meta_channels) {
472
7
        return JXL_FAILURE("Invalid squeeze: mix of meta and nonmeta channels");
473
7
      }
474
417
      if (!in_place) {
475
7
        return JXL_FAILURE(
476
7
            "Invalid squeeze: meta channels require in-place residuals");
477
7
      }
478
410
      image.nb_meta_channels += parameter.num_c;
479
410
    }
480
209k
    if (in_place) {
481
134k
      offset = endc + 1;
482
134k
    } else {
483
75.1k
      offset = image.channel.size();
484
75.1k
    }
485
1.06M
    for (uint32_t c = beginc; c <= endc; c++) {
486
852k
      if (image.channel[c].hshift > 30 || image.channel[c].vshift > 30) {
487
12
        return JXL_FAILURE("Too many squeezes: shift > 30");
488
12
      }
489
852k
      size_t w = image.channel[c].w;
490
852k
      size_t h = image.channel[c].h;
491
852k
      if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel");
492
852k
      if (horizontal) {
493
460k
        image.channel[c].w = (w + 1) / 2;
494
460k
        if (image.channel[c].hshift >= 0) image.channel[c].hshift++;
495
460k
        w = w - (w + 1) / 2;
496
460k
      } else {
497
391k
        image.channel[c].h = (h + 1) / 2;
498
391k
        if (image.channel[c].vshift >= 0) image.channel[c].vshift++;
499
391k
        h = h - (h + 1) / 2;
500
391k
      }
501
852k
      JXL_RETURN_IF_ERROR(image.channel[c].shrink());
502
1.70M
      JXL_ASSIGN_OR_RETURN(Channel placeholder,
503
1.70M
                           Channel::Create(memory_manager, w, h));
504
1.70M
      placeholder.hshift = image.channel[c].hshift;
505
1.70M
      placeholder.vshift = image.channel[c].vshift;
506
507
1.70M
      image.channel.insert(image.channel.begin() + offset + (c - beginc),
508
1.70M
                           std::move(placeholder));
509
1.70M
      JXL_DEBUG_V(8, "MetaSqueeze applied, current image: %s",
510
1.70M
                  image.DebugString().c_str());
511
1.70M
    }
512
209k
  }
513
46.6k
  return true;
514
46.7k
}
515
516
}  // namespace jxl
517
518
#endif