Coverage Report

Created: 2026-02-14 07:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jpegli/render.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jpegli/render.h"
7
8
#include <jxl/types.h>
9
10
#include <algorithm>
11
#include <array>
12
#include <cmath>
13
#include <cstddef>
14
#include <cstdint>
15
#include <cstring>
16
#include <vector>
17
18
#include "lib/jpegli/color_quantize.h"
19
#include "lib/jpegli/color_transform.h"
20
#include "lib/jpegli/common.h"
21
#include "lib/jpegli/common_internal.h"
22
#include "lib/jpegli/decode_internal.h"
23
#include "lib/jpegli/error.h"
24
#include "lib/jpegli/idct.h"
25
#include "lib/jpegli/types.h"
26
#include "lib/jpegli/upsample.h"
27
#include "lib/jxl/base/byte_order.h"
28
#include "lib/jxl/base/compiler_specific.h"
29
30
#ifdef MEMORY_SANITIZER
31
#define JXL_MEMORY_SANITIZER 1
32
#elif defined(__has_feature)
33
#if __has_feature(memory_sanitizer)
34
#define JXL_MEMORY_SANITIZER 1
35
#else
36
#define JXL_MEMORY_SANITIZER 0
37
#endif
38
#else
39
#define JXL_MEMORY_SANITIZER 0
40
#endif
41
42
#if JXL_MEMORY_SANITIZER
43
#include "sanitizer/msan_interface.h"
44
#endif
45
46
#undef HWY_TARGET_INCLUDE
47
#define HWY_TARGET_INCLUDE "lib/jpegli/render.cc"
48
#include <hwy/foreach_target.h>
49
#include <hwy/highway.h>
50
51
HWY_BEFORE_NAMESPACE();
52
namespace jpegli {
53
namespace HWY_NAMESPACE {
54
55
// These templates are not found via ADL.
56
using hwy::HWY_NAMESPACE::Abs;
57
using hwy::HWY_NAMESPACE::Add;
58
using hwy::HWY_NAMESPACE::Clamp;
59
using hwy::HWY_NAMESPACE::Gt;
60
using hwy::HWY_NAMESPACE::IfThenElseZero;
61
using hwy::HWY_NAMESPACE::Mul;
62
using hwy::HWY_NAMESPACE::NearestInt;
63
using hwy::HWY_NAMESPACE::Or;
64
using hwy::HWY_NAMESPACE::Rebind;
65
using hwy::HWY_NAMESPACE::ShiftLeftSame;
66
using hwy::HWY_NAMESPACE::ShiftRightSame;
67
using hwy::HWY_NAMESPACE::Vec;
68
using D = HWY_FULL(float);
69
using DI = HWY_FULL(int32_t);
70
constexpr D d;
71
constexpr DI di;
72
73
void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
74
                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
75
2.15M
                      int32_t* JXL_RESTRICT sumabs) {
76
222M
  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
77
220M
    size_t k = i % DCTSIZE2;
78
220M
    const Rebind<int16_t, DI> di16;
79
220M
    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
80
220M
    const auto abs_coeff = Abs(coeff);
81
220M
    const auto not_0 = Gt(abs_coeff, Zero(di));
82
220M
    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
83
220M
    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
84
220M
    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
85
220M
  }
86
2.15M
}
jpegli::N_SSE4::GatherBlockStats(short const*, unsigned long, int*, int*)
Line
Count
Source
75
520k
                      int32_t* JXL_RESTRICT sumabs) {
76
72.2M
  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
77
71.7M
    size_t k = i % DCTSIZE2;
78
71.7M
    const Rebind<int16_t, DI> di16;
79
71.7M
    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
80
71.7M
    const auto abs_coeff = Abs(coeff);
81
71.7M
    const auto not_0 = Gt(abs_coeff, Zero(di));
82
71.7M
    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
83
71.7M
    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
84
71.7M
    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
85
71.7M
  }
86
520k
}
jpegli::N_AVX2::GatherBlockStats(short const*, unsigned long, int*, int*)
Line
Count
Source
75
1.03M
                      int32_t* JXL_RESTRICT sumabs) {
76
80.3M
  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
77
79.2M
    size_t k = i % DCTSIZE2;
78
79.2M
    const Rebind<int16_t, DI> di16;
79
79.2M
    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
80
79.2M
    const auto abs_coeff = Abs(coeff);
81
79.2M
    const auto not_0 = Gt(abs_coeff, Zero(di));
82
79.2M
    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
83
79.2M
    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
84
79.2M
    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
85
79.2M
  }
86
1.03M
}
jpegli::N_SSE2::GatherBlockStats(short const*, unsigned long, int*, int*)
Line
Count
Source
75
599k
                      int32_t* JXL_RESTRICT sumabs) {
76
70.0M
  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
77
69.5M
    size_t k = i % DCTSIZE2;
78
69.5M
    const Rebind<int16_t, DI> di16;
79
69.5M
    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
80
69.5M
    const auto abs_coeff = Abs(coeff);
81
69.5M
    const auto not_0 = Gt(abs_coeff, Zero(di));
82
69.5M
    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
83
69.5M
    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
84
69.5M
    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
85
69.5M
  }
86
599k
}
87
88
29.9M
void DecenterRow(float* row, size_t xsize) {
89
29.9M
  const HWY_CAPPED(float, 8) df;
90
29.9M
  const auto c128 = Set(df, 128.0f / 255);
91
535M
  for (size_t x = 0; x < xsize; x += Lanes(df)) {
92
505M
    Store(Add(Load(df, row + x), c128), df, row + x);
93
505M
  }
94
29.9M
}
jpegli::N_SSE4::DecenterRow(float*, unsigned long)
Line
Count
Source
88
7.58M
void DecenterRow(float* row, size_t xsize) {
89
7.58M
  const HWY_CAPPED(float, 8) df;
90
7.58M
  const auto c128 = Set(df, 128.0f / 255);
91
172M
  for (size_t x = 0; x < xsize; x += Lanes(df)) {
92
165M
    Store(Add(Load(df, row + x), c128), df, row + x);
93
165M
  }
94
7.58M
}
jpegli::N_AVX2::DecenterRow(float*, unsigned long)
Line
Count
Source
88
13.3M
void DecenterRow(float* row, size_t xsize) {
89
13.3M
  const HWY_CAPPED(float, 8) df;
90
13.3M
  const auto c128 = Set(df, 128.0f / 255);
91
176M
  for (size_t x = 0; x < xsize; x += Lanes(df)) {
92
163M
    Store(Add(Load(df, row + x), c128), df, row + x);
93
163M
  }
94
13.3M
}
jpegli::N_SSE2::DecenterRow(float*, unsigned long)
Line
Count
Source
88
8.99M
void DecenterRow(float* row, size_t xsize) {
89
8.99M
  const HWY_CAPPED(float, 8) df;
90
8.99M
  const auto c128 = Set(df, 128.0f / 255);
91
185M
  for (size_t x = 0; x < xsize; x += Lanes(df)) {
92
176M
    Store(Add(Load(df, row + x), c128), df, row + x);
93
176M
  }
94
8.99M
}
95
96
void DitherRow(j_decompress_ptr cinfo, float* row, int c, size_t y,
97
0
               size_t xsize) {
98
0
  jpeg_decomp_master* m = cinfo->master;
99
0
  if (!m->dither_[c]) return;
100
0
  const float* dither_row =
101
0
      &m->dither_[c][(y & m->dither_mask_) * m->dither_size_];
102
0
  for (size_t x = 0; x < xsize; ++x) {
103
0
    row[x] += dither_row[x & m->dither_mask_];
104
0
  }
105
0
}
Unexecuted instantiation: jpegli::N_SSE4::DitherRow(jpeg_decompress_struct*, float*, int, unsigned long, unsigned long)
Unexecuted instantiation: jpegli::N_AVX2::DitherRow(jpeg_decompress_struct*, float*, int, unsigned long, unsigned long)
Unexecuted instantiation: jpegli::N_SSE2::DitherRow(jpeg_decompress_struct*, float*, int, unsigned long, unsigned long)
106
107
template <typename T>
108
void StoreUnsignedRow(float* JXL_RESTRICT input[], size_t x0, size_t len,
109
10.8M
                      size_t num_channels, float multiplier, T* output) {
110
10.8M
  const HWY_CAPPED(float, 8) cd;
111
10.8M
  auto zero = Zero(cd);
112
10.8M
  auto mul = Set(cd, multiplier);
113
10.8M
  const Rebind<T, decltype(cd)> cdu;
114
#if JXL_MEMORY_SANITIZER
115
  const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len;
116
  for (size_t c = 0; c < num_channels; ++c) {
117
    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
118
  }
119
#endif
120
10.8M
  if (num_channels == 1) {
121
51.9M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
122
46.4M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
123
46.4M
      StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]);
124
46.4M
    }
125
5.51M
  } else if (num_channels == 2) {
126
37.1M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
127
33.3M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
128
33.3M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
129
33.3M
      StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)),
130
33.3M
                        DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]);
131
33.3M
    }
132
3.84M
  } else if (num_channels == 3) {
133
17.0M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
134
16.3M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
135
16.3M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
136
16.3M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
137
16.3M
      StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)),
138
16.3M
                        DemoteTo(cdu, NearestInt(v1)),
139
16.3M
                        DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]);
140
16.3M
    }
141
785k
  } else if (num_channels == 4) {
142
12.0M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
143
11.2M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
144
11.2M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
145
11.2M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
146
11.2M
      auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul);
147
11.2M
      StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)),
148
11.2M
                        DemoteTo(cdu, NearestInt(v1)),
149
11.2M
                        DemoteTo(cdu, NearestInt(v2)),
150
11.2M
                        DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]);
151
11.2M
    }
152
785k
  }
153
#if JXL_MEMORY_SANITIZER
154
  __msan_poison(output + num_channels * len,
155
                sizeof(output[0]) * num_channels * padding);
156
#endif
157
10.8M
}
void jpegli::N_SSE4::StoreUnsignedRow<unsigned char>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned char*)
Line
Count
Source
109
1.10M
                      size_t num_channels, float multiplier, T* output) {
110
1.10M
  const HWY_CAPPED(float, 8) cd;
111
1.10M
  auto zero = Zero(cd);
112
1.10M
  auto mul = Set(cd, multiplier);
113
1.10M
  const Rebind<T, decltype(cd)> cdu;
114
#if JXL_MEMORY_SANITIZER
115
  const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len;
116
  for (size_t c = 0; c < num_channels; ++c) {
117
    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
118
  }
119
#endif
120
1.10M
  if (num_channels == 1) {
121
5.81M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
122
5.51M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
123
5.51M
      StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]);
124
5.51M
    }
125
805k
  } else if (num_channels == 2) {
126
5.94M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
127
5.35M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
128
5.35M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
129
5.35M
      StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)),
130
5.35M
                        DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]);
131
5.35M
    }
132
588k
  } else if (num_channels == 3) {
133
3.31M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
134
3.16M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
135
3.16M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
136
3.16M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
137
3.16M
      StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)),
138
3.16M
                        DemoteTo(cdu, NearestInt(v1)),
139
3.16M
                        DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]);
140
3.16M
    }
141
148k
  } else if (num_channels == 4) {
142
1.17M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
143
1.11M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
144
1.11M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
145
1.11M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
146
1.11M
      auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul);
147
1.11M
      StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)),
148
1.11M
                        DemoteTo(cdu, NearestInt(v1)),
149
1.11M
                        DemoteTo(cdu, NearestInt(v2)),
150
1.11M
                        DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]);
151
1.11M
    }
152
67.8k
  }
153
#if JXL_MEMORY_SANITIZER
154
  __msan_poison(output + num_channels * len,
155
                sizeof(output[0]) * num_channels * padding);
156
#endif
157
1.10M
}
void jpegli::N_SSE4::StoreUnsignedRow<unsigned short>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned short*)
Line
Count
Source
109
1.60M
                      size_t num_channels, float multiplier, T* output) {
110
1.60M
  const HWY_CAPPED(float, 8) cd;
111
1.60M
  auto zero = Zero(cd);
112
1.60M
  auto mul = Set(cd, multiplier);
113
1.60M
  const Rebind<T, decltype(cd)> cdu;
114
#if JXL_MEMORY_SANITIZER
115
  const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len;
116
  for (size_t c = 0; c < num_channels; ++c) {
117
    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
118
  }
119
#endif
120
1.60M
  if (num_channels == 1) {
121
12.4M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
122
11.6M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
123
11.6M
      StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]);
124
11.6M
    }
125
812k
  } else if (num_channels == 2) {
126
6.14M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
127
5.60M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
128
5.60M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
129
5.60M
      StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)),
130
5.60M
                        DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]);
131
5.60M
    }
132
537k
  } else if (num_channels == 3) {
133
2.59M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
134
2.39M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
135
2.39M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
136
2.39M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
137
2.39M
      StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)),
138
2.39M
                        DemoteTo(cdu, NearestInt(v1)),
139
2.39M
                        DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]);
140
2.39M
    }
141
200k
  } else if (num_channels == 4) {
142
3.38M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
143
3.31M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
144
3.31M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
145
3.31M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
146
3.31M
      auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul);
147
3.31M
      StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)),
148
3.31M
                        DemoteTo(cdu, NearestInt(v1)),
149
3.31M
                        DemoteTo(cdu, NearestInt(v2)),
150
3.31M
                        DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]);
151
3.31M
    }
152
74.5k
  }
153
#if JXL_MEMORY_SANITIZER
154
  __msan_poison(output + num_channels * len,
155
                sizeof(output[0]) * num_channels * padding);
156
#endif
157
1.60M
}
void jpegli::N_AVX2::StoreUnsignedRow<unsigned char>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned char*)
Line
Count
Source
109
1.68M
                      size_t num_channels, float multiplier, T* output) {
110
1.68M
  const HWY_CAPPED(float, 8) cd;
111
1.68M
  auto zero = Zero(cd);
112
1.68M
  auto mul = Set(cd, multiplier);
113
1.68M
  const Rebind<T, decltype(cd)> cdu;
114
#if JXL_MEMORY_SANITIZER
115
  const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len;
116
  for (size_t c = 0; c < num_channels; ++c) {
117
    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
118
  }
119
#endif
120
1.68M
  if (num_channels == 1) {
121
5.30M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
122
4.61M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
123
4.61M
      StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]);
124
4.61M
    }
125
993k
  } else if (num_channels == 2) {
126
4.82M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
127
3.99M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
128
3.99M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
129
3.99M
      StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)),
130
3.99M
                        DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]);
131
3.99M
    }
132
829k
  } else if (num_channels == 3) {
133
2.62M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
134
2.59M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
135
2.59M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
136
2.59M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
137
2.59M
      StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)),
138
2.59M
                        DemoteTo(cdu, NearestInt(v1)),
139
2.59M
                        DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]);
140
2.59M
    }
141
131k
  } else if (num_channels == 4) {
142
659k
    for (size_t i = 0; i < len; i += Lanes(cd)) {
143
528k
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
144
528k
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
145
528k
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
146
528k
      auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul);
147
528k
      StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)),
148
528k
                        DemoteTo(cdu, NearestInt(v1)),
149
528k
                        DemoteTo(cdu, NearestInt(v2)),
150
528k
                        DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]);
151
528k
    }
152
131k
  }
153
#if JXL_MEMORY_SANITIZER
154
  __msan_poison(output + num_channels * len,
155
                sizeof(output[0]) * num_channels * padding);
156
#endif
157
1.68M
}
void jpegli::N_AVX2::StoreUnsignedRow<unsigned short>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned short*)
Line
Count
Source
109
3.50M
                      size_t num_channels, float multiplier, T* output) {
110
3.50M
  const HWY_CAPPED(float, 8) cd;
111
3.50M
  auto zero = Zero(cd);
112
3.50M
  auto mul = Set(cd, multiplier);
113
3.50M
  const Rebind<T, decltype(cd)> cdu;
114
#if JXL_MEMORY_SANITIZER
115
  const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len;
116
  for (size_t c = 0; c < num_channels; ++c) {
117
    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
118
  }
119
#endif
120
3.50M
  if (num_channels == 1) {
121
12.0M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
122
9.89M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
123
9.89M
      StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]);
124
9.89M
    }
125
2.11M
  } else if (num_channels == 2) {
126
5.55M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
127
4.64M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
128
4.64M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
129
4.64M
      StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)),
130
4.64M
                        DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]);
131
4.64M
    }
132
906k
  } else if (num_channels == 3) {
133
2.00M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
134
1.95M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
135
1.95M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
136
1.95M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
137
1.95M
      StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)),
138
1.95M
                        DemoteTo(cdu, NearestInt(v1)),
139
1.95M
                        DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]);
140
1.95M
    }
141
438k
  } else if (num_channels == 4) {
142
2.43M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
143
1.99M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
144
1.99M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
145
1.99M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
146
1.99M
      auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul);
147
1.99M
      StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)),
148
1.99M
                        DemoteTo(cdu, NearestInt(v1)),
149
1.99M
                        DemoteTo(cdu, NearestInt(v2)),
150
1.99M
                        DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]);
151
1.99M
    }
152
438k
  }
153
#if JXL_MEMORY_SANITIZER
154
  __msan_poison(output + num_channels * len,
155
                sizeof(output[0]) * num_channels * padding);
156
#endif
157
3.50M
}
void jpegli::N_SSE2::StoreUnsignedRow<unsigned char>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned char*)
Line
Count
Source
109
979k
                      size_t num_channels, float multiplier, T* output) {
110
979k
  const HWY_CAPPED(float, 8) cd;
111
979k
  auto zero = Zero(cd);
112
979k
  auto mul = Set(cd, multiplier);
113
979k
  const Rebind<T, decltype(cd)> cdu;
114
#if JXL_MEMORY_SANITIZER
115
  const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len;
116
  for (size_t c = 0; c < num_channels; ++c) {
117
    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
118
  }
119
#endif
120
979k
  if (num_channels == 1) {
121
6.89M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
122
6.39M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
123
6.39M
      StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]);
124
6.39M
    }
125
495k
  } else if (num_channels == 2) {
126
6.12M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
127
5.75M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
128
5.75M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
129
5.75M
      StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)),
130
5.75M
                        DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]);
131
5.75M
    }
132
370k
  } else if (num_channels == 3) {
133
2.91M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
134
2.81M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
135
2.81M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
136
2.81M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
137
2.81M
      StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)),
138
2.81M
                        DemoteTo(cdu, NearestInt(v1)),
139
2.81M
                        DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]);
140
2.81M
    }
141
105k
  } else if (num_channels == 4) {
142
2.12M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
143
2.11M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
144
2.11M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
145
2.11M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
146
2.11M
      auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul);
147
2.11M
      StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)),
148
2.11M
                        DemoteTo(cdu, NearestInt(v1)),
149
2.11M
                        DemoteTo(cdu, NearestInt(v2)),
150
2.11M
                        DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]);
151
2.11M
    }
152
6.86k
  }
153
#if JXL_MEMORY_SANITIZER
154
  __msan_poison(output + num_channels * len,
155
                sizeof(output[0]) * num_channels * padding);
156
#endif
157
979k
}
void jpegli::N_SSE2::StoreUnsignedRow<unsigned short>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned short*)
Line
Count
Source
109
1.95M
                      size_t num_channels, float multiplier, T* output) {
110
1.95M
  const HWY_CAPPED(float, 8) cd;
111
1.95M
  auto zero = Zero(cd);
112
1.95M
  auto mul = Set(cd, multiplier);
113
1.95M
  const Rebind<T, decltype(cd)> cdu;
114
#if JXL_MEMORY_SANITIZER
115
  const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len;
116
  for (size_t c = 0; c < num_channels; ++c) {
117
    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
118
  }
119
#endif
120
1.95M
  if (num_channels == 1) {
121
9.45M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
122
8.32M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
123
8.32M
      StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]);
124
8.32M
    }
125
1.13M
  } else if (num_channels == 2) {
126
8.58M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
127
7.97M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
128
7.97M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
129
7.97M
      StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)),
130
7.97M
                        DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]);
131
7.97M
    }
132
611k
  } else if (num_channels == 3) {
133
3.61M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
134
3.46M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
135
3.46M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
136
3.46M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
137
3.46M
      StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)),
138
3.46M
                        DemoteTo(cdu, NearestInt(v1)),
139
3.46M
                        DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]);
140
3.46M
    }
141
149k
  } else if (num_channels == 4) {
142
2.25M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
143
2.18M
      auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul);
144
2.18M
      auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul);
145
2.18M
      auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul);
146
2.18M
      auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul);
147
2.18M
      StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)),
148
2.18M
                        DemoteTo(cdu, NearestInt(v1)),
149
2.18M
                        DemoteTo(cdu, NearestInt(v2)),
150
2.18M
                        DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]);
151
2.18M
    }
152
66.5k
  }
153
#if JXL_MEMORY_SANITIZER
154
  __msan_poison(output + num_channels * len,
155
                sizeof(output[0]) * num_channels * padding);
156
#endif
157
1.95M
}
158
159
void StoreFloatRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
160
7.53M
                   size_t num_channels, float* output) {
161
7.53M
  const HWY_CAPPED(float, 8) cd;
162
7.53M
  if (num_channels == 1) {
163
4.61M
    memcpy(output, input[0] + x0, len * sizeof(output[0]));
164
4.61M
  } else if (num_channels == 2) {
165
16.4M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
166
14.5M
      StoreInterleaved2(LoadU(cd, &input[0][x0 + i]),
167
14.5M
                        LoadU(cd, &input[1][x0 + i]), cd, &output[2 * i]);
168
14.5M
    }
169
1.96M
  } else if (num_channels == 3) {
170
13.3M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
171
12.5M
      StoreInterleaved3(LoadU(cd, &input[0][x0 + i]),
172
12.5M
                        LoadU(cd, &input[1][x0 + i]),
173
12.5M
                        LoadU(cd, &input[2][x0 + i]), cd, &output[3 * i]);
174
12.5M
    }
175
837k
  } else if (num_channels == 4) {
176
15.2M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
177
15.1M
      StoreInterleaved4(LoadU(cd, &input[0][x0 + i]),
178
15.1M
                        LoadU(cd, &input[1][x0 + i]),
179
15.1M
                        LoadU(cd, &input[2][x0 + i]),
180
15.1M
                        LoadU(cd, &input[3][x0 + i]), cd, &output[4 * i]);
181
15.1M
    }
182
122k
  }
183
7.53M
}
jpegli::N_SSE4::StoreFloatRow(float* restrict*, unsigned long, unsigned long, unsigned long, float*)
Line
Count
Source
160
1.71M
                   size_t num_channels, float* output) {
161
1.71M
  const HWY_CAPPED(float, 8) cd;
162
1.71M
  if (num_channels == 1) {
163
1.02M
    memcpy(output, input[0] + x0, len * sizeof(output[0]));
164
1.02M
  } else if (num_channels == 2) {
165
4.77M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
166
4.25M
      StoreInterleaved2(LoadU(cd, &input[0][x0 + i]),
167
4.25M
                        LoadU(cd, &input[1][x0 + i]), cd, &output[2 * i]);
168
4.25M
    }
169
523k
  } else if (num_channels == 3) {
170
2.90M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
171
2.80M
      StoreInterleaved3(LoadU(cd, &input[0][x0 + i]),
172
2.80M
                        LoadU(cd, &input[1][x0 + i]),
173
2.80M
                        LoadU(cd, &input[2][x0 + i]), cd, &output[3 * i]);
174
2.80M
    }
175
94.1k
  } else if (num_channels == 4) {
176
3.27M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
177
3.20M
      StoreInterleaved4(LoadU(cd, &input[0][x0 + i]),
178
3.20M
                        LoadU(cd, &input[1][x0 + i]),
179
3.20M
                        LoadU(cd, &input[2][x0 + i]),
180
3.20M
                        LoadU(cd, &input[3][x0 + i]), cd, &output[4 * i]);
181
3.20M
    }
182
68.7k
  }
183
1.71M
}
jpegli::N_AVX2::StoreFloatRow(float* restrict*, unsigned long, unsigned long, unsigned long, float*)
Line
Count
Source
160
3.35M
                   size_t num_channels, float* output) {
161
3.35M
  const HWY_CAPPED(float, 8) cd;
162
3.35M
  if (num_channels == 1) {
163
2.39M
    memcpy(output, input[0] + x0, len * sizeof(output[0]));
164
2.39M
  } else if (num_channels == 2) {
165
4.82M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
166
4.09M
      StoreInterleaved2(LoadU(cd, &input[0][x0 + i]),
167
4.09M
                        LoadU(cd, &input[1][x0 + i]), cd, &output[2 * i]);
168
4.09M
    }
169
729k
  } else if (num_channels == 3) {
170
4.98M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
171
4.78M
      StoreInterleaved3(LoadU(cd, &input[0][x0 + i]),
172
4.78M
                        LoadU(cd, &input[1][x0 + i]),
173
4.78M
                        LoadU(cd, &input[2][x0 + i]), cd, &output[3 * i]);
174
4.78M
    }
175
200k
  } else if (num_channels == 4) {
176
10.5M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
177
10.4M
      StoreInterleaved4(LoadU(cd, &input[0][x0 + i]),
178
10.4M
                        LoadU(cd, &input[1][x0 + i]),
179
10.4M
                        LoadU(cd, &input[2][x0 + i]),
180
10.4M
                        LoadU(cd, &input[3][x0 + i]), cd, &output[4 * i]);
181
10.4M
    }
182
28.9k
  }
183
3.35M
}
jpegli::N_SSE2::StoreFloatRow(float* restrict*, unsigned long, unsigned long, unsigned long, float*)
Line
Count
Source
160
2.47M
                   size_t num_channels, float* output) {
161
2.47M
  const HWY_CAPPED(float, 8) cd;
162
2.47M
  if (num_channels == 1) {
163
1.19M
    memcpy(output, input[0] + x0, len * sizeof(output[0]));
164
1.27M
  } else if (num_channels == 2) {
165
6.88M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
166
6.17M
      StoreInterleaved2(LoadU(cd, &input[0][x0 + i]),
167
6.17M
                        LoadU(cd, &input[1][x0 + i]), cd, &output[2 * i]);
168
6.17M
    }
169
708k
  } else if (num_channels == 3) {
170
5.46M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
171
4.92M
      StoreInterleaved3(LoadU(cd, &input[0][x0 + i]),
172
4.92M
                        LoadU(cd, &input[1][x0 + i]),
173
4.92M
                        LoadU(cd, &input[2][x0 + i]), cd, &output[3 * i]);
174
4.92M
    }
175
542k
  } else if (num_channels == 4) {
176
1.51M
    for (size_t i = 0; i < len; i += Lanes(cd)) {
177
1.49M
      StoreInterleaved4(LoadU(cd, &input[0][x0 + i]),
178
1.49M
                        LoadU(cd, &input[1][x0 + i]),
179
1.49M
                        LoadU(cd, &input[2][x0 + i]),
180
1.49M
                        LoadU(cd, &input[3][x0 + i]), cd, &output[4 * i]);
181
1.49M
    }
182
24.8k
  }
183
2.47M
}
184
185
static constexpr float kFSWeightMR = 7.0f / 16.0f;
186
static constexpr float kFSWeightBL = 3.0f / 16.0f;
187
static constexpr float kFSWeightBM = 5.0f / 16.0f;
188
static constexpr float kFSWeightBR = 1.0f / 16.0f;
189
190
0
float LimitError(float error) {
191
0
  float abserror = std::abs(error);
192
0
  if (abserror > 48.0f) {
193
0
    abserror = 32.0f;
194
0
  } else if (abserror > 16.0f) {
195
0
    abserror = 0.5f * abserror + 8.0f;
196
0
  }
197
0
  return error > 0.0f ? abserror : -abserror;
198
0
}
Unexecuted instantiation: jpegli::N_SSE4::LimitError(float)
Unexecuted instantiation: jpegli::N_AVX2::LimitError(float)
Unexecuted instantiation: jpegli::N_SSE2::LimitError(float)
199
200
void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
201
                   size_t xoffset, size_t len, size_t num_channels,
202
18.3M
                   uint8_t* JXL_RESTRICT output) {
203
18.3M
  jpeg_decomp_master* m = cinfo->master;
204
18.3M
  uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
205
18.3M
  if (cinfo->quantize_colors && m->quant_pass_ == 1) {
206
0
    float* error_row[kMaxComponents];
207
0
    float* next_error_row[kMaxComponents];
208
0
    J_DITHER_MODE dither_mode = cinfo->dither_mode;
209
0
    if (dither_mode == JDITHER_ORDERED) {
210
0
      for (size_t c = 0; c < num_channels; ++c) {
211
0
        DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
212
0
                  cinfo->output_width);
213
0
      }
214
0
    } else if (dither_mode == JDITHER_FS) {
215
0
      for (size_t c = 0; c < num_channels; ++c) {
216
0
        if (cinfo->output_scanline % 2 == 0) {
217
0
          error_row[c] = m->error_row_[c];
218
0
          next_error_row[c] = m->error_row_[c + kMaxComponents];
219
0
        } else {
220
0
          error_row[c] = m->error_row_[c + kMaxComponents];
221
0
          next_error_row[c] = m->error_row_[c];
222
0
        }
223
0
        memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
224
0
      }
225
0
    }
226
0
    const float mul = 255.0f;
227
0
    if (dither_mode != JDITHER_FS) {
228
0
      StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
229
0
    }
230
0
    for (size_t i = 0; i < len; ++i) {
231
0
      uint8_t* pixel = &scratch_space[num_channels * i];
232
0
      if (dither_mode == JDITHER_FS) {
233
0
        for (size_t c = 0; c < num_channels; ++c) {
234
0
          float val = rows[c][i] * mul + LimitError(error_row[c][i]);
235
0
          pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
236
0
        }
237
0
      }
238
0
      int index = LookupColorIndex(cinfo, pixel);
239
0
      output[i] = index;
240
0
      if (dither_mode == JDITHER_FS) {
241
0
        size_t prev_i = i > 0 ? i - 1 : 0;
242
0
        size_t next_i = i + 1 < len ? i + 1 : len - 1;
243
0
        for (size_t c = 0; c < num_channels; ++c) {
244
0
          float error = pixel[c] - cinfo->colormap[c][index];
245
0
          error_row[c][next_i] += kFSWeightMR * error;
246
0
          next_error_row[c][prev_i] += kFSWeightBL * error;
247
0
          next_error_row[c][i] += kFSWeightBM * error;
248
0
          next_error_row[c][next_i] += kFSWeightBR * error;
249
0
        }
250
0
      }
251
0
    }
252
18.3M
  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
253
3.76M
    const float mul = 255.0;
254
3.76M
    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
255
3.76M
    memcpy(output, scratch_space, len * num_channels);
256
14.6M
  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
257
7.07M
    const float mul = 65535.0;
258
7.07M
    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
259
7.07M
    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
260
7.07M
    if (m->swap_endianness_) {
261
4.01M
      const HWY_CAPPED(uint16_t, 8) du;
262
4.01M
      size_t output_len = len * num_channels;
263
35.3M
      for (size_t j = 0; j < output_len; j += Lanes(du)) {
264
31.3M
        auto v = LoadU(du, tmp + j);
265
31.3M
        auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
266
31.3M
        StoreU(vswap, du, tmp + j);
267
31.3M
      }
268
4.01M
    }
269
7.07M
    memcpy(output, tmp, len * num_channels * 2);
270
7.53M
  } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
271
7.53M
    float* tmp = reinterpret_cast<float*>(scratch_space);
272
7.53M
    StoreFloatRow(rows, xoffset, len, num_channels, tmp);
273
7.53M
    if (m->swap_endianness_) {
274
3.19M
      size_t output_len = len * num_channels;
275
179M
      for (size_t j = 0; j < output_len; ++j) {
276
176M
        tmp[j] = BSwapFloat(tmp[j]);
277
176M
      }
278
3.19M
    }
279
7.53M
    memcpy(output, tmp, len * num_channels * 4);
280
7.53M
  }
281
18.3M
}
jpegli::N_SSE4::WriteToOutput(jpeg_decompress_struct*, float* restrict*, unsigned long, unsigned long, unsigned long, unsigned char*)
Line
Count
Source
202
4.41M
                   uint8_t* JXL_RESTRICT output) {
203
4.41M
  jpeg_decomp_master* m = cinfo->master;
204
4.41M
  uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
205
4.41M
  if (cinfo->quantize_colors && m->quant_pass_ == 1) {
206
0
    float* error_row[kMaxComponents];
207
0
    float* next_error_row[kMaxComponents];
208
0
    J_DITHER_MODE dither_mode = cinfo->dither_mode;
209
0
    if (dither_mode == JDITHER_ORDERED) {
210
0
      for (size_t c = 0; c < num_channels; ++c) {
211
0
        DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
212
0
                  cinfo->output_width);
213
0
      }
214
0
    } else if (dither_mode == JDITHER_FS) {
215
0
      for (size_t c = 0; c < num_channels; ++c) {
216
0
        if (cinfo->output_scanline % 2 == 0) {
217
0
          error_row[c] = m->error_row_[c];
218
0
          next_error_row[c] = m->error_row_[c + kMaxComponents];
219
0
        } else {
220
0
          error_row[c] = m->error_row_[c + kMaxComponents];
221
0
          next_error_row[c] = m->error_row_[c];
222
0
        }
223
0
        memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
224
0
      }
225
0
    }
226
0
    const float mul = 255.0f;
227
0
    if (dither_mode != JDITHER_FS) {
228
0
      StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
229
0
    }
230
0
    for (size_t i = 0; i < len; ++i) {
231
0
      uint8_t* pixel = &scratch_space[num_channels * i];
232
0
      if (dither_mode == JDITHER_FS) {
233
0
        for (size_t c = 0; c < num_channels; ++c) {
234
0
          float val = rows[c][i] * mul + LimitError(error_row[c][i]);
235
0
          pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
236
0
        }
237
0
      }
238
0
      int index = LookupColorIndex(cinfo, pixel);
239
0
      output[i] = index;
240
0
      if (dither_mode == JDITHER_FS) {
241
0
        size_t prev_i = i > 0 ? i - 1 : 0;
242
0
        size_t next_i = i + 1 < len ? i + 1 : len - 1;
243
0
        for (size_t c = 0; c < num_channels; ++c) {
244
0
          float error = pixel[c] - cinfo->colormap[c][index];
245
0
          error_row[c][next_i] += kFSWeightMR * error;
246
0
          next_error_row[c][prev_i] += kFSWeightBL * error;
247
0
          next_error_row[c][i] += kFSWeightBM * error;
248
0
          next_error_row[c][next_i] += kFSWeightBR * error;
249
0
        }
250
0
      }
251
0
    }
252
4.41M
  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
253
1.10M
    const float mul = 255.0;
254
1.10M
    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
255
1.10M
    memcpy(output, scratch_space, len * num_channels);
256
3.31M
  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
257
1.60M
    const float mul = 65535.0;
258
1.60M
    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
259
1.60M
    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
260
1.60M
    if (m->swap_endianness_) {
261
1.08M
      const HWY_CAPPED(uint16_t, 8) du;
262
1.08M
      size_t output_len = len * num_channels;
263
10.1M
      for (size_t j = 0; j < output_len; j += Lanes(du)) {
264
9.03M
        auto v = LoadU(du, tmp + j);
265
9.03M
        auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
266
9.03M
        StoreU(vswap, du, tmp + j);
267
9.03M
      }
268
1.08M
    }
269
1.60M
    memcpy(output, tmp, len * num_channels * 2);
270
1.71M
  } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
271
1.71M
    float* tmp = reinterpret_cast<float*>(scratch_space);
272
1.71M
    StoreFloatRow(rows, xoffset, len, num_channels, tmp);
273
1.71M
    if (m->swap_endianness_) {
274
973k
      size_t output_len = len * num_channels;
275
48.4M
      for (size_t j = 0; j < output_len; ++j) {
276
47.4M
        tmp[j] = BSwapFloat(tmp[j]);
277
47.4M
      }
278
973k
    }
279
1.71M
    memcpy(output, tmp, len * num_channels * 4);
280
1.71M
  }
281
4.41M
}
jpegli::N_AVX2::WriteToOutput(jpeg_decompress_struct*, float* restrict*, unsigned long, unsigned long, unsigned long, unsigned char*)
Line
Count
Source
202
8.54M
                   uint8_t* JXL_RESTRICT output) {
203
8.54M
  jpeg_decomp_master* m = cinfo->master;
204
8.54M
  uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
205
8.54M
  if (cinfo->quantize_colors && m->quant_pass_ == 1) {
206
0
    float* error_row[kMaxComponents];
207
0
    float* next_error_row[kMaxComponents];
208
0
    J_DITHER_MODE dither_mode = cinfo->dither_mode;
209
0
    if (dither_mode == JDITHER_ORDERED) {
210
0
      for (size_t c = 0; c < num_channels; ++c) {
211
0
        DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
212
0
                  cinfo->output_width);
213
0
      }
214
0
    } else if (dither_mode == JDITHER_FS) {
215
0
      for (size_t c = 0; c < num_channels; ++c) {
216
0
        if (cinfo->output_scanline % 2 == 0) {
217
0
          error_row[c] = m->error_row_[c];
218
0
          next_error_row[c] = m->error_row_[c + kMaxComponents];
219
0
        } else {
220
0
          error_row[c] = m->error_row_[c + kMaxComponents];
221
0
          next_error_row[c] = m->error_row_[c];
222
0
        }
223
0
        memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
224
0
      }
225
0
    }
226
0
    const float mul = 255.0f;
227
0
    if (dither_mode != JDITHER_FS) {
228
0
      StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
229
0
    }
230
0
    for (size_t i = 0; i < len; ++i) {
231
0
      uint8_t* pixel = &scratch_space[num_channels * i];
232
0
      if (dither_mode == JDITHER_FS) {
233
0
        for (size_t c = 0; c < num_channels; ++c) {
234
0
          float val = rows[c][i] * mul + LimitError(error_row[c][i]);
235
0
          pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
236
0
        }
237
0
      }
238
0
      int index = LookupColorIndex(cinfo, pixel);
239
0
      output[i] = index;
240
0
      if (dither_mode == JDITHER_FS) {
241
0
        size_t prev_i = i > 0 ? i - 1 : 0;
242
0
        size_t next_i = i + 1 < len ? i + 1 : len - 1;
243
0
        for (size_t c = 0; c < num_channels; ++c) {
244
0
          float error = pixel[c] - cinfo->colormap[c][index];
245
0
          error_row[c][next_i] += kFSWeightMR * error;
246
0
          next_error_row[c][prev_i] += kFSWeightBL * error;
247
0
          next_error_row[c][i] += kFSWeightBM * error;
248
0
          next_error_row[c][next_i] += kFSWeightBR * error;
249
0
        }
250
0
      }
251
0
    }
252
8.54M
  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
253
1.68M
    const float mul = 255.0;
254
1.68M
    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
255
1.68M
    memcpy(output, scratch_space, len * num_channels);
256
6.86M
  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
257
3.50M
    const float mul = 65535.0;
258
3.50M
    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
259
3.50M
    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
260
3.50M
    if (m->swap_endianness_) {
261
1.87M
      const HWY_CAPPED(uint16_t, 8) du;
262
1.87M
      size_t output_len = len * num_channels;
263
14.0M
      for (size_t j = 0; j < output_len; j += Lanes(du)) {
264
12.1M
        auto v = LoadU(du, tmp + j);
265
12.1M
        auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
266
12.1M
        StoreU(vswap, du, tmp + j);
267
12.1M
      }
268
1.87M
    }
269
3.50M
    memcpy(output, tmp, len * num_channels * 2);
270
3.50M
  } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
271
3.35M
    float* tmp = reinterpret_cast<float*>(scratch_space);
272
3.35M
    StoreFloatRow(rows, xoffset, len, num_channels, tmp);
273
3.35M
    if (m->swap_endianness_) {
274
934k
      size_t output_len = len * num_channels;
275
59.0M
      for (size_t j = 0; j < output_len; ++j) {
276
58.1M
        tmp[j] = BSwapFloat(tmp[j]);
277
58.1M
      }
278
934k
    }
279
3.35M
    memcpy(output, tmp, len * num_channels * 4);
280
3.35M
  }
281
8.54M
}
jpegli::N_SSE2::WriteToOutput(jpeg_decompress_struct*, float* restrict*, unsigned long, unsigned long, unsigned long, unsigned char*)
Line
Count
Source
202
5.41M
                   uint8_t* JXL_RESTRICT output) {
203
5.41M
  jpeg_decomp_master* m = cinfo->master;
204
5.41M
  uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
205
5.41M
  if (cinfo->quantize_colors && m->quant_pass_ == 1) {
206
0
    float* error_row[kMaxComponents];
207
0
    float* next_error_row[kMaxComponents];
208
0
    J_DITHER_MODE dither_mode = cinfo->dither_mode;
209
0
    if (dither_mode == JDITHER_ORDERED) {
210
0
      for (size_t c = 0; c < num_channels; ++c) {
211
0
        DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
212
0
                  cinfo->output_width);
213
0
      }
214
0
    } else if (dither_mode == JDITHER_FS) {
215
0
      for (size_t c = 0; c < num_channels; ++c) {
216
0
        if (cinfo->output_scanline % 2 == 0) {
217
0
          error_row[c] = m->error_row_[c];
218
0
          next_error_row[c] = m->error_row_[c + kMaxComponents];
219
0
        } else {
220
0
          error_row[c] = m->error_row_[c + kMaxComponents];
221
0
          next_error_row[c] = m->error_row_[c];
222
0
        }
223
0
        memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
224
0
      }
225
0
    }
226
0
    const float mul = 255.0f;
227
0
    if (dither_mode != JDITHER_FS) {
228
0
      StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
229
0
    }
230
0
    for (size_t i = 0; i < len; ++i) {
231
0
      uint8_t* pixel = &scratch_space[num_channels * i];
232
0
      if (dither_mode == JDITHER_FS) {
233
0
        for (size_t c = 0; c < num_channels; ++c) {
234
0
          float val = rows[c][i] * mul + LimitError(error_row[c][i]);
235
0
          pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
236
0
        }
237
0
      }
238
0
      int index = LookupColorIndex(cinfo, pixel);
239
0
      output[i] = index;
240
0
      if (dither_mode == JDITHER_FS) {
241
0
        size_t prev_i = i > 0 ? i - 1 : 0;
242
0
        size_t next_i = i + 1 < len ? i + 1 : len - 1;
243
0
        for (size_t c = 0; c < num_channels; ++c) {
244
0
          float error = pixel[c] - cinfo->colormap[c][index];
245
0
          error_row[c][next_i] += kFSWeightMR * error;
246
0
          next_error_row[c][prev_i] += kFSWeightBL * error;
247
0
          next_error_row[c][i] += kFSWeightBM * error;
248
0
          next_error_row[c][next_i] += kFSWeightBR * error;
249
0
        }
250
0
      }
251
0
    }
252
5.41M
  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
253
979k
    const float mul = 255.0;
254
979k
    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
255
979k
    memcpy(output, scratch_space, len * num_channels);
256
4.43M
  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
257
1.95M
    const float mul = 65535.0;
258
1.95M
    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
259
1.95M
    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
260
1.95M
    if (m->swap_endianness_) {
261
1.05M
      const HWY_CAPPED(uint16_t, 8) du;
262
1.05M
      size_t output_len = len * num_channels;
263
11.1M
      for (size_t j = 0; j < output_len; j += Lanes(du)) {
264
10.1M
        auto v = LoadU(du, tmp + j);
265
10.1M
        auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
266
10.1M
        StoreU(vswap, du, tmp + j);
267
10.1M
      }
268
1.05M
    }
269
1.95M
    memcpy(output, tmp, len * num_channels * 2);
270
2.47M
  } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
271
2.47M
    float* tmp = reinterpret_cast<float*>(scratch_space);
272
2.47M
    StoreFloatRow(rows, xoffset, len, num_channels, tmp);
273
2.47M
    if (m->swap_endianness_) {
274
1.28M
      size_t output_len = len * num_channels;
275
71.8M
      for (size_t j = 0; j < output_len; ++j) {
276
70.5M
        tmp[j] = BSwapFloat(tmp[j]);
277
70.5M
      }
278
1.28M
    }
279
2.47M
    memcpy(output, tmp, len * num_channels * 4);
280
2.47M
  }
281
5.41M
}
282
283
// NOLINTNEXTLINE(google-readability-namespace-comments)
284
}  // namespace HWY_NAMESPACE
285
}  // namespace jpegli
286
HWY_AFTER_NAMESPACE();
287
288
#if HWY_ONCE
289
290
namespace jpegli {
291
292
HWY_EXPORT(GatherBlockStats);
293
HWY_EXPORT(WriteToOutput);
294
HWY_EXPORT(DecenterRow);
295
296
void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
297
                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
298
2.15M
                      int32_t* JXL_RESTRICT sumabs) {
299
2.15M
  HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros, sumabs);
300
2.15M
}
301
302
void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
303
                   size_t xoffset, size_t len, size_t num_channels,
304
18.3M
                   uint8_t* JXL_RESTRICT output) {
305
18.3M
  HWY_DYNAMIC_DISPATCH(WriteToOutput)
306
18.3M
  (cinfo, rows, xoffset, len, num_channels, output);
307
18.3M
}
308
309
29.9M
void DecenterRow(float* row, size_t xsize) {
310
29.9M
  HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
311
29.9M
}
312
313
2.11M
bool ShouldApplyDequantBiases(j_decompress_ptr cinfo, int ci) {
314
2.11M
  const auto& compinfo = cinfo->comp_info[ci];
315
2.11M
  return (compinfo.h_samp_factor == cinfo->max_h_samp_factor &&
316
1.69M
          compinfo.v_samp_factor == cinfo->max_v_samp_factor);
317
2.11M
}
318
319
// See the following article for the details:
320
// J. R. Price and M. Rabbani, "Dequantization bias for JPEG decompression"
321
// Proceedings International Conference on Information Technology: Coding and
322
// Computing (Cat. No.PR00540), 2000, pp. 30-35, doi: 10.1109/ITCC.2000.844179.
323
void ComputeOptimalLaplacianBiases(const int num_blocks, const int* nonzeros,
324
383k
                                   const int* sumabs, float* biases) {
325
24.5M
  for (size_t k = 1; k < DCTSIZE2; ++k) {
326
24.1M
    if (nonzeros[k] == 0) {
327
23.8M
      biases[k] = 0.5f;
328
23.8M
      continue;
329
23.8M
    }
330
    // Notation adapted from the article
331
253k
    float N = num_blocks;
332
253k
    float N1 = nonzeros[k];
333
253k
    float N0 = num_blocks - N1;
334
253k
    float S = sumabs[k];
335
    // Compute gamma from N0, N1, N, S (eq. 11), with A and B being just
336
    // temporary grouping of terms.
337
253k
    float A = 4.0 * S + 2.0 * N;
338
253k
    float B = 4.0 * S - 2.0 * N1;
339
253k
    float gamma = (-1.0 * N0 + std::sqrt(N0 * N0 * 1.0 + A * B)) / A;
340
253k
    float gamma2 = gamma * gamma;
341
    // The bias is computed from gamma with (eq. 5), where the quantization
342
    // multiplier Q can be factored out and thus the bias can be applied
343
    // directly on the quantized coefficient.
344
253k
    biases[k] =
345
253k
        0.5 * (((1.0 + gamma2) / (1.0 - gamma2)) + 1.0 / std::log(gamma));
346
253k
  }
347
383k
}
348
349
constexpr std::array<int, SAVED_COEFS> Q_POS = {0, 1, 8,  16, 9,
350
                                                2, 3, 10, 17, 24};
351
352
2.27k
bool is_nonzero_quantizers(const JQUANT_TBL* qtable) {
353
2.27k
  return std::all_of(Q_POS.begin(), Q_POS.end(),
354
22.7k
                     [&](int pos) { return qtable->quantval[pos] != 0; });
355
2.27k
}
356
357
// Determine whether smoothing should be applied during decompression
358
2.56k
bool do_smoothing(j_decompress_ptr cinfo) {
359
2.56k
  jpeg_decomp_master* m = cinfo->master;
360
2.56k
  bool smoothing_useful = false;
361
362
2.56k
  if (!cinfo->progressive_mode || cinfo->coef_bits == nullptr) {
363
654
    return false;
364
654
  }
365
1.90k
  auto* coef_bits_latch = m->coef_bits_latch;
366
1.90k
  auto* prev_coef_bits_latch = m->prev_coef_bits_latch;
367
368
3.45k
  for (int ci = 0; ci < cinfo->num_components; ci++) {
369
2.40k
    jpeg_component_info* compptr = &cinfo->comp_info[ci];
370
2.40k
    JQUANT_TBL* qtable = compptr->quant_table;
371
2.40k
    int* coef_bits = cinfo->coef_bits[ci];
372
2.40k
    int* prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
373
374
    // Return early if conditions for smoothing are not met
375
2.40k
    if (qtable == nullptr || !is_nonzero_quantizers(qtable) ||
376
2.27k
        coef_bits[0] < 0) {
377
856
      return false;
378
856
    }
379
380
1.55k
    coef_bits_latch[ci][0] = coef_bits[0];
381
382
15.5k
    for (int coefi = 1; coefi < SAVED_COEFS; coefi++) {
383
13.9k
      prev_coef_bits_latch[ci][coefi] =
384
13.9k
          cinfo->input_scan_number > 1 ? prev_coef_bits[coefi] : -1;
385
13.9k
      if (coef_bits[coefi] != 0) {
386
11.2k
        smoothing_useful = true;
387
11.2k
      }
388
13.9k
      coef_bits_latch[ci][coefi] = coef_bits[coefi];
389
13.9k
    }
390
1.55k
  }
391
392
1.05k
  return smoothing_useful;
393
1.90k
}
394
395
void PredictSmooth(j_decompress_ptr cinfo, JBLOCKARRAY blocks, int component,
396
7.48M
                   size_t bx, int iy) {
397
7.48M
  const size_t imcu_row = cinfo->output_iMCU_row;
398
7.48M
  int16_t* scratch = cinfo->master->smoothing_scratch_;
399
7.48M
  std::vector<int> Q_VAL(SAVED_COEFS);
400
7.48M
  int* coef_bits;
401
402
7.48M
  std::array<std::array<int, 5>, 5> dc_values;
403
7.48M
  auto& compinfo = cinfo->comp_info[component];
404
7.48M
  const size_t by0 = imcu_row * compinfo.v_samp_factor;
405
7.48M
  const size_t by = by0 + iy;
406
407
7.48M
  int prev_iy = by > 0 ? iy - 1 : 0;
408
7.48M
  int prev_prev_iy = by > 1 ? iy - 2 : prev_iy;
409
7.48M
  int next_iy = by + 1 < compinfo.height_in_blocks ? iy + 1 : iy;
410
7.48M
  int next_next_iy = by + 2 < compinfo.height_in_blocks ? iy + 2 : next_iy;
411
412
7.48M
  const int16_t* cur_row = blocks[iy][bx];
413
7.48M
  const int16_t* prev_row = blocks[prev_iy][bx];
414
7.48M
  const int16_t* prev_prev_row = blocks[prev_prev_iy][bx];
415
7.48M
  const int16_t* next_row = blocks[next_iy][bx];
416
7.48M
  const int16_t* next_next_row = blocks[next_next_iy][bx];
417
418
7.48M
  int prev_block_ind = bx ? -DCTSIZE2 : 0;
419
7.48M
  int prev_prev_block_ind = bx > 1 ? -2 * DCTSIZE2 : prev_block_ind;
420
7.48M
  int next_block_ind = bx + 1 < compinfo.width_in_blocks ? DCTSIZE2 : 0;
421
7.48M
  int next_next_block_ind =
422
7.48M
      bx + 2 < compinfo.width_in_blocks ? DCTSIZE2 * 2 : next_block_ind;
423
424
7.48M
  std::array<const int16_t*, 5> row_ptrs = {prev_prev_row, prev_row, cur_row,
425
7.48M
                                            next_row, next_next_row};
426
7.48M
  std::array<int, 5> block_inds = {prev_prev_block_ind, prev_block_ind, 0,
427
7.48M
                                   next_block_ind, next_next_block_ind};
428
429
7.48M
  memcpy(scratch, cur_row, DCTSIZE2 * sizeof(cur_row[0]));
430
431
44.9M
  for (int r = 0; r < 5; ++r) {
432
224M
    for (int c = 0; c < 5; ++c) {
433
187M
      dc_values[r][c] = row_ptrs[r][block_inds[c]];
434
187M
    }
435
37.4M
  }
436
  // Get the correct coef_bits: In case of an incomplete scan, we use the
437
  // prev coefficients.
438
7.48M
  if (cinfo->output_iMCU_row + 1 > cinfo->input_iMCU_row) {
439
7.38M
    coef_bits = cinfo->master->prev_coef_bits_latch[component];
440
7.38M
  } else {
441
102k
    coef_bits = cinfo->master->coef_bits_latch[component];
442
102k
  }
443
444
7.48M
  bool change_dc = true;
445
70.6M
  for (int i = 1; i < SAVED_COEFS; i++) {
446
63.7M
    if (coef_bits[i] != -1) {
447
586k
      change_dc = false;
448
586k
      break;
449
586k
    }
450
63.7M
  }
451
452
7.48M
  JQUANT_TBL* quanttbl = cinfo->quant_tbl_ptrs[compinfo.quant_tbl_no];
453
52.3M
  for (size_t i = 0; i < 6; ++i) {
454
44.9M
    Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
455
44.9M
  }
456
7.48M
  if (change_dc) {
457
34.4M
    for (size_t i = 6; i < SAVED_COEFS; ++i) {
458
27.5M
      Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
459
27.5M
    }
460
6.89M
  }
461
71.3M
  auto calculate_dct_value = [&](int coef_index) {
462
71.3M
    int64_t num = 0;
463
71.3M
    int pred;
464
71.3M
    int Al;
465
    // we use the symmetry of the smoothing matrices by transposing the 5x5 dc
466
    // matrix in that case.
467
71.3M
    bool swap_indices = coef_index == 2 || coef_index == 5 || coef_index == 8 ||
468
49.7M
                        coef_index == 9;
469
834M
    auto dc = [&](int i, int j) {
470
834M
      return swap_indices ? dc_values[j][i] : dc_values[i][j];
471
834M
    };
472
71.3M
    JPEGLI_CHECK(coef_index >= 0 && coef_index < 10);
473
71.3M
    Al = coef_bits[coef_index];
474
71.3M
    switch (coef_index) {
475
6.89M
      case 0:
476
        // set the DC
477
6.89M
        num = (-2 * dc(0, 0) - 6 * dc(0, 1) - 8 * dc(0, 2) - 6 * dc(0, 3) -
478
6.89M
               2 * dc(0, 4) - 6 * dc(1, 0) + 6 * dc(1, 1) + 42 * dc(1, 2) +
479
6.89M
               6 * dc(1, 3) - 6 * dc(1, 4) - 8 * dc(2, 0) + 42 * dc(2, 1) +
480
6.89M
               152 * dc(2, 2) + 42 * dc(2, 3) - 8 * dc(2, 4) - 6 * dc(3, 0) +
481
6.89M
               6 * dc(3, 1) + 42 * dc(3, 2) + 6 * dc(3, 3) - 6 * dc(3, 4) -
482
6.89M
               2 * dc(4, 0) - 6 * dc(4, 1) - 8 * dc(4, 2) - 6 * dc(4, 3) -
483
6.89M
               2 * dc(4, 4));
484
        // special case: for the DC the dequantization is different
485
6.89M
        Al = 0;
486
6.89M
        break;
487
7.36M
      case 1:
488
14.7M
      case 2:
489
        // set Q01 or Q10
490
14.7M
        num = (change_dc ? (-dc(0, 0) - dc(0, 1) + dc(0, 3) + dc(0, 4) -
491
13.7M
                            3 * dc(1, 0) + 13 * dc(1, 1) - 13 * dc(1, 3) +
492
13.7M
                            3 * dc(1, 4) - 3 * dc(2, 0) + 38 * dc(2, 1) -
493
13.7M
                            38 * dc(2, 3) + 3 * dc(2, 4) - 3 * dc(3, 0) +
494
13.7M
                            13 * dc(3, 1) - 13 * dc(3, 3) + 3 * dc(3, 4) -
495
13.7M
                            dc(4, 0) - dc(4, 1) + dc(4, 3) + dc(4, 4))
496
14.7M
                         : (-7 * dc(2, 0) + 50 * dc(2, 1) - 50 * dc(2, 3) +
497
990k
                            7 * dc(2, 4)));
498
14.7M
        break;
499
7.43M
      case 3:
500
14.8M
      case 5:
501
        // set Q02 or Q20
502
14.8M
        num = (change_dc
503
14.8M
                   ? dc(0, 2) + 2 * dc(1, 1) + 7 * dc(1, 2) + 2 * dc(1, 3) -
504
13.7M
                         5 * dc(2, 1) - 14 * dc(2, 2) - 5 * dc(2, 3) +
505
13.7M
                         2 * dc(3, 1) + 7 * dc(3, 2) + 2 * dc(3, 3) + dc(4, 2)
506
14.8M
                   : (-dc(0, 2) + 13 * dc(1, 2) - 24 * dc(2, 2) +
507
1.10M
                      13 * dc(3, 2) - dc(4, 2)));
508
14.8M
        break;
509
7.43M
      case 4:
510
        // set Q11
511
7.43M
        num =
512
7.43M
            (change_dc ? -dc(0, 0) + dc(0, 4) + 9 * dc(1, 1) - 9 * dc(1, 3) -
513
6.87M
                             9 * dc(3, 1) + 9 * dc(3, 3) + dc(4, 0) - dc(4, 4)
514
7.43M
                       : (dc(1, 4) + dc(3, 0) - 10 * dc(3, 1) + 10 * dc(3, 3) -
515
563k
                          dc(0, 1) - dc(3, 4) + dc(4, 1) - dc(4, 3) + dc(0, 3) -
516
563k
                          dc(1, 0) + 10 * dc(1, 1) - 10 * dc(1, 3)));
517
7.43M
        break;
518
6.87M
      case 6:
519
13.7M
      case 9:
520
        // set Q03 or Q30
521
13.7M
        num = (dc(1, 1) - dc(1, 3) + 2 * dc(2, 1) - 2 * dc(2, 3) + dc(3, 1) -
522
13.7M
               dc(3, 3));
523
13.7M
        break;
524
6.87M
      case 7:
525
13.7M
      case 8:
526
13.7M
      default:
527
        // set Q12 and Q21
528
13.7M
        num = (dc(1, 1) - 3 * dc(1, 2) + dc(1, 3) - dc(3, 1) + 3 * dc(3, 2) -
529
13.7M
               dc(3, 3));
530
13.7M
        break;
531
71.3M
    }
532
71.3M
    num = Q_VAL[0] * num;
533
71.3M
    if (num >= 0) {
534
70.7M
      pred = ((Q_VAL[coef_index] << 7) + num) / (Q_VAL[coef_index] << 8);
535
70.7M
      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
536
70.7M
    } else {
537
599k
      pred = ((Q_VAL[coef_index] << 7) - num) / (Q_VAL[coef_index] << 8);
538
599k
      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
539
599k
      pred = -pred;
540
599k
    }
541
71.3M
    return static_cast<int16_t>(pred);
542
71.3M
  };
543
544
7.48M
  int loop_end = change_dc ? SAVED_COEFS : 6;
545
72.5M
  for (int i = 1; i < loop_end; ++i) {
546
65.0M
    if (coef_bits[i] != 0 && scratch[Q_POS[i]] == 0) {
547
64.4M
      scratch[Q_POS[i]] = calculate_dct_value(i);
548
64.4M
    }
549
65.0M
  }
550
7.48M
  if (change_dc) {
551
6.89M
    scratch[0] = calculate_dct_value(0);
552
6.89M
  }
553
7.48M
}
554
555
2.56k
void PrepareForOutput(j_decompress_ptr cinfo) {
556
2.56k
  jpeg_decomp_master* m = cinfo->master;
557
2.56k
  bool smoothing = do_smoothing(cinfo);
558
2.56k
  m->apply_smoothing = smoothing && FROM_JXL_BOOL(cinfo->do_block_smoothing);
559
2.56k
  size_t coeffs_per_block = cinfo->num_components * DCTSIZE2;
560
2.56k
  memset(m->nonzeros_, 0, coeffs_per_block * sizeof(m->nonzeros_[0]));
561
2.56k
  memset(m->sumabs_, 0, coeffs_per_block * sizeof(m->sumabs_[0]));
562
2.56k
  memset(m->num_processed_blocks_, 0, sizeof(m->num_processed_blocks_));
563
2.56k
  memset(m->biases_, 0, coeffs_per_block * sizeof(m->biases_[0]));
564
2.56k
  cinfo->output_iMCU_row = 0;
565
2.56k
  cinfo->output_scanline = 0;
566
2.56k
  const float kDequantScale = 1.0f / (8 * 255);
567
7.45k
  for (int c = 0; c < cinfo->num_components; c++) {
568
4.89k
    const auto& comp = cinfo->comp_info[c];
569
4.89k
    JQUANT_TBL* table = comp.quant_table;
570
4.89k
    if (table == nullptr) continue;
571
240k
    for (size_t k = 0; k < DCTSIZE2; ++k) {
572
237k
      m->dequant_[c * DCTSIZE2 + k] = table->quantval[k] * kDequantScale;
573
237k
    }
574
3.70k
  }
575
2.56k
  JPEGLI_CHECK(ChooseInverseTransform(cinfo));
576
2.56k
  ChooseColorTransform(cinfo);
577
2.56k
}
578
579
1.54M
void DecodeCurrentiMCURow(j_decompress_ptr cinfo) {
580
1.54M
  jpeg_decomp_master* m = cinfo->master;
581
1.54M
  const size_t imcu_row = cinfo->output_iMCU_row;
582
1.54M
  JBLOCKARRAY blocks[kMaxComponents];
583
3.65M
  for (int c = 0; c < cinfo->num_components; ++c) {
584
2.11M
    const jpeg_component_info* comp = &cinfo->comp_info[c];
585
2.11M
    int by0 = imcu_row * comp->v_samp_factor;
586
2.11M
    int block_rows_left = comp->height_in_blocks - by0;
587
2.11M
    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
588
2.11M
    int offset = m->streaming_mode_ ? 0 : by0;
589
2.11M
    blocks[c] = (*cinfo->mem->access_virt_barray)(
590
2.11M
        reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
591
2.11M
        max_block_rows, FALSE);
592
2.11M
  }
593
3.65M
  for (int c = 0; c < cinfo->num_components; ++c) {
594
2.11M
    size_t k0 = c * DCTSIZE2;
595
2.11M
    auto& compinfo = cinfo->comp_info[c];
596
2.11M
    size_t block_row = imcu_row * compinfo.v_samp_factor;
597
2.11M
    if (ShouldApplyDequantBiases(cinfo, c)) {
598
      // Update statistics for this iMCU row.
599
3.69M
      for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
600
2.16M
        size_t by = block_row + iy;
601
2.16M
        if (by >= compinfo.height_in_blocks) {
602
1.46k
          continue;
603
1.46k
        }
604
2.15M
        int16_t* JXL_RESTRICT coeffs = &blocks[c][iy][0][0];
605
2.15M
        size_t num = compinfo.width_in_blocks * DCTSIZE2;
606
2.15M
        GatherBlockStats(coeffs, num, &m->nonzeros_[k0], &m->sumabs_[k0]);
607
2.15M
        m->num_processed_blocks_[c] += compinfo.width_in_blocks;
608
2.15M
      }
609
1.53M
      if (imcu_row % 4 == 3) {
610
        // Re-compute optimal biases every few iMCU-rows.
611
383k
        ComputeOptimalLaplacianBiases(m->num_processed_blocks_[c],
612
383k
                                      &m->nonzeros_[k0], &m->sumabs_[k0],
613
383k
                                      &m->biases_[k0]);
614
383k
      }
615
1.53M
    }
616
2.11M
    RowBuffer<float>* raw_out = &m->raw_output_[c];
617
5.03M
    for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
618
2.92M
      size_t by = block_row + iy;
619
2.92M
      if (by >= compinfo.height_in_blocks) {
620
2.16k
        continue;
621
2.16k
      }
622
2.91M
      size_t dctsize = m->scaled_dct_size[c];
623
2.91M
      int16_t* JXL_RESTRICT row_in = &blocks[c][iy][0][0];
624
2.91M
      float* JXL_RESTRICT row_out = raw_out->Row(by * dctsize);
625
32.4M
      for (size_t bx = 0; bx < compinfo.width_in_blocks; ++bx) {
626
29.5M
        if (m->apply_smoothing) {
627
7.48M
          PredictSmooth(cinfo, blocks[c], c, bx, iy);
628
7.48M
          (*m->inverse_transform[c])(m->smoothing_scratch_, &m->dequant_[k0],
629
7.48M
                                     &m->biases_[k0], m->idct_scratch_,
630
7.48M
                                     &row_out[bx * dctsize], raw_out->stride(),
631
7.48M
                                     dctsize);
632
22.0M
        } else {
633
22.0M
          (*m->inverse_transform[c])(&row_in[bx * DCTSIZE2], &m->dequant_[k0],
634
22.0M
                                     &m->biases_[k0], m->idct_scratch_,
635
22.0M
                                     &row_out[bx * dctsize], raw_out->stride(),
636
22.0M
                                     dctsize);
637
22.0M
        }
638
29.5M
      }
639
2.91M
      if (m->streaming_mode_) {
640
373k
        memset(row_in, 0, compinfo.width_in_blocks * sizeof(JBLOCK));
641
373k
      }
642
2.91M
    }
643
2.11M
  }
644
1.54M
}
645
646
0
void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data) {
647
0
  jpegli::DecodeCurrentiMCURow(cinfo);
648
0
  jpeg_decomp_master* m = cinfo->master;
649
0
  for (int c = 0; c < cinfo->num_components; ++c) {
650
0
    const auto& compinfo = cinfo->comp_info[c];
651
0
    size_t comp_width = compinfo.width_in_blocks * DCTSIZE;
652
0
    size_t comp_height = compinfo.height_in_blocks * DCTSIZE;
653
0
    size_t comp_nrows = compinfo.v_samp_factor * DCTSIZE;
654
0
    size_t y0 = static_cast<size_t>(cinfo->output_iMCU_row) *
655
0
                compinfo.v_samp_factor * DCTSIZE;
656
0
    size_t y1 = std::min(y0 + comp_nrows, comp_height);
657
0
    for (size_t y = y0; y < y1; ++y) {
658
0
      float* rows[1] = {m->raw_output_[c].Row(y)};
659
0
      uint8_t* output = data[c][y - y0];
660
0
      DecenterRow(rows[0], comp_width);
661
0
      WriteToOutput(cinfo, rows, 0, comp_width, 1, output);
662
0
    }
663
0
  }
664
0
  ++cinfo->output_iMCU_row;
665
0
  cinfo->output_scanline += cinfo->max_v_samp_factor * DCTSIZE;
666
0
  if (cinfo->output_scanline >= cinfo->output_height) {
667
0
    ++m->output_passes_done_;
668
0
  }
669
0
}
670
671
void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
672
19.9M
                   JSAMPARRAY scanlines, size_t max_output_rows) {
673
19.9M
  jpeg_decomp_master* m = cinfo->master;
674
19.9M
  const size_t vfactor = cinfo->max_v_samp_factor;
675
19.9M
  const size_t hfactor = cinfo->max_h_samp_factor;
676
19.9M
  const size_t context = m->need_context_rows_ ? 1 : 0;
677
19.9M
  const size_t imcu_row = cinfo->output_iMCU_row;
678
19.9M
  const size_t imcu_height = vfactor * m->min_scaled_dct_size;
679
19.9M
  const size_t imcu_width = hfactor * m->min_scaled_dct_size;
680
19.9M
  const size_t output_width = m->iMCU_cols_ * imcu_width;
681
19.9M
  if (imcu_row == cinfo->total_iMCU_rows ||
682
19.8M
      (imcu_row > context &&
683
19.8M
       cinfo->output_scanline < (imcu_row - context) * imcu_height)) {
684
    // We are ready to output some scanlines.
685
18.3M
    size_t ybegin = cinfo->output_scanline;
686
18.3M
    size_t yend = (imcu_row == cinfo->total_iMCU_rows
687
18.3M
                       ? cinfo->output_height
688
18.3M
                       : (imcu_row - context) * imcu_height);
689
18.3M
    yend = std::min<size_t>(yend, ybegin + max_output_rows - *num_output_rows);
690
18.3M
    size_t yb = (ybegin / vfactor) * vfactor;
691
18.3M
    size_t ye = DivCeil(yend, vfactor) * vfactor;
692
36.7M
    for (size_t y = yb; y < ye; y += vfactor) {
693
48.3M
      for (int c = 0; c < cinfo->num_components; ++c) {
694
29.9M
        RowBuffer<float>* raw_out = &m->raw_output_[c];
695
29.9M
        RowBuffer<float>* render_out = &m->render_output_[c];
696
29.9M
        int line_groups = vfactor / m->v_factor[c];
697
29.9M
        int downsampled_width = output_width / m->h_factor[c];
698
29.9M
        size_t yc = y / m->v_factor[c];
699
81.4M
        for (int dy = 0; dy < line_groups; ++dy) {
700
51.5M
          size_t ymid = yc + dy;
701
51.5M
          const float* JXL_RESTRICT row_mid = raw_out->Row(ymid);
702
51.5M
          if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) {
703
3.72M
            const float* JXL_RESTRICT row_top =
704
3.72M
                ymid == 0 ? row_mid : raw_out->Row(ymid - 1);
705
3.72M
            const float* JXL_RESTRICT row_bot = ymid + 1 == m->raw_height_[c]
706
3.72M
                                                    ? row_mid
707
3.72M
                                                    : raw_out->Row(ymid + 1);
708
3.72M
            Upsample2Vertical(row_top, row_mid, row_bot,
709
3.72M
                              render_out->Row(2 * dy),
710
3.72M
                              render_out->Row(2 * dy + 1), downsampled_width);
711
47.7M
          } else {
712
114M
            for (int yix = 0; yix < m->v_factor[c]; ++yix) {
713
66.3M
              memcpy(render_out->Row(m->v_factor[c] * dy + yix), row_mid,
714
66.3M
                     downsampled_width * sizeof(float));
715
66.3M
            }
716
47.7M
          }
717
51.5M
          if (m->h_factor[c] > 1) {
718
39.6M
            for (int yix = 0; yix < m->v_factor[c]; ++yix) {
719
25.9M
              int row_ix = m->v_factor[c] * dy + yix;
720
25.9M
              float* JXL_RESTRICT row = render_out->Row(row_ix);
721
25.9M
              float* JXL_RESTRICT tmp = m->upsample_scratch_;
722
25.9M
              if (cinfo->do_fancy_upsampling && m->h_factor[c] == 2) {
723
10.9M
                Upsample2Horizontal(row, tmp, output_width);
724
14.9M
              } else {
725
                // TODO(szabadka) SIMDify this.
726
910M
                for (size_t x = 0; x < output_width; ++x) {
727
895M
                  tmp[x] = row[x / m->h_factor[c]];
728
895M
                }
729
14.9M
                memcpy(row, tmp, output_width * sizeof(tmp[0]));
730
14.9M
              }
731
25.9M
            }
732
13.6M
          }
733
51.5M
        }
734
29.9M
      }
735
56.8M
      for (size_t yix = 0; yix < vfactor; ++yix) {
736
38.4M
        if (y + yix < ybegin || y + yix >= yend) continue;
737
18.3M
        float* rows[kMaxComponents];
738
18.3M
        int num_all_components =
739
18.3M
            std::max(cinfo->out_color_components, cinfo->num_components);
740
48.3M
        for (int c = 0; c < num_all_components; ++c) {
741
29.9M
          rows[c] = m->render_output_[c].Row(yix);
742
29.9M
        }
743
18.3M
        (*m->color_transform)(rows, output_width);
744
48.3M
        for (int c = 0; c < cinfo->out_color_components; ++c) {
745
          // Undo the centering of the sample values around zero.
746
29.9M
          DecenterRow(rows[c], output_width);
747
29.9M
        }
748
18.3M
        if (scanlines) {
749
18.3M
          uint8_t* output = scanlines[*num_output_rows];
750
18.3M
          WriteToOutput(cinfo, rows, m->xoffset_, cinfo->output_width,
751
18.3M
                        cinfo->out_color_components, output);
752
18.3M
        }
753
18.3M
        JPEGLI_CHECK(cinfo->output_scanline == y + yix);
754
18.3M
        ++cinfo->output_scanline;
755
18.3M
        ++(*num_output_rows);
756
18.3M
        if (cinfo->output_scanline == cinfo->output_height) {
757
2.51k
          ++m->output_passes_done_;
758
2.51k
        }
759
18.3M
      }
760
18.3M
    }
761
18.3M
  } else {
762
1.54M
    DecodeCurrentiMCURow(cinfo);
763
1.54M
    ++cinfo->output_iMCU_row;
764
1.54M
  }
765
19.9M
}
766
767
}  // namespace jpegli
768
#endif  // HWY_ONCE