/src/libjxl/lib/jpegli/render.cc
Line | Count | Source |
1 | | // Copyright (c) the JPEG XL Project Authors. All rights reserved. |
2 | | // |
3 | | // Use of this source code is governed by a BSD-style |
4 | | // license that can be found in the LICENSE file. |
5 | | |
6 | | #include "lib/jpegli/render.h" |
7 | | |
8 | | #include <jxl/types.h> |
9 | | |
10 | | #include <algorithm> |
11 | | #include <array> |
12 | | #include <cmath> |
13 | | #include <cstddef> |
14 | | #include <cstdint> |
15 | | #include <cstring> |
16 | | #include <vector> |
17 | | |
18 | | #include "lib/jpegli/color_quantize.h" |
19 | | #include "lib/jpegli/color_transform.h" |
20 | | #include "lib/jpegli/common.h" |
21 | | #include "lib/jpegli/common_internal.h" |
22 | | #include "lib/jpegli/decode_internal.h" |
23 | | #include "lib/jpegli/error.h" |
24 | | #include "lib/jpegli/idct.h" |
25 | | #include "lib/jpegli/types.h" |
26 | | #include "lib/jpegli/upsample.h" |
27 | | #include "lib/jxl/base/byte_order.h" |
28 | | #include "lib/jxl/base/compiler_specific.h" |
29 | | |
30 | | #ifdef MEMORY_SANITIZER |
31 | | #define JXL_MEMORY_SANITIZER 1 |
32 | | #elif defined(__has_feature) |
33 | | #if __has_feature(memory_sanitizer) |
34 | | #define JXL_MEMORY_SANITIZER 1 |
35 | | #else |
36 | | #define JXL_MEMORY_SANITIZER 0 |
37 | | #endif |
38 | | #else |
39 | | #define JXL_MEMORY_SANITIZER 0 |
40 | | #endif |
41 | | |
42 | | #if JXL_MEMORY_SANITIZER |
43 | | #include "sanitizer/msan_interface.h" |
44 | | #endif |
45 | | |
46 | | #undef HWY_TARGET_INCLUDE |
47 | | #define HWY_TARGET_INCLUDE "lib/jpegli/render.cc" |
48 | | #include <hwy/foreach_target.h> |
49 | | #include <hwy/highway.h> |
50 | | |
51 | | HWY_BEFORE_NAMESPACE(); |
52 | | namespace jpegli { |
53 | | namespace HWY_NAMESPACE { |
54 | | |
55 | | // These templates are not found via ADL. |
56 | | using hwy::HWY_NAMESPACE::Abs; |
57 | | using hwy::HWY_NAMESPACE::Add; |
58 | | using hwy::HWY_NAMESPACE::Clamp; |
59 | | using hwy::HWY_NAMESPACE::Gt; |
60 | | using hwy::HWY_NAMESPACE::IfThenElseZero; |
61 | | using hwy::HWY_NAMESPACE::Mul; |
62 | | using hwy::HWY_NAMESPACE::NearestInt; |
63 | | using hwy::HWY_NAMESPACE::Or; |
64 | | using hwy::HWY_NAMESPACE::Rebind; |
65 | | using hwy::HWY_NAMESPACE::ShiftLeftSame; |
66 | | using hwy::HWY_NAMESPACE::ShiftRightSame; |
67 | | using hwy::HWY_NAMESPACE::Vec; |
68 | | using D = HWY_FULL(float); |
69 | | using DI = HWY_FULL(int32_t); |
70 | | constexpr D d; |
71 | | constexpr DI di; |
72 | | |
73 | | void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs, |
74 | | const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros, |
75 | 2.15M | int32_t* JXL_RESTRICT sumabs) { |
76 | 222M | for (size_t i = 0; i < coeffs_size; i += Lanes(d)) { |
77 | 220M | size_t k = i % DCTSIZE2; |
78 | 220M | const Rebind<int16_t, DI> di16; |
79 | 220M | const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i)); |
80 | 220M | const auto abs_coeff = Abs(coeff); |
81 | 220M | const auto not_0 = Gt(abs_coeff, Zero(di)); |
82 | 220M | const auto nzero = IfThenElseZero(not_0, Set(di, 1)); |
83 | 220M | Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k); |
84 | 220M | Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k); |
85 | 220M | } |
86 | 2.15M | } jpegli::N_SSE4::GatherBlockStats(short const*, unsigned long, int*, int*) Line | Count | Source | 75 | 520k | int32_t* JXL_RESTRICT sumabs) { | 76 | 72.2M | for (size_t i = 0; i < coeffs_size; i += Lanes(d)) { | 77 | 71.7M | size_t k = i % DCTSIZE2; | 78 | 71.7M | const Rebind<int16_t, DI> di16; | 79 | 71.7M | const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i)); | 80 | 71.7M | const auto abs_coeff = Abs(coeff); | 81 | 71.7M | const auto not_0 = Gt(abs_coeff, Zero(di)); | 82 | 71.7M | const auto nzero = IfThenElseZero(not_0, Set(di, 1)); | 83 | 71.7M | Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k); | 84 | 71.7M | Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k); | 85 | 71.7M | } | 86 | 520k | } |
jpegli::N_AVX2::GatherBlockStats(short const*, unsigned long, int*, int*) Line | Count | Source | 75 | 1.03M | int32_t* JXL_RESTRICT sumabs) { | 76 | 80.3M | for (size_t i = 0; i < coeffs_size; i += Lanes(d)) { | 77 | 79.2M | size_t k = i % DCTSIZE2; | 78 | 79.2M | const Rebind<int16_t, DI> di16; | 79 | 79.2M | const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i)); | 80 | 79.2M | const auto abs_coeff = Abs(coeff); | 81 | 79.2M | const auto not_0 = Gt(abs_coeff, Zero(di)); | 82 | 79.2M | const auto nzero = IfThenElseZero(not_0, Set(di, 1)); | 83 | 79.2M | Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k); | 84 | 79.2M | Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k); | 85 | 79.2M | } | 86 | 1.03M | } |
jpegli::N_SSE2::GatherBlockStats(short const*, unsigned long, int*, int*) Line | Count | Source | 75 | 599k | int32_t* JXL_RESTRICT sumabs) { | 76 | 70.0M | for (size_t i = 0; i < coeffs_size; i += Lanes(d)) { | 77 | 69.5M | size_t k = i % DCTSIZE2; | 78 | 69.5M | const Rebind<int16_t, DI> di16; | 79 | 69.5M | const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i)); | 80 | 69.5M | const auto abs_coeff = Abs(coeff); | 81 | 69.5M | const auto not_0 = Gt(abs_coeff, Zero(di)); | 82 | 69.5M | const auto nzero = IfThenElseZero(not_0, Set(di, 1)); | 83 | 69.5M | Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k); | 84 | 69.5M | Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k); | 85 | 69.5M | } | 86 | 599k | } |
|
87 | | |
88 | 29.9M | void DecenterRow(float* row, size_t xsize) { |
89 | 29.9M | const HWY_CAPPED(float, 8) df; |
90 | 29.9M | const auto c128 = Set(df, 128.0f / 255); |
91 | 535M | for (size_t x = 0; x < xsize; x += Lanes(df)) { |
92 | 505M | Store(Add(Load(df, row + x), c128), df, row + x); |
93 | 505M | } |
94 | 29.9M | } jpegli::N_SSE4::DecenterRow(float*, unsigned long) Line | Count | Source | 88 | 7.58M | void DecenterRow(float* row, size_t xsize) { | 89 | 7.58M | const HWY_CAPPED(float, 8) df; | 90 | 7.58M | const auto c128 = Set(df, 128.0f / 255); | 91 | 172M | for (size_t x = 0; x < xsize; x += Lanes(df)) { | 92 | 165M | Store(Add(Load(df, row + x), c128), df, row + x); | 93 | 165M | } | 94 | 7.58M | } |
jpegli::N_AVX2::DecenterRow(float*, unsigned long) Line | Count | Source | 88 | 13.3M | void DecenterRow(float* row, size_t xsize) { | 89 | 13.3M | const HWY_CAPPED(float, 8) df; | 90 | 13.3M | const auto c128 = Set(df, 128.0f / 255); | 91 | 176M | for (size_t x = 0; x < xsize; x += Lanes(df)) { | 92 | 163M | Store(Add(Load(df, row + x), c128), df, row + x); | 93 | 163M | } | 94 | 13.3M | } |
jpegli::N_SSE2::DecenterRow(float*, unsigned long) Line | Count | Source | 88 | 8.99M | void DecenterRow(float* row, size_t xsize) { | 89 | 8.99M | const HWY_CAPPED(float, 8) df; | 90 | 8.99M | const auto c128 = Set(df, 128.0f / 255); | 91 | 185M | for (size_t x = 0; x < xsize; x += Lanes(df)) { | 92 | 176M | Store(Add(Load(df, row + x), c128), df, row + x); | 93 | 176M | } | 94 | 8.99M | } |
|
95 | | |
96 | | void DitherRow(j_decompress_ptr cinfo, float* row, int c, size_t y, |
97 | 0 | size_t xsize) { |
98 | 0 | jpeg_decomp_master* m = cinfo->master; |
99 | 0 | if (!m->dither_[c]) return; |
100 | 0 | const float* dither_row = |
101 | 0 | &m->dither_[c][(y & m->dither_mask_) * m->dither_size_]; |
102 | 0 | for (size_t x = 0; x < xsize; ++x) { |
103 | 0 | row[x] += dither_row[x & m->dither_mask_]; |
104 | 0 | } |
105 | 0 | } Unexecuted instantiation: jpegli::N_SSE4::DitherRow(jpeg_decompress_struct*, float*, int, unsigned long, unsigned long) Unexecuted instantiation: jpegli::N_AVX2::DitherRow(jpeg_decompress_struct*, float*, int, unsigned long, unsigned long) Unexecuted instantiation: jpegli::N_SSE2::DitherRow(jpeg_decompress_struct*, float*, int, unsigned long, unsigned long) |
106 | | |
107 | | template <typename T> |
108 | | void StoreUnsignedRow(float* JXL_RESTRICT input[], size_t x0, size_t len, |
109 | 10.8M | size_t num_channels, float multiplier, T* output) { |
110 | 10.8M | const HWY_CAPPED(float, 8) cd; |
111 | 10.8M | auto zero = Zero(cd); |
112 | 10.8M | auto mul = Set(cd, multiplier); |
113 | 10.8M | const Rebind<T, decltype(cd)> cdu; |
114 | | #if JXL_MEMORY_SANITIZER |
115 | | const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len; |
116 | | for (size_t c = 0; c < num_channels; ++c) { |
117 | | __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding); |
118 | | } |
119 | | #endif |
120 | 10.8M | if (num_channels == 1) { |
121 | 51.9M | for (size_t i = 0; i < len; i += Lanes(cd)) { |
122 | 46.4M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); |
123 | 46.4M | StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]); |
124 | 46.4M | } |
125 | 5.51M | } else if (num_channels == 2) { |
126 | 37.1M | for (size_t i = 0; i < len; i += Lanes(cd)) { |
127 | 33.3M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); |
128 | 33.3M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); |
129 | 33.3M | StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)), |
130 | 33.3M | DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]); |
131 | 33.3M | } |
132 | 3.84M | } else if (num_channels == 3) { |
133 | 17.0M | for (size_t i = 0; i < len; i += Lanes(cd)) { |
134 | 16.3M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); |
135 | 16.3M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); |
136 | 16.3M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); |
137 | 16.3M | StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)), |
138 | 16.3M | DemoteTo(cdu, NearestInt(v1)), |
139 | 16.3M | DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]); |
140 | 16.3M | } |
141 | 785k | } else if (num_channels == 4) { |
142 | 12.0M | for (size_t i = 0; i < len; i += Lanes(cd)) { |
143 | 11.2M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); |
144 | 11.2M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); |
145 | 11.2M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); |
146 | 11.2M | auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul); |
147 | 11.2M | StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)), |
148 | 11.2M | DemoteTo(cdu, NearestInt(v1)), |
149 | 11.2M | DemoteTo(cdu, NearestInt(v2)), |
150 | 11.2M | DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]); |
151 | 11.2M | } |
152 | 785k | } |
153 | | #if JXL_MEMORY_SANITIZER |
154 | | __msan_poison(output + num_channels * len, |
155 | | sizeof(output[0]) * num_channels * padding); |
156 | | #endif |
157 | 10.8M | } void jpegli::N_SSE4::StoreUnsignedRow<unsigned char>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned char*) Line | Count | Source | 109 | 1.10M | size_t num_channels, float multiplier, T* output) { | 110 | 1.10M | const HWY_CAPPED(float, 8) cd; | 111 | 1.10M | auto zero = Zero(cd); | 112 | 1.10M | auto mul = Set(cd, multiplier); | 113 | 1.10M | const Rebind<T, decltype(cd)> cdu; | 114 | | #if JXL_MEMORY_SANITIZER | 115 | | const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len; | 116 | | for (size_t c = 0; c < num_channels; ++c) { | 117 | | __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding); | 118 | | } | 119 | | #endif | 120 | 1.10M | if (num_channels == 1) { | 121 | 5.81M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 122 | 5.51M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 123 | 5.51M | StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]); | 124 | 5.51M | } | 125 | 805k | } else if (num_channels == 2) { | 126 | 5.94M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 127 | 5.35M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 128 | 5.35M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 129 | 5.35M | StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)), | 130 | 5.35M | DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]); | 131 | 5.35M | } | 132 | 588k | } else if (num_channels == 3) { | 133 | 3.31M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 134 | 3.16M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 135 | 3.16M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 136 | 3.16M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 137 | 3.16M | StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)), | 138 | 3.16M | DemoteTo(cdu, NearestInt(v1)), | 139 | 3.16M | DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]); | 140 | 3.16M | } | 141 | 148k | } else if (num_channels == 4) { | 142 | 1.17M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 143 | 1.11M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 144 | 1.11M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 145 | 1.11M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 146 | 1.11M | auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul); | 147 | 1.11M | StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)), | 148 | 1.11M | DemoteTo(cdu, NearestInt(v1)), | 149 | 1.11M | DemoteTo(cdu, NearestInt(v2)), | 150 | 1.11M | DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]); | 151 | 1.11M | } | 152 | 67.8k | } | 153 | | #if JXL_MEMORY_SANITIZER | 154 | | __msan_poison(output + num_channels * len, | 155 | | sizeof(output[0]) * num_channels * padding); | 156 | | #endif | 157 | 1.10M | } |
void jpegli::N_SSE4::StoreUnsignedRow<unsigned short>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned short*) Line | Count | Source | 109 | 1.60M | size_t num_channels, float multiplier, T* output) { | 110 | 1.60M | const HWY_CAPPED(float, 8) cd; | 111 | 1.60M | auto zero = Zero(cd); | 112 | 1.60M | auto mul = Set(cd, multiplier); | 113 | 1.60M | const Rebind<T, decltype(cd)> cdu; | 114 | | #if JXL_MEMORY_SANITIZER | 115 | | const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len; | 116 | | for (size_t c = 0; c < num_channels; ++c) { | 117 | | __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding); | 118 | | } | 119 | | #endif | 120 | 1.60M | if (num_channels == 1) { | 121 | 12.4M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 122 | 11.6M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 123 | 11.6M | StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]); | 124 | 11.6M | } | 125 | 812k | } else if (num_channels == 2) { | 126 | 6.14M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 127 | 5.60M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 128 | 5.60M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 129 | 5.60M | StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)), | 130 | 5.60M | DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]); | 131 | 5.60M | } | 132 | 537k | } else if (num_channels == 3) { | 133 | 2.59M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 134 | 2.39M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 135 | 2.39M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 136 | 2.39M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 137 | 2.39M | StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)), | 138 | 2.39M | DemoteTo(cdu, NearestInt(v1)), | 139 | 2.39M | DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]); | 140 | 2.39M | } | 141 | 200k | } else if (num_channels == 4) { | 142 | 3.38M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 143 | 3.31M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 144 | 3.31M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 145 | 3.31M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 146 | 3.31M | auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul); | 147 | 3.31M | StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)), | 148 | 3.31M | DemoteTo(cdu, NearestInt(v1)), | 149 | 3.31M | DemoteTo(cdu, NearestInt(v2)), | 150 | 3.31M | DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]); | 151 | 3.31M | } | 152 | 74.5k | } | 153 | | #if JXL_MEMORY_SANITIZER | 154 | | __msan_poison(output + num_channels * len, | 155 | | sizeof(output[0]) * num_channels * padding); | 156 | | #endif | 157 | 1.60M | } |
void jpegli::N_AVX2::StoreUnsignedRow<unsigned char>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned char*) Line | Count | Source | 109 | 1.68M | size_t num_channels, float multiplier, T* output) { | 110 | 1.68M | const HWY_CAPPED(float, 8) cd; | 111 | 1.68M | auto zero = Zero(cd); | 112 | 1.68M | auto mul = Set(cd, multiplier); | 113 | 1.68M | const Rebind<T, decltype(cd)> cdu; | 114 | | #if JXL_MEMORY_SANITIZER | 115 | | const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len; | 116 | | for (size_t c = 0; c < num_channels; ++c) { | 117 | | __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding); | 118 | | } | 119 | | #endif | 120 | 1.68M | if (num_channels == 1) { | 121 | 5.30M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 122 | 4.61M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 123 | 4.61M | StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]); | 124 | 4.61M | } | 125 | 993k | } else if (num_channels == 2) { | 126 | 4.82M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 127 | 3.99M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 128 | 3.99M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 129 | 3.99M | StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)), | 130 | 3.99M | DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]); | 131 | 3.99M | } | 132 | 829k | } else if (num_channels == 3) { | 133 | 2.62M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 134 | 2.59M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 135 | 2.59M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 136 | 2.59M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 137 | 2.59M | StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)), | 138 | 2.59M | DemoteTo(cdu, NearestInt(v1)), | 139 | 2.59M | DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]); | 140 | 2.59M | } | 141 | 131k | } else if (num_channels == 4) { | 142 | 659k | for (size_t i = 0; i < len; i += Lanes(cd)) { | 143 | 528k | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 144 | 528k | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 145 | 528k | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 146 | 528k | auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul); | 147 | 528k | StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)), | 148 | 528k | DemoteTo(cdu, NearestInt(v1)), | 149 | 528k | DemoteTo(cdu, NearestInt(v2)), | 150 | 528k | DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]); | 151 | 528k | } | 152 | 131k | } | 153 | | #if JXL_MEMORY_SANITIZER | 154 | | __msan_poison(output + num_channels * len, | 155 | | sizeof(output[0]) * num_channels * padding); | 156 | | #endif | 157 | 1.68M | } |
void jpegli::N_AVX2::StoreUnsignedRow<unsigned short>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned short*) Line | Count | Source | 109 | 3.50M | size_t num_channels, float multiplier, T* output) { | 110 | 3.50M | const HWY_CAPPED(float, 8) cd; | 111 | 3.50M | auto zero = Zero(cd); | 112 | 3.50M | auto mul = Set(cd, multiplier); | 113 | 3.50M | const Rebind<T, decltype(cd)> cdu; | 114 | | #if JXL_MEMORY_SANITIZER | 115 | | const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len; | 116 | | for (size_t c = 0; c < num_channels; ++c) { | 117 | | __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding); | 118 | | } | 119 | | #endif | 120 | 3.50M | if (num_channels == 1) { | 121 | 12.0M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 122 | 9.89M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 123 | 9.89M | StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]); | 124 | 9.89M | } | 125 | 2.11M | } else if (num_channels == 2) { | 126 | 5.55M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 127 | 4.64M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 128 | 4.64M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 129 | 4.64M | StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)), | 130 | 4.64M | DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]); | 131 | 4.64M | } | 132 | 906k | } else if (num_channels == 3) { | 133 | 2.00M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 134 | 1.95M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 135 | 1.95M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 136 | 1.95M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 137 | 1.95M | StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)), | 138 | 1.95M | DemoteTo(cdu, NearestInt(v1)), | 139 | 1.95M | DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]); | 140 | 1.95M | } | 141 | 438k | } else if (num_channels == 4) { | 142 | 2.43M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 143 | 1.99M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 144 | 1.99M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 145 | 1.99M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 146 | 1.99M | auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul); | 147 | 1.99M | StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)), | 148 | 1.99M | DemoteTo(cdu, NearestInt(v1)), | 149 | 1.99M | DemoteTo(cdu, NearestInt(v2)), | 150 | 1.99M | DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]); | 151 | 1.99M | } | 152 | 438k | } | 153 | | #if JXL_MEMORY_SANITIZER | 154 | | __msan_poison(output + num_channels * len, | 155 | | sizeof(output[0]) * num_channels * padding); | 156 | | #endif | 157 | 3.50M | } |
void jpegli::N_SSE2::StoreUnsignedRow<unsigned char>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned char*) Line | Count | Source | 109 | 979k | size_t num_channels, float multiplier, T* output) { | 110 | 979k | const HWY_CAPPED(float, 8) cd; | 111 | 979k | auto zero = Zero(cd); | 112 | 979k | auto mul = Set(cd, multiplier); | 113 | 979k | const Rebind<T, decltype(cd)> cdu; | 114 | | #if JXL_MEMORY_SANITIZER | 115 | | const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len; | 116 | | for (size_t c = 0; c < num_channels; ++c) { | 117 | | __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding); | 118 | | } | 119 | | #endif | 120 | 979k | if (num_channels == 1) { | 121 | 6.89M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 122 | 6.39M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 123 | 6.39M | StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]); | 124 | 6.39M | } | 125 | 495k | } else if (num_channels == 2) { | 126 | 6.12M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 127 | 5.75M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 128 | 5.75M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 129 | 5.75M | StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)), | 130 | 5.75M | DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]); | 131 | 5.75M | } | 132 | 370k | } else if (num_channels == 3) { | 133 | 2.91M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 134 | 2.81M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 135 | 2.81M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 136 | 2.81M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 137 | 2.81M | StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)), | 138 | 2.81M | DemoteTo(cdu, NearestInt(v1)), | 139 | 2.81M | DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]); | 140 | 2.81M | } | 141 | 105k | } else if (num_channels == 4) { | 142 | 2.12M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 143 | 2.11M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 144 | 2.11M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 145 | 2.11M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 146 | 2.11M | auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul); | 147 | 2.11M | StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)), | 148 | 2.11M | DemoteTo(cdu, NearestInt(v1)), | 149 | 2.11M | DemoteTo(cdu, NearestInt(v2)), | 150 | 2.11M | DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]); | 151 | 2.11M | } | 152 | 6.86k | } | 153 | | #if JXL_MEMORY_SANITIZER | 154 | | __msan_poison(output + num_channels * len, | 155 | | sizeof(output[0]) * num_channels * padding); | 156 | | #endif | 157 | 979k | } |
void jpegli::N_SSE2::StoreUnsignedRow<unsigned short>(float* restrict*, unsigned long, unsigned long, unsigned long, float, unsigned short*) Line | Count | Source | 109 | 1.95M | size_t num_channels, float multiplier, T* output) { | 110 | 1.95M | const HWY_CAPPED(float, 8) cd; | 111 | 1.95M | auto zero = Zero(cd); | 112 | 1.95M | auto mul = Set(cd, multiplier); | 113 | 1.95M | const Rebind<T, decltype(cd)> cdu; | 114 | | #if JXL_MEMORY_SANITIZER | 115 | | const size_t padding = hwy::RoundUpTo(len, Lanes(cd)) - len; | 116 | | for (size_t c = 0; c < num_channels; ++c) { | 117 | | __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding); | 118 | | } | 119 | | #endif | 120 | 1.95M | if (num_channels == 1) { | 121 | 9.45M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 122 | 8.32M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 123 | 8.32M | StoreU(DemoteTo(cdu, NearestInt(v0)), cdu, &output[i]); | 124 | 8.32M | } | 125 | 1.13M | } else if (num_channels == 2) { | 126 | 8.58M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 127 | 7.97M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 128 | 7.97M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 129 | 7.97M | StoreInterleaved2(DemoteTo(cdu, NearestInt(v0)), | 130 | 7.97M | DemoteTo(cdu, NearestInt(v1)), cdu, &output[2 * i]); | 131 | 7.97M | } | 132 | 611k | } else if (num_channels == 3) { | 133 | 3.61M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 134 | 3.46M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 135 | 3.46M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 136 | 3.46M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 137 | 3.46M | StoreInterleaved3(DemoteTo(cdu, NearestInt(v0)), | 138 | 3.46M | DemoteTo(cdu, NearestInt(v1)), | 139 | 3.46M | DemoteTo(cdu, NearestInt(v2)), cdu, &output[3 * i]); | 140 | 3.46M | } | 141 | 149k | } else if (num_channels == 4) { | 142 | 2.25M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 143 | 2.18M | auto v0 = Clamp(zero, Mul(LoadU(cd, &input[0][x0 + i]), mul), mul); | 144 | 2.18M | auto v1 = Clamp(zero, Mul(LoadU(cd, &input[1][x0 + i]), mul), mul); | 145 | 2.18M | auto v2 = Clamp(zero, Mul(LoadU(cd, &input[2][x0 + i]), mul), mul); | 146 | 2.18M | auto v3 = Clamp(zero, Mul(LoadU(cd, &input[3][x0 + i]), mul), mul); | 147 | 2.18M | StoreInterleaved4(DemoteTo(cdu, NearestInt(v0)), | 148 | 2.18M | DemoteTo(cdu, NearestInt(v1)), | 149 | 2.18M | DemoteTo(cdu, NearestInt(v2)), | 150 | 2.18M | DemoteTo(cdu, NearestInt(v3)), cdu, &output[4 * i]); | 151 | 2.18M | } | 152 | 66.5k | } | 153 | | #if JXL_MEMORY_SANITIZER | 154 | | __msan_poison(output + num_channels * len, | 155 | | sizeof(output[0]) * num_channels * padding); | 156 | | #endif | 157 | 1.95M | } |
|
158 | | |
159 | | void StoreFloatRow(float* JXL_RESTRICT input[3], size_t x0, size_t len, |
160 | 7.53M | size_t num_channels, float* output) { |
161 | 7.53M | const HWY_CAPPED(float, 8) cd; |
162 | 7.53M | if (num_channels == 1) { |
163 | 4.61M | memcpy(output, input[0] + x0, len * sizeof(output[0])); |
164 | 4.61M | } else if (num_channels == 2) { |
165 | 16.4M | for (size_t i = 0; i < len; i += Lanes(cd)) { |
166 | 14.5M | StoreInterleaved2(LoadU(cd, &input[0][x0 + i]), |
167 | 14.5M | LoadU(cd, &input[1][x0 + i]), cd, &output[2 * i]); |
168 | 14.5M | } |
169 | 1.96M | } else if (num_channels == 3) { |
170 | 13.3M | for (size_t i = 0; i < len; i += Lanes(cd)) { |
171 | 12.5M | StoreInterleaved3(LoadU(cd, &input[0][x0 + i]), |
172 | 12.5M | LoadU(cd, &input[1][x0 + i]), |
173 | 12.5M | LoadU(cd, &input[2][x0 + i]), cd, &output[3 * i]); |
174 | 12.5M | } |
175 | 837k | } else if (num_channels == 4) { |
176 | 15.2M | for (size_t i = 0; i < len; i += Lanes(cd)) { |
177 | 15.1M | StoreInterleaved4(LoadU(cd, &input[0][x0 + i]), |
178 | 15.1M | LoadU(cd, &input[1][x0 + i]), |
179 | 15.1M | LoadU(cd, &input[2][x0 + i]), |
180 | 15.1M | LoadU(cd, &input[3][x0 + i]), cd, &output[4 * i]); |
181 | 15.1M | } |
182 | 122k | } |
183 | 7.53M | } jpegli::N_SSE4::StoreFloatRow(float* restrict*, unsigned long, unsigned long, unsigned long, float*) Line | Count | Source | 160 | 1.71M | size_t num_channels, float* output) { | 161 | 1.71M | const HWY_CAPPED(float, 8) cd; | 162 | 1.71M | if (num_channels == 1) { | 163 | 1.02M | memcpy(output, input[0] + x0, len * sizeof(output[0])); | 164 | 1.02M | } else if (num_channels == 2) { | 165 | 4.77M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 166 | 4.25M | StoreInterleaved2(LoadU(cd, &input[0][x0 + i]), | 167 | 4.25M | LoadU(cd, &input[1][x0 + i]), cd, &output[2 * i]); | 168 | 4.25M | } | 169 | 523k | } else if (num_channels == 3) { | 170 | 2.90M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 171 | 2.80M | StoreInterleaved3(LoadU(cd, &input[0][x0 + i]), | 172 | 2.80M | LoadU(cd, &input[1][x0 + i]), | 173 | 2.80M | LoadU(cd, &input[2][x0 + i]), cd, &output[3 * i]); | 174 | 2.80M | } | 175 | 94.1k | } else if (num_channels == 4) { | 176 | 3.27M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 177 | 3.20M | StoreInterleaved4(LoadU(cd, &input[0][x0 + i]), | 178 | 3.20M | LoadU(cd, &input[1][x0 + i]), | 179 | 3.20M | LoadU(cd, &input[2][x0 + i]), | 180 | 3.20M | LoadU(cd, &input[3][x0 + i]), cd, &output[4 * i]); | 181 | 3.20M | } | 182 | 68.7k | } | 183 | 1.71M | } |
jpegli::N_AVX2::StoreFloatRow(float* restrict*, unsigned long, unsigned long, unsigned long, float*) Line | Count | Source | 160 | 3.35M | size_t num_channels, float* output) { | 161 | 3.35M | const HWY_CAPPED(float, 8) cd; | 162 | 3.35M | if (num_channels == 1) { | 163 | 2.39M | memcpy(output, input[0] + x0, len * sizeof(output[0])); | 164 | 2.39M | } else if (num_channels == 2) { | 165 | 4.82M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 166 | 4.09M | StoreInterleaved2(LoadU(cd, &input[0][x0 + i]), | 167 | 4.09M | LoadU(cd, &input[1][x0 + i]), cd, &output[2 * i]); | 168 | 4.09M | } | 169 | 729k | } else if (num_channels == 3) { | 170 | 4.98M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 171 | 4.78M | StoreInterleaved3(LoadU(cd, &input[0][x0 + i]), | 172 | 4.78M | LoadU(cd, &input[1][x0 + i]), | 173 | 4.78M | LoadU(cd, &input[2][x0 + i]), cd, &output[3 * i]); | 174 | 4.78M | } | 175 | 200k | } else if (num_channels == 4) { | 176 | 10.5M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 177 | 10.4M | StoreInterleaved4(LoadU(cd, &input[0][x0 + i]), | 178 | 10.4M | LoadU(cd, &input[1][x0 + i]), | 179 | 10.4M | LoadU(cd, &input[2][x0 + i]), | 180 | 10.4M | LoadU(cd, &input[3][x0 + i]), cd, &output[4 * i]); | 181 | 10.4M | } | 182 | 28.9k | } | 183 | 3.35M | } |
jpegli::N_SSE2::StoreFloatRow(float* restrict*, unsigned long, unsigned long, unsigned long, float*) Line | Count | Source | 160 | 2.47M | size_t num_channels, float* output) { | 161 | 2.47M | const HWY_CAPPED(float, 8) cd; | 162 | 2.47M | if (num_channels == 1) { | 163 | 1.19M | memcpy(output, input[0] + x0, len * sizeof(output[0])); | 164 | 1.27M | } else if (num_channels == 2) { | 165 | 6.88M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 166 | 6.17M | StoreInterleaved2(LoadU(cd, &input[0][x0 + i]), | 167 | 6.17M | LoadU(cd, &input[1][x0 + i]), cd, &output[2 * i]); | 168 | 6.17M | } | 169 | 708k | } else if (num_channels == 3) { | 170 | 5.46M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 171 | 4.92M | StoreInterleaved3(LoadU(cd, &input[0][x0 + i]), | 172 | 4.92M | LoadU(cd, &input[1][x0 + i]), | 173 | 4.92M | LoadU(cd, &input[2][x0 + i]), cd, &output[3 * i]); | 174 | 4.92M | } | 175 | 542k | } else if (num_channels == 4) { | 176 | 1.51M | for (size_t i = 0; i < len; i += Lanes(cd)) { | 177 | 1.49M | StoreInterleaved4(LoadU(cd, &input[0][x0 + i]), | 178 | 1.49M | LoadU(cd, &input[1][x0 + i]), | 179 | 1.49M | LoadU(cd, &input[2][x0 + i]), | 180 | 1.49M | LoadU(cd, &input[3][x0 + i]), cd, &output[4 * i]); | 181 | 1.49M | } | 182 | 24.8k | } | 183 | 2.47M | } |
|
184 | | |
185 | | static constexpr float kFSWeightMR = 7.0f / 16.0f; |
186 | | static constexpr float kFSWeightBL = 3.0f / 16.0f; |
187 | | static constexpr float kFSWeightBM = 5.0f / 16.0f; |
188 | | static constexpr float kFSWeightBR = 1.0f / 16.0f; |
189 | | |
190 | 0 | float LimitError(float error) { |
191 | 0 | float abserror = std::abs(error); |
192 | 0 | if (abserror > 48.0f) { |
193 | 0 | abserror = 32.0f; |
194 | 0 | } else if (abserror > 16.0f) { |
195 | 0 | abserror = 0.5f * abserror + 8.0f; |
196 | 0 | } |
197 | 0 | return error > 0.0f ? abserror : -abserror; |
198 | 0 | } Unexecuted instantiation: jpegli::N_SSE4::LimitError(float) Unexecuted instantiation: jpegli::N_AVX2::LimitError(float) Unexecuted instantiation: jpegli::N_SSE2::LimitError(float) |
199 | | |
200 | | void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[], |
201 | | size_t xoffset, size_t len, size_t num_channels, |
202 | 18.3M | uint8_t* JXL_RESTRICT output) { |
203 | 18.3M | jpeg_decomp_master* m = cinfo->master; |
204 | 18.3M | uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_; |
205 | 18.3M | if (cinfo->quantize_colors && m->quant_pass_ == 1) { |
206 | 0 | float* error_row[kMaxComponents]; |
207 | 0 | float* next_error_row[kMaxComponents]; |
208 | 0 | J_DITHER_MODE dither_mode = cinfo->dither_mode; |
209 | 0 | if (dither_mode == JDITHER_ORDERED) { |
210 | 0 | for (size_t c = 0; c < num_channels; ++c) { |
211 | 0 | DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline, |
212 | 0 | cinfo->output_width); |
213 | 0 | } |
214 | 0 | } else if (dither_mode == JDITHER_FS) { |
215 | 0 | for (size_t c = 0; c < num_channels; ++c) { |
216 | 0 | if (cinfo->output_scanline % 2 == 0) { |
217 | 0 | error_row[c] = m->error_row_[c]; |
218 | 0 | next_error_row[c] = m->error_row_[c + kMaxComponents]; |
219 | 0 | } else { |
220 | 0 | error_row[c] = m->error_row_[c + kMaxComponents]; |
221 | 0 | next_error_row[c] = m->error_row_[c]; |
222 | 0 | } |
223 | 0 | memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float)); |
224 | 0 | } |
225 | 0 | } |
226 | 0 | const float mul = 255.0f; |
227 | 0 | if (dither_mode != JDITHER_FS) { |
228 | 0 | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); |
229 | 0 | } |
230 | 0 | for (size_t i = 0; i < len; ++i) { |
231 | 0 | uint8_t* pixel = &scratch_space[num_channels * i]; |
232 | 0 | if (dither_mode == JDITHER_FS) { |
233 | 0 | for (size_t c = 0; c < num_channels; ++c) { |
234 | 0 | float val = rows[c][i] * mul + LimitError(error_row[c][i]); |
235 | 0 | pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val))); |
236 | 0 | } |
237 | 0 | } |
238 | 0 | int index = LookupColorIndex(cinfo, pixel); |
239 | 0 | output[i] = index; |
240 | 0 | if (dither_mode == JDITHER_FS) { |
241 | 0 | size_t prev_i = i > 0 ? i - 1 : 0; |
242 | 0 | size_t next_i = i + 1 < len ? i + 1 : len - 1; |
243 | 0 | for (size_t c = 0; c < num_channels; ++c) { |
244 | 0 | float error = pixel[c] - cinfo->colormap[c][index]; |
245 | 0 | error_row[c][next_i] += kFSWeightMR * error; |
246 | 0 | next_error_row[c][prev_i] += kFSWeightBL * error; |
247 | 0 | next_error_row[c][i] += kFSWeightBM * error; |
248 | 0 | next_error_row[c][next_i] += kFSWeightBR * error; |
249 | 0 | } |
250 | 0 | } |
251 | 0 | } |
252 | 18.3M | } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) { |
253 | 3.76M | const float mul = 255.0; |
254 | 3.76M | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); |
255 | 3.76M | memcpy(output, scratch_space, len * num_channels); |
256 | 14.6M | } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) { |
257 | 7.07M | const float mul = 65535.0; |
258 | 7.07M | uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space); |
259 | 7.07M | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp); |
260 | 7.07M | if (m->swap_endianness_) { |
261 | 4.01M | const HWY_CAPPED(uint16_t, 8) du; |
262 | 4.01M | size_t output_len = len * num_channels; |
263 | 35.3M | for (size_t j = 0; j < output_len; j += Lanes(du)) { |
264 | 31.3M | auto v = LoadU(du, tmp + j); |
265 | 31.3M | auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8)); |
266 | 31.3M | StoreU(vswap, du, tmp + j); |
267 | 31.3M | } |
268 | 4.01M | } |
269 | 7.07M | memcpy(output, tmp, len * num_channels * 2); |
270 | 7.53M | } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) { |
271 | 7.53M | float* tmp = reinterpret_cast<float*>(scratch_space); |
272 | 7.53M | StoreFloatRow(rows, xoffset, len, num_channels, tmp); |
273 | 7.53M | if (m->swap_endianness_) { |
274 | 3.19M | size_t output_len = len * num_channels; |
275 | 179M | for (size_t j = 0; j < output_len; ++j) { |
276 | 176M | tmp[j] = BSwapFloat(tmp[j]); |
277 | 176M | } |
278 | 3.19M | } |
279 | 7.53M | memcpy(output, tmp, len * num_channels * 4); |
280 | 7.53M | } |
281 | 18.3M | } jpegli::N_SSE4::WriteToOutput(jpeg_decompress_struct*, float* restrict*, unsigned long, unsigned long, unsigned long, unsigned char*) Line | Count | Source | 202 | 4.41M | uint8_t* JXL_RESTRICT output) { | 203 | 4.41M | jpeg_decomp_master* m = cinfo->master; | 204 | 4.41M | uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_; | 205 | 4.41M | if (cinfo->quantize_colors && m->quant_pass_ == 1) { | 206 | 0 | float* error_row[kMaxComponents]; | 207 | 0 | float* next_error_row[kMaxComponents]; | 208 | 0 | J_DITHER_MODE dither_mode = cinfo->dither_mode; | 209 | 0 | if (dither_mode == JDITHER_ORDERED) { | 210 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 211 | 0 | DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline, | 212 | 0 | cinfo->output_width); | 213 | 0 | } | 214 | 0 | } else if (dither_mode == JDITHER_FS) { | 215 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 216 | 0 | if (cinfo->output_scanline % 2 == 0) { | 217 | 0 | error_row[c] = m->error_row_[c]; | 218 | 0 | next_error_row[c] = m->error_row_[c + kMaxComponents]; | 219 | 0 | } else { | 220 | 0 | error_row[c] = m->error_row_[c + kMaxComponents]; | 221 | 0 | next_error_row[c] = m->error_row_[c]; | 222 | 0 | } | 223 | 0 | memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float)); | 224 | 0 | } | 225 | 0 | } | 226 | 0 | const float mul = 255.0f; | 227 | 0 | if (dither_mode != JDITHER_FS) { | 228 | 0 | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); | 229 | 0 | } | 230 | 0 | for (size_t i = 0; i < len; ++i) { | 231 | 0 | uint8_t* pixel = &scratch_space[num_channels * i]; | 232 | 0 | if (dither_mode == JDITHER_FS) { | 233 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 234 | 0 | float val = rows[c][i] * mul + LimitError(error_row[c][i]); | 235 | 0 | pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val))); | 236 | 0 | } | 237 | 0 | } | 238 | 0 | int index = LookupColorIndex(cinfo, pixel); | 239 | 0 | output[i] = index; | 240 | 0 | if (dither_mode == JDITHER_FS) { | 241 | 0 | size_t prev_i = i > 0 ? i - 1 : 0; | 242 | 0 | size_t next_i = i + 1 < len ? i + 1 : len - 1; | 243 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 244 | 0 | float error = pixel[c] - cinfo->colormap[c][index]; | 245 | 0 | error_row[c][next_i] += kFSWeightMR * error; | 246 | 0 | next_error_row[c][prev_i] += kFSWeightBL * error; | 247 | 0 | next_error_row[c][i] += kFSWeightBM * error; | 248 | 0 | next_error_row[c][next_i] += kFSWeightBR * error; | 249 | 0 | } | 250 | 0 | } | 251 | 0 | } | 252 | 4.41M | } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) { | 253 | 1.10M | const float mul = 255.0; | 254 | 1.10M | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); | 255 | 1.10M | memcpy(output, scratch_space, len * num_channels); | 256 | 3.31M | } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) { | 257 | 1.60M | const float mul = 65535.0; | 258 | 1.60M | uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space); | 259 | 1.60M | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp); | 260 | 1.60M | if (m->swap_endianness_) { | 261 | 1.08M | const HWY_CAPPED(uint16_t, 8) du; | 262 | 1.08M | size_t output_len = len * num_channels; | 263 | 10.1M | for (size_t j = 0; j < output_len; j += Lanes(du)) { | 264 | 9.03M | auto v = LoadU(du, tmp + j); | 265 | 9.03M | auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8)); | 266 | 9.03M | StoreU(vswap, du, tmp + j); | 267 | 9.03M | } | 268 | 1.08M | } | 269 | 1.60M | memcpy(output, tmp, len * num_channels * 2); | 270 | 1.71M | } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) { | 271 | 1.71M | float* tmp = reinterpret_cast<float*>(scratch_space); | 272 | 1.71M | StoreFloatRow(rows, xoffset, len, num_channels, tmp); | 273 | 1.71M | if (m->swap_endianness_) { | 274 | 973k | size_t output_len = len * num_channels; | 275 | 48.4M | for (size_t j = 0; j < output_len; ++j) { | 276 | 47.4M | tmp[j] = BSwapFloat(tmp[j]); | 277 | 47.4M | } | 278 | 973k | } | 279 | 1.71M | memcpy(output, tmp, len * num_channels * 4); | 280 | 1.71M | } | 281 | 4.41M | } |
jpegli::N_AVX2::WriteToOutput(jpeg_decompress_struct*, float* restrict*, unsigned long, unsigned long, unsigned long, unsigned char*) Line | Count | Source | 202 | 8.54M | uint8_t* JXL_RESTRICT output) { | 203 | 8.54M | jpeg_decomp_master* m = cinfo->master; | 204 | 8.54M | uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_; | 205 | 8.54M | if (cinfo->quantize_colors && m->quant_pass_ == 1) { | 206 | 0 | float* error_row[kMaxComponents]; | 207 | 0 | float* next_error_row[kMaxComponents]; | 208 | 0 | J_DITHER_MODE dither_mode = cinfo->dither_mode; | 209 | 0 | if (dither_mode == JDITHER_ORDERED) { | 210 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 211 | 0 | DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline, | 212 | 0 | cinfo->output_width); | 213 | 0 | } | 214 | 0 | } else if (dither_mode == JDITHER_FS) { | 215 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 216 | 0 | if (cinfo->output_scanline % 2 == 0) { | 217 | 0 | error_row[c] = m->error_row_[c]; | 218 | 0 | next_error_row[c] = m->error_row_[c + kMaxComponents]; | 219 | 0 | } else { | 220 | 0 | error_row[c] = m->error_row_[c + kMaxComponents]; | 221 | 0 | next_error_row[c] = m->error_row_[c]; | 222 | 0 | } | 223 | 0 | memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float)); | 224 | 0 | } | 225 | 0 | } | 226 | 0 | const float mul = 255.0f; | 227 | 0 | if (dither_mode != JDITHER_FS) { | 228 | 0 | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); | 229 | 0 | } | 230 | 0 | for (size_t i = 0; i < len; ++i) { | 231 | 0 | uint8_t* pixel = &scratch_space[num_channels * i]; | 232 | 0 | if (dither_mode == JDITHER_FS) { | 233 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 234 | 0 | float val = rows[c][i] * mul + LimitError(error_row[c][i]); | 235 | 0 | pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val))); | 236 | 0 | } | 237 | 0 | } | 238 | 0 | int index = LookupColorIndex(cinfo, pixel); | 239 | 0 | output[i] = index; | 240 | 0 | if (dither_mode == JDITHER_FS) { | 241 | 0 | size_t prev_i = i > 0 ? i - 1 : 0; | 242 | 0 | size_t next_i = i + 1 < len ? i + 1 : len - 1; | 243 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 244 | 0 | float error = pixel[c] - cinfo->colormap[c][index]; | 245 | 0 | error_row[c][next_i] += kFSWeightMR * error; | 246 | 0 | next_error_row[c][prev_i] += kFSWeightBL * error; | 247 | 0 | next_error_row[c][i] += kFSWeightBM * error; | 248 | 0 | next_error_row[c][next_i] += kFSWeightBR * error; | 249 | 0 | } | 250 | 0 | } | 251 | 0 | } | 252 | 8.54M | } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) { | 253 | 1.68M | const float mul = 255.0; | 254 | 1.68M | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); | 255 | 1.68M | memcpy(output, scratch_space, len * num_channels); | 256 | 6.86M | } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) { | 257 | 3.50M | const float mul = 65535.0; | 258 | 3.50M | uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space); | 259 | 3.50M | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp); | 260 | 3.50M | if (m->swap_endianness_) { | 261 | 1.87M | const HWY_CAPPED(uint16_t, 8) du; | 262 | 1.87M | size_t output_len = len * num_channels; | 263 | 14.0M | for (size_t j = 0; j < output_len; j += Lanes(du)) { | 264 | 12.1M | auto v = LoadU(du, tmp + j); | 265 | 12.1M | auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8)); | 266 | 12.1M | StoreU(vswap, du, tmp + j); | 267 | 12.1M | } | 268 | 1.87M | } | 269 | 3.50M | memcpy(output, tmp, len * num_channels * 2); | 270 | 3.50M | } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) { | 271 | 3.35M | float* tmp = reinterpret_cast<float*>(scratch_space); | 272 | 3.35M | StoreFloatRow(rows, xoffset, len, num_channels, tmp); | 273 | 3.35M | if (m->swap_endianness_) { | 274 | 934k | size_t output_len = len * num_channels; | 275 | 59.0M | for (size_t j = 0; j < output_len; ++j) { | 276 | 58.1M | tmp[j] = BSwapFloat(tmp[j]); | 277 | 58.1M | } | 278 | 934k | } | 279 | 3.35M | memcpy(output, tmp, len * num_channels * 4); | 280 | 3.35M | } | 281 | 8.54M | } |
jpegli::N_SSE2::WriteToOutput(jpeg_decompress_struct*, float* restrict*, unsigned long, unsigned long, unsigned long, unsigned char*) Line | Count | Source | 202 | 5.41M | uint8_t* JXL_RESTRICT output) { | 203 | 5.41M | jpeg_decomp_master* m = cinfo->master; | 204 | 5.41M | uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_; | 205 | 5.41M | if (cinfo->quantize_colors && m->quant_pass_ == 1) { | 206 | 0 | float* error_row[kMaxComponents]; | 207 | 0 | float* next_error_row[kMaxComponents]; | 208 | 0 | J_DITHER_MODE dither_mode = cinfo->dither_mode; | 209 | 0 | if (dither_mode == JDITHER_ORDERED) { | 210 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 211 | 0 | DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline, | 212 | 0 | cinfo->output_width); | 213 | 0 | } | 214 | 0 | } else if (dither_mode == JDITHER_FS) { | 215 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 216 | 0 | if (cinfo->output_scanline % 2 == 0) { | 217 | 0 | error_row[c] = m->error_row_[c]; | 218 | 0 | next_error_row[c] = m->error_row_[c + kMaxComponents]; | 219 | 0 | } else { | 220 | 0 | error_row[c] = m->error_row_[c + kMaxComponents]; | 221 | 0 | next_error_row[c] = m->error_row_[c]; | 222 | 0 | } | 223 | 0 | memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float)); | 224 | 0 | } | 225 | 0 | } | 226 | 0 | const float mul = 255.0f; | 227 | 0 | if (dither_mode != JDITHER_FS) { | 228 | 0 | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); | 229 | 0 | } | 230 | 0 | for (size_t i = 0; i < len; ++i) { | 231 | 0 | uint8_t* pixel = &scratch_space[num_channels * i]; | 232 | 0 | if (dither_mode == JDITHER_FS) { | 233 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 234 | 0 | float val = rows[c][i] * mul + LimitError(error_row[c][i]); | 235 | 0 | pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val))); | 236 | 0 | } | 237 | 0 | } | 238 | 0 | int index = LookupColorIndex(cinfo, pixel); | 239 | 0 | output[i] = index; | 240 | 0 | if (dither_mode == JDITHER_FS) { | 241 | 0 | size_t prev_i = i > 0 ? i - 1 : 0; | 242 | 0 | size_t next_i = i + 1 < len ? i + 1 : len - 1; | 243 | 0 | for (size_t c = 0; c < num_channels; ++c) { | 244 | 0 | float error = pixel[c] - cinfo->colormap[c][index]; | 245 | 0 | error_row[c][next_i] += kFSWeightMR * error; | 246 | 0 | next_error_row[c][prev_i] += kFSWeightBL * error; | 247 | 0 | next_error_row[c][i] += kFSWeightBM * error; | 248 | 0 | next_error_row[c][next_i] += kFSWeightBR * error; | 249 | 0 | } | 250 | 0 | } | 251 | 0 | } | 252 | 5.41M | } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) { | 253 | 979k | const float mul = 255.0; | 254 | 979k | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space); | 255 | 979k | memcpy(output, scratch_space, len * num_channels); | 256 | 4.43M | } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) { | 257 | 1.95M | const float mul = 65535.0; | 258 | 1.95M | uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space); | 259 | 1.95M | StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp); | 260 | 1.95M | if (m->swap_endianness_) { | 261 | 1.05M | const HWY_CAPPED(uint16_t, 8) du; | 262 | 1.05M | size_t output_len = len * num_channels; | 263 | 11.1M | for (size_t j = 0; j < output_len; j += Lanes(du)) { | 264 | 10.1M | auto v = LoadU(du, tmp + j); | 265 | 10.1M | auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8)); | 266 | 10.1M | StoreU(vswap, du, tmp + j); | 267 | 10.1M | } | 268 | 1.05M | } | 269 | 1.95M | memcpy(output, tmp, len * num_channels * 2); | 270 | 2.47M | } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) { | 271 | 2.47M | float* tmp = reinterpret_cast<float*>(scratch_space); | 272 | 2.47M | StoreFloatRow(rows, xoffset, len, num_channels, tmp); | 273 | 2.47M | if (m->swap_endianness_) { | 274 | 1.28M | size_t output_len = len * num_channels; | 275 | 71.8M | for (size_t j = 0; j < output_len; ++j) { | 276 | 70.5M | tmp[j] = BSwapFloat(tmp[j]); | 277 | 70.5M | } | 278 | 1.28M | } | 279 | 2.47M | memcpy(output, tmp, len * num_channels * 4); | 280 | 2.47M | } | 281 | 5.41M | } |
|
282 | | |
283 | | // NOLINTNEXTLINE(google-readability-namespace-comments) |
284 | | } // namespace HWY_NAMESPACE |
285 | | } // namespace jpegli |
286 | | HWY_AFTER_NAMESPACE(); |
287 | | |
288 | | #if HWY_ONCE |
289 | | |
290 | | namespace jpegli { |
291 | | |
292 | | HWY_EXPORT(GatherBlockStats); |
293 | | HWY_EXPORT(WriteToOutput); |
294 | | HWY_EXPORT(DecenterRow); |
295 | | |
296 | | void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs, |
297 | | const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros, |
298 | 2.15M | int32_t* JXL_RESTRICT sumabs) { |
299 | 2.15M | HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros, sumabs); |
300 | 2.15M | } |
301 | | |
302 | | void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[], |
303 | | size_t xoffset, size_t len, size_t num_channels, |
304 | 18.3M | uint8_t* JXL_RESTRICT output) { |
305 | 18.3M | HWY_DYNAMIC_DISPATCH(WriteToOutput) |
306 | 18.3M | (cinfo, rows, xoffset, len, num_channels, output); |
307 | 18.3M | } |
308 | | |
309 | 29.9M | void DecenterRow(float* row, size_t xsize) { |
310 | 29.9M | HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize); |
311 | 29.9M | } |
312 | | |
313 | 2.11M | bool ShouldApplyDequantBiases(j_decompress_ptr cinfo, int ci) { |
314 | 2.11M | const auto& compinfo = cinfo->comp_info[ci]; |
315 | 2.11M | return (compinfo.h_samp_factor == cinfo->max_h_samp_factor && |
316 | 1.69M | compinfo.v_samp_factor == cinfo->max_v_samp_factor); |
317 | 2.11M | } |
318 | | |
319 | | // See the following article for the details: |
320 | | // J. R. Price and M. Rabbani, "Dequantization bias for JPEG decompression" |
321 | | // Proceedings International Conference on Information Technology: Coding and |
322 | | // Computing (Cat. No.PR00540), 2000, pp. 30-35, doi: 10.1109/ITCC.2000.844179. |
323 | | void ComputeOptimalLaplacianBiases(const int num_blocks, const int* nonzeros, |
324 | 383k | const int* sumabs, float* biases) { |
325 | 24.5M | for (size_t k = 1; k < DCTSIZE2; ++k) { |
326 | 24.1M | if (nonzeros[k] == 0) { |
327 | 23.8M | biases[k] = 0.5f; |
328 | 23.8M | continue; |
329 | 23.8M | } |
330 | | // Notation adapted from the article |
331 | 253k | float N = num_blocks; |
332 | 253k | float N1 = nonzeros[k]; |
333 | 253k | float N0 = num_blocks - N1; |
334 | 253k | float S = sumabs[k]; |
335 | | // Compute gamma from N0, N1, N, S (eq. 11), with A and B being just |
336 | | // temporary grouping of terms. |
337 | 253k | float A = 4.0 * S + 2.0 * N; |
338 | 253k | float B = 4.0 * S - 2.0 * N1; |
339 | 253k | float gamma = (-1.0 * N0 + std::sqrt(N0 * N0 * 1.0 + A * B)) / A; |
340 | 253k | float gamma2 = gamma * gamma; |
341 | | // The bias is computed from gamma with (eq. 5), where the quantization |
342 | | // multiplier Q can be factored out and thus the bias can be applied |
343 | | // directly on the quantized coefficient. |
344 | 253k | biases[k] = |
345 | 253k | 0.5 * (((1.0 + gamma2) / (1.0 - gamma2)) + 1.0 / std::log(gamma)); |
346 | 253k | } |
347 | 383k | } |
348 | | |
349 | | constexpr std::array<int, SAVED_COEFS> Q_POS = {0, 1, 8, 16, 9, |
350 | | 2, 3, 10, 17, 24}; |
351 | | |
352 | 2.27k | bool is_nonzero_quantizers(const JQUANT_TBL* qtable) { |
353 | 2.27k | return std::all_of(Q_POS.begin(), Q_POS.end(), |
354 | 22.7k | [&](int pos) { return qtable->quantval[pos] != 0; }); |
355 | 2.27k | } |
356 | | |
357 | | // Determine whether smoothing should be applied during decompression |
358 | 2.56k | bool do_smoothing(j_decompress_ptr cinfo) { |
359 | 2.56k | jpeg_decomp_master* m = cinfo->master; |
360 | 2.56k | bool smoothing_useful = false; |
361 | | |
362 | 2.56k | if (!cinfo->progressive_mode || cinfo->coef_bits == nullptr) { |
363 | 654 | return false; |
364 | 654 | } |
365 | 1.90k | auto* coef_bits_latch = m->coef_bits_latch; |
366 | 1.90k | auto* prev_coef_bits_latch = m->prev_coef_bits_latch; |
367 | | |
368 | 3.45k | for (int ci = 0; ci < cinfo->num_components; ci++) { |
369 | 2.40k | jpeg_component_info* compptr = &cinfo->comp_info[ci]; |
370 | 2.40k | JQUANT_TBL* qtable = compptr->quant_table; |
371 | 2.40k | int* coef_bits = cinfo->coef_bits[ci]; |
372 | 2.40k | int* prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components]; |
373 | | |
374 | | // Return early if conditions for smoothing are not met |
375 | 2.40k | if (qtable == nullptr || !is_nonzero_quantizers(qtable) || |
376 | 2.27k | coef_bits[0] < 0) { |
377 | 856 | return false; |
378 | 856 | } |
379 | | |
380 | 1.55k | coef_bits_latch[ci][0] = coef_bits[0]; |
381 | | |
382 | 15.5k | for (int coefi = 1; coefi < SAVED_COEFS; coefi++) { |
383 | 13.9k | prev_coef_bits_latch[ci][coefi] = |
384 | 13.9k | cinfo->input_scan_number > 1 ? prev_coef_bits[coefi] : -1; |
385 | 13.9k | if (coef_bits[coefi] != 0) { |
386 | 11.2k | smoothing_useful = true; |
387 | 11.2k | } |
388 | 13.9k | coef_bits_latch[ci][coefi] = coef_bits[coefi]; |
389 | 13.9k | } |
390 | 1.55k | } |
391 | | |
392 | 1.05k | return smoothing_useful; |
393 | 1.90k | } |
394 | | |
395 | | void PredictSmooth(j_decompress_ptr cinfo, JBLOCKARRAY blocks, int component, |
396 | 7.48M | size_t bx, int iy) { |
397 | 7.48M | const size_t imcu_row = cinfo->output_iMCU_row; |
398 | 7.48M | int16_t* scratch = cinfo->master->smoothing_scratch_; |
399 | 7.48M | std::vector<int> Q_VAL(SAVED_COEFS); |
400 | 7.48M | int* coef_bits; |
401 | | |
402 | 7.48M | std::array<std::array<int, 5>, 5> dc_values; |
403 | 7.48M | auto& compinfo = cinfo->comp_info[component]; |
404 | 7.48M | const size_t by0 = imcu_row * compinfo.v_samp_factor; |
405 | 7.48M | const size_t by = by0 + iy; |
406 | | |
407 | 7.48M | int prev_iy = by > 0 ? iy - 1 : 0; |
408 | 7.48M | int prev_prev_iy = by > 1 ? iy - 2 : prev_iy; |
409 | 7.48M | int next_iy = by + 1 < compinfo.height_in_blocks ? iy + 1 : iy; |
410 | 7.48M | int next_next_iy = by + 2 < compinfo.height_in_blocks ? iy + 2 : next_iy; |
411 | | |
412 | 7.48M | const int16_t* cur_row = blocks[iy][bx]; |
413 | 7.48M | const int16_t* prev_row = blocks[prev_iy][bx]; |
414 | 7.48M | const int16_t* prev_prev_row = blocks[prev_prev_iy][bx]; |
415 | 7.48M | const int16_t* next_row = blocks[next_iy][bx]; |
416 | 7.48M | const int16_t* next_next_row = blocks[next_next_iy][bx]; |
417 | | |
418 | 7.48M | int prev_block_ind = bx ? -DCTSIZE2 : 0; |
419 | 7.48M | int prev_prev_block_ind = bx > 1 ? -2 * DCTSIZE2 : prev_block_ind; |
420 | 7.48M | int next_block_ind = bx + 1 < compinfo.width_in_blocks ? DCTSIZE2 : 0; |
421 | 7.48M | int next_next_block_ind = |
422 | 7.48M | bx + 2 < compinfo.width_in_blocks ? DCTSIZE2 * 2 : next_block_ind; |
423 | | |
424 | 7.48M | std::array<const int16_t*, 5> row_ptrs = {prev_prev_row, prev_row, cur_row, |
425 | 7.48M | next_row, next_next_row}; |
426 | 7.48M | std::array<int, 5> block_inds = {prev_prev_block_ind, prev_block_ind, 0, |
427 | 7.48M | next_block_ind, next_next_block_ind}; |
428 | | |
429 | 7.48M | memcpy(scratch, cur_row, DCTSIZE2 * sizeof(cur_row[0])); |
430 | | |
431 | 44.9M | for (int r = 0; r < 5; ++r) { |
432 | 224M | for (int c = 0; c < 5; ++c) { |
433 | 187M | dc_values[r][c] = row_ptrs[r][block_inds[c]]; |
434 | 187M | } |
435 | 37.4M | } |
436 | | // Get the correct coef_bits: In case of an incomplete scan, we use the |
437 | | // prev coefficients. |
438 | 7.48M | if (cinfo->output_iMCU_row + 1 > cinfo->input_iMCU_row) { |
439 | 7.38M | coef_bits = cinfo->master->prev_coef_bits_latch[component]; |
440 | 7.38M | } else { |
441 | 102k | coef_bits = cinfo->master->coef_bits_latch[component]; |
442 | 102k | } |
443 | | |
444 | 7.48M | bool change_dc = true; |
445 | 70.6M | for (int i = 1; i < SAVED_COEFS; i++) { |
446 | 63.7M | if (coef_bits[i] != -1) { |
447 | 586k | change_dc = false; |
448 | 586k | break; |
449 | 586k | } |
450 | 63.7M | } |
451 | | |
452 | 7.48M | JQUANT_TBL* quanttbl = cinfo->quant_tbl_ptrs[compinfo.quant_tbl_no]; |
453 | 52.3M | for (size_t i = 0; i < 6; ++i) { |
454 | 44.9M | Q_VAL[i] = quanttbl->quantval[Q_POS[i]]; |
455 | 44.9M | } |
456 | 7.48M | if (change_dc) { |
457 | 34.4M | for (size_t i = 6; i < SAVED_COEFS; ++i) { |
458 | 27.5M | Q_VAL[i] = quanttbl->quantval[Q_POS[i]]; |
459 | 27.5M | } |
460 | 6.89M | } |
461 | 71.3M | auto calculate_dct_value = [&](int coef_index) { |
462 | 71.3M | int64_t num = 0; |
463 | 71.3M | int pred; |
464 | 71.3M | int Al; |
465 | | // we use the symmetry of the smoothing matrices by transposing the 5x5 dc |
466 | | // matrix in that case. |
467 | 71.3M | bool swap_indices = coef_index == 2 || coef_index == 5 || coef_index == 8 || |
468 | 49.7M | coef_index == 9; |
469 | 834M | auto dc = [&](int i, int j) { |
470 | 834M | return swap_indices ? dc_values[j][i] : dc_values[i][j]; |
471 | 834M | }; |
472 | 71.3M | JPEGLI_CHECK(coef_index >= 0 && coef_index < 10); |
473 | 71.3M | Al = coef_bits[coef_index]; |
474 | 71.3M | switch (coef_index) { |
475 | 6.89M | case 0: |
476 | | // set the DC |
477 | 6.89M | num = (-2 * dc(0, 0) - 6 * dc(0, 1) - 8 * dc(0, 2) - 6 * dc(0, 3) - |
478 | 6.89M | 2 * dc(0, 4) - 6 * dc(1, 0) + 6 * dc(1, 1) + 42 * dc(1, 2) + |
479 | 6.89M | 6 * dc(1, 3) - 6 * dc(1, 4) - 8 * dc(2, 0) + 42 * dc(2, 1) + |
480 | 6.89M | 152 * dc(2, 2) + 42 * dc(2, 3) - 8 * dc(2, 4) - 6 * dc(3, 0) + |
481 | 6.89M | 6 * dc(3, 1) + 42 * dc(3, 2) + 6 * dc(3, 3) - 6 * dc(3, 4) - |
482 | 6.89M | 2 * dc(4, 0) - 6 * dc(4, 1) - 8 * dc(4, 2) - 6 * dc(4, 3) - |
483 | 6.89M | 2 * dc(4, 4)); |
484 | | // special case: for the DC the dequantization is different |
485 | 6.89M | Al = 0; |
486 | 6.89M | break; |
487 | 7.36M | case 1: |
488 | 14.7M | case 2: |
489 | | // set Q01 or Q10 |
490 | 14.7M | num = (change_dc ? (-dc(0, 0) - dc(0, 1) + dc(0, 3) + dc(0, 4) - |
491 | 13.7M | 3 * dc(1, 0) + 13 * dc(1, 1) - 13 * dc(1, 3) + |
492 | 13.7M | 3 * dc(1, 4) - 3 * dc(2, 0) + 38 * dc(2, 1) - |
493 | 13.7M | 38 * dc(2, 3) + 3 * dc(2, 4) - 3 * dc(3, 0) + |
494 | 13.7M | 13 * dc(3, 1) - 13 * dc(3, 3) + 3 * dc(3, 4) - |
495 | 13.7M | dc(4, 0) - dc(4, 1) + dc(4, 3) + dc(4, 4)) |
496 | 14.7M | : (-7 * dc(2, 0) + 50 * dc(2, 1) - 50 * dc(2, 3) + |
497 | 990k | 7 * dc(2, 4))); |
498 | 14.7M | break; |
499 | 7.43M | case 3: |
500 | 14.8M | case 5: |
501 | | // set Q02 or Q20 |
502 | 14.8M | num = (change_dc |
503 | 14.8M | ? dc(0, 2) + 2 * dc(1, 1) + 7 * dc(1, 2) + 2 * dc(1, 3) - |
504 | 13.7M | 5 * dc(2, 1) - 14 * dc(2, 2) - 5 * dc(2, 3) + |
505 | 13.7M | 2 * dc(3, 1) + 7 * dc(3, 2) + 2 * dc(3, 3) + dc(4, 2) |
506 | 14.8M | : (-dc(0, 2) + 13 * dc(1, 2) - 24 * dc(2, 2) + |
507 | 1.10M | 13 * dc(3, 2) - dc(4, 2))); |
508 | 14.8M | break; |
509 | 7.43M | case 4: |
510 | | // set Q11 |
511 | 7.43M | num = |
512 | 7.43M | (change_dc ? -dc(0, 0) + dc(0, 4) + 9 * dc(1, 1) - 9 * dc(1, 3) - |
513 | 6.87M | 9 * dc(3, 1) + 9 * dc(3, 3) + dc(4, 0) - dc(4, 4) |
514 | 7.43M | : (dc(1, 4) + dc(3, 0) - 10 * dc(3, 1) + 10 * dc(3, 3) - |
515 | 563k | dc(0, 1) - dc(3, 4) + dc(4, 1) - dc(4, 3) + dc(0, 3) - |
516 | 563k | dc(1, 0) + 10 * dc(1, 1) - 10 * dc(1, 3))); |
517 | 7.43M | break; |
518 | 6.87M | case 6: |
519 | 13.7M | case 9: |
520 | | // set Q03 or Q30 |
521 | 13.7M | num = (dc(1, 1) - dc(1, 3) + 2 * dc(2, 1) - 2 * dc(2, 3) + dc(3, 1) - |
522 | 13.7M | dc(3, 3)); |
523 | 13.7M | break; |
524 | 6.87M | case 7: |
525 | 13.7M | case 8: |
526 | 13.7M | default: |
527 | | // set Q12 and Q21 |
528 | 13.7M | num = (dc(1, 1) - 3 * dc(1, 2) + dc(1, 3) - dc(3, 1) + 3 * dc(3, 2) - |
529 | 13.7M | dc(3, 3)); |
530 | 13.7M | break; |
531 | 71.3M | } |
532 | 71.3M | num = Q_VAL[0] * num; |
533 | 71.3M | if (num >= 0) { |
534 | 70.7M | pred = ((Q_VAL[coef_index] << 7) + num) / (Q_VAL[coef_index] << 8); |
535 | 70.7M | if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1; |
536 | 70.7M | } else { |
537 | 599k | pred = ((Q_VAL[coef_index] << 7) - num) / (Q_VAL[coef_index] << 8); |
538 | 599k | if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1; |
539 | 599k | pred = -pred; |
540 | 599k | } |
541 | 71.3M | return static_cast<int16_t>(pred); |
542 | 71.3M | }; |
543 | | |
544 | 7.48M | int loop_end = change_dc ? SAVED_COEFS : 6; |
545 | 72.5M | for (int i = 1; i < loop_end; ++i) { |
546 | 65.0M | if (coef_bits[i] != 0 && scratch[Q_POS[i]] == 0) { |
547 | 64.4M | scratch[Q_POS[i]] = calculate_dct_value(i); |
548 | 64.4M | } |
549 | 65.0M | } |
550 | 7.48M | if (change_dc) { |
551 | 6.89M | scratch[0] = calculate_dct_value(0); |
552 | 6.89M | } |
553 | 7.48M | } |
554 | | |
555 | 2.56k | void PrepareForOutput(j_decompress_ptr cinfo) { |
556 | 2.56k | jpeg_decomp_master* m = cinfo->master; |
557 | 2.56k | bool smoothing = do_smoothing(cinfo); |
558 | 2.56k | m->apply_smoothing = smoothing && FROM_JXL_BOOL(cinfo->do_block_smoothing); |
559 | 2.56k | size_t coeffs_per_block = cinfo->num_components * DCTSIZE2; |
560 | 2.56k | memset(m->nonzeros_, 0, coeffs_per_block * sizeof(m->nonzeros_[0])); |
561 | 2.56k | memset(m->sumabs_, 0, coeffs_per_block * sizeof(m->sumabs_[0])); |
562 | 2.56k | memset(m->num_processed_blocks_, 0, sizeof(m->num_processed_blocks_)); |
563 | 2.56k | memset(m->biases_, 0, coeffs_per_block * sizeof(m->biases_[0])); |
564 | 2.56k | cinfo->output_iMCU_row = 0; |
565 | 2.56k | cinfo->output_scanline = 0; |
566 | 2.56k | const float kDequantScale = 1.0f / (8 * 255); |
567 | 7.45k | for (int c = 0; c < cinfo->num_components; c++) { |
568 | 4.89k | const auto& comp = cinfo->comp_info[c]; |
569 | 4.89k | JQUANT_TBL* table = comp.quant_table; |
570 | 4.89k | if (table == nullptr) continue; |
571 | 240k | for (size_t k = 0; k < DCTSIZE2; ++k) { |
572 | 237k | m->dequant_[c * DCTSIZE2 + k] = table->quantval[k] * kDequantScale; |
573 | 237k | } |
574 | 3.70k | } |
575 | 2.56k | JPEGLI_CHECK(ChooseInverseTransform(cinfo)); |
576 | 2.56k | ChooseColorTransform(cinfo); |
577 | 2.56k | } |
578 | | |
579 | 1.54M | void DecodeCurrentiMCURow(j_decompress_ptr cinfo) { |
580 | 1.54M | jpeg_decomp_master* m = cinfo->master; |
581 | 1.54M | const size_t imcu_row = cinfo->output_iMCU_row; |
582 | 1.54M | JBLOCKARRAY blocks[kMaxComponents]; |
583 | 3.65M | for (int c = 0; c < cinfo->num_components; ++c) { |
584 | 2.11M | const jpeg_component_info* comp = &cinfo->comp_info[c]; |
585 | 2.11M | int by0 = imcu_row * comp->v_samp_factor; |
586 | 2.11M | int block_rows_left = comp->height_in_blocks - by0; |
587 | 2.11M | int max_block_rows = std::min(comp->v_samp_factor, block_rows_left); |
588 | 2.11M | int offset = m->streaming_mode_ ? 0 : by0; |
589 | 2.11M | blocks[c] = (*cinfo->mem->access_virt_barray)( |
590 | 2.11M | reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset, |
591 | 2.11M | max_block_rows, FALSE); |
592 | 2.11M | } |
593 | 3.65M | for (int c = 0; c < cinfo->num_components; ++c) { |
594 | 2.11M | size_t k0 = c * DCTSIZE2; |
595 | 2.11M | auto& compinfo = cinfo->comp_info[c]; |
596 | 2.11M | size_t block_row = imcu_row * compinfo.v_samp_factor; |
597 | 2.11M | if (ShouldApplyDequantBiases(cinfo, c)) { |
598 | | // Update statistics for this iMCU row. |
599 | 3.69M | for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) { |
600 | 2.16M | size_t by = block_row + iy; |
601 | 2.16M | if (by >= compinfo.height_in_blocks) { |
602 | 1.46k | continue; |
603 | 1.46k | } |
604 | 2.15M | int16_t* JXL_RESTRICT coeffs = &blocks[c][iy][0][0]; |
605 | 2.15M | size_t num = compinfo.width_in_blocks * DCTSIZE2; |
606 | 2.15M | GatherBlockStats(coeffs, num, &m->nonzeros_[k0], &m->sumabs_[k0]); |
607 | 2.15M | m->num_processed_blocks_[c] += compinfo.width_in_blocks; |
608 | 2.15M | } |
609 | 1.53M | if (imcu_row % 4 == 3) { |
610 | | // Re-compute optimal biases every few iMCU-rows. |
611 | 383k | ComputeOptimalLaplacianBiases(m->num_processed_blocks_[c], |
612 | 383k | &m->nonzeros_[k0], &m->sumabs_[k0], |
613 | 383k | &m->biases_[k0]); |
614 | 383k | } |
615 | 1.53M | } |
616 | 2.11M | RowBuffer<float>* raw_out = &m->raw_output_[c]; |
617 | 5.03M | for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) { |
618 | 2.92M | size_t by = block_row + iy; |
619 | 2.92M | if (by >= compinfo.height_in_blocks) { |
620 | 2.16k | continue; |
621 | 2.16k | } |
622 | 2.91M | size_t dctsize = m->scaled_dct_size[c]; |
623 | 2.91M | int16_t* JXL_RESTRICT row_in = &blocks[c][iy][0][0]; |
624 | 2.91M | float* JXL_RESTRICT row_out = raw_out->Row(by * dctsize); |
625 | 32.4M | for (size_t bx = 0; bx < compinfo.width_in_blocks; ++bx) { |
626 | 29.5M | if (m->apply_smoothing) { |
627 | 7.48M | PredictSmooth(cinfo, blocks[c], c, bx, iy); |
628 | 7.48M | (*m->inverse_transform[c])(m->smoothing_scratch_, &m->dequant_[k0], |
629 | 7.48M | &m->biases_[k0], m->idct_scratch_, |
630 | 7.48M | &row_out[bx * dctsize], raw_out->stride(), |
631 | 7.48M | dctsize); |
632 | 22.0M | } else { |
633 | 22.0M | (*m->inverse_transform[c])(&row_in[bx * DCTSIZE2], &m->dequant_[k0], |
634 | 22.0M | &m->biases_[k0], m->idct_scratch_, |
635 | 22.0M | &row_out[bx * dctsize], raw_out->stride(), |
636 | 22.0M | dctsize); |
637 | 22.0M | } |
638 | 29.5M | } |
639 | 2.91M | if (m->streaming_mode_) { |
640 | 373k | memset(row_in, 0, compinfo.width_in_blocks * sizeof(JBLOCK)); |
641 | 373k | } |
642 | 2.91M | } |
643 | 2.11M | } |
644 | 1.54M | } |
645 | | |
646 | 0 | void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data) { |
647 | 0 | jpegli::DecodeCurrentiMCURow(cinfo); |
648 | 0 | jpeg_decomp_master* m = cinfo->master; |
649 | 0 | for (int c = 0; c < cinfo->num_components; ++c) { |
650 | 0 | const auto& compinfo = cinfo->comp_info[c]; |
651 | 0 | size_t comp_width = compinfo.width_in_blocks * DCTSIZE; |
652 | 0 | size_t comp_height = compinfo.height_in_blocks * DCTSIZE; |
653 | 0 | size_t comp_nrows = compinfo.v_samp_factor * DCTSIZE; |
654 | 0 | size_t y0 = static_cast<size_t>(cinfo->output_iMCU_row) * |
655 | 0 | compinfo.v_samp_factor * DCTSIZE; |
656 | 0 | size_t y1 = std::min(y0 + comp_nrows, comp_height); |
657 | 0 | for (size_t y = y0; y < y1; ++y) { |
658 | 0 | float* rows[1] = {m->raw_output_[c].Row(y)}; |
659 | 0 | uint8_t* output = data[c][y - y0]; |
660 | 0 | DecenterRow(rows[0], comp_width); |
661 | 0 | WriteToOutput(cinfo, rows, 0, comp_width, 1, output); |
662 | 0 | } |
663 | 0 | } |
664 | 0 | ++cinfo->output_iMCU_row; |
665 | 0 | cinfo->output_scanline += cinfo->max_v_samp_factor * DCTSIZE; |
666 | 0 | if (cinfo->output_scanline >= cinfo->output_height) { |
667 | 0 | ++m->output_passes_done_; |
668 | 0 | } |
669 | 0 | } |
670 | | |
671 | | void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows, |
672 | 19.9M | JSAMPARRAY scanlines, size_t max_output_rows) { |
673 | 19.9M | jpeg_decomp_master* m = cinfo->master; |
674 | 19.9M | const size_t vfactor = cinfo->max_v_samp_factor; |
675 | 19.9M | const size_t hfactor = cinfo->max_h_samp_factor; |
676 | 19.9M | const size_t context = m->need_context_rows_ ? 1 : 0; |
677 | 19.9M | const size_t imcu_row = cinfo->output_iMCU_row; |
678 | 19.9M | const size_t imcu_height = vfactor * m->min_scaled_dct_size; |
679 | 19.9M | const size_t imcu_width = hfactor * m->min_scaled_dct_size; |
680 | 19.9M | const size_t output_width = m->iMCU_cols_ * imcu_width; |
681 | 19.9M | if (imcu_row == cinfo->total_iMCU_rows || |
682 | 19.8M | (imcu_row > context && |
683 | 19.8M | cinfo->output_scanline < (imcu_row - context) * imcu_height)) { |
684 | | // We are ready to output some scanlines. |
685 | 18.3M | size_t ybegin = cinfo->output_scanline; |
686 | 18.3M | size_t yend = (imcu_row == cinfo->total_iMCU_rows |
687 | 18.3M | ? cinfo->output_height |
688 | 18.3M | : (imcu_row - context) * imcu_height); |
689 | 18.3M | yend = std::min<size_t>(yend, ybegin + max_output_rows - *num_output_rows); |
690 | 18.3M | size_t yb = (ybegin / vfactor) * vfactor; |
691 | 18.3M | size_t ye = DivCeil(yend, vfactor) * vfactor; |
692 | 36.7M | for (size_t y = yb; y < ye; y += vfactor) { |
693 | 48.3M | for (int c = 0; c < cinfo->num_components; ++c) { |
694 | 29.9M | RowBuffer<float>* raw_out = &m->raw_output_[c]; |
695 | 29.9M | RowBuffer<float>* render_out = &m->render_output_[c]; |
696 | 29.9M | int line_groups = vfactor / m->v_factor[c]; |
697 | 29.9M | int downsampled_width = output_width / m->h_factor[c]; |
698 | 29.9M | size_t yc = y / m->v_factor[c]; |
699 | 81.4M | for (int dy = 0; dy < line_groups; ++dy) { |
700 | 51.5M | size_t ymid = yc + dy; |
701 | 51.5M | const float* JXL_RESTRICT row_mid = raw_out->Row(ymid); |
702 | 51.5M | if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) { |
703 | 3.72M | const float* JXL_RESTRICT row_top = |
704 | 3.72M | ymid == 0 ? row_mid : raw_out->Row(ymid - 1); |
705 | 3.72M | const float* JXL_RESTRICT row_bot = ymid + 1 == m->raw_height_[c] |
706 | 3.72M | ? row_mid |
707 | 3.72M | : raw_out->Row(ymid + 1); |
708 | 3.72M | Upsample2Vertical(row_top, row_mid, row_bot, |
709 | 3.72M | render_out->Row(2 * dy), |
710 | 3.72M | render_out->Row(2 * dy + 1), downsampled_width); |
711 | 47.7M | } else { |
712 | 114M | for (int yix = 0; yix < m->v_factor[c]; ++yix) { |
713 | 66.3M | memcpy(render_out->Row(m->v_factor[c] * dy + yix), row_mid, |
714 | 66.3M | downsampled_width * sizeof(float)); |
715 | 66.3M | } |
716 | 47.7M | } |
717 | 51.5M | if (m->h_factor[c] > 1) { |
718 | 39.6M | for (int yix = 0; yix < m->v_factor[c]; ++yix) { |
719 | 25.9M | int row_ix = m->v_factor[c] * dy + yix; |
720 | 25.9M | float* JXL_RESTRICT row = render_out->Row(row_ix); |
721 | 25.9M | float* JXL_RESTRICT tmp = m->upsample_scratch_; |
722 | 25.9M | if (cinfo->do_fancy_upsampling && m->h_factor[c] == 2) { |
723 | 10.9M | Upsample2Horizontal(row, tmp, output_width); |
724 | 14.9M | } else { |
725 | | // TODO(szabadka) SIMDify this. |
726 | 910M | for (size_t x = 0; x < output_width; ++x) { |
727 | 895M | tmp[x] = row[x / m->h_factor[c]]; |
728 | 895M | } |
729 | 14.9M | memcpy(row, tmp, output_width * sizeof(tmp[0])); |
730 | 14.9M | } |
731 | 25.9M | } |
732 | 13.6M | } |
733 | 51.5M | } |
734 | 29.9M | } |
735 | 56.8M | for (size_t yix = 0; yix < vfactor; ++yix) { |
736 | 38.4M | if (y + yix < ybegin || y + yix >= yend) continue; |
737 | 18.3M | float* rows[kMaxComponents]; |
738 | 18.3M | int num_all_components = |
739 | 18.3M | std::max(cinfo->out_color_components, cinfo->num_components); |
740 | 48.3M | for (int c = 0; c < num_all_components; ++c) { |
741 | 29.9M | rows[c] = m->render_output_[c].Row(yix); |
742 | 29.9M | } |
743 | 18.3M | (*m->color_transform)(rows, output_width); |
744 | 48.3M | for (int c = 0; c < cinfo->out_color_components; ++c) { |
745 | | // Undo the centering of the sample values around zero. |
746 | 29.9M | DecenterRow(rows[c], output_width); |
747 | 29.9M | } |
748 | 18.3M | if (scanlines) { |
749 | 18.3M | uint8_t* output = scanlines[*num_output_rows]; |
750 | 18.3M | WriteToOutput(cinfo, rows, m->xoffset_, cinfo->output_width, |
751 | 18.3M | cinfo->out_color_components, output); |
752 | 18.3M | } |
753 | 18.3M | JPEGLI_CHECK(cinfo->output_scanline == y + yix); |
754 | 18.3M | ++cinfo->output_scanline; |
755 | 18.3M | ++(*num_output_rows); |
756 | 18.3M | if (cinfo->output_scanline == cinfo->output_height) { |
757 | 2.51k | ++m->output_passes_done_; |
758 | 2.51k | } |
759 | 18.3M | } |
760 | 18.3M | } |
761 | 18.3M | } else { |
762 | 1.54M | DecodeCurrentiMCURow(cinfo); |
763 | 1.54M | ++cinfo->output_iMCU_row; |
764 | 1.54M | } |
765 | 19.9M | } |
766 | | |
767 | | } // namespace jpegli |
768 | | #endif // HWY_ONCE |