/src/libjxl/lib/jxl/compressed_dc.cc

Source
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "lib/jxl/compressed_dc.h"

#include <jxl/memory_manager.h>

#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <vector>

#include "lib/jxl/ac_context.h"
#include "lib/jxl/frame_header.h"
#include "lib/jxl/modular/modular_image.h"

#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc"
#include <hwy/foreach_target.h>
#include <hwy/highway.h>

#include "lib/jxl/base/compiler_specific.h"
#include "lib/jxl/base/data_parallel.h"
#include "lib/jxl/base/rect.h"
#include "lib/jxl/base/status.h"
#include "lib/jxl/image.h"
HWY_BEFORE_NAMESPACE();
namespace jxl {
namespace HWY_NAMESPACE {

using D = HWY_FULL(float);
using DScalar = HWY_CAPPED(float, 1);

// These templates are not found via ADL.
using hwy::HWY_NAMESPACE::Abs;
using hwy::HWY_NAMESPACE::Add;
using hwy::HWY_NAMESPACE::Div;
using hwy::HWY_NAMESPACE::Max;
using hwy::HWY_NAMESPACE::Mul;
using hwy::HWY_NAMESPACE::MulAdd;
using hwy::HWY_NAMESPACE::Rebind;
using hwy::HWY_NAMESPACE::Sub;
using hwy::HWY_NAMESPACE::Vec;
using hwy::HWY_NAMESPACE::ZeroIfNegative;

// TODO(veluca): optimize constants.
const float w1 = 0.20345139757231578f;
const float w2 = 0.0334829185968739f;
const float w0 = 1.0f - 4.0f * (w1 + w2);

template <class V>
V MaxWorkaround(V a, V b) {
#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
  // Prevents "Do not know how to split the result of this operator" error
  return IfThenElse(a > b, a, b);
#else
  return Max(a, b);
#endif
}

template <typename D>
JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor,
                                    const float* JXL_RESTRICT row_top,
                                    const float* JXL_RESTRICT row,
                                    const float* JXL_RESTRICT row_bottom,
                                    Vec<D>* JXL_RESTRICT mc,
                                    Vec<D>* JXL_RESTRICT sm,
                                    Vec<D>* JXL_RESTRICT gap, size_t x) {
  const auto tl = LoadU(d, row_top + x - 1);
  const auto tc = Load(d, row_top + x);
  const auto tr = LoadU(d, row_top + x + 1);

  const auto ml = LoadU(d, row + x - 1);
  *mc = Load(d, row + x);
  const auto mr = LoadU(d, row + x + 1);

  const auto bl = LoadU(d, row_bottom + x - 1);
  const auto bc = Load(d, row_bottom + x);
  const auto br = LoadU(d, row_bottom + x + 1);

  const auto w_center = Set(d, w0);
  const auto w_side = Set(d, w1);
  const auto w_corner = Set(d, w2);

  const auto corner = Add(Add(tl, tr), Add(bl, br));
  const auto side = Add(Add(ml, mr), Add(tc, bc));
  *sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(*mc, w_center)));

  const auto dc_quant = Set(d, dc_factor);
  *gap = MaxWorkaround(*gap, Abs(Div(Sub(*mc, *sm), dc_quant)));
}

template <typename D>
JXL_INLINE void ComputePixel(
    const float* JXL_RESTRICT dc_factors,
    const float* JXL_RESTRICT* JXL_RESTRICT rows_top,
    const float* JXL_RESTRICT* JXL_RESTRICT rows,
    const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom,
    float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
  const D d;
  auto mc_x = Undefined(d);
  auto mc_y = Undefined(d);
  auto mc_b = Undefined(d);
  auto sm_x = Undefined(d);
  auto sm_y = Undefined(d);
  auto sm_b = Undefined(d);
  auto gap = Set(d, 0.5f);
  ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
                      &mc_x, &sm_x, &gap, x);
  ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
                      &mc_y, &sm_y, &gap, x);
  ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
                      &mc_b, &sm_b, &gap, x);
  auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
  factor = ZeroIfNegative(factor);

  auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
  Store(out, d, out_rows[0] + x);
  out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
  Store(out, d, out_rows[1] + x);
  out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
  Store(out, d, out_rows[2] + x);
}

Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager,
                           const float* dc_factors, Image3F* dc,
                           ThreadPool* pool) {
  const size_t xsize = dc->xsize();
  const size_t ysize = dc->ysize();
  if (ysize <= 2 || xsize <= 2) return true;

  // TODO(veluca): use tile-based processing?
  // TODO(veluca): decide if changes to the y channel should be propagated to
  // the x and b channels through color correlation.
  JXL_ENSURE(w1 + w2 < 0.25f);

  JXL_ASSIGN_OR_RETURN(Image3F smoothed,
                       Image3F::Create(memory_manager, xsize, ysize));
  // Fill in borders that the loop below will not. First and last are unused.
  for (size_t c = 0; c < 3; c++) {
    for (size_t y : {static_cast<size_t>(0), ysize - 1}) {
      memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
             xsize * sizeof(float));
    }
  }
  auto process_row = [&](const uint32_t y, size_t /*thread*/) -> Status {
    const float* JXL_RESTRICT rows_top[3]{
        dc->ConstPlaneRow(0, y - 1),
        dc->ConstPlaneRow(1, y - 1),
        dc->ConstPlaneRow(2, y - 1),
    };
    const float* JXL_RESTRICT rows[3] = {
        dc->ConstPlaneRow(0, y),
        dc->ConstPlaneRow(1, y),
        dc->ConstPlaneRow(2, y),
    };
    const float* JXL_RESTRICT rows_bottom[3] = {
        dc->ConstPlaneRow(0, y + 1),
        dc->ConstPlaneRow(1, y + 1),
        dc->ConstPlaneRow(2, y + 1),
    };
    float* JXL_RESTRICT rows_out[3] = {
        smoothed.PlaneRow(0, y),
        smoothed.PlaneRow(1, y),
        smoothed.PlaneRow(2, y),
    };
    for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
      for (size_t c = 0; c < 3; c++) {
        rows_out[c][x] = rows[c][x];
      }
    }

    size_t x = 1;
    // First pixels
    const size_t N = Lanes(D());
    for (; x < std::min(N, xsize - 1); x++) {
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
                            x);
    }
    // Full vectors.
    for (; x + N <= xsize - 1; x += N) {
      ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
    }
    // Last pixels.
    for (; x < xsize - 1; x++) {
      ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
                            x);
    }
    return true;
  };
  JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit,
                                process_row, "DCSmoothingRow"));
  dc->Swap(smoothed);
  return true;
}

// DC dequantization.
void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
               const float* dc_factors, float mul, const float* cfl_factors,
               const YCbCrChromaSubsampling& chroma_subsampling,
               const BlockCtxMap& bctx) {
  const HWY_FULL(float) df;
  const Rebind<pixel_type, HWY_FULL(float)> di;  // assumes pixel_type <= float
  if (chroma_subsampling.Is444()) {
    const auto fac_x = Set(df, dc_factors[0] * mul);
    const auto fac_y = Set(df, dc_factors[1] * mul);
    const auto fac_b = Set(df, dc_factors[2] * mul);
    const auto cfl_fac_x = Set(df, cfl_factors[0]);
    const auto cfl_fac_b = Set(df, cfl_factors[2]);
    for (size_t y = 0; y < r.ysize(); y++) {
      float* dec_row_x = r.PlaneRow(dc, 0, y);
      float* dec_row_y = r.PlaneRow(dc, 1, y);
      float* dec_row_b = r.PlaneRow(dc, 2, y);
      const int32_t* quant_row_x = in.channel[1].plane.Row(y);
      const int32_t* quant_row_y = in.channel[0].plane.Row(y);
      const int32_t* quant_row_b = in.channel[2].plane.Row(y);
      for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
        const auto in_q_x = Load(di, quant_row_x + x);
        const auto in_q_y = Load(di, quant_row_y + x);
        const auto in_q_b = Load(di, quant_row_b + x);
        const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
        const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
        const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
        Store(in_y, df, dec_row_y + x);
        Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
        Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
      }
    }
  } else {
    for (size_t c : {1, 0, 2}) {
      Rect rect(r.x0() >> chroma_subsampling.HShift(c),
                r.y0() >> chroma_subsampling.VShift(c),
                r.xsize() >> chroma_subsampling.HShift(c),
                r.ysize() >> chroma_subsampling.VShift(c));
      const auto fac = Set(df, dc_factors[c] * mul);
      const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
      for (size_t y = 0; y < rect.ysize(); y++) {
        const int32_t* quant_row = ch.plane.Row(y);
        float* row = rect.PlaneRow(dc, c, y);
        for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
          const auto in_q = Load(di, quant_row + x);
          const auto out = Mul(ConvertTo(df, in_q), fac);
          Store(out, df, row + x);
        }
      }
    }
  }
  if (bctx.num_dc_ctxs <= 1) {
    for (size_t y = 0; y < r.ysize(); y++) {
      uint8_t* qdc_row = r.Row(quant_dc, y);
      memset(qdc_row, 0, sizeof(*qdc_row) * r.xsize());
    }
  } else {
    JXL_DASSERT(r.ysize() == 0 ||
                (r.ysize() - 1) >> chroma_subsampling.VShift(0) <
                    in.channel[1].plane.ysize());
    JXL_DASSERT(r.ysize() == 0 ||
                (r.ysize() - 1) >> chroma_subsampling.VShift(1) <
                    in.channel[0].plane.ysize());
    JXL_DASSERT(r.ysize() == 0 ||
                (r.ysize() - 1) >> chroma_subsampling.VShift(2) <
                    in.channel[2].plane.ysize());
    for (size_t y = 0; y < r.ysize(); y++) {
      uint8_t* qdc_row_val = r.Row(quant_dc, y);
      const int32_t* quant_row_x =
          in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
      const int32_t* quant_row_y =
          in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
      const int32_t* quant_row_b =
          in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
      for (size_t x = 0; x < r.xsize(); x++) {
        int bucket_x = 0;
        int bucket_y = 0;
        int bucket_b = 0;
        for (int t : bctx.dc_thresholds[0]) {
          if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
        }
        for (int t : bctx.dc_thresholds[1]) {
          if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
        }
        for (int t : bctx.dc_thresholds[2]) {
          if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
        }
        int bucket = bucket_x;
        bucket *= bctx.dc_thresholds[2].size() + 1;
        bucket += bucket_b;
        bucket *= bctx.dc_thresholds[1].size() + 1;
        bucket += bucket_y;
        qdc_row_val[x] = bucket;
      }
    }
  }
}

// NOLINTNEXTLINE(google-readability-namespace-comments)
}  // namespace HWY_NAMESPACE
}  // namespace jxl
HWY_AFTER_NAMESPACE();

#if HWY_ONCE
namespace jxl {

HWY_EXPORT(DequantDC);
HWY_EXPORT(AdaptiveDCSmoothing);
Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager,
                           const float* dc_factors, Image3F* dc,
                           ThreadPool* pool) {
  return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(memory_manager, dc_factors,
                                                   dc, pool);
}

void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
               const float* dc_factors, float mul, const float* cfl_factors,
               const YCbCrChromaSubsampling& chroma_subsampling,
               const BlockCtxMap& bctx) {
  HWY_DYNAMIC_DISPATCH(DequantDC)
  (r, dc, quant_dc, in, dc_factors, mul, cfl_factors, chroma_subsampling, bctx);
}

}  // namespace jxl
#endif  // HWY_ONCE

Coverage Report

Created: 2026-06-14 06:57

Line	Count	Source
1		// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2		//
3		// Use of this source code is governed by a BSD-style
4		// license that can be found in the LICENSE file.
5
6		#include "lib/jxl/compressed_dc.h"
7
8		#include <jxl/memory_manager.h>
9
10		#include <algorithm>
11		#include <cstdint>
12		#include <cstdlib>
13		#include <cstring>
14		#include <vector>
15
16		#include "lib/jxl/ac_context.h"
17		#include "lib/jxl/frame_header.h"
18		#include "lib/jxl/modular/modular_image.h"
19
20		#undef HWY_TARGET_INCLUDE
21		#define HWY_TARGET_INCLUDE "lib/jxl/compressed_dc.cc"
22		#include <hwy/foreach_target.h>
23		#include <hwy/highway.h>
24
25		#include "lib/jxl/base/compiler_specific.h"
26		#include "lib/jxl/base/data_parallel.h"
27		#include "lib/jxl/base/rect.h"
28		#include "lib/jxl/base/status.h"
29		#include "lib/jxl/image.h"
30		HWY_BEFORE_NAMESPACE();
31		namespace jxl {
32		namespace HWY_NAMESPACE {
33
34		using D = HWY_FULL(float);
35		using DScalar = HWY_CAPPED(float, 1);
36
37		// These templates are not found via ADL.
38		using hwy::HWY_NAMESPACE::Abs;
39		using hwy::HWY_NAMESPACE::Add;
40		using hwy::HWY_NAMESPACE::Div;
41		using hwy::HWY_NAMESPACE::Max;
42		using hwy::HWY_NAMESPACE::Mul;
43		using hwy::HWY_NAMESPACE::MulAdd;
44		using hwy::HWY_NAMESPACE::Rebind;
45		using hwy::HWY_NAMESPACE::Sub;
46		using hwy::HWY_NAMESPACE::Vec;
47		using hwy::HWY_NAMESPACE::ZeroIfNegative;
48
49		// TODO(veluca): optimize constants.
50		const float w1 = 0.20345139757231578f;
51		const float w2 = 0.0334829185968739f;
52		const float w0 = 1.0f - 4.0f * (w1 + w2);
53
54		template <class V>
55	72.2k	V MaxWorkaround(V a, V b) {
56		#if (HWY_TARGET == HWY_AVX3) && HWY_COMPILER_CLANG <= 800
57		// Prevents "Do not know how to split the result of this operator" error
58		return IfThenElse(a > b, a, b);
59		#else
60	72.2k	return Max(a, b);
61	72.2k	#endif
62	72.2k	}
63
64		template <typename D>
65		JXL_INLINE void ComputePixelChannel(const D d, const float dc_factor,
66		const float* JXL_RESTRICT row_top,
67		const float* JXL_RESTRICT row,
68		const float* JXL_RESTRICT row_bottom,
69		Vec<D>* JXL_RESTRICT mc,
70		Vec<D>* JXL_RESTRICT sm,
71	76.9k	Vec<D>* JXL_RESTRICT gap, size_t x) {
72	76.9k	const auto tl = LoadU(d, row_top + x - 1);
73	76.9k	const auto tc = Load(d, row_top + x);
74	76.9k	const auto tr = LoadU(d, row_top + x + 1);
75
76	76.9k	const auto ml = LoadU(d, row + x - 1);
77	76.9k	*mc = Load(d, row + x);
78	76.9k	const auto mr = LoadU(d, row + x + 1);
79
80	76.9k	const auto bl = LoadU(d, row_bottom + x - 1);
81	76.9k	const auto bc = Load(d, row_bottom + x);
82	76.9k	const auto br = LoadU(d, row_bottom + x + 1);
83
84	76.9k	const auto w_center = Set(d, w0);
85	76.9k	const auto w_side = Set(d, w1);
86	76.9k	const auto w_corner = Set(d, w2);
87
88	76.9k	const auto corner = Add(Add(tl, tr), Add(bl, br));
89	76.9k	const auto side = Add(Add(ml, mr), Add(tc, bc));
90	76.9k	sm = MulAdd(corner, w_corner, MulAdd(side, w_side, Mul(mc, w_center)));
91
92	76.9k	const auto dc_quant = Set(d, dc_factor);
93	76.9k	gap = MaxWorkaround(gap, Abs(Div(Sub(mc, sm), dc_quant)));
94	76.9k	}
95
96		template <typename D>
97		JXL_INLINE void ComputePixel(
98		const float* JXL_RESTRICT dc_factors,
99		const float* JXL_RESTRICT* JXL_RESTRICT rows_top,
100		const float* JXL_RESTRICT* JXL_RESTRICT rows,
101		const float* JXL_RESTRICT* JXL_RESTRICT rows_bottom,
102	38.2k	float* JXL_RESTRICT* JXL_RESTRICT out_rows, size_t x) {
103	38.2k	const D d;
104	38.2k	auto mc_x = Undefined(d);
105	38.2k	auto mc_y = Undefined(d);
106	38.2k	auto mc_b = Undefined(d);
107	38.2k	auto sm_x = Undefined(d);
108	38.2k	auto sm_y = Undefined(d);
109	38.2k	auto sm_b = Undefined(d);
110	38.2k	auto gap = Set(d, 0.5f);
111	38.2k	ComputePixelChannel(d, dc_factors[0], rows_top[0], rows[0], rows_bottom[0],
112	38.2k	&mc_x, &sm_x, &gap, x);
113	38.2k	ComputePixelChannel(d, dc_factors[1], rows_top[1], rows[1], rows_bottom[1],
114	38.2k	&mc_y, &sm_y, &gap, x);
115	38.2k	ComputePixelChannel(d, dc_factors[2], rows_top[2], rows[2], rows_bottom[2],
116	38.2k	&mc_b, &sm_b, &gap, x);
117	38.2k	auto factor = MulAdd(Set(d, -4.0f), gap, Set(d, 3.0f));
118	38.2k	factor = ZeroIfNegative(factor);
119
120	38.2k	auto out = MulAdd(Sub(sm_x, mc_x), factor, mc_x);
121	38.2k	Store(out, d, out_rows[0] + x);
122	38.2k	out = MulAdd(Sub(sm_y, mc_y), factor, mc_y);
123	38.2k	Store(out, d, out_rows[1] + x);
124	38.2k	out = MulAdd(Sub(sm_b, mc_b), factor, mc_b);
125	38.2k	Store(out, d, out_rows[2] + x);
126	38.2k	}
127
128		Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager,
129		const float* dc_factors, Image3F* dc,
130	5.20k	ThreadPool* pool) {
131	5.20k	const size_t xsize = dc->xsize();
132	5.20k	const size_t ysize = dc->ysize();
133	5.20k	if (ysize <= 2 \|\| xsize <= 2) return true;
134
135		// TODO(veluca): use tile-based processing?
136		// TODO(veluca): decide if changes to the y channel should be propagated to
137		// the x and b channels through color correlation.
138	42	JXL_ENSURE(w1 + w2 < 0.25f);
139
140	84	JXL_ASSIGN_OR_RETURN(Image3F smoothed,
141	84	Image3F::Create(memory_manager, xsize, ysize));
142		// Fill in borders that the loop below will not. First and last are unused.
143	168	for (size_t c = 0; c < 3; c++) {
144	252	for (size_t y : {static_cast<size_t>(0), ysize - 1}) {
145	252	memcpy(smoothed.PlaneRow(c, y), dc->PlaneRow(c, y),
146	252	xsize * sizeof(float));
147	252	}
148	126	}
149	1.62k	auto process_row = [&](const uint32_t y, size_t /thread/) -> Status {
150	1.62k	const float* JXL_RESTRICT rows_top[3]{
151	1.62k	dc->ConstPlaneRow(0, y - 1),
152	1.62k	dc->ConstPlaneRow(1, y - 1),
153	1.62k	dc->ConstPlaneRow(2, y - 1),
154	1.62k	};
155	1.62k	const float* JXL_RESTRICT rows[3] = {
156	1.62k	dc->ConstPlaneRow(0, y),
157	1.62k	dc->ConstPlaneRow(1, y),
158	1.62k	dc->ConstPlaneRow(2, y),
159	1.62k	};
160	1.62k	const float* JXL_RESTRICT rows_bottom[3] = {
161	1.62k	dc->ConstPlaneRow(0, y + 1),
162	1.62k	dc->ConstPlaneRow(1, y + 1),
163	1.62k	dc->ConstPlaneRow(2, y + 1),
164	1.62k	};
165	1.62k	float* JXL_RESTRICT rows_out[3] = {
166	1.62k	smoothed.PlaneRow(0, y),
167	1.62k	smoothed.PlaneRow(1, y),
168	1.62k	smoothed.PlaneRow(2, y),
169	1.62k	};
170	3.21k	for (size_t x : {static_cast<size_t>(0), xsize - 1}) {
171	12.8k	for (size_t c = 0; c < 3; c++) {
172	9.62k	rows_out[c][x] = rows[c][x];
173	9.62k	}
174	3.21k	}
175
176	1.62k	size_t x = 1;
177		// First pixels
178	1.62k	const size_t N = Lanes(D());
179	1.62k	for (; x < std::min(N, xsize - 1); x++) {
180	0	ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
181	0	x);
182	0	}
183		// Full vectors.
184	39.7k	for (; x + N <= xsize - 1; x += N) {
185	38.0k	ComputePixel<D>(dc_factors, rows_top, rows, rows_bottom, rows_out, x);
186	38.0k	}
187		// Last pixels.
188	1.62k	for (; x < xsize - 1; x++) {
189	0	ComputePixel<DScalar>(dc_factors, rows_top, rows, rows_bottom, rows_out,
190	0	x);
191	0	}
192	1.62k	return true;
193	1.62k	};
194	84	JXL_RETURN_IF_ERROR(RunOnPool(pool, 1, ysize - 1, ThreadPool::NoInit,
195	84	process_row, "DCSmoothingRow"));
196	42	dc->Swap(smoothed);
197	42	return true;
198	84	}
199
200		// DC dequantization.
201		void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
202		const float* dc_factors, float mul, const float* cfl_factors,
203		const YCbCrChromaSubsampling& chroma_subsampling,
204	5.90k	const BlockCtxMap& bctx) {
205	5.90k	const HWY_FULL(float) df;
206	5.90k	const Rebind<pixel_type, HWY_FULL(float)> di; // assumes pixel_type <= float
207	5.90k	if (chroma_subsampling.Is444()) {
208	5.90k	const auto fac_x = Set(df, dc_factors[0] * mul);
209	5.90k	const auto fac_y = Set(df, dc_factors[1] * mul);
210	5.90k	const auto fac_b = Set(df, dc_factors[2] * mul);
211	5.90k	const auto cfl_fac_x = Set(df, cfl_factors[0]);
212	5.90k	const auto cfl_fac_b = Set(df, cfl_factors[2]);
213	18.7k	for (size_t y = 0; y < r.ysize(); y++) {
214	12.8k	float* dec_row_x = r.PlaneRow(dc, 0, y);
215	12.8k	float* dec_row_y = r.PlaneRow(dc, 1, y);
216	12.8k	float* dec_row_b = r.PlaneRow(dc, 2, y);
217	12.8k	const int32_t* quant_row_x = in.channel[1].plane.Row(y);
218	12.8k	const int32_t* quant_row_y = in.channel[0].plane.Row(y);
219	12.8k	const int32_t* quant_row_b = in.channel[2].plane.Row(y);
220	142k	for (size_t x = 0; x < r.xsize(); x += Lanes(di)) {
221	129k	const auto in_q_x = Load(di, quant_row_x + x);
222	129k	const auto in_q_y = Load(di, quant_row_y + x);
223	129k	const auto in_q_b = Load(di, quant_row_b + x);
224	129k	const auto in_x = Mul(ConvertTo(df, in_q_x), fac_x);
225	129k	const auto in_y = Mul(ConvertTo(df, in_q_y), fac_y);
226	129k	const auto in_b = Mul(ConvertTo(df, in_q_b), fac_b);
227	129k	Store(in_y, df, dec_row_y + x);
228	129k	Store(MulAdd(in_y, cfl_fac_x, in_x), df, dec_row_x + x);
229	129k	Store(MulAdd(in_y, cfl_fac_b, in_b), df, dec_row_b + x);
230	129k	}
231	12.8k	}
232	5.90k	} else {
233	6	for (size_t c : {1, 0, 2}) {
234	6	Rect rect(r.x0() >> chroma_subsampling.HShift(c),
235	6	r.y0() >> chroma_subsampling.VShift(c),
236	6	r.xsize() >> chroma_subsampling.HShift(c),
237	6	r.ysize() >> chroma_subsampling.VShift(c));
238	6	const auto fac = Set(df, dc_factors[c] * mul);
239	6	const Channel& ch = in.channel[c < 2 ? c ^ 1 : c];
240	16	for (size_t y = 0; y < rect.ysize(); y++) {
241	10	const int32_t* quant_row = ch.plane.Row(y);
242	10	float* row = rect.PlaneRow(dc, c, y);
243	30	for (size_t x = 0; x < rect.xsize(); x += Lanes(di)) {
244	20	const auto in_q = Load(di, quant_row + x);
245	20	const auto out = Mul(ConvertTo(df, in_q), fac);
246	20	Store(out, df, row + x);
247	20	}
248	10	}
249	6	}
250	2	}
251	5.90k	if (bctx.num_dc_ctxs <= 1) {
252	17.3k	for (size_t y = 0; y < r.ysize(); y++) {
253	11.9k	uint8_t* qdc_row = r.Row(quant_dc, y);
254	11.9k	memset(qdc_row, 0, sizeof(qdc_row) r.xsize());
255	11.9k	}
256	5.39k	} else {
257	513	JXL_DASSERT(r.ysize() == 0 \|\|
258	513	(r.ysize() - 1) >> chroma_subsampling.VShift(0) <
259	513	in.channel[1].plane.ysize());
260	513	JXL_DASSERT(r.ysize() == 0 \|\|
261	513	(r.ysize() - 1) >> chroma_subsampling.VShift(1) <
262	513	in.channel[0].plane.ysize());
263	513	JXL_DASSERT(r.ysize() == 0 \|\|
264	513	(r.ysize() - 1) >> chroma_subsampling.VShift(2) <
265	513	in.channel[2].plane.ysize());
266	1.46k	for (size_t y = 0; y < r.ysize(); y++) {
267	953	uint8_t* qdc_row_val = r.Row(quant_dc, y);
268	953	const int32_t* quant_row_x =
269	953	in.channel[1].plane.Row(y >> chroma_subsampling.VShift(0));
270	953	const int32_t* quant_row_y =
271	953	in.channel[0].plane.Row(y >> chroma_subsampling.VShift(1));
272	953	const int32_t* quant_row_b =
273	953	in.channel[2].plane.Row(y >> chroma_subsampling.VShift(2));
274	10.7k	for (size_t x = 0; x < r.xsize(); x++) {
275	9.78k	int bucket_x = 0;
276	9.78k	int bucket_y = 0;
277	9.78k	int bucket_b = 0;
278	106k	for (int t : bctx.dc_thresholds[0]) {
279	106k	if (quant_row_x[x >> chroma_subsampling.HShift(0)] > t) bucket_x++;
280	106k	}
281	9.78k	for (int t : bctx.dc_thresholds[1]) {
282	706	if (quant_row_y[x >> chroma_subsampling.HShift(1)] > t) bucket_y++;
283	706	}
284	9.78k	for (int t : bctx.dc_thresholds[2]) {
285	410	if (quant_row_b[x >> chroma_subsampling.HShift(2)] > t) bucket_b++;
286	410	}
287	9.78k	int bucket = bucket_x;
288	9.78k	bucket *= bctx.dc_thresholds[2].size() + 1;
289	9.78k	bucket += bucket_b;
290	9.78k	bucket *= bctx.dc_thresholds[1].size() + 1;
291	9.78k	bucket += bucket_y;
292	9.78k	qdc_row_val[x] = bucket;
293	9.78k	}
294	953	}
295	513	}
296	5.90k	}
297
298		// NOLINTNEXTLINE(google-readability-namespace-comments)
299		} // namespace HWY_NAMESPACE
300		} // namespace jxl
301		HWY_AFTER_NAMESPACE();
302
303		#if HWY_ONCE
304		namespace jxl {
305
306		HWY_EXPORT(DequantDC);
307		HWY_EXPORT(AdaptiveDCSmoothing);
308		Status AdaptiveDCSmoothing(JxlMemoryManager* memory_manager,
309		const float* dc_factors, Image3F* dc,
310	5.20k	ThreadPool* pool) {
311	5.20k	return HWY_DYNAMIC_DISPATCH(AdaptiveDCSmoothing)(memory_manager, dc_factors,
312	5.20k	dc, pool);
313	5.20k	}
314
315		void DequantDC(const Rect& r, Image3F* dc, ImageB* quant_dc, const Image& in,
316		const float* dc_factors, float mul, const float* cfl_factors,
317		const YCbCrChromaSubsampling& chroma_subsampling,
318	5.90k	const BlockCtxMap& bctx) {
319	5.90k	HWY_DYNAMIC_DISPATCH(DequantDC)
320	5.90k	(r, dc, quant_dc, in, dc_factors, mul, cfl_factors, chroma_subsampling, bctx);
321	5.90k	}
322
323		} // namespace jxl
324		#endif // HWY_ONCE