Coverage Report

Created: 2022-08-24 06:04

/src/libjxl/lib/jxl/butteraugli/butteraugli.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
//
6
// Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com)
7
//
8
// The physical architecture of butteraugli is based on the following naming
9
// convention:
10
//   * Opsin - dynamics of the photosensitive chemicals in the retina
11
//             with their immediate electrical processing
12
//   * Xyb - hybrid opponent/trichromatic color space
13
//     x is roughly red-subtract-green.
14
//     y is yellow.
15
//     b is blue.
16
//     Xyb values are computed from Opsin mixing, not directly from rgb.
17
//   * Mask - for visual masking
18
//   * Hf - color modeling for spatially high-frequency features
19
//   * Lf - color modeling for spatially low-frequency features
20
//   * Diffmap - to cluster and build an image of error between the images
21
//   * Blur - to hold the smoothing code
22
23
#include "lib/jxl/butteraugli/butteraugli.h"
24
25
#include <stdint.h>
26
#include <stdio.h>
27
#include <stdlib.h>
28
#include <string.h>
29
30
#include <algorithm>
31
#include <array>
32
#include <cmath>
33
#include <new>
34
#include <vector>
35
36
#if PROFILER_ENABLED
37
#include <chrono>
38
#endif  // PROFILER_ENABLED
39
40
#undef HWY_TARGET_INCLUDE
41
#define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc"
42
#include <hwy/foreach_target.h>
43
44
#include "lib/jxl/base/printf_macros.h"
45
#include "lib/jxl/base/profiler.h"
46
#include "lib/jxl/base/status.h"
47
#include "lib/jxl/convolve.h"
48
#include "lib/jxl/fast_math-inl.h"
49
#include "lib/jxl/gauss_blur.h"
50
#include "lib/jxl/image_ops.h"
51
52
#ifndef JXL_BUTTERAUGLI_ONCE
53
#define JXL_BUTTERAUGLI_ONCE
54
55
namespace jxl {
56
57
0
std::vector<float> ComputeKernel(float sigma) {
58
0
  const float m = 2.25;  // Accuracy increases when m is increased.
59
0
  const double scaler = -1.0 / (2.0 * sigma * sigma);
60
0
  const int diff = std::max<int>(1, m * std::fabs(sigma));
61
0
  std::vector<float> kernel(2 * diff + 1);
62
0
  for (int i = -diff; i <= diff; ++i) {
63
0
    kernel[i + diff] = std::exp(scaler * i * i);
64
0
  }
65
0
  return kernel;
66
0
}
67
68
void ConvolveBorderColumn(const ImageF& in, const std::vector<float>& kernel,
69
0
                          const size_t x, float* BUTTERAUGLI_RESTRICT row_out) {
70
0
  const size_t offset = kernel.size() / 2;
71
0
  int minx = x < offset ? 0 : x - offset;
72
0
  int maxx = std::min<int>(in.xsize() - 1, x + offset);
73
0
  float weight = 0.0f;
74
0
  for (int j = minx; j <= maxx; ++j) {
75
0
    weight += kernel[j - x + offset];
76
0
  }
77
0
  float scale = 1.0f / weight;
78
0
  for (size_t y = 0; y < in.ysize(); ++y) {
79
0
    const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
80
0
    float sum = 0.0f;
81
0
    for (int j = minx; j <= maxx; ++j) {
82
0
      sum += row_in[j] * kernel[j - x + offset];
83
0
    }
84
0
    row_out[y] = sum * scale;
85
0
  }
86
0
}
87
88
// Computes a horizontal convolution and transposes the result.
89
void ConvolutionWithTranspose(const ImageF& in,
90
                              const std::vector<float>& kernel,
91
0
                              ImageF* BUTTERAUGLI_RESTRICT out) {
92
0
  PROFILER_FUNC;
93
0
  JXL_CHECK(out->xsize() == in.ysize());
94
0
  JXL_CHECK(out->ysize() == in.xsize());
95
0
  const size_t len = kernel.size();
96
0
  const size_t offset = len / 2;
97
0
  float weight_no_border = 0.0f;
98
0
  for (size_t j = 0; j < len; ++j) {
99
0
    weight_no_border += kernel[j];
100
0
  }
101
0
  const float scale_no_border = 1.0f / weight_no_border;
102
0
  const size_t border1 = std::min(in.xsize(), offset);
103
0
  const size_t border2 = in.xsize() > offset ? in.xsize() - offset : 0;
104
0
  std::vector<float> scaled_kernel(len / 2 + 1);
105
0
  for (size_t i = 0; i <= len / 2; ++i) {
106
0
    scaled_kernel[i] = kernel[i] * scale_no_border;
107
0
  }
108
109
  // middle
110
0
  switch (len) {
111
0
    case 7: {
112
0
      PROFILER_ZONE("conv7");
113
0
      const float sk0 = scaled_kernel[0];
114
0
      const float sk1 = scaled_kernel[1];
115
0
      const float sk2 = scaled_kernel[2];
116
0
      const float sk3 = scaled_kernel[3];
117
0
      for (size_t y = 0; y < in.ysize(); ++y) {
118
0
        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
119
0
        for (size_t x = border1; x < border2; ++x, ++row_in) {
120
0
          const float sum0 = (row_in[0] + row_in[6]) * sk0;
121
0
          const float sum1 = (row_in[1] + row_in[5]) * sk1;
122
0
          const float sum2 = (row_in[2] + row_in[4]) * sk2;
123
0
          const float sum = (row_in[3]) * sk3 + sum0 + sum1 + sum2;
124
0
          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
125
0
          row_out[y] = sum;
126
0
        }
127
0
      }
128
0
    } break;
129
0
    case 13: {
130
0
      PROFILER_ZONE("conv15");
131
0
      for (size_t y = 0; y < in.ysize(); ++y) {
132
0
        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
133
0
        for (size_t x = border1; x < border2; ++x, ++row_in) {
134
0
          float sum0 = (row_in[0] + row_in[12]) * scaled_kernel[0];
135
0
          float sum1 = (row_in[1] + row_in[11]) * scaled_kernel[1];
136
0
          float sum2 = (row_in[2] + row_in[10]) * scaled_kernel[2];
137
0
          float sum3 = (row_in[3] + row_in[9]) * scaled_kernel[3];
138
0
          sum0 += (row_in[4] + row_in[8]) * scaled_kernel[4];
139
0
          sum1 += (row_in[5] + row_in[7]) * scaled_kernel[5];
140
0
          const float sum = (row_in[6]) * scaled_kernel[6];
141
0
          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
142
0
          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
143
0
        }
144
0
      }
145
0
      break;
146
0
    }
147
0
    case 15: {
148
0
      PROFILER_ZONE("conv15");
149
0
      for (size_t y = 0; y < in.ysize(); ++y) {
150
0
        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
151
0
        for (size_t x = border1; x < border2; ++x, ++row_in) {
152
0
          float sum0 = (row_in[0] + row_in[14]) * scaled_kernel[0];
153
0
          float sum1 = (row_in[1] + row_in[13]) * scaled_kernel[1];
154
0
          float sum2 = (row_in[2] + row_in[12]) * scaled_kernel[2];
155
0
          float sum3 = (row_in[3] + row_in[11]) * scaled_kernel[3];
156
0
          sum0 += (row_in[4] + row_in[10]) * scaled_kernel[4];
157
0
          sum1 += (row_in[5] + row_in[9]) * scaled_kernel[5];
158
0
          sum2 += (row_in[6] + row_in[8]) * scaled_kernel[6];
159
0
          const float sum = (row_in[7]) * scaled_kernel[7];
160
0
          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
161
0
          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
162
0
        }
163
0
      }
164
0
      break;
165
0
    }
166
0
    case 33: {
167
0
      PROFILER_ZONE("conv33");
168
0
      for (size_t y = 0; y < in.ysize(); ++y) {
169
0
        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
170
0
        for (size_t x = border1; x < border2; ++x, ++row_in) {
171
0
          float sum0 = (row_in[0] + row_in[32]) * scaled_kernel[0];
172
0
          float sum1 = (row_in[1] + row_in[31]) * scaled_kernel[1];
173
0
          float sum2 = (row_in[2] + row_in[30]) * scaled_kernel[2];
174
0
          float sum3 = (row_in[3] + row_in[29]) * scaled_kernel[3];
175
0
          sum0 += (row_in[4] + row_in[28]) * scaled_kernel[4];
176
0
          sum1 += (row_in[5] + row_in[27]) * scaled_kernel[5];
177
0
          sum2 += (row_in[6] + row_in[26]) * scaled_kernel[6];
178
0
          sum3 += (row_in[7] + row_in[25]) * scaled_kernel[7];
179
0
          sum0 += (row_in[8] + row_in[24]) * scaled_kernel[8];
180
0
          sum1 += (row_in[9] + row_in[23]) * scaled_kernel[9];
181
0
          sum2 += (row_in[10] + row_in[22]) * scaled_kernel[10];
182
0
          sum3 += (row_in[11] + row_in[21]) * scaled_kernel[11];
183
0
          sum0 += (row_in[12] + row_in[20]) * scaled_kernel[12];
184
0
          sum1 += (row_in[13] + row_in[19]) * scaled_kernel[13];
185
0
          sum2 += (row_in[14] + row_in[18]) * scaled_kernel[14];
186
0
          sum3 += (row_in[15] + row_in[17]) * scaled_kernel[15];
187
0
          const float sum = (row_in[16]) * scaled_kernel[16];
188
0
          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
189
0
          row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
190
0
        }
191
0
      }
192
0
      break;
193
0
    }
194
0
    default:
195
0
      printf("Warning: Unexpected kernel size! %" PRIuS "\n", len);
196
0
      for (size_t y = 0; y < in.ysize(); ++y) {
197
0
        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
198
0
        for (size_t x = border1; x < border2; ++x) {
199
0
          const int d = x - offset;
200
0
          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
201
0
          float sum = 0.0f;
202
0
          size_t j;
203
0
          for (j = 0; j <= len / 2; ++j) {
204
0
            sum += row_in[d + j] * scaled_kernel[j];
205
0
          }
206
0
          for (; j < len; ++j) {
207
0
            sum += row_in[d + j] * scaled_kernel[len - 1 - j];
208
0
          }
209
0
          row_out[y] = sum;
210
0
        }
211
0
      }
212
0
  }
213
  // left border
214
0
  for (size_t x = 0; x < border1; ++x) {
215
0
    ConvolveBorderColumn(in, kernel, x, out->Row(x));
216
0
  }
217
218
  // right border
219
0
  for (size_t x = border2; x < in.xsize(); ++x) {
220
0
    ConvolveBorderColumn(in, kernel, x, out->Row(x));
221
0
  }
222
0
}
223
224
// A blur somewhat similar to a 2D Gaussian blur.
225
// See: https://en.wikipedia.org/wiki/Gaussian_blur
226
//
227
// This is a bottleneck because the sigma can be quite large (>7). We can use
228
// gauss_blur.cc (runtime independent of sigma, closer to a 4*sigma truncated
229
// Gaussian and our 2.25 in ComputeKernel), but its boundary conditions are
230
// zero-valued. This leads to noticeable differences at the edges of diffmaps.
231
// We retain a special case for 5x5 kernels (even faster than gauss_blur),
232
// optionally use gauss_blur followed by fixup of the borders for large images,
233
// or fall back to the previous truncated FIR followed by a transpose.
234
void Blur(const ImageF& in, float sigma, const ButteraugliParams& params,
235
0
          BlurTemp* temp, ImageF* out) {
236
0
  std::vector<float> kernel = ComputeKernel(sigma);
237
  // Separable5 does an in-place convolution, so this fast path is not safe if
238
  // in aliases out.
239
0
  if (kernel.size() == 5 && &in != out) {
240
0
    float sum_weights = 0.0f;
241
0
    for (const float w : kernel) {
242
0
      sum_weights += w;
243
0
    }
244
0
    const float scale = 1.0f / sum_weights;
245
0
    const float w0 = kernel[2] * scale;
246
0
    const float w1 = kernel[1] * scale;
247
0
    const float w2 = kernel[0] * scale;
248
0
    const WeightsSeparable5 weights = {
249
0
        {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
250
0
        {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
251
0
    };
252
0
    Separable5(in, Rect(in), weights, /*pool=*/nullptr, out);
253
0
    return;
254
0
  }
255
256
0
  ImageF* JXL_RESTRICT temp_t = temp->GetTransposed(in);
257
0
  ConvolutionWithTranspose(in, kernel, temp_t);
258
0
  ConvolutionWithTranspose(*temp_t, kernel, out);
259
0
}
260
261
// Allows PaddedMaltaUnit to call either function via overloading.
262
struct MaltaTagLF {};
263
struct MaltaTag {};
264
265
}  // namespace jxl
266
267
#endif  // JXL_BUTTERAUGLI_ONCE
268
269
#include <hwy/highway.h>
270
HWY_BEFORE_NAMESPACE();
271
namespace jxl {
272
namespace HWY_NAMESPACE {
273
274
// These templates are not found via ADL.
275
using hwy::HWY_NAMESPACE::Abs;
276
using hwy::HWY_NAMESPACE::Div;
277
using hwy::HWY_NAMESPACE::Gt;
278
using hwy::HWY_NAMESPACE::IfThenElse;
279
using hwy::HWY_NAMESPACE::IfThenElseZero;
280
using hwy::HWY_NAMESPACE::Lt;
281
using hwy::HWY_NAMESPACE::Max;
282
using hwy::HWY_NAMESPACE::Mul;
283
using hwy::HWY_NAMESPACE::MulAdd;
284
using hwy::HWY_NAMESPACE::MulSub;
285
using hwy::HWY_NAMESPACE::Neg;
286
using hwy::HWY_NAMESPACE::Sub;
287
using hwy::HWY_NAMESPACE::Vec;
288
using hwy::HWY_NAMESPACE::ZeroIfNegative;
289
290
template <class D, class V>
291
0
HWY_INLINE V MaximumClamp(D d, V v, double kMaxVal) {
292
0
  static const double kMul = 0.724216145665;
293
0
  const V mul = Set(d, kMul);
294
0
  const V maxval = Set(d, kMaxVal);
295
  // If greater than maxval or less than -maxval, replace with if_*.
296
0
  const V if_pos = MulAdd(Sub(v, maxval), mul, maxval);
297
0
  const V if_neg = MulSub(Add(v, maxval), mul, maxval);
298
0
  const V pos_or_v = IfThenElse(Ge(v, maxval), if_pos, v);
299
0
  return IfThenElse(Lt(v, Neg(maxval)), if_neg, pos_or_v);
300
0
}
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::MaximumClamp<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>, double)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::MaximumClamp<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, double)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::MaximumClamp<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, double)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::MaximumClamp<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul>, double)
301
302
// Make area around zero less important (remove it).
303
template <class D, class V>
304
0
HWY_INLINE V RemoveRangeAroundZero(const D d, const double kw, const V x) {
305
0
  const auto w = Set(d, kw);
306
0
  return IfThenElse(Gt(x, w), Sub(x, w),
307
0
                    IfThenElseZero(Lt(x, Neg(w)), Add(x, w)));
308
0
}
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::RemoveRangeAroundZero<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, double, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::RemoveRangeAroundZero<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, double, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::RemoveRangeAroundZero<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, double, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::RemoveRangeAroundZero<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, double, hwy::N_EMU128::Vec128<float, 4ul>)
309
310
// Make area around zero more important (2x it until the limit).
311
template <class D, class V>
312
0
HWY_INLINE V AmplifyRangeAroundZero(const D d, const double kw, const V x) {
313
0
  const auto w = Set(d, kw);
314
0
  return IfThenElse(Gt(x, w), Add(x, w),
315
0
                    IfThenElse(Lt(x, Neg(w)), Sub(x, w), Add(x, x)));
316
0
}
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::AmplifyRangeAroundZero<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, double, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::AmplifyRangeAroundZero<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, double, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::AmplifyRangeAroundZero<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, double, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::AmplifyRangeAroundZero<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, double, hwy::N_EMU128::Vec128<float, 4ul>)
317
318
// XybLowFreqToVals converts from low-frequency XYB space to the 'vals' space.
319
// Vals space can be converted to L2-norm space (Euclidean and normalized)
320
// through visual masking.
321
template <class D, class V>
322
HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y,
323
                                 const V& b_arg, V* HWY_RESTRICT valx,
324
0
                                 V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) {
325
0
  static const double xmuli = 32.2217497012;
326
0
  static const double ymuli = 13.7697791434;
327
0
  static const double bmuli = 47.504615728;
328
0
  static const double y_to_b_muli = -0.362267051518;
329
0
  const V xmul = Set(d, xmuli);
330
0
  const V ymul = Set(d, ymuli);
331
0
  const V bmul = Set(d, bmuli);
332
0
  const V y_to_b_mul = Set(d, y_to_b_muli);
333
0
  const V b = MulAdd(y_to_b_mul, y, b_arg);
334
0
  *valb = Mul(b, bmul);
335
0
  *valx = Mul(x, xmul);
336
0
  *valy = Mul(y, ymul);
337
0
}
Unexecuted instantiation: void jxl::N_AVX3::XybLowFreqToVals<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float>*, hwy::N_AVX3::Vec512<float>*, hwy::N_AVX3::Vec512<float>*)
Unexecuted instantiation: void jxl::N_AVX2::XybLowFreqToVals<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*)
Unexecuted instantiation: void jxl::N_SSE4::XybLowFreqToVals<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*)
Unexecuted instantiation: void jxl::N_EMU128::XybLowFreqToVals<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul>*, hwy::N_EMU128::Vec128<float, 4ul>*, hwy::N_EMU128::Vec128<float, 4ul>*)
338
339
void SuppressXByY(const ImageF& in_x, const ImageF& in_y, const double yw,
340
0
                  ImageF* HWY_RESTRICT out) {
341
0
  JXL_DASSERT(SameSize(in_x, in_y) && SameSize(in_x, *out));
342
0
  const size_t xsize = in_x.xsize();
343
0
  const size_t ysize = in_x.ysize();
344
345
0
  const HWY_FULL(float) d;
346
0
  static const double s = 0.653020556257;
347
0
  const auto sv = Set(d, s);
348
0
  const auto one_minus_s = Set(d, 1.0 - s);
349
0
  const auto ywv = Set(d, yw);
350
351
0
  for (size_t y = 0; y < ysize; ++y) {
352
0
    const float* HWY_RESTRICT row_x = in_x.ConstRow(y);
353
0
    const float* HWY_RESTRICT row_y = in_y.ConstRow(y);
354
0
    float* HWY_RESTRICT row_out = out->Row(y);
355
356
0
    for (size_t x = 0; x < xsize; x += Lanes(d)) {
357
0
      const auto vx = Load(d, row_x + x);
358
0
      const auto vy = Load(d, row_y + x);
359
0
      const auto scaler =
360
0
          MulAdd(Div(ywv, MulAdd(vy, vy, ywv)), one_minus_s, sv);
361
0
      Store(Mul(scaler, vx), d, row_out + x);
362
0
    }
363
0
  }
364
0
}
Unexecuted instantiation: jxl::N_SSE4::SuppressXByY(jxl::Plane<float> const&, jxl::Plane<float> const&, double, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX2::SuppressXByY(jxl::Plane<float> const&, jxl::Plane<float> const&, double, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX3::SuppressXByY(jxl::Plane<float> const&, jxl::Plane<float> const&, double, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_EMU128::SuppressXByY(jxl::Plane<float> const&, jxl::Plane<float> const&, double, jxl::Plane<float>*)
365
366
static void SeparateFrequencies(size_t xsize, size_t ysize,
367
                                const ButteraugliParams& params,
368
                                BlurTemp* blur_temp, const Image3F& xyb,
369
0
                                PsychoImage& ps) {
370
0
  PROFILER_FUNC;
371
0
  const HWY_FULL(float) d;
372
373
  // Extract lf ...
374
0
  static const double kSigmaLf = 7.15593339443;
375
0
  static const double kSigmaHf = 3.22489901262;
376
0
  static const double kSigmaUhf = 1.56416327805;
377
0
  ps.mf = Image3F(xsize, ysize);
378
0
  ps.hf[0] = ImageF(xsize, ysize);
379
0
  ps.hf[1] = ImageF(xsize, ysize);
380
0
  ps.lf = Image3F(xyb.xsize(), xyb.ysize());
381
0
  ps.mf = Image3F(xyb.xsize(), xyb.ysize());
382
0
  for (int i = 0; i < 3; ++i) {
383
0
    Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &ps.lf.Plane(i));
384
385
    // ... and keep everything else in mf.
386
0
    for (size_t y = 0; y < ysize; ++y) {
387
0
      const float* BUTTERAUGLI_RESTRICT row_xyb = xyb.PlaneRow(i, y);
388
0
      const float* BUTTERAUGLI_RESTRICT row_lf = ps.lf.ConstPlaneRow(i, y);
389
0
      float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
390
0
      for (size_t x = 0; x < xsize; x += Lanes(d)) {
391
0
        const auto mf = Sub(Load(d, row_xyb + x), Load(d, row_lf + x));
392
0
        Store(mf, d, row_mf + x);
393
0
      }
394
0
    }
395
0
    if (i == 2) {
396
0
      Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
397
0
      break;
398
0
    }
399
    // Divide mf into mf and hf.
400
0
    for (size_t y = 0; y < ysize; ++y) {
401
0
      float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
402
0
      float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
403
0
      for (size_t x = 0; x < xsize; x += Lanes(d)) {
404
0
        Store(Load(d, row_mf + x), d, row_hf + x);
405
0
      }
406
0
    }
407
0
    Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
408
0
    static const double kRemoveMfRange = 0.29;
409
0
    static const double kAddMfRange = 0.1;
410
0
    if (i == 0) {
411
0
      for (size_t y = 0; y < ysize; ++y) {
412
0
        float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(0, y);
413
0
        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
414
0
        for (size_t x = 0; x < xsize; x += Lanes(d)) {
415
0
          auto mf = Load(d, row_mf + x);
416
0
          auto hf = Sub(Load(d, row_hf + x), mf);
417
0
          mf = RemoveRangeAroundZero(d, kRemoveMfRange, mf);
418
0
          Store(mf, d, row_mf + x);
419
0
          Store(hf, d, row_hf + x);
420
0
        }
421
0
      }
422
0
    } else {
423
0
      for (size_t y = 0; y < ysize; ++y) {
424
0
        float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(1, y);
425
0
        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
426
0
        for (size_t x = 0; x < xsize; x += Lanes(d)) {
427
0
          auto mf = Load(d, row_mf + x);
428
0
          auto hf = Sub(Load(d, row_hf + x), mf);
429
430
0
          mf = AmplifyRangeAroundZero(d, kAddMfRange, mf);
431
0
          Store(mf, d, row_mf + x);
432
0
          Store(hf, d, row_hf + x);
433
0
        }
434
0
      }
435
0
    }
436
0
  }
437
438
  // Temporarily used as output of SuppressXByY
439
0
  ps.uhf[0] = ImageF(xsize, ysize);
440
0
  ps.uhf[1] = ImageF(xsize, ysize);
441
442
  // Suppress red-green by intensity change in the high freq channels.
443
0
  static const double suppress = 46.0;
444
0
  SuppressXByY(ps.hf[0], ps.hf[1], suppress, &ps.uhf[0]);
445
  // hf is the SuppressXByY output, uhf will be written below.
446
0
  ps.hf[0].Swap(ps.uhf[0]);
447
448
0
  for (int i = 0; i < 2; ++i) {
449
    // Divide hf into hf and uhf.
450
0
    for (size_t y = 0; y < ysize; ++y) {
451
0
      float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[i].Row(y);
452
0
      float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
453
0
      for (size_t x = 0; x < xsize; ++x) {
454
0
        row_uhf[x] = row_hf[x];
455
0
      }
456
0
    }
457
0
    Blur(ps.hf[i], kSigmaUhf, params, blur_temp, &ps.hf[i]);
458
0
    static const double kRemoveHfRange = 1.5;
459
0
    static const double kAddHfRange = 0.132;
460
0
    static const double kRemoveUhfRange = 0.04;
461
0
    static const double kMaxclampHf = 28.4691806922;
462
0
    static const double kMaxclampUhf = 5.19175294647;
463
0
    static double kMulYHf = 2.155;
464
0
    static double kMulYUhf = 2.69313763794;
465
0
    if (i == 0) {
466
0
      for (size_t y = 0; y < ysize; ++y) {
467
0
        float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[0].Row(y);
468
0
        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
469
0
        for (size_t x = 0; x < xsize; x += Lanes(d)) {
470
0
          auto hf = Load(d, row_hf + x);
471
0
          auto uhf = Sub(Load(d, row_uhf + x), hf);
472
0
          hf = RemoveRangeAroundZero(d, kRemoveHfRange, hf);
473
0
          uhf = RemoveRangeAroundZero(d, kRemoveUhfRange, uhf);
474
0
          Store(hf, d, row_hf + x);
475
0
          Store(uhf, d, row_uhf + x);
476
0
        }
477
0
      }
478
0
    } else {
479
0
      for (size_t y = 0; y < ysize; ++y) {
480
0
        float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[1].Row(y);
481
0
        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
482
0
        for (size_t x = 0; x < xsize; x += Lanes(d)) {
483
0
          auto hf = Load(d, row_hf + x);
484
0
          hf = MaximumClamp(d, hf, kMaxclampHf);
485
486
0
          auto uhf = Sub(Load(d, row_uhf + x), hf);
487
0
          uhf = MaximumClamp(d, uhf, kMaxclampUhf);
488
0
          uhf = Mul(uhf, Set(d, kMulYUhf));
489
0
          Store(uhf, d, row_uhf + x);
490
491
0
          hf = Mul(hf, Set(d, kMulYHf));
492
0
          hf = AmplifyRangeAroundZero(d, kAddHfRange, hf);
493
0
          Store(hf, d, row_hf + x);
494
0
        }
495
0
      }
496
0
    }
497
0
  }
498
  // Modify range around zero code only concerns the high frequency
499
  // planes and only the X and Y channels.
500
  // Convert low freq xyb to vals space so that we can do a simple squared sum
501
  // diff on the low frequencies later.
502
0
  for (size_t y = 0; y < ysize; ++y) {
503
0
    float* BUTTERAUGLI_RESTRICT row_x = ps.lf.PlaneRow(0, y);
504
0
    float* BUTTERAUGLI_RESTRICT row_y = ps.lf.PlaneRow(1, y);
505
0
    float* BUTTERAUGLI_RESTRICT row_b = ps.lf.PlaneRow(2, y);
506
0
    for (size_t x = 0; x < xsize; x += Lanes(d)) {
507
0
      auto valx = Undefined(d);
508
0
      auto valy = Undefined(d);
509
0
      auto valb = Undefined(d);
510
0
      XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x),
511
0
                       Load(d, row_b + x), &valx, &valy, &valb);
512
0
      Store(valx, d, row_x + x);
513
0
      Store(valy, d, row_y + x);
514
0
      Store(valb, d, row_b + x);
515
0
    }
516
0
  }
517
0
}
Unexecuted instantiation: butteraugli.cc:jxl::N_AVX3::SeparateFrequencies(unsigned long, unsigned long, jxl::ButteraugliParams const&, jxl::BlurTemp*, jxl::Image3<float> const&, jxl::PsychoImage&)
Unexecuted instantiation: butteraugli.cc:jxl::N_AVX2::SeparateFrequencies(unsigned long, unsigned long, jxl::ButteraugliParams const&, jxl::BlurTemp*, jxl::Image3<float> const&, jxl::PsychoImage&)
Unexecuted instantiation: butteraugli.cc:jxl::N_SSE4::SeparateFrequencies(unsigned long, unsigned long, jxl::ButteraugliParams const&, jxl::BlurTemp*, jxl::Image3<float> const&, jxl::PsychoImage&)
Unexecuted instantiation: butteraugli.cc:jxl::N_EMU128::SeparateFrequencies(unsigned long, unsigned long, jxl::ButteraugliParams const&, jxl::BlurTemp*, jxl::Image3<float> const&, jxl::PsychoImage&)
518
519
namespace {
520
template <typename V>
521
0
BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d) {
522
0
  return Add(Add(a, b), Add(c, d));
523
0
}
Unexecuted instantiation: butteraugli.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::Sum<hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::Sum<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::Sum<hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::Sum<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::(anonymous namespace)::Sum<hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX3::Vec512<float> jxl::N_AVX3::(anonymous namespace)::Sum<hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: butteraugli.cc:hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::(anonymous namespace)::Sum<hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::(anonymous namespace)::Sum<hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>)
524
template <typename V>
525
0
BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e) {
526
0
  return Sum(a, b, c, Add(d, e));
527
0
}
Unexecuted instantiation: butteraugli.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::Sum<hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::Sum<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::Sum<hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::Sum<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::(anonymous namespace)::Sum<hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX3::Vec512<float> jxl::N_AVX3::(anonymous namespace)::Sum<hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: butteraugli.cc:hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::(anonymous namespace)::Sum<hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::(anonymous namespace)::Sum<hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>)
528
template <typename V>
529
0
BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g) {
530
0
  return Sum(a, b, c, Sum(d, e, f, g));
531
0
}
Unexecuted instantiation: butteraugli.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::Sum<hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::Sum<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::Sum<hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::Sum<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::(anonymous namespace)::Sum<hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX3::Vec512<float> jxl::N_AVX3::(anonymous namespace)::Sum<hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: butteraugli.cc:hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::(anonymous namespace)::Sum<hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::(anonymous namespace)::Sum<hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>)
532
template <typename V>
533
0
BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d, V e, V f, V g, V h, V i) {
534
0
  return Add(Add(Sum(a, b, c, d), Sum(e, f, g, h)), i);
535
0
}
Unexecuted instantiation: butteraugli.cc:hwy::N_SSE4::Vec128<float, 1ul> jxl::N_SSE4::(anonymous namespace)::Sum<hwy::N_SSE4::Vec128<float, 1ul> >(hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>, hwy::N_SSE4::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::(anonymous namespace)::Sum<hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX2::Vec128<float, 1ul> jxl::N_AVX2::(anonymous namespace)::Sum<hwy::N_AVX2::Vec128<float, 1ul> >(hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>, hwy::N_AVX2::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX2::Vec256<float> jxl::N_AVX2::(anonymous namespace)::Sum<hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX3::Vec128<float, 1ul> jxl::N_AVX3::(anonymous namespace)::Sum<hwy::N_AVX3::Vec128<float, 1ul> >(hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>, hwy::N_AVX3::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_AVX3::Vec512<float> jxl::N_AVX3::(anonymous namespace)::Sum<hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: butteraugli.cc:hwy::N_EMU128::Vec128<float, 1ul> jxl::N_EMU128::(anonymous namespace)::Sum<hwy::N_EMU128::Vec128<float, 1ul> >(hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>, hwy::N_EMU128::Vec128<float, 1ul>)
Unexecuted instantiation: butteraugli.cc:hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::(anonymous namespace)::Sum<hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>, hwy::N_EMU128::Vec128<float, 4ul>)
536
}  // namespace
537
538
template <class D>
539
Vec<D> MaltaUnit(MaltaTagLF /*tag*/, const D df,
540
0
                 const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
541
0
  const intptr_t xs3 = 3 * xs;
542
543
0
  const auto center = LoadU(df, d);
544
545
  // x grows, y constant
546
0
  const auto sum_yconst = Sum(LoadU(df, d - 4), LoadU(df, d - 2), center,
547
0
                              LoadU(df, d + 2), LoadU(df, d + 4));
548
  // Will return this, sum of all line kernels
549
0
  auto retval = Mul(sum_yconst, sum_yconst);
550
0
  {
551
    // y grows, x constant
552
0
    auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs - xs), center,
553
0
                   LoadU(df, d + xs + xs), LoadU(df, d + xs3 + xs));
554
0
    retval = MulAdd(sum, sum, retval);
555
0
  }
556
0
  {
557
    // both grow
558
0
    auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2), center,
559
0
                   LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
560
0
    retval = MulAdd(sum, sum, retval);
561
0
  }
562
0
  {
563
    // y grows, x shrinks
564
0
    auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2), center,
565
0
                   LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
566
0
    retval = MulAdd(sum, sum, retval);
567
0
  }
568
0
  {
569
    // y grows -4 to 4, x shrinks 1 -> -1
570
0
    auto sum =
571
0
        Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs - xs + 1), center,
572
0
            LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 1));
573
0
    retval = MulAdd(sum, sum, retval);
574
0
  }
575
0
  {
576
    //  y grows -4 to 4, x grows -1 -> 1
577
0
    auto sum =
578
0
        Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs - xs - 1), center,
579
0
            LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 1));
580
0
    retval = MulAdd(sum, sum, retval);
581
0
  }
582
0
  {
583
    // x grows -4 to 4, y grows -1 to 1
584
0
    auto sum = Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 2 - xs), center,
585
0
                   LoadU(df, d + 2 + xs), LoadU(df, d + 4 + xs));
586
0
    retval = MulAdd(sum, sum, retval);
587
0
  }
588
0
  {
589
    // x grows -4 to 4, y shrinks 1 to -1
590
0
    auto sum = Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 2 + xs), center,
591
0
                   LoadU(df, d + 2 - xs), LoadU(df, d + 4 - xs));
592
0
    retval = MulAdd(sum, sum, retval);
593
0
  }
594
0
  {
595
    /* 0_________
596
       1__*______
597
       2___*_____
598
       3_________
599
       4____0____
600
       5_________
601
       6_____*___
602
       7______*__
603
       8_________ */
604
0
    auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1), center,
605
0
                   LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
606
0
    retval = MulAdd(sum, sum, retval);
607
0
  }
608
0
  {
609
    /* 0_________
610
       1______*__
611
       2_____*___
612
       3_________
613
       4____0____
614
       5_________
615
       6___*_____
616
       7__*______
617
       8_________ */
618
0
    auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1), center,
619
0
                   LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
620
0
    retval = MulAdd(sum, sum, retval);
621
0
  }
622
0
  {
623
    /* 0_________
624
       1_________
625
       2_*_______
626
       3__*______
627
       4____0____
628
       5______*__
629
       6_______*_
630
       7_________
631
       8_________ */
632
0
    auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2), center,
633
0
                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
634
0
    retval = MulAdd(sum, sum, retval);
635
0
  }
636
0
  {
637
    /* 0_________
638
       1_________
639
       2_______*_
640
       3______*__
641
       4____0____
642
       5__*______
643
       6_*_______
644
       7_________
645
       8_________ */
646
0
    auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2), center,
647
0
                   LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
648
0
    retval = MulAdd(sum, sum, retval);
649
0
  }
650
0
  {
651
    /* 0_________
652
       1_________
653
       2________*
654
       3______*__
655
       4____0____
656
       5__*______
657
       6*________
658
       7_________
659
       8_________ */
660
661
0
    auto sum = Sum(LoadU(df, d + xs + xs - 4), LoadU(df, d + xs - 2), center,
662
0
                   LoadU(df, d - xs + 2), LoadU(df, d - xs - xs + 4));
663
0
    retval = MulAdd(sum, sum, retval);
664
0
  }
665
0
  {
666
    /* 0_________
667
       1_________
668
       2*________
669
       3__*______
670
       4____0____
671
       5______*__
672
       6________*
673
       7_________
674
       8_________ */
675
0
    auto sum = Sum(LoadU(df, d - xs - xs - 4), LoadU(df, d - xs - 2), center,
676
0
                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 4));
677
0
    retval = MulAdd(sum, sum, retval);
678
0
  }
679
0
  {
680
    /* 0__*______
681
       1_________
682
       2___*_____
683
       3_________
684
       4____0____
685
       5_________
686
       6_____*___
687
       7_________
688
       8______*__ */
689
0
    auto sum =
690
0
        Sum(LoadU(df, d - xs3 - xs - 2), LoadU(df, d - xs - xs - 1), center,
691
0
            LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + xs + 2));
692
0
    retval = MulAdd(sum, sum, retval);
693
0
  }
694
0
  {
695
    /* 0______*__
696
       1_________
697
       2_____*___
698
       3_________
699
       4____0____
700
       5_________
701
       6___*_____
702
       7_________
703
       8__*______ */
704
0
    auto sum =
705
0
        Sum(LoadU(df, d - xs3 - xs + 2), LoadU(df, d - xs - xs + 1), center,
706
0
            LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 + xs - 2));
707
0
    retval = MulAdd(sum, sum, retval);
708
0
  }
709
0
  return retval;
710
0
}
Unexecuted instantiation: decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)())) jxl::N_SSE4::MaltaUnit<hwy::N_SSE4::Simd<float, 1ul, 0> >(jxl::MaltaTagLF, hwy::N_SSE4::Simd<float, 1ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)())) jxl::N_SSE4::MaltaUnit<hwy::N_SSE4::Simd<float, 4ul, 0> >(jxl::MaltaTagLF, hwy::N_SSE4::Simd<float, 4ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)())) jxl::N_AVX2::MaltaUnit<hwy::N_AVX2::Simd<float, 1ul, 0> >(jxl::MaltaTagLF, hwy::N_AVX2::Simd<float, 1ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)())) jxl::N_AVX2::MaltaUnit<hwy::N_AVX2::Simd<float, 8ul, 0> >(jxl::MaltaTagLF, hwy::N_AVX2::Simd<float, 8ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 1ul, 0>)())) jxl::N_AVX3::MaltaUnit<hwy::N_AVX3::Simd<float, 1ul, 0> >(jxl::MaltaTagLF, hwy::N_AVX3::Simd<float, 1ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>)())) jxl::N_AVX3::MaltaUnit<hwy::N_AVX3::Simd<float, 16ul, 0> >(jxl::MaltaTagLF, hwy::N_AVX3::Simd<float, 16ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_EMU128::Simd<float, 1ul, 0>)())) jxl::N_EMU128::MaltaUnit<hwy::N_EMU128::Simd<float, 1ul, 0> >(jxl::MaltaTagLF, hwy::N_EMU128::Simd<float, 1ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_EMU128::Simd<float, 4ul, 0>)())) jxl::N_EMU128::MaltaUnit<hwy::N_EMU128::Simd<float, 4ul, 0> >(jxl::MaltaTagLF, hwy::N_EMU128::Simd<float, 4ul, 0>, float const*, long)
711
712
template <class D>
713
Vec<D> MaltaUnit(MaltaTag /*tag*/, const D df,
714
0
                 const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
715
0
  const intptr_t xs3 = 3 * xs;
716
717
0
  const auto center = LoadU(df, d);
718
719
  // x grows, y constant
720
0
  const auto sum_yconst =
721
0
      Sum(LoadU(df, d - 4), LoadU(df, d - 3), LoadU(df, d - 2),
722
0
          LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2),
723
0
          LoadU(df, d + 3), LoadU(df, d + 4));
724
  // Will return this, sum of all line kernels
725
0
  auto retval = Mul(sum_yconst, sum_yconst);
726
727
0
  {
728
    // y grows, x constant
729
0
    auto sum = Sum(LoadU(df, d - xs3 - xs), LoadU(df, d - xs3),
730
0
                   LoadU(df, d - xs - xs), LoadU(df, d - xs), center,
731
0
                   LoadU(df, d + xs), LoadU(df, d + xs + xs),
732
0
                   LoadU(df, d + xs3), LoadU(df, d + xs3 + xs));
733
0
    retval = MulAdd(sum, sum, retval);
734
0
  }
735
0
  {
736
    // both grow
737
0
    auto sum = Sum(LoadU(df, d - xs3 - 3), LoadU(df, d - xs - xs - 2),
738
0
                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
739
0
                   LoadU(df, d + xs + xs + 2), LoadU(df, d + xs3 + 3));
740
0
    retval = MulAdd(sum, sum, retval);
741
0
  }
742
0
  {
743
    // y grows, x shrinks
744
0
    auto sum = Sum(LoadU(df, d - xs3 + 3), LoadU(df, d - xs - xs + 2),
745
0
                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
746
0
                   LoadU(df, d + xs + xs - 2), LoadU(df, d + xs3 - 3));
747
0
    retval = MulAdd(sum, sum, retval);
748
0
  }
749
0
  {
750
    // y grows -4 to 4, x shrinks 1 -> -1
751
0
    auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
752
0
                   LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
753
0
                   LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
754
0
                   LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
755
0
    retval = MulAdd(sum, sum, retval);
756
0
  }
757
0
  {
758
    //  y grows -4 to 4, x grows -1 -> 1
759
0
    auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
760
0
                   LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
761
0
                   LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
762
0
                   LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
763
0
    retval = MulAdd(sum, sum, retval);
764
0
  }
765
0
  {
766
    // x grows -4 to 4, y grows -1 to 1
767
0
    auto sum =
768
0
        Sum(LoadU(df, d - 4 - xs), LoadU(df, d - 3 - xs), LoadU(df, d - 2 - xs),
769
0
            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 + xs),
770
0
            LoadU(df, d + 3 + xs), LoadU(df, d + 4 + xs));
771
0
    retval = MulAdd(sum, sum, retval);
772
0
  }
773
0
  {
774
    // x grows -4 to 4, y shrinks 1 to -1
775
0
    auto sum =
776
0
        Sum(LoadU(df, d - 4 + xs), LoadU(df, d - 3 + xs), LoadU(df, d - 2 + xs),
777
0
            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + 2 - xs),
778
0
            LoadU(df, d + 3 - xs), LoadU(df, d + 4 - xs));
779
0
    retval = MulAdd(sum, sum, retval);
780
0
  }
781
0
  {
782
    /* 0_________
783
       1__*______
784
       2___*_____
785
       3___*_____
786
       4____0____
787
       5_____*___
788
       6_____*___
789
       7______*__
790
       8_________ */
791
0
    auto sum = Sum(LoadU(df, d - xs3 - 2), LoadU(df, d - xs - xs - 1),
792
0
                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
793
0
                   LoadU(df, d + xs + xs + 1), LoadU(df, d + xs3 + 2));
794
0
    retval = MulAdd(sum, sum, retval);
795
0
  }
796
0
  {
797
    /* 0_________
798
       1______*__
799
       2_____*___
800
       3_____*___
801
       4____0____
802
       5___*_____
803
       6___*_____
804
       7__*______
805
       8_________ */
806
0
    auto sum = Sum(LoadU(df, d - xs3 + 2), LoadU(df, d - xs - xs + 1),
807
0
                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
808
0
                   LoadU(df, d + xs + xs - 1), LoadU(df, d + xs3 - 2));
809
0
    retval = MulAdd(sum, sum, retval);
810
0
  }
811
0
  {
812
    /* 0_________
813
       1_________
814
       2_*_______
815
       3__**_____
816
       4____0____
817
       5_____**__
818
       6_______*_
819
       7_________
820
       8_________ */
821
0
    auto sum = Sum(LoadU(df, d - xs - xs - 3), LoadU(df, d - xs - 2),
822
0
                   LoadU(df, d - xs - 1), center, LoadU(df, d + xs + 1),
823
0
                   LoadU(df, d + xs + 2), LoadU(df, d + xs + xs + 3));
824
0
    retval = MulAdd(sum, sum, retval);
825
0
  }
826
0
  {
827
    /* 0_________
828
       1_________
829
       2_______*_
830
       3_____**__
831
       4____0____
832
       5__**_____
833
       6_*_______
834
       7_________
835
       8_________ */
836
0
    auto sum = Sum(LoadU(df, d - xs - xs + 3), LoadU(df, d - xs + 2),
837
0
                   LoadU(df, d - xs + 1), center, LoadU(df, d + xs - 1),
838
0
                   LoadU(df, d + xs - 2), LoadU(df, d + xs + xs - 3));
839
0
    retval = MulAdd(sum, sum, retval);
840
0
  }
841
0
  {
842
    /* 0_________
843
       1_________
844
       2_________
845
       3______***
846
       4___*0*___
847
       5***______
848
       6_________
849
       7_________
850
       8_________ */
851
852
0
    auto sum =
853
0
        Sum(LoadU(df, d + xs - 4), LoadU(df, d + xs - 3), LoadU(df, d + xs - 2),
854
0
            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d - xs + 2),
855
0
            LoadU(df, d - xs + 3), LoadU(df, d - xs + 4));
856
0
    retval = MulAdd(sum, sum, retval);
857
0
  }
858
0
  {
859
    /* 0_________
860
       1_________
861
       2_________
862
       3***______
863
       4___*0*___
864
       5______***
865
       6_________
866
       7_________
867
       8_________ */
868
0
    auto sum =
869
0
        Sum(LoadU(df, d - xs - 4), LoadU(df, d - xs - 3), LoadU(df, d - xs - 2),
870
0
            LoadU(df, d - 1), center, LoadU(df, d + 1), LoadU(df, d + xs + 2),
871
0
            LoadU(df, d + xs + 3), LoadU(df, d + xs + 4));
872
0
    retval = MulAdd(sum, sum, retval);
873
0
  }
874
0
  {
875
    /* 0___*_____
876
       1___*_____
877
       2___*_____
878
       3____*____
879
       4____0____
880
       5____*____
881
       6_____*___
882
       7_____*___
883
       8_____*___ */
884
0
    auto sum = Sum(LoadU(df, d - xs3 - xs - 1), LoadU(df, d - xs3 - 1),
885
0
                   LoadU(df, d - xs - xs - 1), LoadU(df, d - xs), center,
886
0
                   LoadU(df, d + xs), LoadU(df, d + xs + xs + 1),
887
0
                   LoadU(df, d + xs3 + 1), LoadU(df, d + xs3 + xs + 1));
888
0
    retval = MulAdd(sum, sum, retval);
889
0
  }
890
0
  {
891
    /* 0_____*___
892
       1_____*___
893
       2____ *___
894
       3____*____
895
       4____0____
896
       5____*____
897
       6___*_____
898
       7___*_____
899
       8___*_____ */
900
0
    auto sum = Sum(LoadU(df, d - xs3 - xs + 1), LoadU(df, d - xs3 + 1),
901
0
                   LoadU(df, d - xs - xs + 1), LoadU(df, d - xs), center,
902
0
                   LoadU(df, d + xs), LoadU(df, d + xs + xs - 1),
903
0
                   LoadU(df, d + xs3 - 1), LoadU(df, d + xs3 + xs - 1));
904
0
    retval = MulAdd(sum, sum, retval);
905
0
  }
906
0
  return retval;
907
0
}
Unexecuted instantiation: decltype (Zero((hwy::N_SSE4::Simd<float, 1ul, 0>)())) jxl::N_SSE4::MaltaUnit<hwy::N_SSE4::Simd<float, 1ul, 0> >(jxl::MaltaTag, hwy::N_SSE4::Simd<float, 1ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_SSE4::Simd<float, 4ul, 0>)())) jxl::N_SSE4::MaltaUnit<hwy::N_SSE4::Simd<float, 4ul, 0> >(jxl::MaltaTag, hwy::N_SSE4::Simd<float, 4ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX2::Simd<float, 1ul, 0>)())) jxl::N_AVX2::MaltaUnit<hwy::N_AVX2::Simd<float, 1ul, 0> >(jxl::MaltaTag, hwy::N_AVX2::Simd<float, 1ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX2::Simd<float, 8ul, 0>)())) jxl::N_AVX2::MaltaUnit<hwy::N_AVX2::Simd<float, 8ul, 0> >(jxl::MaltaTag, hwy::N_AVX2::Simd<float, 8ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 1ul, 0>)())) jxl::N_AVX3::MaltaUnit<hwy::N_AVX3::Simd<float, 1ul, 0> >(jxl::MaltaTag, hwy::N_AVX3::Simd<float, 1ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>)())) jxl::N_AVX3::MaltaUnit<hwy::N_AVX3::Simd<float, 16ul, 0> >(jxl::MaltaTag, hwy::N_AVX3::Simd<float, 16ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_EMU128::Simd<float, 1ul, 0>)())) jxl::N_EMU128::MaltaUnit<hwy::N_EMU128::Simd<float, 1ul, 0> >(jxl::MaltaTag, hwy::N_EMU128::Simd<float, 1ul, 0>, float const*, long)
Unexecuted instantiation: decltype (Zero((hwy::N_EMU128::Simd<float, 4ul, 0>)())) jxl::N_EMU128::MaltaUnit<hwy::N_EMU128::Simd<float, 4ul, 0> >(jxl::MaltaTag, hwy::N_EMU128::Simd<float, 4ul, 0>, float const*, long)
908
909
// Returns MaltaUnit. Avoids bounds-checks when x0 and y0 are known
910
// to be far enough from the image borders. "diffs" is a packed image.
911
template <class Tag>
912
static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs,
913
                                                const size_t x0,
914
0
                                                const size_t y0) {
915
0
  const float* BUTTERAUGLI_RESTRICT d = diffs.ConstRow(y0) + x0;
916
0
  const HWY_CAPPED(float, 1) df;
917
0
  if ((x0 >= 4 && y0 >= 4 && x0 < (diffs.xsize() - 4) &&
918
0
       y0 < (diffs.ysize() - 4))) {
919
0
    return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow()));
920
0
  }
921
922
0
  PROFILER_ZONE("Padded Malta");
923
0
  float borderimage[12 * 9];  // round up to 4
924
0
  for (int dy = 0; dy < 9; ++dy) {
925
0
    int y = y0 + dy - 4;
926
0
    if (y < 0 || static_cast<size_t>(y) >= diffs.ysize()) {
927
0
      for (int dx = 0; dx < 12; ++dx) {
928
0
        borderimage[dy * 12 + dx] = 0.0f;
929
0
      }
930
0
      continue;
931
0
    }
932
933
0
    const float* row_diffs = diffs.ConstRow(y);
934
0
    for (int dx = 0; dx < 9; ++dx) {
935
0
      int x = x0 + dx - 4;
936
0
      if (x < 0 || static_cast<size_t>(x) >= diffs.xsize()) {
937
0
        borderimage[dy * 12 + dx] = 0.0f;
938
0
      } else {
939
0
        borderimage[dy * 12 + dx] = row_diffs[x];
940
0
      }
941
0
    }
942
0
    std::fill(borderimage + dy * 12 + 9, borderimage + dy * 12 + 12, 0.0f);
943
0
  }
944
0
  return GetLane(MaltaUnit(Tag(), df, &borderimage[4 * 12 + 4], 12));
945
0
}
Unexecuted instantiation: butteraugli.cc:float jxl::N_SSE4::PaddedMaltaUnit<jxl::MaltaTag>(jxl::Plane<float> const&, unsigned long, unsigned long)
Unexecuted instantiation: butteraugli.cc:float jxl::N_SSE4::PaddedMaltaUnit<jxl::MaltaTagLF>(jxl::Plane<float> const&, unsigned long, unsigned long)
Unexecuted instantiation: butteraugli.cc:float jxl::N_AVX2::PaddedMaltaUnit<jxl::MaltaTag>(jxl::Plane<float> const&, unsigned long, unsigned long)
Unexecuted instantiation: butteraugli.cc:float jxl::N_AVX2::PaddedMaltaUnit<jxl::MaltaTagLF>(jxl::Plane<float> const&, unsigned long, unsigned long)
Unexecuted instantiation: butteraugli.cc:float jxl::N_AVX3::PaddedMaltaUnit<jxl::MaltaTag>(jxl::Plane<float> const&, unsigned long, unsigned long)
Unexecuted instantiation: butteraugli.cc:float jxl::N_AVX3::PaddedMaltaUnit<jxl::MaltaTagLF>(jxl::Plane<float> const&, unsigned long, unsigned long)
Unexecuted instantiation: butteraugli.cc:float jxl::N_EMU128::PaddedMaltaUnit<jxl::MaltaTag>(jxl::Plane<float> const&, unsigned long, unsigned long)
Unexecuted instantiation: butteraugli.cc:float jxl::N_EMU128::PaddedMaltaUnit<jxl::MaltaTagLF>(jxl::Plane<float> const&, unsigned long, unsigned long)
946
947
template <class Tag>
948
static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
949
                          const double w_0gt1, const double w_0lt1,
950
                          const double norm1, const double len,
951
                          const double mulli, ImageF* HWY_RESTRICT diffs,
952
0
                          Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
953
0
  JXL_DASSERT(SameSize(lum0, lum1) && SameSize(lum0, *diffs));
954
0
  const size_t xsize_ = lum0.xsize();
955
0
  const size_t ysize_ = lum0.ysize();
956
957
0
  const float kWeight0 = 0.5;
958
0
  const float kWeight1 = 0.33;
959
960
0
  const double w_pre0gt1 = mulli * std::sqrt(kWeight0 * w_0gt1) / (len * 2 + 1);
961
0
  const double w_pre0lt1 = mulli * std::sqrt(kWeight1 * w_0lt1) / (len * 2 + 1);
962
0
  const float norm2_0gt1 = w_pre0gt1 * norm1;
963
0
  const float norm2_0lt1 = w_pre0lt1 * norm1;
964
965
0
  for (size_t y = 0; y < ysize_; ++y) {
966
0
    const float* HWY_RESTRICT row0 = lum0.ConstRow(y);
967
0
    const float* HWY_RESTRICT row1 = lum1.ConstRow(y);
968
0
    float* HWY_RESTRICT row_diffs = diffs->Row(y);
969
0
    for (size_t x = 0; x < xsize_; ++x) {
970
0
      const float absval = 0.5f * (std::abs(row0[x]) + std::abs(row1[x]));
971
0
      const float diff = row0[x] - row1[x];
972
0
      const float scaler = norm2_0gt1 / (static_cast<float>(norm1) + absval);
973
974
      // Primary symmetric quadratic objective.
975
0
      row_diffs[x] = scaler * diff;
976
977
0
      const float scaler2 = norm2_0lt1 / (static_cast<float>(norm1) + absval);
978
0
      const double fabs0 = std::fabs(row0[x]);
979
980
      // Secondary half-open quadratic objectives.
981
0
      const double too_small = 0.55 * fabs0;
982
0
      const double too_big = 1.05 * fabs0;
983
984
0
      if (row0[x] < 0) {
985
0
        if (row1[x] > -too_small) {
986
0
          double impact = scaler2 * (row1[x] + too_small);
987
0
          row_diffs[x] -= impact;
988
0
        } else if (row1[x] < -too_big) {
989
0
          double impact = scaler2 * (-row1[x] - too_big);
990
0
          row_diffs[x] += impact;
991
0
        }
992
0
      } else {
993
0
        if (row1[x] < too_small) {
994
0
          double impact = scaler2 * (too_small - row1[x]);
995
0
          row_diffs[x] += impact;
996
0
        } else if (row1[x] > too_big) {
997
0
          double impact = scaler2 * (row1[x] - too_big);
998
0
          row_diffs[x] -= impact;
999
0
        }
1000
0
      }
1001
0
    }
1002
0
  }
1003
1004
0
  size_t y0 = 0;
1005
  // Top
1006
0
  for (; y0 < 4; ++y0) {
1007
0
    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
1008
0
    for (size_t x0 = 0; x0 < xsize_; ++x0) {
1009
0
      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
1010
0
    }
1011
0
  }
1012
1013
0
  const HWY_FULL(float) df;
1014
0
  const size_t aligned_x = std::max(size_t(4), Lanes(df));
1015
0
  const intptr_t stride = diffs->PixelsPerRow();
1016
1017
  // Middle
1018
0
  for (; y0 < ysize_ - 4; ++y0) {
1019
0
    const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0);
1020
0
    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
1021
0
    size_t x0 = 0;
1022
0
    for (; x0 < aligned_x; ++x0) {
1023
0
      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
1024
0
    }
1025
0
    for (; x0 + Lanes(df) + 4 <= xsize_; x0 += Lanes(df)) {
1026
0
      auto diff = Load(df, row_diff + x0);
1027
0
      diff = Add(diff, MaltaUnit(Tag(), df, row_in + x0, stride));
1028
0
      Store(diff, df, row_diff + x0);
1029
0
    }
1030
1031
0
    for (; x0 < xsize_; ++x0) {
1032
0
      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
1033
0
    }
1034
0
  }
1035
1036
  // Bottom
1037
0
  for (; y0 < ysize_; ++y0) {
1038
0
    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
1039
0
    for (size_t x0 = 0; x0 < xsize_; ++x0) {
1040
0
      row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
1041
0
    }
1042
0
  }
1043
0
}
Unexecuted instantiation: butteraugli.cc:void jxl::N_SSE4::MaltaDiffMapT<jxl::MaltaTag>(jxl::MaltaTag, jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:void jxl::N_SSE4::MaltaDiffMapT<jxl::MaltaTagLF>(jxl::MaltaTagLF, jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:void jxl::N_AVX2::MaltaDiffMapT<jxl::MaltaTag>(jxl::MaltaTag, jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:void jxl::N_AVX2::MaltaDiffMapT<jxl::MaltaTagLF>(jxl::MaltaTagLF, jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:void jxl::N_AVX3::MaltaDiffMapT<jxl::MaltaTag>(jxl::MaltaTag, jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:void jxl::N_AVX3::MaltaDiffMapT<jxl::MaltaTagLF>(jxl::MaltaTagLF, jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:void jxl::N_EMU128::MaltaDiffMapT<jxl::MaltaTag>(jxl::MaltaTag, jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:void jxl::N_EMU128::MaltaDiffMapT<jxl::MaltaTagLF>(jxl::MaltaTagLF, jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
1044
1045
// Need non-template wrapper functions for HWY_EXPORT.
1046
void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
1047
                  const double w_0lt1, const double norm1, const double len,
1048
                  const double mulli, ImageF* HWY_RESTRICT diffs,
1049
0
                  Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1050
0
  MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
1051
0
                diffs, block_diff_ac, c);
1052
0
}
Unexecuted instantiation: jxl::N_SSE4::MaltaDiffMap(jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: jxl::N_AVX2::MaltaDiffMap(jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: jxl::N_AVX3::MaltaDiffMap(jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: jxl::N_EMU128::MaltaDiffMap(jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
1053
1054
void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
1055
                    const double w_0lt1, const double norm1, const double len,
1056
                    const double mulli, ImageF* HWY_RESTRICT diffs,
1057
0
                    Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1058
0
  MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
1059
0
                diffs, block_diff_ac, c);
1060
0
}
Unexecuted instantiation: jxl::N_SSE4::MaltaDiffMapLF(jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: jxl::N_AVX2::MaltaDiffMapLF(jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: jxl::N_AVX3::MaltaDiffMapLF(jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: jxl::N_EMU128::MaltaDiffMapLF(jxl::Plane<float> const&, jxl::Plane<float> const&, double, double, double, double, double, jxl::Plane<float>*, jxl::Image3<float>*, unsigned long)
1061
1062
0
void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) {
1063
0
  PROFILER_FUNC;
1064
0
  const size_t xsize = xyb.xsize();
1065
0
  const size_t ysize = xyb.ysize();
1066
0
  const float bias = mul * bias_arg;
1067
0
  const float sqrt_bias = sqrt(bias);
1068
0
  for (size_t y = 0; y < ysize; ++y) {
1069
0
    const float* BUTTERAUGLI_RESTRICT row_in = xyb.Row(y);
1070
0
    float* BUTTERAUGLI_RESTRICT row_out = out->Row(y);
1071
0
    for (size_t x = 0; x < xsize; ++x) {
1072
      // kBias makes sqrt behave more linearly.
1073
0
      row_out[x] = sqrt(mul * std::abs(row_in[x]) + bias) - sqrt_bias;
1074
0
    }
1075
0
  }
1076
0
}
Unexecuted instantiation: jxl::N_SSE4::DiffPrecompute(jxl::Plane<float> const&, float, float, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX2::DiffPrecompute(jxl::Plane<float> const&, float, float, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX3::DiffPrecompute(jxl::Plane<float> const&, float, float, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_EMU128::DiffPrecompute(jxl::Plane<float> const&, float, float, jxl::Plane<float>*)
1077
1078
// std::log(80.0) / std::log(255.0);
1079
constexpr float kIntensityTargetNormalizationHack = 0.79079917404f;
1080
static const float kInternalGoodQualityThreshold =
1081
    17.8f * kIntensityTargetNormalizationHack;
1082
static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
1083
1084
0
void StoreMin3(const float v, float& min0, float& min1, float& min2) {
1085
0
  if (v < min2) {
1086
0
    if (v < min0) {
1087
0
      min2 = min1;
1088
0
      min1 = min0;
1089
0
      min0 = v;
1090
0
    } else if (v < min1) {
1091
0
      min2 = min1;
1092
0
      min1 = v;
1093
0
    } else {
1094
0
      min2 = v;
1095
0
    }
1096
0
  }
1097
0
}
Unexecuted instantiation: jxl::N_SSE4::StoreMin3(float, float&, float&, float&)
Unexecuted instantiation: jxl::N_AVX2::StoreMin3(float, float&, float&, float&)
Unexecuted instantiation: jxl::N_AVX3::StoreMin3(float, float&, float&, float&)
Unexecuted instantiation: jxl::N_EMU128::StoreMin3(float, float&, float&, float&)
1098
1099
// Look for smooth areas near the area of degradation.
1100
// If the areas area generally smooth, don't do masking.
1101
0
void FuzzyErosion(const ImageF& from, ImageF* to) {
1102
0
  const size_t xsize = from.xsize();
1103
0
  const size_t ysize = from.ysize();
1104
0
  static const int kStep = 3;
1105
0
  for (size_t y = 0; y < ysize; ++y) {
1106
0
    for (size_t x = 0; x < xsize; ++x) {
1107
0
      float min0 = from.Row(y)[x];
1108
0
      float min1 = 2 * min0;
1109
0
      float min2 = min1;
1110
0
      if (x >= kStep) {
1111
0
        float v = from.Row(y)[x - kStep];
1112
0
        StoreMin3(v, min0, min1, min2);
1113
0
        if (y >= kStep) {
1114
0
          float v = from.Row(y - kStep)[x - kStep];
1115
0
          StoreMin3(v, min0, min1, min2);
1116
0
        }
1117
0
        if (y < ysize - kStep) {
1118
0
          float v = from.Row(y + kStep)[x - kStep];
1119
0
          StoreMin3(v, min0, min1, min2);
1120
0
        }
1121
0
      }
1122
0
      if (x < xsize - kStep) {
1123
0
        float v = from.Row(y)[x + kStep];
1124
0
        StoreMin3(v, min0, min1, min2);
1125
0
        if (y >= kStep) {
1126
0
          float v = from.Row(y - kStep)[x + kStep];
1127
0
          StoreMin3(v, min0, min1, min2);
1128
0
        }
1129
0
        if (y < ysize - kStep) {
1130
0
          float v = from.Row(y + kStep)[x + kStep];
1131
0
          StoreMin3(v, min0, min1, min2);
1132
0
        }
1133
0
      }
1134
0
      if (y >= kStep) {
1135
0
        float v = from.Row(y - kStep)[x];
1136
0
        StoreMin3(v, min0, min1, min2);
1137
0
      }
1138
0
      if (y < ysize - kStep) {
1139
0
        float v = from.Row(y + kStep)[x];
1140
0
        StoreMin3(v, min0, min1, min2);
1141
0
      }
1142
0
      to->Row(y)[x] = (0.45f * min0 + 0.3f * min1 + 0.25f * min2);
1143
0
    }
1144
0
  }
1145
0
}
Unexecuted instantiation: jxl::N_SSE4::FuzzyErosion(jxl::Plane<float> const&, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX2::FuzzyErosion(jxl::Plane<float> const&, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX3::FuzzyErosion(jxl::Plane<float> const&, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_EMU128::FuzzyErosion(jxl::Plane<float> const&, jxl::Plane<float>*)
1146
1147
// Compute values of local frequency and dc masking based on the activity
1148
// in the two images. img_diff_ac may be null.
1149
void Mask(const ImageF& mask0, const ImageF& mask1,
1150
          const ButteraugliParams& params, BlurTemp* blur_temp,
1151
          ImageF* BUTTERAUGLI_RESTRICT mask,
1152
0
          ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
1153
  // Only X and Y components are involved in masking. B's influence
1154
  // is considered less important in the high frequency area, and we
1155
  // don't model masking from lower frequency signals.
1156
0
  PROFILER_FUNC;
1157
0
  const size_t xsize = mask0.xsize();
1158
0
  const size_t ysize = mask0.ysize();
1159
0
  *mask = ImageF(xsize, ysize);
1160
0
  static const float kMul = 6.19424080439;
1161
0
  static const float kBias = 12.61050594197;
1162
0
  static const float kRadius = 2.7;
1163
0
  ImageF diff0(xsize, ysize);
1164
0
  ImageF diff1(xsize, ysize);
1165
0
  ImageF blurred0(xsize, ysize);
1166
0
  ImageF blurred1(xsize, ysize);
1167
0
  DiffPrecompute(mask0, kMul, kBias, &diff0);
1168
0
  DiffPrecompute(mask1, kMul, kBias, &diff1);
1169
0
  Blur(diff0, kRadius, params, blur_temp, &blurred0);
1170
0
  FuzzyErosion(blurred0, &diff0);
1171
0
  Blur(diff1, kRadius, params, blur_temp, &blurred1);
1172
0
  FuzzyErosion(blurred1, &diff1);
1173
0
  for (size_t y = 0; y < ysize; ++y) {
1174
0
    for (size_t x = 0; x < xsize; ++x) {
1175
0
      mask->Row(y)[x] = diff0.Row(y)[x];
1176
0
      if (diff_ac != nullptr) {
1177
0
        static const float kMaskToErrorMul = 10.0;
1178
0
        float diff = blurred0.Row(y)[x] - blurred1.Row(y)[x];
1179
0
        diff_ac->Row(y)[x] += kMaskToErrorMul * diff * diff;
1180
0
      }
1181
0
    }
1182
0
  }
1183
0
}
Unexecuted instantiation: jxl::N_SSE4::Mask(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::ButteraugliParams const&, jxl::BlurTemp*, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX2::Mask(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::ButteraugliParams const&, jxl::BlurTemp*, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX3::Mask(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::ButteraugliParams const&, jxl::BlurTemp*, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_EMU128::Mask(jxl::Plane<float> const&, jxl::Plane<float> const&, jxl::ButteraugliParams const&, jxl::BlurTemp*, jxl::Plane<float>*, jxl::Plane<float>*)
1184
1185
// `diff_ac` may be null.
1186
void MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1,
1187
                     const size_t xsize, const size_t ysize,
1188
                     const ButteraugliParams& params, Image3F* temp,
1189
                     BlurTemp* blur_temp, ImageF* BUTTERAUGLI_RESTRICT mask,
1190
0
                     ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
1191
0
  ImageF mask0(xsize, ysize);
1192
0
  ImageF mask1(xsize, ysize);
1193
0
  static const float muls[3] = {
1194
0
      2.5f,
1195
0
      0.4f,
1196
0
      0.4f,
1197
0
  };
1198
  // Silly and unoptimized approach here. TODO(jyrki): rework this.
1199
0
  for (size_t y = 0; y < ysize; ++y) {
1200
0
    const float* BUTTERAUGLI_RESTRICT row_y_hf0 = pi0.hf[1].Row(y);
1201
0
    const float* BUTTERAUGLI_RESTRICT row_y_hf1 = pi1.hf[1].Row(y);
1202
0
    const float* BUTTERAUGLI_RESTRICT row_y_uhf0 = pi0.uhf[1].Row(y);
1203
0
    const float* BUTTERAUGLI_RESTRICT row_y_uhf1 = pi1.uhf[1].Row(y);
1204
0
    const float* BUTTERAUGLI_RESTRICT row_x_hf0 = pi0.hf[0].Row(y);
1205
0
    const float* BUTTERAUGLI_RESTRICT row_x_hf1 = pi1.hf[0].Row(y);
1206
0
    const float* BUTTERAUGLI_RESTRICT row_x_uhf0 = pi0.uhf[0].Row(y);
1207
0
    const float* BUTTERAUGLI_RESTRICT row_x_uhf1 = pi1.uhf[0].Row(y);
1208
0
    float* BUTTERAUGLI_RESTRICT row0 = mask0.Row(y);
1209
0
    float* BUTTERAUGLI_RESTRICT row1 = mask1.Row(y);
1210
0
    for (size_t x = 0; x < xsize; ++x) {
1211
0
      float xdiff0 = (row_x_uhf0[x] + row_x_hf0[x]) * muls[0];
1212
0
      float xdiff1 = (row_x_uhf1[x] + row_x_hf1[x]) * muls[0];
1213
0
      float ydiff0 = row_y_uhf0[x] * muls[1] + row_y_hf0[x] * muls[2];
1214
0
      float ydiff1 = row_y_uhf1[x] * muls[1] + row_y_hf1[x] * muls[2];
1215
0
      row0[x] = xdiff0 * xdiff0 + ydiff0 * ydiff0;
1216
0
      row0[x] = sqrt(row0[x]);
1217
0
      row1[x] = xdiff1 * xdiff1 + ydiff1 * ydiff1;
1218
0
      row1[x] = sqrt(row1[x]);
1219
0
    }
1220
0
  }
1221
0
  Mask(mask0, mask1, params, blur_temp, mask, diff_ac);
1222
0
}
Unexecuted instantiation: jxl::N_SSE4::MaskPsychoImage(jxl::PsychoImage const&, jxl::PsychoImage const&, unsigned long, unsigned long, jxl::ButteraugliParams const&, jxl::Image3<float>*, jxl::BlurTemp*, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX2::MaskPsychoImage(jxl::PsychoImage const&, jxl::PsychoImage const&, unsigned long, unsigned long, jxl::ButteraugliParams const&, jxl::Image3<float>*, jxl::BlurTemp*, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX3::MaskPsychoImage(jxl::PsychoImage const&, jxl::PsychoImage const&, unsigned long, unsigned long, jxl::ButteraugliParams const&, jxl::Image3<float>*, jxl::BlurTemp*, jxl::Plane<float>*, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_EMU128::MaskPsychoImage(jxl::PsychoImage const&, jxl::PsychoImage const&, unsigned long, unsigned long, jxl::ButteraugliParams const&, jxl::Image3<float>*, jxl::BlurTemp*, jxl::Plane<float>*, jxl::Plane<float>*)
1223
1224
0
double MaskY(double delta) {
1225
0
  static const double offset = 0.829591754942;
1226
0
  static const double scaler = 0.451936922203;
1227
0
  static const double mul = 2.5485944793;
1228
0
  const double c = mul / ((scaler * delta) + offset);
1229
0
  const double retval = kGlobalScale * (1.0 + c);
1230
0
  return retval * retval;
1231
0
}
Unexecuted instantiation: jxl::N_SSE4::MaskY(double)
Unexecuted instantiation: jxl::N_AVX2::MaskY(double)
Unexecuted instantiation: jxl::N_AVX3::MaskY(double)
Unexecuted instantiation: jxl::N_EMU128::MaskY(double)
1232
1233
0
double MaskDcY(double delta) {
1234
0
  static const double offset = 0.20025578522;
1235
0
  static const double scaler = 3.87449418804;
1236
0
  static const double mul = 0.505054525019;
1237
0
  const double c = mul / ((scaler * delta) + offset);
1238
0
  const double retval = kGlobalScale * (1.0 + c);
1239
0
  return retval * retval;
1240
0
}
Unexecuted instantiation: jxl::N_SSE4::MaskDcY(double)
Unexecuted instantiation: jxl::N_AVX2::MaskDcY(double)
Unexecuted instantiation: jxl::N_AVX3::MaskDcY(double)
Unexecuted instantiation: jxl::N_EMU128::MaskDcY(double)
1241
1242
0
inline float MaskColor(const float color[3], const float mask) {
1243
0
  return color[0] * mask + color[1] * mask + color[2] * mask;
1244
0
}
Unexecuted instantiation: jxl::N_SSE4::MaskColor(float const*, float)
Unexecuted instantiation: jxl::N_AVX2::MaskColor(float const*, float)
Unexecuted instantiation: jxl::N_AVX3::MaskColor(float const*, float)
Unexecuted instantiation: jxl::N_EMU128::MaskColor(float const*, float)
1245
1246
// Diffmap := sqrt of sum{diff images by multiplied by X and Y/B masks}
1247
void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc,
1248
                              const Image3F& block_diff_ac, float xmul,
1249
0
                              ImageF* result) {
1250
0
  PROFILER_FUNC;
1251
0
  JXL_CHECK(SameSize(mask, *result));
1252
0
  size_t xsize = mask.xsize();
1253
0
  size_t ysize = mask.ysize();
1254
0
  for (size_t y = 0; y < ysize; ++y) {
1255
0
    float* BUTTERAUGLI_RESTRICT row_out = result->Row(y);
1256
0
    for (size_t x = 0; x < xsize; ++x) {
1257
0
      float val = mask.Row(y)[x];
1258
0
      float maskval = MaskY(val);
1259
0
      float dc_maskval = MaskDcY(val);
1260
0
      float diff_dc[3];
1261
0
      float diff_ac[3];
1262
0
      for (int i = 0; i < 3; ++i) {
1263
0
        diff_dc[i] = block_diff_dc.PlaneRow(i, y)[x];
1264
0
        diff_ac[i] = block_diff_ac.PlaneRow(i, y)[x];
1265
0
      }
1266
0
      diff_ac[0] *= xmul;
1267
0
      diff_dc[0] *= xmul;
1268
0
      row_out[x] =
1269
0
          sqrt(MaskColor(diff_dc, dc_maskval) + MaskColor(diff_ac, maskval));
1270
0
    }
1271
0
  }
1272
0
}
Unexecuted instantiation: jxl::N_SSE4::CombineChannelsToDiffmap(jxl::Plane<float> const&, jxl::Image3<float> const&, jxl::Image3<float> const&, float, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX2::CombineChannelsToDiffmap(jxl::Plane<float> const&, jxl::Image3<float> const&, jxl::Image3<float> const&, float, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_AVX3::CombineChannelsToDiffmap(jxl::Plane<float> const&, jxl::Image3<float> const&, jxl::Image3<float> const&, float, jxl::Plane<float>*)
Unexecuted instantiation: jxl::N_EMU128::CombineChannelsToDiffmap(jxl::Plane<float> const&, jxl::Image3<float> const&, jxl::Image3<float> const&, float, jxl::Plane<float>*)
1273
1274
// Adds weighted L2 difference between i0 and i1 to diffmap.
1275
static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
1276
0
                   Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
1277
0
  if (w == 0) return;
1278
1279
0
  const HWY_FULL(float) d;
1280
0
  const auto weight = Set(d, w);
1281
1282
0
  for (size_t y = 0; y < i0.ysize(); ++y) {
1283
0
    const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
1284
0
    const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
1285
0
    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
1286
1287
0
    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
1288
0
      const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
1289
0
      const auto diff2 = Mul(diff, diff);
1290
0
      const auto prev = Load(d, row_diff + x);
1291
0
      Store(MulAdd(diff2, weight, prev), d, row_diff + x);
1292
0
    }
1293
0
  }
1294
0
}
Unexecuted instantiation: butteraugli.cc:jxl::N_AVX3::L2Diff(jxl::Plane<float> const&, jxl::Plane<float> const&, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_AVX2::L2Diff(jxl::Plane<float> const&, jxl::Plane<float> const&, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_SSE4::L2Diff(jxl::Plane<float> const&, jxl::Plane<float> const&, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_EMU128::L2Diff(jxl::Plane<float> const&, jxl::Plane<float> const&, float, jxl::Image3<float>*, unsigned long)
1295
1296
// Initializes diffmap to the weighted L2 difference between i0 and i1.
1297
static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
1298
0
                      Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
1299
0
  if (w == 0) return;
1300
1301
0
  const HWY_FULL(float) d;
1302
0
  const auto weight = Set(d, w);
1303
1304
0
  for (size_t y = 0; y < i0.ysize(); ++y) {
1305
0
    const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
1306
0
    const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
1307
0
    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
1308
1309
0
    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
1310
0
      const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
1311
0
      const auto diff2 = Mul(diff, diff);
1312
0
      Store(Mul(diff2, weight), d, row_diff + x);
1313
0
    }
1314
0
  }
1315
0
}
Unexecuted instantiation: butteraugli.cc:jxl::N_AVX3::SetL2Diff(jxl::Plane<float> const&, jxl::Plane<float> const&, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_AVX2::SetL2Diff(jxl::Plane<float> const&, jxl::Plane<float> const&, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_SSE4::SetL2Diff(jxl::Plane<float> const&, jxl::Plane<float> const&, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_EMU128::SetL2Diff(jxl::Plane<float> const&, jxl::Plane<float> const&, float, jxl::Image3<float>*, unsigned long)
1316
1317
// i0 is the original image.
1318
// i1 is the deformed copy.
1319
static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1,
1320
                             float w_0lt1,
1321
0
                             Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
1322
0
  if (w_0gt1 == 0 && w_0lt1 == 0) {
1323
0
    return;
1324
0
  }
1325
1326
0
  const HWY_FULL(float) d;
1327
0
  const auto vw_0gt1 = Set(d, w_0gt1 * 0.8);
1328
0
  const auto vw_0lt1 = Set(d, w_0lt1 * 0.8);
1329
1330
0
  for (size_t y = 0; y < i0.ysize(); ++y) {
1331
0
    const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y);
1332
0
    const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y);
1333
0
    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
1334
1335
0
    for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
1336
0
      const auto val0 = Load(d, row0 + x);
1337
0
      const auto val1 = Load(d, row1 + x);
1338
1339
      // Primary symmetric quadratic objective.
1340
0
      const auto diff = Sub(val0, val1);
1341
0
      auto total = MulAdd(Mul(diff, diff), vw_0gt1, Load(d, row_diff + x));
1342
1343
      // Secondary half-open quadratic objectives.
1344
0
      const auto fabs0 = Abs(val0);
1345
0
      const auto too_small = Mul(Set(d, 0.4), fabs0);
1346
0
      const auto too_big = fabs0;
1347
1348
0
      const auto if_neg = IfThenElse(
1349
0
          Gt(val1, Neg(too_small)), Add(val1, too_small),
1350
0
          IfThenElseZero(Lt(val1, Neg(too_big)), Sub(Neg(val1), too_big)));
1351
0
      const auto if_pos =
1352
0
          IfThenElse(Lt(val1, too_small), Sub(too_small, val1),
1353
0
                     IfThenElseZero(Gt(val1, too_big), Sub(val1, too_big)));
1354
0
      const auto v = IfThenElse(Lt(val0, Zero(d)), if_neg, if_pos);
1355
0
      total = MulAdd(vw_0lt1, Mul(v, v), total);
1356
0
      Store(total, d, row_diff + x);
1357
0
    }
1358
0
  }
1359
0
}
Unexecuted instantiation: butteraugli.cc:jxl::N_AVX3::L2DiffAsymmetric(jxl::Plane<float> const&, jxl::Plane<float> const&, float, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_AVX2::L2DiffAsymmetric(jxl::Plane<float> const&, jxl::Plane<float> const&, float, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_SSE4::L2DiffAsymmetric(jxl::Plane<float> const&, jxl::Plane<float> const&, float, float, jxl::Image3<float>*, unsigned long)
Unexecuted instantiation: butteraugli.cc:jxl::N_EMU128::L2DiffAsymmetric(jxl::Plane<float> const&, jxl::Plane<float> const&, float, float, jxl::Image3<float>*, unsigned long)
1360
1361
// A simple HDR compatible gamma function.
1362
template <class DF, class V>
1363
0
V Gamma(const DF df, V v) {
1364
  // ln(2) constant folded in because we want std::log but have FastLog2f.
1365
0
  const auto kRetMul = Set(df, 19.245013259874995f * 0.693147180559945f);
1366
0
  const auto kRetAdd = Set(df, -23.16046239805755);
1367
  // This should happen rarely, but may lead to a NaN in log, which is
1368
  // undesirable. Since negative photons don't exist we solve the NaNs by
1369
  // clamping here.
1370
0
  v = ZeroIfNegative(v);
1371
1372
0
  const auto biased = Add(v, Set(df, 9.9710635769299145));
1373
0
  const auto log = FastLog2f(df, biased);
1374
  // We could fold this into a custom Log2 polynomial, but there would be
1375
  // relatively little gain.
1376
0
  return MulAdd(kRetMul, log, kRetAdd);
1377
0
}
Unexecuted instantiation: hwy::N_SSE4::Vec128<float, 4ul> jxl::N_SSE4::Gamma<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>)
Unexecuted instantiation: hwy::N_AVX2::Vec256<float> jxl::N_AVX2::Gamma<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>)
Unexecuted instantiation: hwy::N_AVX3::Vec512<float> jxl::N_AVX3::Gamma<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: hwy::N_EMU128::Vec128<float, 4ul> jxl::N_EMU128::Gamma<hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul>)
1378
1379
template <bool Clamp, class DF, class V>
1380
BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1,
1381
                                        const V& in2, V* JXL_RESTRICT out0,
1382
                                        V* JXL_RESTRICT out1,
1383
0
                                        V* JXL_RESTRICT out2) {
1384
  // https://en.wikipedia.org/wiki/Photopsin absorbance modeling.
1385
0
  static const double mixi0 = 0.29956550340058319;
1386
0
  static const double mixi1 = 0.63373087833825936;
1387
0
  static const double mixi2 = 0.077705617820981968;
1388
0
  static const double mixi3 = 1.7557483643287353;
1389
0
  static const double mixi4 = 0.22158691104574774;
1390
0
  static const double mixi5 = 0.69391388044116142;
1391
0
  static const double mixi6 = 0.0987313588422;
1392
0
  static const double mixi7 = 1.7557483643287353;
1393
0
  static const double mixi8 = 0.02;
1394
0
  static const double mixi9 = 0.02;
1395
0
  static const double mixi10 = 0.20480129041026129;
1396
0
  static const double mixi11 = 12.226454707163354;
1397
1398
0
  const V mix0 = Set(df, mixi0);
1399
0
  const V mix1 = Set(df, mixi1);
1400
0
  const V mix2 = Set(df, mixi2);
1401
0
  const V mix3 = Set(df, mixi3);
1402
0
  const V mix4 = Set(df, mixi4);
1403
0
  const V mix5 = Set(df, mixi5);
1404
0
  const V mix6 = Set(df, mixi6);
1405
0
  const V mix7 = Set(df, mixi7);
1406
0
  const V mix8 = Set(df, mixi8);
1407
0
  const V mix9 = Set(df, mixi9);
1408
0
  const V mix10 = Set(df, mixi10);
1409
0
  const V mix11 = Set(df, mixi11);
1410
1411
0
  *out0 = MulAdd(mix0, in0, MulAdd(mix1, in1, MulAdd(mix2, in2, mix3)));
1412
0
  *out1 = MulAdd(mix4, in0, MulAdd(mix5, in1, MulAdd(mix6, in2, mix7)));
1413
0
  *out2 = MulAdd(mix8, in0, MulAdd(mix9, in1, MulAdd(mix10, in2, mix11)));
1414
1415
0
  if (Clamp) {
1416
0
    *out0 = Max(*out0, mix3);
1417
0
    *out1 = Max(*out1, mix7);
1418
0
    *out2 = Max(*out2, mix11);
1419
0
  }
1420
0
}
Unexecuted instantiation: void jxl::N_SSE4::OpsinAbsorbance<true, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*)
Unexecuted instantiation: void jxl::N_SSE4::OpsinAbsorbance<false, hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> >(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul> const&, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*, hwy::N_SSE4::Vec128<float, 4ul>*)
Unexecuted instantiation: void jxl::N_AVX2::OpsinAbsorbance<true, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*)
Unexecuted instantiation: void jxl::N_AVX2::OpsinAbsorbance<false, hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> >(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float> const&, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*, hwy::N_AVX2::Vec256<float>*)
Unexecuted instantiation: void jxl::N_AVX3::OpsinAbsorbance<true, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float>*, hwy::N_AVX3::Vec512<float>*, hwy::N_AVX3::Vec512<float>*)
Unexecuted instantiation: void jxl::N_AVX3::OpsinAbsorbance<false, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float> const&, hwy::N_AVX3::Vec512<float>*, hwy::N_AVX3::Vec512<float>*, hwy::N_AVX3::Vec512<float>*)
Unexecuted instantiation: void jxl::N_EMU128::OpsinAbsorbance<true, hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul>*, hwy::N_EMU128::Vec128<float, 4ul>*, hwy::N_EMU128::Vec128<float, 4ul>*)
Unexecuted instantiation: void jxl::N_EMU128::OpsinAbsorbance<false, hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> >(hwy::N_EMU128::Simd<float, 4ul, 0>, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul> const&, hwy::N_EMU128::Vec128<float, 4ul>*, hwy::N_EMU128::Vec128<float, 4ul>*, hwy::N_EMU128::Vec128<float, 4ul>*)
1421
1422
// `blurred` is a temporary image used inside this function and not returned.
1423
Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
1424
0
                           Image3F* blurred, BlurTemp* blur_temp) {
1425
0
  PROFILER_FUNC;
1426
0
  Image3F xyb(rgb.xsize(), rgb.ysize());
1427
0
  const double kSigma = 1.2;
1428
0
  Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0));
1429
0
  Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1));
1430
0
  Blur(rgb.Plane(2), kSigma, params, blur_temp, &blurred->Plane(2));
1431
0
  const HWY_FULL(float) df;
1432
0
  const auto intensity_target_multiplier = Set(df, params.intensity_target);
1433
0
  for (size_t y = 0; y < rgb.ysize(); ++y) {
1434
0
    const float* BUTTERAUGLI_RESTRICT row_r = rgb.ConstPlaneRow(0, y);
1435
0
    const float* BUTTERAUGLI_RESTRICT row_g = rgb.ConstPlaneRow(1, y);
1436
0
    const float* BUTTERAUGLI_RESTRICT row_b = rgb.ConstPlaneRow(2, y);
1437
0
    const float* BUTTERAUGLI_RESTRICT row_blurred_r =
1438
0
        blurred->ConstPlaneRow(0, y);
1439
0
    const float* BUTTERAUGLI_RESTRICT row_blurred_g =
1440
0
        blurred->ConstPlaneRow(1, y);
1441
0
    const float* BUTTERAUGLI_RESTRICT row_blurred_b =
1442
0
        blurred->ConstPlaneRow(2, y);
1443
0
    float* BUTTERAUGLI_RESTRICT row_out_x = xyb.PlaneRow(0, y);
1444
0
    float* BUTTERAUGLI_RESTRICT row_out_y = xyb.PlaneRow(1, y);
1445
0
    float* BUTTERAUGLI_RESTRICT row_out_b = xyb.PlaneRow(2, y);
1446
0
    const auto min = Set(df, 1e-4f);
1447
0
    for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) {
1448
0
      auto sensitivity0 = Undefined(df);
1449
0
      auto sensitivity1 = Undefined(df);
1450
0
      auto sensitivity2 = Undefined(df);
1451
0
      {
1452
        // Calculate sensitivity based on the smoothed image gamma derivative.
1453
0
        auto pre_mixed0 = Undefined(df);
1454
0
        auto pre_mixed1 = Undefined(df);
1455
0
        auto pre_mixed2 = Undefined(df);
1456
0
        OpsinAbsorbance<true>(
1457
0
            df, Mul(Load(df, row_blurred_r + x), intensity_target_multiplier),
1458
0
            Mul(Load(df, row_blurred_g + x), intensity_target_multiplier),
1459
0
            Mul(Load(df, row_blurred_b + x), intensity_target_multiplier),
1460
0
            &pre_mixed0, &pre_mixed1, &pre_mixed2);
1461
0
        pre_mixed0 = Max(pre_mixed0, min);
1462
0
        pre_mixed1 = Max(pre_mixed1, min);
1463
0
        pre_mixed2 = Max(pre_mixed2, min);
1464
0
        sensitivity0 = Div(Gamma(df, pre_mixed0), pre_mixed0);
1465
0
        sensitivity1 = Div(Gamma(df, pre_mixed1), pre_mixed1);
1466
0
        sensitivity2 = Div(Gamma(df, pre_mixed2), pre_mixed2);
1467
0
        sensitivity0 = Max(sensitivity0, min);
1468
0
        sensitivity1 = Max(sensitivity1, min);
1469
0
        sensitivity2 = Max(sensitivity2, min);
1470
0
      }
1471
0
      auto cur_mixed0 = Undefined(df);
1472
0
      auto cur_mixed1 = Undefined(df);
1473
0
      auto cur_mixed2 = Undefined(df);
1474
0
      OpsinAbsorbance<false>(
1475
0
          df, Mul(Load(df, row_r + x), intensity_target_multiplier),
1476
0
          Mul(Load(df, row_g + x), intensity_target_multiplier),
1477
0
          Mul(Load(df, row_b + x), intensity_target_multiplier), &cur_mixed0,
1478
0
          &cur_mixed1, &cur_mixed2);
1479
0
      cur_mixed0 = Mul(cur_mixed0, sensitivity0);
1480
0
      cur_mixed1 = Mul(cur_mixed1, sensitivity1);
1481
0
      cur_mixed2 = Mul(cur_mixed2, sensitivity2);
1482
      // This is a kludge. The negative values should be zeroed away before
1483
      // blurring. Ideally there would be no negative values in the first place.
1484
0
      const auto min01 = Set(df, 1.7557483643287353f);
1485
0
      const auto min2 = Set(df, 12.226454707163354f);
1486
0
      cur_mixed0 = Max(cur_mixed0, min01);
1487
0
      cur_mixed1 = Max(cur_mixed1, min01);
1488
0
      cur_mixed2 = Max(cur_mixed2, min2);
1489
1490
0
      Store(Sub(cur_mixed0, cur_mixed1), df, row_out_x + x);
1491
0
      Store(Add(cur_mixed0, cur_mixed1), df, row_out_y + x);
1492
0
      Store(cur_mixed2, df, row_out_b + x);
1493
0
    }
1494
0
  }
1495
0
  return xyb;
1496
0
}
Unexecuted instantiation: jxl::N_SSE4::OpsinDynamicsImage(jxl::Image3<float> const&, jxl::ButteraugliParams const&, jxl::Image3<float>*, jxl::BlurTemp*)
Unexecuted instantiation: jxl::N_AVX2::OpsinDynamicsImage(jxl::Image3<float> const&, jxl::ButteraugliParams const&, jxl::Image3<float>*, jxl::BlurTemp*)
Unexecuted instantiation: jxl::N_AVX3::OpsinDynamicsImage(jxl::Image3<float> const&, jxl::ButteraugliParams const&, jxl::Image3<float>*, jxl::BlurTemp*)
Unexecuted instantiation: jxl::N_EMU128::OpsinDynamicsImage(jxl::Image3<float> const&, jxl::ButteraugliParams const&, jxl::Image3<float>*, jxl::BlurTemp*)
1497
1498
// NOLINTNEXTLINE(google-readability-namespace-comments)
1499
}  // namespace HWY_NAMESPACE
1500
}  // namespace jxl
1501
HWY_AFTER_NAMESPACE();
1502
1503
#if HWY_ONCE
1504
namespace jxl {
1505
1506
HWY_EXPORT(SeparateFrequencies);       // Local function.
1507
HWY_EXPORT(MaskPsychoImage);           // Local function.
1508
HWY_EXPORT(L2DiffAsymmetric);          // Local function.
1509
HWY_EXPORT(L2Diff);                    // Local function.
1510
HWY_EXPORT(SetL2Diff);                 // Local function.
1511
HWY_EXPORT(CombineChannelsToDiffmap);  // Local function.
1512
HWY_EXPORT(MaltaDiffMap);              // Local function.
1513
HWY_EXPORT(MaltaDiffMapLF);            // Local function.
1514
HWY_EXPORT(OpsinDynamicsImage);        // Local function.
1515
1516
#if BUTTERAUGLI_ENABLE_CHECKS
1517
1518
static inline bool IsNan(const float x) {
1519
  uint32_t bits;
1520
  memcpy(&bits, &x, sizeof(bits));
1521
  const uint32_t bitmask_exp = 0x7F800000;
1522
  return (bits & bitmask_exp) == bitmask_exp && (bits & 0x7FFFFF);
1523
}
1524
1525
static inline bool IsNan(const double x) {
1526
  uint64_t bits;
1527
  memcpy(&bits, &x, sizeof(bits));
1528
  return (0x7ff0000000000001ULL <= bits && bits <= 0x7fffffffffffffffULL) ||
1529
         (0xfff0000000000001ULL <= bits && bits <= 0xffffffffffffffffULL);
1530
}
1531
1532
static inline void CheckImage(const ImageF& image, const char* name) {
1533
  PROFILER_FUNC;
1534
  for (size_t y = 0; y < image.ysize(); ++y) {
1535
    const float* BUTTERAUGLI_RESTRICT row = image.Row(y);
1536
    for (size_t x = 0; x < image.xsize(); ++x) {
1537
      if (IsNan(row[x])) {
1538
        printf("NAN: Image %s @ %" PRIuS ",%" PRIuS " (of %" PRIuS ",%" PRIuS
1539
               ")\n",
1540
               name, x, y, image.xsize(), image.ysize());
1541
        exit(1);
1542
      }
1543
    }
1544
  }
1545
}
1546
1547
#define CHECK_NAN(x, str)                \
1548
  do {                                   \
1549
    if (IsNan(x)) {                      \
1550
      printf("%d: %s\n", __LINE__, str); \
1551
      abort();                           \
1552
    }                                    \
1553
  } while (0)
1554
1555
#define CHECK_IMAGE(image, name) CheckImage(image, name)
1556
1557
#else  // BUTTERAUGLI_ENABLE_CHECKS
1558
1559
#define CHECK_NAN(x, str)
1560
#define CHECK_IMAGE(image, name)
1561
1562
#endif  // BUTTERAUGLI_ENABLE_CHECKS
1563
1564
// Calculate a 2x2 subsampled image for purposes of recursive butteraugli at
1565
// multiresolution.
1566
0
static Image3F SubSample2x(const Image3F& in) {
1567
0
  size_t xs = (in.xsize() + 1) / 2;
1568
0
  size_t ys = (in.ysize() + 1) / 2;
1569
0
  Image3F retval(xs, ys);
1570
0
  for (size_t c = 0; c < 3; ++c) {
1571
0
    for (size_t y = 0; y < ys; ++y) {
1572
0
      for (size_t x = 0; x < xs; ++x) {
1573
0
        retval.PlaneRow(c, y)[x] = 0;
1574
0
      }
1575
0
    }
1576
0
  }
1577
0
  for (size_t c = 0; c < 3; ++c) {
1578
0
    for (size_t y = 0; y < in.ysize(); ++y) {
1579
0
      for (size_t x = 0; x < in.xsize(); ++x) {
1580
0
        retval.PlaneRow(c, y / 2)[x / 2] += 0.25f * in.PlaneRow(c, y)[x];
1581
0
      }
1582
0
    }
1583
0
    if ((in.xsize() & 1) != 0) {
1584
0
      for (size_t y = 0; y < retval.ysize(); ++y) {
1585
0
        size_t last_column = retval.xsize() - 1;
1586
0
        retval.PlaneRow(c, y)[last_column] *= 2.0f;
1587
0
      }
1588
0
    }
1589
0
    if ((in.ysize() & 1) != 0) {
1590
0
      for (size_t x = 0; x < retval.xsize(); ++x) {
1591
0
        size_t last_row = retval.ysize() - 1;
1592
0
        retval.PlaneRow(c, last_row)[x] *= 2.0f;
1593
0
      }
1594
0
    }
1595
0
  }
1596
0
  return retval;
1597
0
}
1598
1599
// Supersample src by 2x and add it to dest.
1600
0
static void AddSupersampled2x(const ImageF& src, float w, ImageF& dest) {
1601
0
  for (size_t y = 0; y < dest.ysize(); ++y) {
1602
0
    for (size_t x = 0; x < dest.xsize(); ++x) {
1603
      // There will be less errors from the more averaged images.
1604
      // We take it into account to some extent using a scaler.
1605
0
      static const double kHeuristicMixingValue = 0.3;
1606
0
      dest.Row(y)[x] *= 1.0 - kHeuristicMixingValue * w;
1607
0
      dest.Row(y)[x] += w * src.Row(y / 2)[x / 2];
1608
0
    }
1609
0
  }
1610
0
}
1611
1612
0
Image3F* ButteraugliComparator::Temp() const {
1613
0
  bool was_in_use = temp_in_use_.test_and_set(std::memory_order_acq_rel);
1614
0
  JXL_ASSERT(!was_in_use);
1615
0
  (void)was_in_use;
1616
0
  return &temp_;
1617
0
}
1618
1619
0
void ButteraugliComparator::ReleaseTemp() const { temp_in_use_.clear(); }
1620
1621
ButteraugliComparator::ButteraugliComparator(const Image3F& rgb0,
1622
                                             const ButteraugliParams& params)
1623
    : xsize_(rgb0.xsize()),
1624
      ysize_(rgb0.ysize()),
1625
      params_(params),
1626
0
      temp_(xsize_, ysize_) {
1627
0
  if (xsize_ < 8 || ysize_ < 8) {
1628
0
    return;
1629
0
  }
1630
1631
0
  Image3F xyb0 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(rgb0, params, Temp(),
1632
0
                                                          &blur_temp_);
1633
0
  ReleaseTemp();
1634
0
  HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
1635
0
  (xsize_, ysize_, params_, &blur_temp_, xyb0, pi0_);
1636
1637
  // Awful recursive construction of samples of different resolution.
1638
  // This is an after-thought and possibly somewhat parallel in
1639
  // functionality with the PsychoImage multi-resolution approach.
1640
0
  sub_.reset(new ButteraugliComparator(SubSample2x(rgb0), params));
1641
0
}
1642
1643
0
void ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const {
1644
0
  HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
1645
0
  (pi0_, pi0_, xsize_, ysize_, params_, Temp(), &blur_temp_, mask, nullptr);
1646
0
  ReleaseTemp();
1647
0
}
1648
1649
0
void ButteraugliComparator::Diffmap(const Image3F& rgb1, ImageF& result) const {
1650
0
  PROFILER_FUNC;
1651
0
  if (xsize_ < 8 || ysize_ < 8) {
1652
0
    ZeroFillImage(&result);
1653
0
    return;
1654
0
  }
1655
0
  const Image3F xyb1 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
1656
0
      rgb1, params_, Temp(), &blur_temp_);
1657
0
  ReleaseTemp();
1658
0
  DiffmapOpsinDynamicsImage(xyb1, result);
1659
0
  if (sub_) {
1660
0
    if (sub_->xsize_ < 8 || sub_->ysize_ < 8) {
1661
0
      return;
1662
0
    }
1663
0
    const Image3F sub_xyb = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
1664
0
        SubSample2x(rgb1), params_, sub_->Temp(), &sub_->blur_temp_);
1665
0
    sub_->ReleaseTemp();
1666
0
    ImageF subresult;
1667
0
    sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult);
1668
0
    AddSupersampled2x(subresult, 0.5, result);
1669
0
  }
1670
0
}
1671
1672
void ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1,
1673
0
                                                      ImageF& result) const {
1674
0
  PROFILER_FUNC;
1675
0
  if (xsize_ < 8 || ysize_ < 8) {
1676
0
    ZeroFillImage(&result);
1677
0
    return;
1678
0
  }
1679
0
  PsychoImage pi1;
1680
0
  HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
1681
0
  (xsize_, ysize_, params_, &blur_temp_, xyb1, pi1);
1682
0
  result = ImageF(xsize_, ysize_);
1683
0
  DiffmapPsychoImage(pi1, result);
1684
0
}
1685
1686
namespace {
1687
1688
void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
1689
                  const double w_0lt1, const double norm1,
1690
                  ImageF* HWY_RESTRICT diffs,
1691
0
                  Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1692
0
  PROFILER_FUNC;
1693
0
  const double len = 3.75;
1694
0
  static const double mulli = 0.39905817637;
1695
0
  HWY_DYNAMIC_DISPATCH(MaltaDiffMap)
1696
0
  (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
1697
0
}
1698
1699
void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
1700
                    const double w_0lt1, const double norm1,
1701
                    ImageF* HWY_RESTRICT diffs,
1702
0
                    Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1703
0
  PROFILER_FUNC;
1704
0
  const double len = 3.75;
1705
0
  static const double mulli = 0.611612573796;
1706
0
  HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF)
1707
0
  (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
1708
0
}
1709
1710
}  // namespace
1711
1712
void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
1713
0
                                               ImageF& diffmap) const {
1714
0
  PROFILER_FUNC;
1715
0
  if (xsize_ < 8 || ysize_ < 8) {
1716
0
    ZeroFillImage(&diffmap);
1717
0
    return;
1718
0
  }
1719
1720
0
  const float hf_asymmetry_ = params_.hf_asymmetry;
1721
0
  const float xmul_ = params_.xmul;
1722
1723
0
  ImageF diffs(xsize_, ysize_);
1724
0
  Image3F block_diff_ac(xsize_, ysize_);
1725
0
  ZeroFillImage(&block_diff_ac);
1726
0
  static const double wUhfMalta = 1.10039032555;
1727
0
  static const double norm1Uhf = 71.7800275169;
1728
0
  MaltaDiffMap(pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_,
1729
0
               wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1);
1730
1731
0
  static const double wUhfMaltaX = 173.5;
1732
0
  static const double norm1UhfX = 5.0;
1733
0
  MaltaDiffMap(pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_,
1734
0
               wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac,
1735
0
               0);
1736
1737
0
  static const double wHfMalta = 18.7237414387;
1738
0
  static const double norm1Hf = 4498534.45232;
1739
0
  MaltaDiffMapLF(pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_),
1740
0
                 wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs,
1741
0
                 &block_diff_ac, 1);
1742
1743
0
  static const double wHfMaltaX = 6923.99476109;
1744
0
  static const double norm1HfX = 8051.15833247;
1745
0
  MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0], wHfMaltaX * std::sqrt(hf_asymmetry_),
1746
0
                 wHfMaltaX / std::sqrt(hf_asymmetry_), norm1HfX, &diffs,
1747
0
                 &block_diff_ac, 0);
1748
1749
0
  static const double wMfMalta = 37.0819870399;
1750
0
  static const double norm1Mf = 130262059.556;
1751
0
  MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1), wMfMalta, wMfMalta, norm1Mf,
1752
0
                 &diffs, &block_diff_ac, 1);
1753
1754
0
  static const double wMfMaltaX = 8246.75321353;
1755
0
  static const double norm1MfX = 1009002.70582;
1756
0
  MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0), wMfMaltaX, wMfMaltaX,
1757
0
                 norm1MfX, &diffs, &block_diff_ac, 0);
1758
1759
0
  static const double wmul[9] = {
1760
0
      400.0,         1.50815703118,  0,
1761
0
      2150.0,        10.6195433239,  16.2176043152,
1762
0
      29.2353797994, 0.844626970982, 0.703646627719,
1763
0
  };
1764
0
  Image3F block_diff_dc(xsize_, ysize_);
1765
0
  for (size_t c = 0; c < 3; ++c) {
1766
0
    if (c < 2) {  // No blue channel error accumulated at HF.
1767
0
      HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric)
1768
0
      (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_,
1769
0
       &block_diff_ac, c);
1770
0
    }
1771
0
    HWY_DYNAMIC_DISPATCH(L2Diff)
1772
0
    (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac, c);
1773
0
    HWY_DYNAMIC_DISPATCH(SetL2Diff)
1774
0
    (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc, c);
1775
0
  }
1776
1777
0
  ImageF mask;
1778
0
  HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
1779
0
  (pi0_, pi1, xsize_, ysize_, params_, Temp(), &blur_temp_, &mask,
1780
0
   &block_diff_ac.Plane(1));
1781
0
  ReleaseTemp();
1782
1783
0
  HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap)
1784
0
  (mask, block_diff_dc, block_diff_ac, xmul_, &diffmap);
1785
0
}
1786
1787
double ButteraugliScoreFromDiffmap(const ImageF& diffmap,
1788
0
                                   const ButteraugliParams* params) {
1789
0
  PROFILER_FUNC;
1790
0
  float retval = 0.0f;
1791
0
  for (size_t y = 0; y < diffmap.ysize(); ++y) {
1792
0
    const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y);
1793
0
    for (size_t x = 0; x < diffmap.xsize(); ++x) {
1794
0
      retval = std::max(retval, row[x]);
1795
0
    }
1796
0
  }
1797
0
  return retval;
1798
0
}
1799
1800
bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
1801
0
                        double hf_asymmetry, double xmul, ImageF& diffmap) {
1802
0
  ButteraugliParams params;
1803
0
  params.hf_asymmetry = hf_asymmetry;
1804
0
  params.xmul = xmul;
1805
0
  return ButteraugliDiffmap(rgb0, rgb1, params, diffmap);
1806
0
}
1807
1808
bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
1809
0
                        const ButteraugliParams& params, ImageF& diffmap) {
1810
0
  PROFILER_FUNC;
1811
0
  const size_t xsize = rgb0.xsize();
1812
0
  const size_t ysize = rgb0.ysize();
1813
0
  if (xsize < 1 || ysize < 1) {
1814
0
    return JXL_FAILURE("Zero-sized image");
1815
0
  }
1816
0
  if (!SameSize(rgb0, rgb1)) {
1817
0
    return JXL_FAILURE("Size mismatch");
1818
0
  }
1819
0
  static const int kMax = 8;
1820
0
  if (xsize < kMax || ysize < kMax) {
1821
    // Butteraugli values for small (where xsize or ysize is smaller
1822
    // than 8 pixels) images are non-sensical, but most likely it is
1823
    // less disruptive to try to compute something than just give up.
1824
    // Temporarily extend the borders of the image to fit 8 x 8 size.
1825
0
    size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0;
1826
0
    size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0;
1827
0
    size_t xscaled = std::max<size_t>(kMax, xsize);
1828
0
    size_t yscaled = std::max<size_t>(kMax, ysize);
1829
0
    Image3F scaled0(xscaled, yscaled);
1830
0
    Image3F scaled1(xscaled, yscaled);
1831
0
    for (int i = 0; i < 3; ++i) {
1832
0
      for (size_t y = 0; y < yscaled; ++y) {
1833
0
        for (size_t x = 0; x < xscaled; ++x) {
1834
0
          size_t x2 =
1835
0
              std::min<size_t>(xsize - 1, x > xborder ? x - xborder : 0);
1836
0
          size_t y2 =
1837
0
              std::min<size_t>(ysize - 1, y > yborder ? y - yborder : 0);
1838
0
          scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2];
1839
0
          scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2];
1840
0
        }
1841
0
      }
1842
0
    }
1843
0
    ImageF diffmap_scaled;
1844
0
    const bool ok =
1845
0
        ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled);
1846
0
    diffmap = ImageF(xsize, ysize);
1847
0
    for (size_t y = 0; y < ysize; ++y) {
1848
0
      for (size_t x = 0; x < xsize; ++x) {
1849
0
        diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder];
1850
0
      }
1851
0
    }
1852
0
    return ok;
1853
0
  }
1854
0
  ButteraugliComparator butteraugli(rgb0, params);
1855
0
  butteraugli.Diffmap(rgb1, diffmap);
1856
0
  return true;
1857
0
}
1858
1859
bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
1860
                          float hf_asymmetry, float xmul, ImageF& diffmap,
1861
0
                          double& diffvalue) {
1862
0
  ButteraugliParams params;
1863
0
  params.hf_asymmetry = hf_asymmetry;
1864
0
  params.xmul = xmul;
1865
0
  return ButteraugliInterface(rgb0, rgb1, params, diffmap, diffvalue);
1866
0
}
1867
1868
bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
1869
                          const ButteraugliParams& params, ImageF& diffmap,
1870
0
                          double& diffvalue) {
1871
#if PROFILER_ENABLED
1872
  auto trace_start = std::chrono::steady_clock::now();
1873
#endif
1874
0
  if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) {
1875
0
    return false;
1876
0
  }
1877
#if PROFILER_ENABLED
1878
  auto trace_end = std::chrono::steady_clock::now();
1879
  std::chrono::duration<double> elapsed = trace_end - trace_start;
1880
  const size_t mp = rgb0.xsize() * rgb0.ysize();
1881
  printf("diff MP/s %f\n", mp / elapsed.count() * 1E-6);
1882
#endif
1883
0
  diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
1884
0
  return true;
1885
0
}
1886
1887
0
double ButteraugliFuzzyClass(double score) {
1888
0
  static const double fuzzy_width_up = 4.8;
1889
0
  static const double fuzzy_width_down = 4.8;
1890
0
  static const double m0 = 2.0;
1891
0
  static const double scaler = 0.7777;
1892
0
  double val;
1893
0
  if (score < 1.0) {
1894
    // val in [scaler .. 2.0]
1895
0
    val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_down));
1896
0
    val -= 1.0;           // from [1 .. 2] to [0 .. 1]
1897
0
    val *= 2.0 - scaler;  // from [0 .. 1] to [0 .. 2.0 - scaler]
1898
0
    val += scaler;        // from [0 .. 2.0 - scaler] to [scaler .. 2.0]
1899
0
  } else {
1900
    // val in [0 .. scaler]
1901
0
    val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_up));
1902
0
    val *= scaler;
1903
0
  }
1904
0
  return val;
1905
0
}
1906
1907
// #define PRINT_OUT_NORMALIZATION
1908
1909
0
double ButteraugliFuzzyInverse(double seek) {
1910
0
  double pos = 0;
1911
  // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
1912
0
  for (double range = 1.0; range >= 1e-10; range *= 0.5) {
1913
0
    double cur = ButteraugliFuzzyClass(pos);
1914
0
    if (cur < seek) {
1915
0
      pos -= range;
1916
0
    } else {
1917
0
      pos += range;
1918
0
    }
1919
0
  }
1920
#ifdef PRINT_OUT_NORMALIZATION
1921
  if (seek == 1.0) {
1922
    fprintf(stderr, "Fuzzy inverse %g\n", pos);
1923
  }
1924
#endif
1925
0
  return pos;
1926
0
}
1927
1928
#ifdef PRINT_OUT_NORMALIZATION
1929
static double print_out_normalization = ButteraugliFuzzyInverse(1.0);
1930
#endif
1931
1932
namespace {
1933
1934
void ScoreToRgb(double score, double good_threshold, double bad_threshold,
1935
0
                float rgb[3]) {
1936
0
  double heatmap[12][3] = {
1937
0
      {0, 0, 0},       {0, 0, 1},
1938
0
      {0, 1, 1},       {0, 1, 0},  // Good level
1939
0
      {1, 1, 0},       {1, 0, 0},  // Bad level
1940
0
      {1, 0, 1},       {0.5, 0.5, 1.0},
1941
0
      {1.0, 0.5, 0.5},  // Pastel colors for the very bad quality range.
1942
0
      {1.0, 1.0, 0.5}, {1, 1, 1},
1943
0
      {1, 1, 1},  // Last color repeated to have a solid range of white.
1944
0
  };
1945
0
  if (score < good_threshold) {
1946
0
    score = (score / good_threshold) * 0.3;
1947
0
  } else if (score < bad_threshold) {
1948
0
    score = 0.3 +
1949
0
            (score - good_threshold) / (bad_threshold - good_threshold) * 0.15;
1950
0
  } else {
1951
0
    score = 0.45 + (score - bad_threshold) / (bad_threshold * 12) * 0.5;
1952
0
  }
1953
0
  static const int kTableSize = sizeof(heatmap) / sizeof(heatmap[0]);
1954
0
  score = std::min<double>(std::max<double>(score * (kTableSize - 1), 0.0),
1955
0
                           kTableSize - 2);
1956
0
  int ix = static_cast<int>(score);
1957
0
  ix = std::min(std::max(0, ix), kTableSize - 2);  // Handle NaN
1958
0
  double mix = score - ix;
1959
0
  for (int i = 0; i < 3; ++i) {
1960
0
    double v = mix * heatmap[ix + 1][i] + (1 - mix) * heatmap[ix][i];
1961
0
    rgb[i] = pow(v, 0.5);
1962
0
  }
1963
0
}
1964
1965
}  // namespace
1966
1967
Image3F CreateHeatMapImage(const ImageF& distmap, double good_threshold,
1968
0
                           double bad_threshold) {
1969
0
  Image3F heatmap(distmap.xsize(), distmap.ysize());
1970
0
  for (size_t y = 0; y < distmap.ysize(); ++y) {
1971
0
    const float* BUTTERAUGLI_RESTRICT row_distmap = distmap.ConstRow(y);
1972
0
    float* BUTTERAUGLI_RESTRICT row_h0 = heatmap.PlaneRow(0, y);
1973
0
    float* BUTTERAUGLI_RESTRICT row_h1 = heatmap.PlaneRow(1, y);
1974
0
    float* BUTTERAUGLI_RESTRICT row_h2 = heatmap.PlaneRow(2, y);
1975
0
    for (size_t x = 0; x < distmap.xsize(); ++x) {
1976
0
      const float d = row_distmap[x];
1977
0
      float rgb[3];
1978
0
      ScoreToRgb(d, good_threshold, bad_threshold, rgb);
1979
0
      row_h0[x] = rgb[0];
1980
0
      row_h1[x] = rgb[1];
1981
0
      row_h2[x] = rgb[2];
1982
0
    }
1983
0
  }
1984
0
  return heatmap;
1985
0
}
1986
1987
}  // namespace jxl
1988
#endif  // HWY_ONCE