Coverage Report

Created: 2023-08-28 07:24

/src/libjxl/lib/jpegli/upsample.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jpegli/upsample.h"
7
8
#include <string.h>
9
10
#undef HWY_TARGET_INCLUDE
11
#define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc"
12
#include <hwy/foreach_target.h>
13
#include <hwy/highway.h>
14
15
HWY_BEFORE_NAMESPACE();
16
namespace jpegli {
17
namespace HWY_NAMESPACE {
18
19
// These templates are not found via ADL.
20
using hwy::HWY_NAMESPACE::Mul;
21
using hwy::HWY_NAMESPACE::MulAdd;
22
using hwy::HWY_NAMESPACE::Vec;
23
24
#if HWY_CAP_GE512
25
using hwy::HWY_NAMESPACE::Half;
26
using hwy::HWY_NAMESPACE::Vec;
27
template <size_t i, class DF, class V>
28
0
HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
29
0
  using HF = Half<DF>;
30
0
  using HHF = Half<HF>;
31
0
  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
32
0
  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
33
0
}
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3::Quarter<0ul, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3::Quarter<1ul, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3::Quarter<2ul, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3::Quarter<3ul, hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_ZEN4::Quarter<0ul, hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_ZEN4::Quarter<1ul, hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_ZEN4::Quarter<2ul, hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_ZEN4::Quarter<3ul, hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_SPR::Quarter<0ul, hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_SPR::Quarter<1ul, hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_SPR::Quarter<2ul, hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>::Half::Half)())) jpegli::N_AVX3_SPR::Quarter<3ul, hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>)
34
35
template <class DF, class V>
36
0
HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
37
0
  using HF = Half<DF>;
38
0
  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
39
0
}
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3::Simd<float, 16ul, 0>)())) jpegli::N_AVX3::Concat4<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec128<float, 4ul> >(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec128<float, 4ul>, hwy::N_AVX3::Vec128<float, 4ul>, hwy::N_AVX3::Vec128<float, 4ul>, hwy::N_AVX3::Vec128<float, 4ul>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>)())) jpegli::N_AVX3_ZEN4::Concat4<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul> >(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>, hwy::N_AVX3_ZEN4::Vec128<float, 4ul>)
Unexecuted instantiation: decltype (Zero((hwy::N_AVX3_SPR::Simd<float, 16ul, 0>)())) jpegli::N_AVX3_SPR::Concat4<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul> >(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec128<float, 4ul>, hwy::N_AVX3_SPR::Vec128<float, 4ul>, hwy::N_AVX3_SPR::Vec128<float, 4ul>, hwy::N_AVX3_SPR::Vec128<float, 4ul>)
40
41
#endif
42
43
// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
44
// aligned.
45
template <class DF, class V, typename T>
46
162M
void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
47
162M
  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
48
#if HWY_TARGET == HWY_SCALAR
49
  Store(v0, df, mem);
50
  Store(v1, df, mem + 1);
51
#elif !HWY_CAP_GE256
52
  Store(InterleaveLower(df, v0, v1), df, mem);
53
  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
54
#else
55
50.9M
  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
56
50.9M
    auto t0 = InterleaveLower(df, v0, v1);
57
50.9M
    auto t1 = InterleaveUpper(df, v0, v1);
58
50.9M
    Store(ConcatLowerLower(df, t1, t0), df, mem);
59
50.9M
    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
60
50.9M
  } else {
61
#if HWY_CAP_GE512
62
    auto t0 = InterleaveLower(df, v0, v1);
63
    auto t1 = InterleaveUpper(df, v0, v1);
64
    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
65
                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
66
          df, mem);
67
    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
68
                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
69
          df, mem + Lanes(df));
70
#endif
71
0
  }
72
#endif
73
162M
}
void jpegli::N_SSE4::StoreInterleaved<hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, float>(hwy::N_SSE4::Simd<float, 4ul, 0>, hwy::N_SSE4::Vec128<float, 4ul>, hwy::N_SSE4::Vec128<float, 4ul>, float*)
Line
Count
Source
46
54.3M
void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
47
54.3M
  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
48
#if HWY_TARGET == HWY_SCALAR
49
  Store(v0, df, mem);
50
  Store(v1, df, mem + 1);
51
#elif !HWY_CAP_GE256
52
54.3M
  Store(InterleaveLower(df, v0, v1), df, mem);
53
54.3M
  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
54
#else
55
  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
56
    auto t0 = InterleaveLower(df, v0, v1);
57
    auto t1 = InterleaveUpper(df, v0, v1);
58
    Store(ConcatLowerLower(df, t1, t0), df, mem);
59
    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
60
  } else {
61
#if HWY_CAP_GE512
62
    auto t0 = InterleaveLower(df, v0, v1);
63
    auto t1 = InterleaveUpper(df, v0, v1);
64
    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
65
                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
66
          df, mem);
67
    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
68
                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
69
          df, mem + Lanes(df));
70
#endif
71
  }
72
#endif
73
54.3M
}
void jpegli::N_AVX2::StoreInterleaved<hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, float>(hwy::N_AVX2::Simd<float, 8ul, 0>, hwy::N_AVX2::Vec256<float>, hwy::N_AVX2::Vec256<float>, float*)
Line
Count
Source
46
50.9M
void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
47
50.9M
  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
48
#if HWY_TARGET == HWY_SCALAR
49
  Store(v0, df, mem);
50
  Store(v1, df, mem + 1);
51
#elif !HWY_CAP_GE256
52
  Store(InterleaveLower(df, v0, v1), df, mem);
53
  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
54
#else
55
50.9M
  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
56
50.9M
    auto t0 = InterleaveLower(df, v0, v1);
57
50.9M
    auto t1 = InterleaveUpper(df, v0, v1);
58
50.9M
    Store(ConcatLowerLower(df, t1, t0), df, mem);
59
50.9M
    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
60
50.9M
  } else {
61
#if HWY_CAP_GE512
62
    auto t0 = InterleaveLower(df, v0, v1);
63
    auto t1 = InterleaveUpper(df, v0, v1);
64
    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
65
                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
66
          df, mem);
67
    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
68
                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
69
          df, mem + Lanes(df));
70
#endif
71
0
  }
72
50.9M
#endif
73
50.9M
}
Unexecuted instantiation: void jpegli::N_AVX3::StoreInterleaved<hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>, float>(hwy::N_AVX3::Simd<float, 16ul, 0>, hwy::N_AVX3::Vec512<float>, hwy::N_AVX3::Vec512<float>, float*)
Unexecuted instantiation: void jpegli::N_AVX3_ZEN4::StoreInterleaved<hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>, float>(hwy::N_AVX3_ZEN4::Simd<float, 16ul, 0>, hwy::N_AVX3_ZEN4::Vec512<float>, hwy::N_AVX3_ZEN4::Vec512<float>, float*)
Unexecuted instantiation: void jpegli::N_AVX3_SPR::StoreInterleaved<hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>, float>(hwy::N_AVX3_SPR::Simd<float, 16ul, 0>, hwy::N_AVX3_SPR::Vec512<float>, hwy::N_AVX3_SPR::Vec512<float>, float*)
void jpegli::N_SSE2::StoreInterleaved<hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, float>(hwy::N_SSE2::Simd<float, 4ul, 0>, hwy::N_SSE2::Vec128<float, 4ul>, hwy::N_SSE2::Vec128<float, 4ul>, float*)
Line
Count
Source
46
56.7M
void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
47
56.7M
  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
48
#if HWY_TARGET == HWY_SCALAR
49
  Store(v0, df, mem);
50
  Store(v1, df, mem + 1);
51
#elif !HWY_CAP_GE256
52
56.7M
  Store(InterleaveLower(df, v0, v1), df, mem);
53
56.7M
  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
54
#else
55
  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
56
    auto t0 = InterleaveLower(df, v0, v1);
57
    auto t1 = InterleaveUpper(df, v0, v1);
58
    Store(ConcatLowerLower(df, t1, t0), df, mem);
59
    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
60
  } else {
61
#if HWY_CAP_GE512
62
    auto t0 = InterleaveLower(df, v0, v1);
63
    auto t1 = InterleaveUpper(df, v0, v1);
64
    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
65
                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
66
          df, mem);
67
    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
68
                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
69
          df, mem + Lanes(df));
70
#endif
71
  }
72
#endif
73
56.7M
}
74
75
void Upsample2Horizontal(float* JXL_RESTRICT row,
76
14.4M
                         float* JXL_RESTRICT scratch_space, size_t len_out) {
77
14.4M
  HWY_FULL(float) df;
78
14.4M
  auto threefour = Set(df, 0.75f);
79
14.4M
  auto onefour = Set(df, 0.25f);
80
14.4M
  const size_t len_in = (len_out + 1) >> 1;
81
14.4M
  memcpy(scratch_space, row, len_in * sizeof(row[0]));
82
14.4M
  scratch_space[-1] = scratch_space[0];
83
14.4M
  scratch_space[len_in] = scratch_space[len_in - 1];
84
176M
  for (size_t x = 0; x < len_in; x += Lanes(df)) {
85
162M
    auto current = Mul(Load(df, scratch_space + x), threefour);
86
162M
    auto prev = LoadU(df, scratch_space + x - 1);
87
162M
    auto next = LoadU(df, scratch_space + x + 1);
88
162M
    auto left = MulAdd(onefour, prev, current);
89
162M
    auto right = MulAdd(onefour, next, current);
90
162M
    StoreInterleaved(df, left, right, row + x * 2);
91
162M
  }
92
14.4M
}
jpegli::N_SSE4::Upsample2Horizontal(float*, float*, unsigned long)
Line
Count
Source
76
4.74M
                         float* JXL_RESTRICT scratch_space, size_t len_out) {
77
4.74M
  HWY_FULL(float) df;
78
4.74M
  auto threefour = Set(df, 0.75f);
79
4.74M
  auto onefour = Set(df, 0.25f);
80
4.74M
  const size_t len_in = (len_out + 1) >> 1;
81
4.74M
  memcpy(scratch_space, row, len_in * sizeof(row[0]));
82
4.74M
  scratch_space[-1] = scratch_space[0];
83
4.74M
  scratch_space[len_in] = scratch_space[len_in - 1];
84
59.1M
  for (size_t x = 0; x < len_in; x += Lanes(df)) {
85
54.3M
    auto current = Mul(Load(df, scratch_space + x), threefour);
86
54.3M
    auto prev = LoadU(df, scratch_space + x - 1);
87
54.3M
    auto next = LoadU(df, scratch_space + x + 1);
88
54.3M
    auto left = MulAdd(onefour, prev, current);
89
54.3M
    auto right = MulAdd(onefour, next, current);
90
54.3M
    StoreInterleaved(df, left, right, row + x * 2);
91
54.3M
  }
92
4.74M
}
jpegli::N_AVX2::Upsample2Horizontal(float*, float*, unsigned long)
Line
Count
Source
76
3.28M
                         float* JXL_RESTRICT scratch_space, size_t len_out) {
77
3.28M
  HWY_FULL(float) df;
78
3.28M
  auto threefour = Set(df, 0.75f);
79
3.28M
  auto onefour = Set(df, 0.25f);
80
3.28M
  const size_t len_in = (len_out + 1) >> 1;
81
3.28M
  memcpy(scratch_space, row, len_in * sizeof(row[0]));
82
3.28M
  scratch_space[-1] = scratch_space[0];
83
3.28M
  scratch_space[len_in] = scratch_space[len_in - 1];
84
54.2M
  for (size_t x = 0; x < len_in; x += Lanes(df)) {
85
50.9M
    auto current = Mul(Load(df, scratch_space + x), threefour);
86
50.9M
    auto prev = LoadU(df, scratch_space + x - 1);
87
50.9M
    auto next = LoadU(df, scratch_space + x + 1);
88
50.9M
    auto left = MulAdd(onefour, prev, current);
89
50.9M
    auto right = MulAdd(onefour, next, current);
90
50.9M
    StoreInterleaved(df, left, right, row + x * 2);
91
50.9M
  }
92
3.28M
}
Unexecuted instantiation: jpegli::N_AVX3::Upsample2Horizontal(float*, float*, unsigned long)
Unexecuted instantiation: jpegli::N_AVX3_ZEN4::Upsample2Horizontal(float*, float*, unsigned long)
Unexecuted instantiation: jpegli::N_AVX3_SPR::Upsample2Horizontal(float*, float*, unsigned long)
jpegli::N_SSE2::Upsample2Horizontal(float*, float*, unsigned long)
Line
Count
Source
76
6.39M
                         float* JXL_RESTRICT scratch_space, size_t len_out) {
77
6.39M
  HWY_FULL(float) df;
78
6.39M
  auto threefour = Set(df, 0.75f);
79
6.39M
  auto onefour = Set(df, 0.25f);
80
6.39M
  const size_t len_in = (len_out + 1) >> 1;
81
6.39M
  memcpy(scratch_space, row, len_in * sizeof(row[0]));
82
6.39M
  scratch_space[-1] = scratch_space[0];
83
6.39M
  scratch_space[len_in] = scratch_space[len_in - 1];
84
63.1M
  for (size_t x = 0; x < len_in; x += Lanes(df)) {
85
56.7M
    auto current = Mul(Load(df, scratch_space + x), threefour);
86
56.7M
    auto prev = LoadU(df, scratch_space + x - 1);
87
56.7M
    auto next = LoadU(df, scratch_space + x + 1);
88
56.7M
    auto left = MulAdd(onefour, prev, current);
89
56.7M
    auto right = MulAdd(onefour, next, current);
90
56.7M
    StoreInterleaved(df, left, right, row + x * 2);
91
56.7M
  }
92
6.39M
}
93
94
void Upsample2Vertical(const float* JXL_RESTRICT row_top,
95
                       const float* JXL_RESTRICT row_mid,
96
                       const float* JXL_RESTRICT row_bot,
97
                       float* JXL_RESTRICT row_out0,
98
6.10M
                       float* JXL_RESTRICT row_out1, size_t len) {
99
6.10M
  HWY_FULL(float) df;
100
6.10M
  auto threefour = Set(df, 0.75f);
101
6.10M
  auto onefour = Set(df, 0.25f);
102
106M
  for (size_t x = 0; x < len; x += Lanes(df)) {
103
100M
    auto it = Load(df, row_top + x);
104
100M
    auto im = Load(df, row_mid + x);
105
100M
    auto ib = Load(df, row_bot + x);
106
100M
    auto im_scaled = Mul(im, threefour);
107
100M
    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
108
100M
    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
109
100M
  }
110
6.10M
}
jpegli::N_SSE4::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long)
Line
Count
Source
98
1.84M
                       float* JXL_RESTRICT row_out1, size_t len) {
99
1.84M
  HWY_FULL(float) df;
100
1.84M
  auto threefour = Set(df, 0.75f);
101
1.84M
  auto onefour = Set(df, 0.25f);
102
36.4M
  for (size_t x = 0; x < len; x += Lanes(df)) {
103
34.6M
    auto it = Load(df, row_top + x);
104
34.6M
    auto im = Load(df, row_mid + x);
105
34.6M
    auto ib = Load(df, row_bot + x);
106
34.6M
    auto im_scaled = Mul(im, threefour);
107
34.6M
    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
108
34.6M
    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
109
34.6M
  }
110
1.84M
}
jpegli::N_AVX2::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long)
Line
Count
Source
98
1.98M
                       float* JXL_RESTRICT row_out1, size_t len) {
99
1.98M
  HWY_FULL(float) df;
100
1.98M
  auto threefour = Set(df, 0.75f);
101
1.98M
  auto onefour = Set(df, 0.25f);
102
38.4M
  for (size_t x = 0; x < len; x += Lanes(df)) {
103
36.4M
    auto it = Load(df, row_top + x);
104
36.4M
    auto im = Load(df, row_mid + x);
105
36.4M
    auto ib = Load(df, row_bot + x);
106
36.4M
    auto im_scaled = Mul(im, threefour);
107
36.4M
    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
108
36.4M
    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
109
36.4M
  }
110
1.98M
}
Unexecuted instantiation: jpegli::N_AVX3::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long)
Unexecuted instantiation: jpegli::N_AVX3_ZEN4::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long)
Unexecuted instantiation: jpegli::N_AVX3_SPR::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long)
jpegli::N_SSE2::Upsample2Vertical(float const*, float const*, float const*, float*, float*, unsigned long)
Line
Count
Source
98
2.27M
                       float* JXL_RESTRICT row_out1, size_t len) {
99
2.27M
  HWY_FULL(float) df;
100
2.27M
  auto threefour = Set(df, 0.75f);
101
2.27M
  auto onefour = Set(df, 0.25f);
102
31.9M
  for (size_t x = 0; x < len; x += Lanes(df)) {
103
29.6M
    auto it = Load(df, row_top + x);
104
29.6M
    auto im = Load(df, row_mid + x);
105
29.6M
    auto ib = Load(df, row_bot + x);
106
29.6M
    auto im_scaled = Mul(im, threefour);
107
29.6M
    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
108
29.6M
    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
109
29.6M
  }
110
2.27M
}
111
112
// NOLINTNEXTLINE(google-readability-namespace-comments)
113
}  // namespace HWY_NAMESPACE
114
}  // namespace jpegli
115
HWY_AFTER_NAMESPACE();
116
117
#if HWY_ONCE
118
namespace jpegli {
119
120
HWY_EXPORT(Upsample2Horizontal);
121
HWY_EXPORT(Upsample2Vertical);
122
123
void Upsample2Horizontal(float* JXL_RESTRICT row,
124
14.4M
                         float* JXL_RESTRICT scratch_space, size_t len_out) {
125
14.4M
  return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row, scratch_space, len_out);
126
14.4M
}
127
128
void Upsample2Vertical(const float* JXL_RESTRICT row_top,
129
                       const float* JXL_RESTRICT row_mid,
130
                       const float* JXL_RESTRICT row_bot,
131
                       float* JXL_RESTRICT row_out0,
132
6.10M
                       float* JXL_RESTRICT row_out1, size_t len) {
133
6.10M
  return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot,
134
6.10M
                                                 row_out0, row_out1, len);
135
6.10M
}
136
}  // namespace jpegli
137
#endif  // HWY_ONCE