Coverage Report

Created: 2024-05-21 06:41

/src/libjxl/lib/jpegli/transpose-inl.h
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
7
#ifdef LIB_JPEGLI_TRANSPOSE_INL_H_
8
#undef LIB_JPEGLI_TRANSPOSE_INL_H_
9
#else
10
#define LIB_JPEGLI_TRANSPOSE_INL_H_
11
#endif
12
13
#include "lib/jxl/base/compiler_specific.h"
14
15
HWY_BEFORE_NAMESPACE();
16
namespace jpegli {
17
namespace HWY_NAMESPACE {
18
namespace {
19
20
#if HWY_CAP_GE256
21
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
22
35.3M
                                  float* JXL_RESTRICT to) {
23
35.3M
  const HWY_CAPPED(float, 8) d;
24
35.3M
  auto i0 = Load(d, from);
25
35.3M
  auto i1 = Load(d, from + 1 * 8);
26
35.3M
  auto i2 = Load(d, from + 2 * 8);
27
35.3M
  auto i3 = Load(d, from + 3 * 8);
28
35.3M
  auto i4 = Load(d, from + 4 * 8);
29
35.3M
  auto i5 = Load(d, from + 5 * 8);
30
35.3M
  auto i6 = Load(d, from + 6 * 8);
31
35.3M
  auto i7 = Load(d, from + 7 * 8);
32
33
35.3M
  const auto q0 = InterleaveLower(d, i0, i2);
34
35.3M
  const auto q1 = InterleaveLower(d, i1, i3);
35
35.3M
  const auto q2 = InterleaveUpper(d, i0, i2);
36
35.3M
  const auto q3 = InterleaveUpper(d, i1, i3);
37
35.3M
  const auto q4 = InterleaveLower(d, i4, i6);
38
35.3M
  const auto q5 = InterleaveLower(d, i5, i7);
39
35.3M
  const auto q6 = InterleaveUpper(d, i4, i6);
40
35.3M
  const auto q7 = InterleaveUpper(d, i5, i7);
41
42
35.3M
  const auto r0 = InterleaveLower(d, q0, q1);
43
35.3M
  const auto r1 = InterleaveUpper(d, q0, q1);
44
35.3M
  const auto r2 = InterleaveLower(d, q2, q3);
45
35.3M
  const auto r3 = InterleaveUpper(d, q2, q3);
46
35.3M
  const auto r4 = InterleaveLower(d, q4, q5);
47
35.3M
  const auto r5 = InterleaveUpper(d, q4, q5);
48
35.3M
  const auto r6 = InterleaveLower(d, q6, q7);
49
35.3M
  const auto r7 = InterleaveUpper(d, q6, q7);
50
51
35.3M
  i0 = ConcatLowerLower(d, r4, r0);
52
35.3M
  i1 = ConcatLowerLower(d, r5, r1);
53
35.3M
  i2 = ConcatLowerLower(d, r6, r2);
54
35.3M
  i3 = ConcatLowerLower(d, r7, r3);
55
35.3M
  i4 = ConcatUpperUpper(d, r4, r0);
56
35.3M
  i5 = ConcatUpperUpper(d, r5, r1);
57
35.3M
  i6 = ConcatUpperUpper(d, r6, r2);
58
35.3M
  i7 = ConcatUpperUpper(d, r7, r3);
59
60
35.3M
  Store(i0, d, to);
61
35.3M
  Store(i1, d, to + 1 * 8);
62
35.3M
  Store(i2, d, to + 2 * 8);
63
35.3M
  Store(i3, d, to + 3 * 8);
64
35.3M
  Store(i4, d, to + 4 * 8);
65
35.3M
  Store(i5, d, to + 5 * 8);
66
35.3M
  Store(i6, d, to + 6 * 8);
67
35.3M
  Store(i7, d, to + 7 * 8);
68
35.3M
}
69
#elif HWY_TARGET != HWY_SCALAR
70
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
71
30.5M
                                  float* JXL_RESTRICT to) {
72
30.5M
  const HWY_CAPPED(float, 4) d;
73
91.6M
  for (size_t n = 0; n < 8; n += 4) {
74
183M
    for (size_t m = 0; m < 8; m += 4) {
75
122M
      auto p0 = Load(d, from + n * 8 + m);
76
122M
      auto p1 = Load(d, from + (n + 1) * 8 + m);
77
122M
      auto p2 = Load(d, from + (n + 2) * 8 + m);
78
122M
      auto p3 = Load(d, from + (n + 3) * 8 + m);
79
122M
      const auto q0 = InterleaveLower(d, p0, p2);
80
122M
      const auto q1 = InterleaveLower(d, p1, p3);
81
122M
      const auto q2 = InterleaveUpper(d, p0, p2);
82
122M
      const auto q3 = InterleaveUpper(d, p1, p3);
83
84
122M
      const auto r0 = InterleaveLower(d, q0, q1);
85
122M
      const auto r1 = InterleaveUpper(d, q0, q1);
86
122M
      const auto r2 = InterleaveLower(d, q2, q3);
87
122M
      const auto r3 = InterleaveUpper(d, q2, q3);
88
122M
      Store(r0, d, to + m * 8 + n);
89
122M
      Store(r1, d, to + (1 + m) * 8 + n);
90
122M
      Store(r2, d, to + (2 + m) * 8 + n);
91
122M
      Store(r3, d, to + (3 + m) * 8 + n);
92
122M
    }
93
61.1M
  }
94
30.5M
}
idct.cc:jpegli::N_SSE4::(anonymous namespace)::Transpose8x8Block(float const*, float*)
Line
Count
Source
71
16.3M
                                  float* JXL_RESTRICT to) {
72
16.3M
  const HWY_CAPPED(float, 4) d;
73
48.9M
  for (size_t n = 0; n < 8; n += 4) {
74
97.8M
    for (size_t m = 0; m < 8; m += 4) {
75
65.2M
      auto p0 = Load(d, from + n * 8 + m);
76
65.2M
      auto p1 = Load(d, from + (n + 1) * 8 + m);
77
65.2M
      auto p2 = Load(d, from + (n + 2) * 8 + m);
78
65.2M
      auto p3 = Load(d, from + (n + 3) * 8 + m);
79
65.2M
      const auto q0 = InterleaveLower(d, p0, p2);
80
65.2M
      const auto q1 = InterleaveLower(d, p1, p3);
81
65.2M
      const auto q2 = InterleaveUpper(d, p0, p2);
82
65.2M
      const auto q3 = InterleaveUpper(d, p1, p3);
83
84
65.2M
      const auto r0 = InterleaveLower(d, q0, q1);
85
65.2M
      const auto r1 = InterleaveUpper(d, q0, q1);
86
65.2M
      const auto r2 = InterleaveLower(d, q2, q3);
87
65.2M
      const auto r3 = InterleaveUpper(d, q2, q3);
88
65.2M
      Store(r0, d, to + m * 8 + n);
89
65.2M
      Store(r1, d, to + (1 + m) * 8 + n);
90
65.2M
      Store(r2, d, to + (2 + m) * 8 + n);
91
65.2M
      Store(r3, d, to + (3 + m) * 8 + n);
92
65.2M
    }
93
32.6M
  }
94
16.3M
}
idct.cc:jpegli::N_SSE2::(anonymous namespace)::Transpose8x8Block(float const*, float*)
Line
Count
Source
71
14.2M
                                  float* JXL_RESTRICT to) {
72
14.2M
  const HWY_CAPPED(float, 4) d;
73
42.7M
  for (size_t n = 0; n < 8; n += 4) {
74
85.4M
    for (size_t m = 0; m < 8; m += 4) {
75
56.9M
      auto p0 = Load(d, from + n * 8 + m);
76
56.9M
      auto p1 = Load(d, from + (n + 1) * 8 + m);
77
56.9M
      auto p2 = Load(d, from + (n + 2) * 8 + m);
78
56.9M
      auto p3 = Load(d, from + (n + 3) * 8 + m);
79
56.9M
      const auto q0 = InterleaveLower(d, p0, p2);
80
56.9M
      const auto q1 = InterleaveLower(d, p1, p3);
81
56.9M
      const auto q2 = InterleaveUpper(d, p0, p2);
82
56.9M
      const auto q3 = InterleaveUpper(d, p1, p3);
83
84
56.9M
      const auto r0 = InterleaveLower(d, q0, q1);
85
56.9M
      const auto r1 = InterleaveUpper(d, q0, q1);
86
56.9M
      const auto r2 = InterleaveLower(d, q2, q3);
87
56.9M
      const auto r3 = InterleaveUpper(d, q2, q3);
88
56.9M
      Store(r0, d, to + m * 8 + n);
89
56.9M
      Store(r1, d, to + (1 + m) * 8 + n);
90
56.9M
      Store(r2, d, to + (2 + m) * 8 + n);
91
56.9M
      Store(r3, d, to + (3 + m) * 8 + n);
92
56.9M
    }
93
28.4M
  }
94
14.2M
}
95
#else
96
static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
97
                                         float* JXL_RESTRICT to) {
98
  for (size_t n = 0; n < 8; ++n) {
99
    for (size_t m = 0; m < 8; ++m) {
100
      to[8 * n + m] = from[8 * m + n];
101
    }
102
  }
103
}
104
#endif
105
106
// NOLINTNEXTLINE(google-readability-namespace-comments)
107
}  // namespace
108
}  // namespace HWY_NAMESPACE
109
}  // namespace jpegli
110
HWY_AFTER_NAMESPACE();
111
#endif  // LIB_JPEGLI_TRANSPOSE_INL_H_