Coverage Report

Created: 2025-07-23 07:47

/src/libjxl/lib/jpegli/transpose-inl.h
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include <cstddef>
7
8
#if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
9
#ifdef LIB_JPEGLI_TRANSPOSE_INL_H_
10
#undef LIB_JPEGLI_TRANSPOSE_INL_H_
11
#else
12
#define LIB_JPEGLI_TRANSPOSE_INL_H_
13
#endif
14
15
#include <hwy/highway.h>
16
17
#include "lib/jxl/base/compiler_specific.h"
18
19
HWY_BEFORE_NAMESPACE();
20
namespace jpegli {
21
namespace HWY_NAMESPACE {
22
namespace {
23
24
#if HWY_CAP_GE256
25
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
26
31.2M
                                  float* JXL_RESTRICT to) {
27
31.2M
  const HWY_CAPPED(float, 8) d;
28
31.2M
  auto i0 = Load(d, from);
29
31.2M
  auto i1 = Load(d, from + 1 * 8);
30
31.2M
  auto i2 = Load(d, from + 2 * 8);
31
31.2M
  auto i3 = Load(d, from + 3 * 8);
32
31.2M
  auto i4 = Load(d, from + 4 * 8);
33
31.2M
  auto i5 = Load(d, from + 5 * 8);
34
31.2M
  auto i6 = Load(d, from + 6 * 8);
35
31.2M
  auto i7 = Load(d, from + 7 * 8);
36
37
31.2M
  const auto q0 = InterleaveLower(d, i0, i2);
38
31.2M
  const auto q1 = InterleaveLower(d, i1, i3);
39
31.2M
  const auto q2 = InterleaveUpper(d, i0, i2);
40
31.2M
  const auto q3 = InterleaveUpper(d, i1, i3);
41
31.2M
  const auto q4 = InterleaveLower(d, i4, i6);
42
31.2M
  const auto q5 = InterleaveLower(d, i5, i7);
43
31.2M
  const auto q6 = InterleaveUpper(d, i4, i6);
44
31.2M
  const auto q7 = InterleaveUpper(d, i5, i7);
45
46
31.2M
  const auto r0 = InterleaveLower(d, q0, q1);
47
31.2M
  const auto r1 = InterleaveUpper(d, q0, q1);
48
31.2M
  const auto r2 = InterleaveLower(d, q2, q3);
49
31.2M
  const auto r3 = InterleaveUpper(d, q2, q3);
50
31.2M
  const auto r4 = InterleaveLower(d, q4, q5);
51
31.2M
  const auto r5 = InterleaveUpper(d, q4, q5);
52
31.2M
  const auto r6 = InterleaveLower(d, q6, q7);
53
31.2M
  const auto r7 = InterleaveUpper(d, q6, q7);
54
55
31.2M
  i0 = ConcatLowerLower(d, r4, r0);
56
31.2M
  i1 = ConcatLowerLower(d, r5, r1);
57
31.2M
  i2 = ConcatLowerLower(d, r6, r2);
58
31.2M
  i3 = ConcatLowerLower(d, r7, r3);
59
31.2M
  i4 = ConcatUpperUpper(d, r4, r0);
60
31.2M
  i5 = ConcatUpperUpper(d, r5, r1);
61
31.2M
  i6 = ConcatUpperUpper(d, r6, r2);
62
31.2M
  i7 = ConcatUpperUpper(d, r7, r3);
63
64
31.2M
  Store(i0, d, to);
65
31.2M
  Store(i1, d, to + 1 * 8);
66
31.2M
  Store(i2, d, to + 2 * 8);
67
31.2M
  Store(i3, d, to + 3 * 8);
68
31.2M
  Store(i4, d, to + 4 * 8);
69
31.2M
  Store(i5, d, to + 5 * 8);
70
31.2M
  Store(i6, d, to + 6 * 8);
71
31.2M
  Store(i7, d, to + 7 * 8);
72
31.2M
}
73
#elif HWY_TARGET != HWY_SCALAR
74
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
75
32.4M
                                  float* JXL_RESTRICT to) {
76
32.4M
  const HWY_CAPPED(float, 4) d;
77
97.2M
  for (size_t n = 0; n < 8; n += 4) {
78
194M
    for (size_t m = 0; m < 8; m += 4) {
79
129M
      auto p0 = Load(d, from + n * 8 + m);
80
129M
      auto p1 = Load(d, from + (n + 1) * 8 + m);
81
129M
      auto p2 = Load(d, from + (n + 2) * 8 + m);
82
129M
      auto p3 = Load(d, from + (n + 3) * 8 + m);
83
129M
      const auto q0 = InterleaveLower(d, p0, p2);
84
129M
      const auto q1 = InterleaveLower(d, p1, p3);
85
129M
      const auto q2 = InterleaveUpper(d, p0, p2);
86
129M
      const auto q3 = InterleaveUpper(d, p1, p3);
87
88
129M
      const auto r0 = InterleaveLower(d, q0, q1);
89
129M
      const auto r1 = InterleaveUpper(d, q0, q1);
90
129M
      const auto r2 = InterleaveLower(d, q2, q3);
91
129M
      const auto r3 = InterleaveUpper(d, q2, q3);
92
129M
      Store(r0, d, to + m * 8 + n);
93
129M
      Store(r1, d, to + (1 + m) * 8 + n);
94
129M
      Store(r2, d, to + (2 + m) * 8 + n);
95
129M
      Store(r3, d, to + (3 + m) * 8 + n);
96
129M
    }
97
64.8M
  }
98
32.4M
}
idct.cc:jpegli::N_SSE4::(anonymous namespace)::Transpose8x8Block(float const*, float*)
Line
Count
Source
75
15.9M
                                  float* JXL_RESTRICT to) {
76
15.9M
  const HWY_CAPPED(float, 4) d;
77
47.7M
  for (size_t n = 0; n < 8; n += 4) {
78
95.4M
    for (size_t m = 0; m < 8; m += 4) {
79
63.6M
      auto p0 = Load(d, from + n * 8 + m);
80
63.6M
      auto p1 = Load(d, from + (n + 1) * 8 + m);
81
63.6M
      auto p2 = Load(d, from + (n + 2) * 8 + m);
82
63.6M
      auto p3 = Load(d, from + (n + 3) * 8 + m);
83
63.6M
      const auto q0 = InterleaveLower(d, p0, p2);
84
63.6M
      const auto q1 = InterleaveLower(d, p1, p3);
85
63.6M
      const auto q2 = InterleaveUpper(d, p0, p2);
86
63.6M
      const auto q3 = InterleaveUpper(d, p1, p3);
87
88
63.6M
      const auto r0 = InterleaveLower(d, q0, q1);
89
63.6M
      const auto r1 = InterleaveUpper(d, q0, q1);
90
63.6M
      const auto r2 = InterleaveLower(d, q2, q3);
91
63.6M
      const auto r3 = InterleaveUpper(d, q2, q3);
92
63.6M
      Store(r0, d, to + m * 8 + n);
93
63.6M
      Store(r1, d, to + (1 + m) * 8 + n);
94
63.6M
      Store(r2, d, to + (2 + m) * 8 + n);
95
63.6M
      Store(r3, d, to + (3 + m) * 8 + n);
96
63.6M
    }
97
31.8M
  }
98
15.9M
}
idct.cc:jpegli::N_SSE2::(anonymous namespace)::Transpose8x8Block(float const*, float*)
Line
Count
Source
75
16.5M
                                  float* JXL_RESTRICT to) {
76
16.5M
  const HWY_CAPPED(float, 4) d;
77
49.5M
  for (size_t n = 0; n < 8; n += 4) {
78
99.1M
    for (size_t m = 0; m < 8; m += 4) {
79
66.0M
      auto p0 = Load(d, from + n * 8 + m);
80
66.0M
      auto p1 = Load(d, from + (n + 1) * 8 + m);
81
66.0M
      auto p2 = Load(d, from + (n + 2) * 8 + m);
82
66.0M
      auto p3 = Load(d, from + (n + 3) * 8 + m);
83
66.0M
      const auto q0 = InterleaveLower(d, p0, p2);
84
66.0M
      const auto q1 = InterleaveLower(d, p1, p3);
85
66.0M
      const auto q2 = InterleaveUpper(d, p0, p2);
86
66.0M
      const auto q3 = InterleaveUpper(d, p1, p3);
87
88
66.0M
      const auto r0 = InterleaveLower(d, q0, q1);
89
66.0M
      const auto r1 = InterleaveUpper(d, q0, q1);
90
66.0M
      const auto r2 = InterleaveLower(d, q2, q3);
91
66.0M
      const auto r3 = InterleaveUpper(d, q2, q3);
92
66.0M
      Store(r0, d, to + m * 8 + n);
93
66.0M
      Store(r1, d, to + (1 + m) * 8 + n);
94
66.0M
      Store(r2, d, to + (2 + m) * 8 + n);
95
66.0M
      Store(r3, d, to + (3 + m) * 8 + n);
96
66.0M
    }
97
33.0M
  }
98
16.5M
}
99
#else
100
static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
101
                                         float* JXL_RESTRICT to) {
102
  for (size_t n = 0; n < 8; ++n) {
103
    for (size_t m = 0; m < 8; ++m) {
104
      to[8 * n + m] = from[8 * m + n];
105
    }
106
  }
107
}
108
#endif
109
110
// NOLINTNEXTLINE(google-readability-namespace-comments)
111
}  // namespace
112
}  // namespace HWY_NAMESPACE
113
}  // namespace jpegli
114
HWY_AFTER_NAMESPACE();
115
#endif  // LIB_JPEGLI_TRANSPOSE_INL_H_