Coverage Report

Created: 2025-07-23 07:47

/src/libjxl/lib/jxl/modular/transform/rct.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/modular/transform/rct.h"
7
8
#include <cstddef>
9
#include <cstdint>
10
#include <utility>
11
12
#include "lib/jxl/base/data_parallel.h"
13
#include "lib/jxl/base/status.h"
14
#include "lib/jxl/modular/modular_image.h"
15
#include "lib/jxl/modular/transform/transform.h"
16
#undef HWY_TARGET_INCLUDE
17
#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/rct.cc"
18
#include <hwy/foreach_target.h>
19
#include <hwy/highway.h>
20
HWY_BEFORE_NAMESPACE();
21
namespace jxl {
22
namespace HWY_NAMESPACE {
23
24
// These templates are not found via ADL.
25
using hwy::HWY_NAMESPACE::Add;
26
using hwy::HWY_NAMESPACE::ShiftRight;
27
using hwy::HWY_NAMESPACE::Sub;
28
29
template <int transform_type>
30
void InvRCTRow(const pixel_type* in0, const pixel_type* in1,
31
               const pixel_type* in2, pixel_type* out0, pixel_type* out1,
32
1.83M
               pixel_type* out2, size_t w) {
33
1.83M
  static_assert(transform_type >= 0 && transform_type < 7,
34
1.83M
                "Invalid transform type");
35
1.83M
  int second = transform_type >> 1;
36
1.83M
  int third = transform_type & 1;
37
38
1.83M
  size_t x = 0;
39
1.83M
  const HWY_FULL(pixel_type) d;
40
1.83M
  const size_t N = Lanes(d);
41
47.7M
  for (; x + N - 1 < w; x += N) {
42
45.9M
    if (transform_type == 6) {
43
18.6M
      auto Y = Load(d, in0 + x);
44
18.6M
      auto Co = Load(d, in1 + x);
45
18.6M
      auto Cg = Load(d, in2 + x);
46
18.6M
      Y = Sub(Y, ShiftRight<1>(Cg));
47
18.6M
      auto G = Add(Cg, Y);
48
18.6M
      Y = Sub(Y, ShiftRight<1>(Co));
49
18.6M
      auto R = Add(Y, Co);
50
18.6M
      Store(R, d, out0 + x);
51
18.6M
      Store(G, d, out1 + x);
52
18.6M
      Store(Y, d, out2 + x);
53
27.3M
    } else {
54
27.3M
      auto First = Load(d, in0 + x);
55
27.3M
      auto Second = Load(d, in1 + x);
56
27.3M
      auto Third = Load(d, in2 + x);
57
27.3M
      if (third) Third = Add(Third, First);
58
27.3M
      if (second == 1) {
59
2.63M
        Second = Add(Second, First);
60
24.6M
      } else if (second == 2) {
61
22.1M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
22.1M
      }
63
27.3M
      Store(First, d, out0 + x);
64
27.3M
      Store(Second, d, out1 + x);
65
27.3M
      Store(Third, d, out2 + x);
66
27.3M
    }
67
45.9M
  }
68
4.68M
  for (; x < w; x++) {
69
2.85M
    if (transform_type == 6) {
70
1.46M
      pixel_type Y = in0[x];
71
1.46M
      pixel_type Co = in1[x];
72
1.46M
      pixel_type Cg = in2[x];
73
1.46M
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
1.46M
      pixel_type G = PixelAdd(Cg, tmp);
75
1.46M
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
1.46M
      pixel_type R = PixelAdd(B, Co);
77
1.46M
      out0[x] = R;
78
1.46M
      out1[x] = G;
79
1.46M
      out2[x] = B;
80
1.46M
    } else {
81
1.38M
      pixel_type First = in0[x];
82
1.38M
      pixel_type Second = in1[x];
83
1.38M
      pixel_type Third = in2[x];
84
1.38M
      if (third) Third = PixelAdd(Third, First);
85
1.38M
      if (second == 1) {
86
480k
        Second = PixelAdd(Second, First);
87
905k
      } else if (second == 2) {
88
188k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
188k
      }
90
1.38M
      out0[x] = First;
91
1.38M
      out1[x] = Second;
92
1.38M
      out2[x] = Third;
93
1.38M
    }
94
2.85M
  }
95
1.83M
}
Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
void jxl::N_SSE4::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.51k
               pixel_type* out2, size_t w) {
33
3.51k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.51k
                "Invalid transform type");
35
3.51k
  int second = transform_type >> 1;
36
3.51k
  int third = transform_type & 1;
37
38
3.51k
  size_t x = 0;
39
3.51k
  const HWY_FULL(pixel_type) d;
40
3.51k
  const size_t N = Lanes(d);
41
228k
  for (; x + N - 1 < w; x += N) {
42
225k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
225k
    } else {
54
225k
      auto First = Load(d, in0 + x);
55
225k
      auto Second = Load(d, in1 + x);
56
225k
      auto Third = Load(d, in2 + x);
57
225k
      if (third) Third = Add(Third, First);
58
225k
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
225k
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
225k
      Store(First, d, out0 + x);
64
225k
      Store(Second, d, out1 + x);
65
225k
      Store(Third, d, out2 + x);
66
225k
    }
67
225k
  }
68
8.12k
  for (; x < w; x++) {
69
4.60k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
4.60k
    } else {
81
4.60k
      pixel_type First = in0[x];
82
4.60k
      pixel_type Second = in1[x];
83
4.60k
      pixel_type Third = in2[x];
84
4.60k
      if (third) Third = PixelAdd(Third, First);
85
4.60k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
4.60k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
4.60k
      out0[x] = First;
91
4.60k
      out1[x] = Second;
92
4.60k
      out2[x] = Third;
93
4.60k
    }
94
4.60k
  }
95
3.51k
}
void jxl::N_SSE4::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.88k
               pixel_type* out2, size_t w) {
33
3.88k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.88k
                "Invalid transform type");
35
3.88k
  int second = transform_type >> 1;
36
3.88k
  int third = transform_type & 1;
37
38
3.88k
  size_t x = 0;
39
3.88k
  const HWY_FULL(pixel_type) d;
40
3.88k
  const size_t N = Lanes(d);
41
85.9k
  for (; x + N - 1 < w; x += N) {
42
82.0k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
82.0k
    } else {
54
82.0k
      auto First = Load(d, in0 + x);
55
82.0k
      auto Second = Load(d, in1 + x);
56
82.0k
      auto Third = Load(d, in2 + x);
57
82.0k
      if (third) Third = Add(Third, First);
58
82.0k
      if (second == 1) {
59
82.0k
        Second = Add(Second, First);
60
82.0k
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
82.0k
      Store(First, d, out0 + x);
64
82.0k
      Store(Second, d, out1 + x);
65
82.0k
      Store(Third, d, out2 + x);
66
82.0k
    }
67
82.0k
  }
68
6.00k
  for (; x < w; x++) {
69
2.11k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
2.11k
    } else {
81
2.11k
      pixel_type First = in0[x];
82
2.11k
      pixel_type Second = in1[x];
83
2.11k
      pixel_type Third = in2[x];
84
2.11k
      if (third) Third = PixelAdd(Third, First);
85
2.11k
      if (second == 1) {
86
2.11k
        Second = PixelAdd(Second, First);
87
2.11k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
2.11k
      out0[x] = First;
91
2.11k
      out1[x] = Second;
92
2.11k
      out2[x] = Third;
93
2.11k
    }
94
2.11k
  }
95
3.88k
}
void jxl::N_SSE4::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.87k
               pixel_type* out2, size_t w) {
33
3.87k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.87k
                "Invalid transform type");
35
3.87k
  int second = transform_type >> 1;
36
3.87k
  int third = transform_type & 1;
37
38
3.87k
  size_t x = 0;
39
3.87k
  const HWY_FULL(pixel_type) d;
40
3.87k
  const size_t N = Lanes(d);
41
112k
  for (; x + N - 1 < w; x += N) {
42
109k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
109k
    } else {
54
109k
      auto First = Load(d, in0 + x);
55
109k
      auto Second = Load(d, in1 + x);
56
109k
      auto Third = Load(d, in2 + x);
57
109k
      if (third) Third = Add(Third, First);
58
109k
      if (second == 1) {
59
109k
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
109k
      Store(First, d, out0 + x);
64
109k
      Store(Second, d, out1 + x);
65
109k
      Store(Third, d, out2 + x);
66
109k
    }
67
109k
  }
68
5.76k
  for (; x < w; x++) {
69
1.88k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
1.88k
    } else {
81
1.88k
      pixel_type First = in0[x];
82
1.88k
      pixel_type Second = in1[x];
83
1.88k
      pixel_type Third = in2[x];
84
1.88k
      if (third) Third = PixelAdd(Third, First);
85
1.88k
      if (second == 1) {
86
1.88k
        Second = PixelAdd(Second, First);
87
1.88k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
1.88k
      out0[x] = First;
91
1.88k
      out1[x] = Second;
92
1.88k
      out2[x] = Third;
93
1.88k
    }
94
1.88k
  }
95
3.87k
}
void jxl::N_SSE4::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
61.8k
               pixel_type* out2, size_t w) {
33
61.8k
  static_assert(transform_type >= 0 && transform_type < 7,
34
61.8k
                "Invalid transform type");
35
61.8k
  int second = transform_type >> 1;
36
61.8k
  int third = transform_type & 1;
37
38
61.8k
  size_t x = 0;
39
61.8k
  const HWY_FULL(pixel_type) d;
40
61.8k
  const size_t N = Lanes(d);
41
3.33M
  for (; x + N - 1 < w; x += N) {
42
3.27M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
3.27M
    } else {
54
3.27M
      auto First = Load(d, in0 + x);
55
3.27M
      auto Second = Load(d, in1 + x);
56
3.27M
      auto Third = Load(d, in2 + x);
57
3.27M
      if (third) Third = Add(Third, First);
58
3.27M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
3.27M
      } else if (second == 2) {
61
3.27M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
3.27M
      }
63
3.27M
      Store(First, d, out0 + x);
64
3.27M
      Store(Second, d, out1 + x);
65
3.27M
      Store(Third, d, out2 + x);
66
3.27M
    }
67
3.27M
  }
68
68.6k
  for (; x < w; x++) {
69
6.83k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
6.83k
    } else {
81
6.83k
      pixel_type First = in0[x];
82
6.83k
      pixel_type Second = in1[x];
83
6.83k
      pixel_type Third = in2[x];
84
6.83k
      if (third) Third = PixelAdd(Third, First);
85
6.83k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
6.83k
      } else if (second == 2) {
88
6.83k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
6.83k
      }
90
6.83k
      out0[x] = First;
91
6.83k
      out1[x] = Second;
92
6.83k
      out2[x] = Third;
93
6.83k
    }
94
6.83k
  }
95
61.8k
}
void jxl::N_SSE4::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
75.3k
               pixel_type* out2, size_t w) {
33
75.3k
  static_assert(transform_type >= 0 && transform_type < 7,
34
75.3k
                "Invalid transform type");
35
75.3k
  int second = transform_type >> 1;
36
75.3k
  int third = transform_type & 1;
37
38
75.3k
  size_t x = 0;
39
75.3k
  const HWY_FULL(pixel_type) d;
40
75.3k
  const size_t N = Lanes(d);
41
3.96M
  for (; x + N - 1 < w; x += N) {
42
3.89M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
3.89M
    } else {
54
3.89M
      auto First = Load(d, in0 + x);
55
3.89M
      auto Second = Load(d, in1 + x);
56
3.89M
      auto Third = Load(d, in2 + x);
57
3.89M
      if (third) Third = Add(Third, First);
58
3.89M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
3.89M
      } else if (second == 2) {
61
3.89M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
3.89M
      }
63
3.89M
      Store(First, d, out0 + x);
64
3.89M
      Store(Second, d, out1 + x);
65
3.89M
      Store(Third, d, out2 + x);
66
3.89M
    }
67
3.89M
  }
68
79.1k
  for (; x < w; x++) {
69
3.81k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
3.81k
    } else {
81
3.81k
      pixel_type First = in0[x];
82
3.81k
      pixel_type Second = in1[x];
83
3.81k
      pixel_type Third = in2[x];
84
3.81k
      if (third) Third = PixelAdd(Third, First);
85
3.81k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
3.81k
      } else if (second == 2) {
88
3.81k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
3.81k
      }
90
3.81k
      out0[x] = First;
91
3.81k
      out1[x] = Second;
92
3.81k
      out2[x] = Third;
93
3.81k
    }
94
3.81k
  }
95
75.3k
}
void jxl::N_SSE4::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
82.9k
               pixel_type* out2, size_t w) {
33
82.9k
  static_assert(transform_type >= 0 && transform_type < 7,
34
82.9k
                "Invalid transform type");
35
82.9k
  int second = transform_type >> 1;
36
82.9k
  int third = transform_type & 1;
37
38
82.9k
  size_t x = 0;
39
82.9k
  const HWY_FULL(pixel_type) d;
40
82.9k
  const size_t N = Lanes(d);
41
3.72M
  for (; x + N - 1 < w; x += N) {
42
3.64M
    if (transform_type == 6) {
43
3.64M
      auto Y = Load(d, in0 + x);
44
3.64M
      auto Co = Load(d, in1 + x);
45
3.64M
      auto Cg = Load(d, in2 + x);
46
3.64M
      Y = Sub(Y, ShiftRight<1>(Cg));
47
3.64M
      auto G = Add(Cg, Y);
48
3.64M
      Y = Sub(Y, ShiftRight<1>(Co));
49
3.64M
      auto R = Add(Y, Co);
50
3.64M
      Store(R, d, out0 + x);
51
3.64M
      Store(G, d, out1 + x);
52
3.64M
      Store(Y, d, out2 + x);
53
3.64M
    } else {
54
126
      auto First = Load(d, in0 + x);
55
126
      auto Second = Load(d, in1 + x);
56
126
      auto Third = Load(d, in2 + x);
57
126
      if (third) Third = Add(Third, First);
58
126
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
126
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
126
      Store(First, d, out0 + x);
64
126
      Store(Second, d, out1 + x);
65
126
      Store(Third, d, out2 + x);
66
126
    }
67
3.64M
  }
68
113k
  for (; x < w; x++) {
69
30.9k
    if (transform_type == 6) {
70
30.9k
      pixel_type Y = in0[x];
71
30.9k
      pixel_type Co = in1[x];
72
30.9k
      pixel_type Cg = in2[x];
73
30.9k
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
30.9k
      pixel_type G = PixelAdd(Cg, tmp);
75
30.9k
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
30.9k
      pixel_type R = PixelAdd(B, Co);
77
30.9k
      out0[x] = R;
78
30.9k
      out1[x] = G;
79
30.9k
      out2[x] = B;
80
18.4E
    } else {
81
18.4E
      pixel_type First = in0[x];
82
18.4E
      pixel_type Second = in1[x];
83
18.4E
      pixel_type Third = in2[x];
84
18.4E
      if (third) Third = PixelAdd(Third, First);
85
18.4E
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
18.4E
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
18.4E
      out0[x] = First;
91
18.4E
      out1[x] = Second;
92
18.4E
      out2[x] = Third;
93
18.4E
    }
94
30.9k
  }
95
82.9k
}
Unexecuted instantiation: void jxl::N_AVX2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
void jxl::N_AVX2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
199k
               pixel_type* out2, size_t w) {
33
199k
  static_assert(transform_type >= 0 && transform_type < 7,
34
199k
                "Invalid transform type");
35
199k
  int second = transform_type >> 1;
36
199k
  int third = transform_type & 1;
37
38
199k
  size_t x = 0;
39
199k
  const HWY_FULL(pixel_type) d;
40
199k
  const size_t N = Lanes(d);
41
2.33M
  for (; x + N - 1 < w; x += N) {
42
2.13M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
2.13M
    } else {
54
2.13M
      auto First = Load(d, in0 + x);
55
2.13M
      auto Second = Load(d, in1 + x);
56
2.13M
      auto Third = Load(d, in2 + x);
57
2.13M
      if (third) Third = Add(Third, First);
58
2.13M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
2.13M
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
2.13M
      Store(First, d, out0 + x);
64
2.13M
      Store(Second, d, out1 + x);
65
2.13M
      Store(Third, d, out2 + x);
66
2.13M
    }
67
2.13M
  }
68
906k
  for (; x < w; x++) {
69
706k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
706k
    } else {
81
706k
      pixel_type First = in0[x];
82
706k
      pixel_type Second = in1[x];
83
706k
      pixel_type Third = in2[x];
84
706k
      if (third) Third = PixelAdd(Third, First);
85
706k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
706k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
706k
      out0[x] = First;
91
706k
      out1[x] = Second;
92
706k
      out2[x] = Third;
93
706k
    }
94
706k
  }
95
199k
}
void jxl::N_AVX2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
193k
               pixel_type* out2, size_t w) {
33
193k
  static_assert(transform_type >= 0 && transform_type < 7,
34
193k
                "Invalid transform type");
35
193k
  int second = transform_type >> 1;
36
193k
  int third = transform_type & 1;
37
38
193k
  size_t x = 0;
39
193k
  const HWY_FULL(pixel_type) d;
40
193k
  const size_t N = Lanes(d);
41
1.39M
  for (; x + N - 1 < w; x += N) {
42
1.20M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
1.20M
    } else {
54
1.20M
      auto First = Load(d, in0 + x);
55
1.20M
      auto Second = Load(d, in1 + x);
56
1.20M
      auto Third = Load(d, in2 + x);
57
1.20M
      if (third) Third = Add(Third, First);
58
1.20M
      if (second == 1) {
59
1.20M
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
1.20M
      Store(First, d, out0 + x);
64
1.20M
      Store(Second, d, out1 + x);
65
1.20M
      Store(Third, d, out2 + x);
66
1.20M
    }
67
1.20M
  }
68
541k
  for (; x < w; x++) {
69
347k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
347k
    } else {
81
347k
      pixel_type First = in0[x];
82
347k
      pixel_type Second = in1[x];
83
347k
      pixel_type Third = in2[x];
84
347k
      if (third) Third = PixelAdd(Third, First);
85
347k
      if (second == 1) {
86
347k
        Second = PixelAdd(Second, First);
87
347k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
347k
      out0[x] = First;
91
347k
      out1[x] = Second;
92
347k
      out2[x] = Third;
93
347k
    }
94
347k
  }
95
193k
}
void jxl::N_AVX2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
96.0k
               pixel_type* out2, size_t w) {
33
96.0k
  static_assert(transform_type >= 0 && transform_type < 7,
34
96.0k
                "Invalid transform type");
35
96.0k
  int second = transform_type >> 1;
36
96.0k
  int third = transform_type & 1;
37
38
96.0k
  size_t x = 0;
39
96.0k
  const HWY_FULL(pixel_type) d;
40
96.0k
  const size_t N = Lanes(d);
41
1.20M
  for (; x + N - 1 < w; x += N) {
42
1.11M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
1.11M
    } else {
54
1.11M
      auto First = Load(d, in0 + x);
55
1.11M
      auto Second = Load(d, in1 + x);
56
1.11M
      auto Third = Load(d, in2 + x);
57
1.11M
      if (third) Third = Add(Third, First);
58
1.11M
      if (second == 1) {
59
1.11M
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
1.11M
      Store(First, d, out0 + x);
64
1.11M
      Store(Second, d, out1 + x);
65
1.11M
      Store(Third, d, out2 + x);
66
1.11M
    }
67
1.11M
  }
68
217k
  for (; x < w; x++) {
69
121k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
121k
    } else {
81
121k
      pixel_type First = in0[x];
82
121k
      pixel_type Second = in1[x];
83
121k
      pixel_type Third = in2[x];
84
121k
      if (third) Third = PixelAdd(Third, First);
85
121k
      if (second == 1) {
86
121k
        Second = PixelAdd(Second, First);
87
121k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
121k
      out0[x] = First;
91
121k
      out1[x] = Second;
92
121k
      out2[x] = Third;
93
121k
    }
94
121k
  }
95
96.0k
}
void jxl::N_AVX2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
86.9k
               pixel_type* out2, size_t w) {
33
86.9k
  static_assert(transform_type >= 0 && transform_type < 7,
34
86.9k
                "Invalid transform type");
35
86.9k
  int second = transform_type >> 1;
36
86.9k
  int third = transform_type & 1;
37
38
86.9k
  size_t x = 0;
39
86.9k
  const HWY_FULL(pixel_type) d;
40
86.9k
  const size_t N = Lanes(d);
41
1.88M
  for (; x + N - 1 < w; x += N) {
42
1.79M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
1.79M
    } else {
54
1.79M
      auto First = Load(d, in0 + x);
55
1.79M
      auto Second = Load(d, in1 + x);
56
1.79M
      auto Third = Load(d, in2 + x);
57
1.79M
      if (third) Third = Add(Third, First);
58
1.79M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
1.79M
      } else if (second == 2) {
61
1.79M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
1.79M
      }
63
1.79M
      Store(First, d, out0 + x);
64
1.79M
      Store(Second, d, out1 + x);
65
1.79M
      Store(Third, d, out2 + x);
66
1.79M
    }
67
1.79M
  }
68
181k
  for (; x < w; x++) {
69
94.6k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
94.6k
    } else {
81
94.6k
      pixel_type First = in0[x];
82
94.6k
      pixel_type Second = in1[x];
83
94.6k
      pixel_type Third = in2[x];
84
94.6k
      if (third) Third = PixelAdd(Third, First);
85
94.6k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
94.6k
      } else if (second == 2) {
88
94.6k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
94.6k
      }
90
94.6k
      out0[x] = First;
91
94.6k
      out1[x] = Second;
92
94.6k
      out2[x] = Third;
93
94.6k
    }
94
94.6k
  }
95
86.9k
}
void jxl::N_AVX2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
178k
               pixel_type* out2, size_t w) {
33
178k
  static_assert(transform_type >= 0 && transform_type < 7,
34
178k
                "Invalid transform type");
35
178k
  int second = transform_type >> 1;
36
178k
  int third = transform_type & 1;
37
38
178k
  size_t x = 0;
39
178k
  const HWY_FULL(pixel_type) d;
40
178k
  const size_t N = Lanes(d);
41
4.79M
  for (; x + N - 1 < w; x += N) {
42
4.61M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
4.61M
    } else {
54
4.61M
      auto First = Load(d, in0 + x);
55
4.61M
      auto Second = Load(d, in1 + x);
56
4.61M
      auto Third = Load(d, in2 + x);
57
4.66M
      if (third) Third = Add(Third, First);
58
4.61M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
4.65M
      } else if (second == 2) {
61
4.65M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
4.65M
      }
63
4.61M
      Store(First, d, out0 + x);
64
4.61M
      Store(Second, d, out1 + x);
65
4.61M
      Store(Third, d, out2 + x);
66
4.61M
    }
67
4.61M
  }
68
252k
  for (; x < w; x++) {
69
74.6k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
74.6k
    } else {
81
74.6k
      pixel_type First = in0[x];
82
74.6k
      pixel_type Second = in1[x];
83
74.6k
      pixel_type Third = in2[x];
84
74.6k
      if (third) Third = PixelAdd(Third, First);
85
74.6k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
74.6k
      } else if (second == 2) {
88
74.6k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
74.6k
      }
90
74.6k
      out0[x] = First;
91
74.6k
      out1[x] = Second;
92
74.6k
      out2[x] = Third;
93
74.6k
    }
94
74.6k
  }
95
178k
}
void jxl::N_AVX2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
563k
               pixel_type* out2, size_t w) {
33
563k
  static_assert(transform_type >= 0 && transform_type < 7,
34
563k
                "Invalid transform type");
35
563k
  int second = transform_type >> 1;
36
563k
  int third = transform_type & 1;
37
38
563k
  size_t x = 0;
39
563k
  const HWY_FULL(pixel_type) d;
40
563k
  const size_t N = Lanes(d);
41
9.81M
  for (; x + N - 1 < w; x += N) {
42
9.24M
    if (transform_type == 6) {
43
9.24M
      auto Y = Load(d, in0 + x);
44
9.24M
      auto Co = Load(d, in1 + x);
45
9.24M
      auto Cg = Load(d, in2 + x);
46
9.24M
      Y = Sub(Y, ShiftRight<1>(Cg));
47
9.24M
      auto G = Add(Cg, Y);
48
9.24M
      Y = Sub(Y, ShiftRight<1>(Co));
49
9.24M
      auto R = Add(Y, Co);
50
9.24M
      Store(R, d, out0 + x);
51
9.24M
      Store(G, d, out1 + x);
52
9.24M
      Store(Y, d, out2 + x);
53
9.24M
    } else {
54
57
      auto First = Load(d, in0 + x);
55
57
      auto Second = Load(d, in1 + x);
56
57
      auto Third = Load(d, in2 + x);
57
57
      if (third) Third = Add(Third, First);
58
57
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
57
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
57
      Store(First, d, out0 + x);
64
57
      Store(Second, d, out1 + x);
65
57
      Store(Third, d, out2 + x);
66
57
    }
67
9.24M
  }
68
1.98M
  for (; x < w; x++) {
69
1.41M
    if (transform_type == 6) {
70
1.41M
      pixel_type Y = in0[x];
71
1.41M
      pixel_type Co = in1[x];
72
1.41M
      pixel_type Cg = in2[x];
73
1.41M
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
1.41M
      pixel_type G = PixelAdd(Cg, tmp);
75
1.41M
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
1.41M
      pixel_type R = PixelAdd(B, Co);
77
1.41M
      out0[x] = R;
78
1.41M
      out1[x] = G;
79
1.41M
      out2[x] = B;
80
18.4E
    } else {
81
18.4E
      pixel_type First = in0[x];
82
18.4E
      pixel_type Second = in1[x];
83
18.4E
      pixel_type Third = in2[x];
84
18.4E
      if (third) Third = PixelAdd(Third, First);
85
18.4E
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
18.4E
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
18.4E
      out0[x] = First;
91
18.4E
      out1[x] = Second;
92
18.4E
      out2[x] = Third;
93
18.4E
    }
94
1.41M
  }
95
563k
}
Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
void jxl::N_SSE2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.28k
               pixel_type* out2, size_t w) {
33
3.28k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.28k
                "Invalid transform type");
35
3.28k
  int second = transform_type >> 1;
36
3.28k
  int third = transform_type & 1;
37
38
3.28k
  size_t x = 0;
39
3.28k
  const HWY_FULL(pixel_type) d;
40
3.28k
  const size_t N = Lanes(d);
41
195k
  for (; x + N - 1 < w; x += N) {
42
192k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
192k
    } else {
54
192k
      auto First = Load(d, in0 + x);
55
192k
      auto Second = Load(d, in1 + x);
56
192k
      auto Third = Load(d, in2 + x);
57
192k
      if (third) Third = Add(Third, First);
58
192k
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
192k
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
192k
      Store(First, d, out0 + x);
64
192k
      Store(Second, d, out1 + x);
65
192k
      Store(Third, d, out2 + x);
66
192k
    }
67
192k
  }
68
9.07k
  for (; x < w; x++) {
69
5.78k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
5.78k
    } else {
81
5.78k
      pixel_type First = in0[x];
82
5.78k
      pixel_type Second = in1[x];
83
5.78k
      pixel_type Third = in2[x];
84
5.78k
      if (third) Third = PixelAdd(Third, First);
85
5.78k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
5.78k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
5.78k
      out0[x] = First;
91
5.78k
      out1[x] = Second;
92
5.78k
      out2[x] = Third;
93
5.78k
    }
94
5.78k
  }
95
3.28k
}
void jxl::N_SSE2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.12k
               pixel_type* out2, size_t w) {
33
3.12k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.12k
                "Invalid transform type");
35
3.12k
  int second = transform_type >> 1;
36
3.12k
  int third = transform_type & 1;
37
38
3.12k
  size_t x = 0;
39
3.12k
  const HWY_FULL(pixel_type) d;
40
3.12k
  const size_t N = Lanes(d);
41
50.2k
  for (; x + N - 1 < w; x += N) {
42
47.1k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
47.1k
    } else {
54
47.1k
      auto First = Load(d, in0 + x);
55
47.1k
      auto Second = Load(d, in1 + x);
56
47.1k
      auto Third = Load(d, in2 + x);
57
47.1k
      if (third) Third = Add(Third, First);
58
47.2k
      if (second == 1) {
59
47.2k
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
47.1k
      Store(First, d, out0 + x);
64
47.1k
      Store(Second, d, out1 + x);
65
47.1k
      Store(Third, d, out2 + x);
66
47.1k
    }
67
47.1k
  }
68
8.56k
  for (; x < w; x++) {
69
5.44k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
5.44k
    } else {
81
5.44k
      pixel_type First = in0[x];
82
5.44k
      pixel_type Second = in1[x];
83
5.44k
      pixel_type Third = in2[x];
84
5.44k
      if (third) Third = PixelAdd(Third, First);
85
5.44k
      if (second == 1) {
86
5.44k
        Second = PixelAdd(Second, First);
87
5.44k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
5.44k
      out0[x] = First;
91
5.44k
      out1[x] = Second;
92
5.44k
      out2[x] = Third;
93
5.44k
    }
94
5.44k
  }
95
3.12k
}
void jxl::N_SSE2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
2.88k
               pixel_type* out2, size_t w) {
33
2.88k
  static_assert(transform_type >= 0 && transform_type < 7,
34
2.88k
                "Invalid transform type");
35
2.88k
  int second = transform_type >> 1;
36
2.88k
  int third = transform_type & 1;
37
38
2.88k
  size_t x = 0;
39
2.88k
  const HWY_FULL(pixel_type) d;
40
2.88k
  const size_t N = Lanes(d);
41
78.6k
  for (; x + N - 1 < w; x += N) {
42
75.8k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
75.8k
    } else {
54
75.8k
      auto First = Load(d, in0 + x);
55
75.8k
      auto Second = Load(d, in1 + x);
56
75.8k
      auto Third = Load(d, in2 + x);
57
75.8k
      if (third) Third = Add(Third, First);
58
75.8k
      if (second == 1) {
59
75.8k
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
75.8k
      Store(First, d, out0 + x);
64
75.8k
      Store(Second, d, out1 + x);
65
75.8k
      Store(Third, d, out2 + x);
66
75.8k
    }
67
75.8k
  }
68
5.56k
  for (; x < w; x++) {
69
2.68k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
2.68k
    } else {
81
2.68k
      pixel_type First = in0[x];
82
2.68k
      pixel_type Second = in1[x];
83
2.68k
      pixel_type Third = in2[x];
84
2.68k
      if (third) Third = PixelAdd(Third, First);
85
2.68k
      if (second == 1) {
86
2.68k
        Second = PixelAdd(Second, First);
87
2.68k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
2.68k
      out0[x] = First;
91
2.68k
      out1[x] = Second;
92
2.68k
      out2[x] = Third;
93
2.68k
    }
94
2.68k
  }
95
2.88k
}
void jxl::N_SSE2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
98.1k
               pixel_type* out2, size_t w) {
33
98.1k
  static_assert(transform_type >= 0 && transform_type < 7,
34
98.1k
                "Invalid transform type");
35
98.1k
  int second = transform_type >> 1;
36
98.1k
  int third = transform_type & 1;
37
38
98.1k
  size_t x = 0;
39
98.1k
  const HWY_FULL(pixel_type) d;
40
98.1k
  const size_t N = Lanes(d);
41
5.35M
  for (; x + N - 1 < w; x += N) {
42
5.25M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
5.25M
    } else {
54
5.25M
      auto First = Load(d, in0 + x);
55
5.25M
      auto Second = Load(d, in1 + x);
56
5.25M
      auto Third = Load(d, in2 + x);
57
5.25M
      if (third) Third = Add(Third, First);
58
5.25M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
5.25M
      } else if (second == 2) {
61
5.25M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
5.25M
      }
63
5.25M
      Store(First, d, out0 + x);
64
5.25M
      Store(Second, d, out1 + x);
65
5.25M
      Store(Third, d, out2 + x);
66
5.25M
    }
67
5.25M
  }
68
103k
  for (; x < w; x++) {
69
4.89k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
4.89k
    } else {
81
4.89k
      pixel_type First = in0[x];
82
4.89k
      pixel_type Second = in1[x];
83
4.89k
      pixel_type Third = in2[x];
84
4.89k
      if (third) Third = PixelAdd(Third, First);
85
4.89k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
4.89k
      } else if (second == 2) {
88
4.89k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
4.89k
      }
90
4.89k
      out0[x] = First;
91
4.89k
      out1[x] = Second;
92
4.89k
      out2[x] = Third;
93
4.89k
    }
94
4.89k
  }
95
98.1k
}
void jxl::N_SSE2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
68.1k
               pixel_type* out2, size_t w) {
33
68.1k
  static_assert(transform_type >= 0 && transform_type < 7,
34
68.1k
                "Invalid transform type");
35
68.1k
  int second = transform_type >> 1;
36
68.1k
  int third = transform_type & 1;
37
38
68.1k
  size_t x = 0;
39
68.1k
  const HWY_FULL(pixel_type) d;
40
68.1k
  const size_t N = Lanes(d);
41
3.36M
  for (; x + N - 1 < w; x += N) {
42
3.29M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
3.29M
    } else {
54
3.29M
      auto First = Load(d, in0 + x);
55
3.29M
      auto Second = Load(d, in1 + x);
56
3.29M
      auto Third = Load(d, in2 + x);
57
3.31M
      if (third) Third = Add(Third, First);
58
3.29M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
3.31M
      } else if (second == 2) {
61
3.31M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
3.31M
      }
63
3.29M
      Store(First, d, out0 + x);
64
3.29M
      Store(Second, d, out1 + x);
65
3.29M
      Store(Third, d, out2 + x);
66
3.29M
    }
67
3.29M
  }
68
71.7k
  for (; x < w; x++) {
69
3.63k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
3.63k
    } else {
81
3.63k
      pixel_type First = in0[x];
82
3.63k
      pixel_type Second = in1[x];
83
3.63k
      pixel_type Third = in2[x];
84
3.63k
      if (third) Third = PixelAdd(Third, First);
85
3.63k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
3.63k
      } else if (second == 2) {
88
3.63k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
3.63k
      }
90
3.63k
      out0[x] = First;
91
3.63k
      out1[x] = Second;
92
3.63k
      out2[x] = Third;
93
3.63k
    }
94
3.63k
  }
95
68.1k
}
void jxl::N_SSE2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
110k
               pixel_type* out2, size_t w) {
33
110k
  static_assert(transform_type >= 0 && transform_type < 7,
34
110k
                "Invalid transform type");
35
110k
  int second = transform_type >> 1;
36
110k
  int third = transform_type & 1;
37
38
110k
  size_t x = 0;
39
110k
  const HWY_FULL(pixel_type) d;
40
110k
  const size_t N = Lanes(d);
41
5.84M
  for (; x + N - 1 < w; x += N) {
42
5.73M
    if (transform_type == 6) {
43
5.73M
      auto Y = Load(d, in0 + x);
44
5.73M
      auto Co = Load(d, in1 + x);
45
5.73M
      auto Cg = Load(d, in2 + x);
46
5.73M
      Y = Sub(Y, ShiftRight<1>(Cg));
47
5.73M
      auto G = Add(Cg, Y);
48
5.73M
      Y = Sub(Y, ShiftRight<1>(Co));
49
5.73M
      auto R = Add(Y, Co);
50
5.73M
      Store(R, d, out0 + x);
51
5.73M
      Store(G, d, out1 + x);
52
5.73M
      Store(Y, d, out2 + x);
53
5.73M
    } else {
54
3
      auto First = Load(d, in0 + x);
55
3
      auto Second = Load(d, in1 + x);
56
3
      auto Third = Load(d, in2 + x);
57
3
      if (third) Third = Add(Third, First);
58
3
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
3
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
3
      Store(First, d, out0 + x);
64
3
      Store(Second, d, out1 + x);
65
3
      Store(Third, d, out2 + x);
66
3
    }
67
5.73M
  }
68
124k
  for (; x < w; x++) {
69
14.1k
    if (transform_type == 6) {
70
14.1k
      pixel_type Y = in0[x];
71
14.1k
      pixel_type Co = in1[x];
72
14.1k
      pixel_type Cg = in2[x];
73
14.1k
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
14.1k
      pixel_type G = PixelAdd(Cg, tmp);
75
14.1k
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
14.1k
      pixel_type R = PixelAdd(B, Co);
77
14.1k
      out0[x] = R;
78
14.1k
      out1[x] = G;
79
14.1k
      out2[x] = B;
80
14.1k
    } else {
81
0
      pixel_type First = in0[x];
82
0
      pixel_type Second = in1[x];
83
0
      pixel_type Third = in2[x];
84
0
      if (third) Third = PixelAdd(Third, First);
85
0
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
0
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
0
      out0[x] = First;
91
0
      out1[x] = Second;
92
0
      out2[x] = Third;
93
0
    }
94
14.1k
  }
95
110k
}
96
97
19.5k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
98
19.5k
  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
99
19.5k
  size_t m = begin_c;
100
19.5k
  Channel& c0 = input.channel[m + 0];
101
19.5k
  size_t w = c0.w;
102
19.5k
  size_t h = c0.h;
103
19.5k
  if (rct_type == 0) {  // noop
104
5.69k
    return true;
105
5.69k
  }
106
  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
107
13.8k
  int permutation = rct_type / 7;
108
13.8k
  JXL_ENSURE(permutation < 6);
109
  // 0-5 values have the low bit corresponding to Third and the high bits
110
  // corresponding to Second. 6 corresponds to YCoCg.
111
  //
112
  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
113
  //
114
  // Third: 0=nop, 1=SubtractFirst
115
13.8k
  int custom = rct_type % 7;
116
  // Special case: permute-only. Swap channels around.
117
13.8k
  if (custom == 0) {
118
976
    Channel ch0 = std::move(input.channel[m]);
119
976
    Channel ch1 = std::move(input.channel[m + 1]);
120
976
    Channel ch2 = std::move(input.channel[m + 2]);
121
976
    input.channel[m + (permutation % 3)] = std::move(ch0);
122
976
    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
123
976
        std::move(ch1);
124
976
    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
125
976
        std::move(ch2);
126
976
    return true;
127
976
  }
128
12.9k
  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
129
12.9k
      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
130
12.9k
      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
131
12.9k
  const auto process_row = [&](const uint32_t task,
132
1.87M
                               size_t /* thread */) -> Status {
133
1.87M
    const size_t y = task;
134
1.87M
    const pixel_type* in0 = input.channel[m].Row(y);
135
1.87M
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
1.87M
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
1.87M
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
1.87M
    pixel_type* out1 =
139
1.87M
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
1.87M
    pixel_type* out2 =
141
1.87M
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
1.87M
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
1.87M
    return true;
144
1.87M
  };
rct.cc:jxl::N_SSE4::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
132
234k
                               size_t /* thread */) -> Status {
133
234k
    const size_t y = task;
134
234k
    const pixel_type* in0 = input.channel[m].Row(y);
135
234k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
234k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
234k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
234k
    pixel_type* out1 =
139
234k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
234k
    pixel_type* out2 =
141
234k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
234k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
234k
    return true;
144
234k
  };
rct.cc:jxl::N_AVX2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
132
1.34M
                               size_t /* thread */) -> Status {
133
1.34M
    const size_t y = task;
134
1.34M
    const pixel_type* in0 = input.channel[m].Row(y);
135
1.34M
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
1.34M
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
1.34M
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
1.34M
    pixel_type* out1 =
139
1.34M
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
1.34M
    pixel_type* out2 =
141
1.34M
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
1.34M
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
1.34M
    return true;
144
1.34M
  };
rct.cc:jxl::N_SSE2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
132
291k
                               size_t /* thread */) -> Status {
133
291k
    const size_t y = task;
134
291k
    const pixel_type* in0 = input.channel[m].Row(y);
135
291k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
291k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
291k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
291k
    pixel_type* out1 =
139
291k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
291k
    pixel_type* out2 =
141
291k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
291k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
291k
    return true;
144
291k
  };
145
12.9k
  JXL_RETURN_IF_ERROR(
146
12.9k
      RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT"));
147
12.9k
  return true;
148
12.9k
}
jxl::N_SSE4::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)
Line
Count
Source
97
3.43k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
98
3.43k
  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
99
3.43k
  size_t m = begin_c;
100
3.43k
  Channel& c0 = input.channel[m + 0];
101
3.43k
  size_t w = c0.w;
102
3.43k
  size_t h = c0.h;
103
3.43k
  if (rct_type == 0) {  // noop
104
428
    return true;
105
428
  }
106
  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
107
3.01k
  int permutation = rct_type / 7;
108
3.01k
  JXL_ENSURE(permutation < 6);
109
  // 0-5 values have the low bit corresponding to Third and the high bits
110
  // corresponding to Second. 6 corresponds to YCoCg.
111
  //
112
  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
113
  //
114
  // Third: 0=nop, 1=SubtractFirst
115
3.01k
  int custom = rct_type % 7;
116
  // Special case: permute-only. Swap channels around.
117
3.01k
  if (custom == 0) {
118
84
    Channel ch0 = std::move(input.channel[m]);
119
84
    Channel ch1 = std::move(input.channel[m + 1]);
120
84
    Channel ch2 = std::move(input.channel[m + 2]);
121
84
    input.channel[m + (permutation % 3)] = std::move(ch0);
122
84
    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
123
84
        std::move(ch1);
124
84
    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
125
84
        std::move(ch2);
126
84
    return true;
127
84
  }
128
2.92k
  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
129
2.92k
      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
130
2.92k
      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
131
2.92k
  const auto process_row = [&](const uint32_t task,
132
2.92k
                               size_t /* thread */) -> Status {
133
2.92k
    const size_t y = task;
134
2.92k
    const pixel_type* in0 = input.channel[m].Row(y);
135
2.92k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
2.92k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
2.92k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
2.92k
    pixel_type* out1 =
139
2.92k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
2.92k
    pixel_type* out2 =
141
2.92k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
2.92k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
2.92k
    return true;
144
2.92k
  };
145
2.92k
  JXL_RETURN_IF_ERROR(
146
2.92k
      RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT"));
147
2.92k
  return true;
148
2.92k
}
jxl::N_AVX2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)
Line
Count
Source
97
12.1k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
98
12.1k
  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
99
12.1k
  size_t m = begin_c;
100
12.1k
  Channel& c0 = input.channel[m + 0];
101
12.1k
  size_t w = c0.w;
102
12.1k
  size_t h = c0.h;
103
12.1k
  if (rct_type == 0) {  // noop
104
5.04k
    return true;
105
5.04k
  }
106
  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
107
7.09k
  int permutation = rct_type / 7;
108
7.09k
  JXL_ENSURE(permutation < 6);
109
  // 0-5 values have the low bit corresponding to Third and the high bits
110
  // corresponding to Second. 6 corresponds to YCoCg.
111
  //
112
  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
113
  //
114
  // Third: 0=nop, 1=SubtractFirst
115
7.09k
  int custom = rct_type % 7;
116
  // Special case: permute-only. Swap channels around.
117
7.09k
  if (custom == 0) {
118
687
    Channel ch0 = std::move(input.channel[m]);
119
687
    Channel ch1 = std::move(input.channel[m + 1]);
120
687
    Channel ch2 = std::move(input.channel[m + 2]);
121
687
    input.channel[m + (permutation % 3)] = std::move(ch0);
122
687
    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
123
687
        std::move(ch1);
124
687
    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
125
687
        std::move(ch2);
126
687
    return true;
127
687
  }
128
6.40k
  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
129
6.40k
      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
130
6.40k
      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
131
6.40k
  const auto process_row = [&](const uint32_t task,
132
6.40k
                               size_t /* thread */) -> Status {
133
6.40k
    const size_t y = task;
134
6.40k
    const pixel_type* in0 = input.channel[m].Row(y);
135
6.40k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
6.40k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
6.40k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
6.40k
    pixel_type* out1 =
139
6.40k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
6.40k
    pixel_type* out2 =
141
6.40k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
6.40k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
6.40k
    return true;
144
6.40k
  };
145
6.40k
  JXL_RETURN_IF_ERROR(
146
6.40k
      RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT"));
147
6.40k
  return true;
148
6.40k
}
jxl::N_SSE2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)
Line
Count
Source
97
3.99k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
98
3.99k
  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
99
3.99k
  size_t m = begin_c;
100
3.99k
  Channel& c0 = input.channel[m + 0];
101
3.99k
  size_t w = c0.w;
102
3.99k
  size_t h = c0.h;
103
3.99k
  if (rct_type == 0) {  // noop
104
217
    return true;
105
217
  }
106
  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
107
3.78k
  int permutation = rct_type / 7;
108
3.78k
  JXL_ENSURE(permutation < 6);
109
  // 0-5 values have the low bit corresponding to Third and the high bits
110
  // corresponding to Second. 6 corresponds to YCoCg.
111
  //
112
  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
113
  //
114
  // Third: 0=nop, 1=SubtractFirst
115
3.78k
  int custom = rct_type % 7;
116
  // Special case: permute-only. Swap channels around.
117
3.78k
  if (custom == 0) {
118
205
    Channel ch0 = std::move(input.channel[m]);
119
205
    Channel ch1 = std::move(input.channel[m + 1]);
120
205
    Channel ch2 = std::move(input.channel[m + 2]);
121
205
    input.channel[m + (permutation % 3)] = std::move(ch0);
122
205
    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
123
205
        std::move(ch1);
124
205
    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
125
205
        std::move(ch2);
126
205
    return true;
127
205
  }
128
3.57k
  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
129
3.57k
      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
130
3.57k
      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
131
3.57k
  const auto process_row = [&](const uint32_t task,
132
3.57k
                               size_t /* thread */) -> Status {
133
3.57k
    const size_t y = task;
134
3.57k
    const pixel_type* in0 = input.channel[m].Row(y);
135
3.57k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
3.57k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
3.57k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
3.57k
    pixel_type* out1 =
139
3.57k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
3.57k
    pixel_type* out2 =
141
3.57k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
3.57k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
3.57k
    return true;
144
3.57k
  };
145
3.57k
  JXL_RETURN_IF_ERROR(
146
3.57k
      RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT"));
147
3.57k
  return true;
148
3.57k
}
149
150
}  // namespace HWY_NAMESPACE
151
}  // namespace jxl
152
HWY_AFTER_NAMESPACE();
153
154
#if HWY_ONCE
155
namespace jxl {
156
157
HWY_EXPORT(InvRCT);
158
19.5k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
159
19.5k
  return HWY_DYNAMIC_DISPATCH(InvRCT)(input, begin_c, rct_type, pool);
160
19.5k
}
161
162
}  // namespace jxl
163
#endif