Coverage Report

Created: 2025-12-03 07:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libjxl/lib/jxl/modular/transform/rct.cc
Line
Count
Source
1
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
2
//
3
// Use of this source code is governed by a BSD-style
4
// license that can be found in the LICENSE file.
5
6
#include "lib/jxl/modular/transform/rct.h"
7
8
#include <cstddef>
9
#include <cstdint>
10
#include <utility>
11
12
#include "lib/jxl/base/data_parallel.h"
13
#include "lib/jxl/base/status.h"
14
#include "lib/jxl/modular/modular_image.h"
15
#include "lib/jxl/modular/transform/transform.h"
16
#undef HWY_TARGET_INCLUDE
17
#define HWY_TARGET_INCLUDE "lib/jxl/modular/transform/rct.cc"
18
#include <hwy/foreach_target.h>
19
#include <hwy/highway.h>
20
HWY_BEFORE_NAMESPACE();
21
namespace jxl {
22
namespace HWY_NAMESPACE {
23
24
// These templates are not found via ADL.
25
using hwy::HWY_NAMESPACE::Add;
26
using hwy::HWY_NAMESPACE::ShiftRight;
27
using hwy::HWY_NAMESPACE::Sub;
28
29
template <int transform_type>
30
void InvRCTRow(const pixel_type* in0, const pixel_type* in1,
31
               const pixel_type* in2, pixel_type* out0, pixel_type* out1,
32
7.22M
               pixel_type* out2, size_t w) {
33
7.22M
  static_assert(transform_type >= 0 && transform_type < 7,
34
7.22M
                "Invalid transform type");
35
7.22M
  int second = transform_type >> 1;
36
7.22M
  int third = transform_type & 1;
37
38
7.22M
  size_t x = 0;
39
7.22M
  const HWY_FULL(pixel_type) d;
40
7.22M
  const size_t N = Lanes(d);
41
44.5M
  for (; x + N - 1 < w; x += N) {
42
37.3M
    if (transform_type == 6) {
43
10.1M
      auto Y = Load(d, in0 + x);
44
10.1M
      auto Co = Load(d, in1 + x);
45
10.1M
      auto Cg = Load(d, in2 + x);
46
10.1M
      Y = Sub(Y, ShiftRight<1>(Cg));
47
10.1M
      auto G = Add(Cg, Y);
48
10.1M
      Y = Sub(Y, ShiftRight<1>(Co));
49
10.1M
      auto R = Add(Y, Co);
50
10.1M
      Store(R, d, out0 + x);
51
10.1M
      Store(G, d, out1 + x);
52
10.1M
      Store(Y, d, out2 + x);
53
27.2M
    } else {
54
27.2M
      auto First = Load(d, in0 + x);
55
27.2M
      auto Second = Load(d, in1 + x);
56
27.2M
      auto Third = Load(d, in2 + x);
57
27.2M
      if (third) Third = Add(Third, First);
58
27.2M
      if (second == 1) {
59
4.47M
        Second = Add(Second, First);
60
22.7M
      } else if (second == 2) {
61
21.9M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
21.9M
      }
63
27.2M
      Store(First, d, out0 + x);
64
27.2M
      Store(Second, d, out1 + x);
65
27.2M
      Store(Third, d, out2 + x);
66
27.2M
    }
67
37.3M
  }
68
47.7M
  for (; x < w; x++) {
69
40.5M
    if (transform_type == 6) {
70
39.2M
      pixel_type Y = in0[x];
71
39.2M
      pixel_type Co = in1[x];
72
39.2M
      pixel_type Cg = in2[x];
73
39.2M
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
39.2M
      pixel_type G = PixelAdd(Cg, tmp);
75
39.2M
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
39.2M
      pixel_type R = PixelAdd(B, Co);
77
39.2M
      out0[x] = R;
78
39.2M
      out1[x] = G;
79
39.2M
      out2[x] = B;
80
39.2M
    } else {
81
1.30M
      pixel_type First = in0[x];
82
1.30M
      pixel_type Second = in1[x];
83
1.30M
      pixel_type Third = in2[x];
84
1.30M
      if (third) Third = PixelAdd(Third, First);
85
1.30M
      if (second == 1) {
86
630k
        Second = PixelAdd(Second, First);
87
676k
      } else if (second == 2) {
88
525k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
525k
      }
90
1.30M
      out0[x] = First;
91
1.30M
      out1[x] = Second;
92
1.30M
      out2[x] = Third;
93
1.30M
    }
94
40.5M
  }
95
7.22M
}
Unexecuted instantiation: void jxl::N_SSE4::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
void jxl::N_SSE4::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.14k
               pixel_type* out2, size_t w) {
33
3.14k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.14k
                "Invalid transform type");
35
3.14k
  int second = transform_type >> 1;
36
3.14k
  int third = transform_type & 1;
37
38
3.14k
  size_t x = 0;
39
3.14k
  const HWY_FULL(pixel_type) d;
40
3.14k
  const size_t N = Lanes(d);
41
60.4k
  for (; x + N - 1 < w; x += N) {
42
57.2k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
57.2k
    } else {
54
57.2k
      auto First = Load(d, in0 + x);
55
57.2k
      auto Second = Load(d, in1 + x);
56
57.2k
      auto Third = Load(d, in2 + x);
57
57.3k
      if (third) Third = Add(Third, First);
58
57.2k
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
57.2k
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
57.2k
      Store(First, d, out0 + x);
64
57.2k
      Store(Second, d, out1 + x);
65
57.2k
      Store(Third, d, out2 + x);
66
57.2k
    }
67
57.2k
  }
68
7.96k
  for (; x < w; x++) {
69
4.81k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
4.81k
    } else {
81
4.81k
      pixel_type First = in0[x];
82
4.81k
      pixel_type Second = in1[x];
83
4.81k
      pixel_type Third = in2[x];
84
4.81k
      if (third) Third = PixelAdd(Third, First);
85
4.81k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
4.81k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
4.81k
      out0[x] = First;
91
4.81k
      out1[x] = Second;
92
4.81k
      out2[x] = Third;
93
4.81k
    }
94
4.81k
  }
95
3.14k
}
void jxl::N_SSE4::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.26k
               pixel_type* out2, size_t w) {
33
3.26k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.26k
                "Invalid transform type");
35
3.26k
  int second = transform_type >> 1;
36
3.26k
  int third = transform_type & 1;
37
38
3.26k
  size_t x = 0;
39
3.26k
  const HWY_FULL(pixel_type) d;
40
3.26k
  const size_t N = Lanes(d);
41
103k
  for (; x + N - 1 < w; x += N) {
42
99.8k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
99.8k
    } else {
54
99.8k
      auto First = Load(d, in0 + x);
55
99.8k
      auto Second = Load(d, in1 + x);
56
99.8k
      auto Third = Load(d, in2 + x);
57
99.8k
      if (third) Third = Add(Third, First);
58
99.8k
      if (second == 1) {
59
99.8k
        Second = Add(Second, First);
60
99.8k
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
99.8k
      Store(First, d, out0 + x);
64
99.8k
      Store(Second, d, out1 + x);
65
99.8k
      Store(Third, d, out2 + x);
66
99.8k
    }
67
99.8k
  }
68
5.45k
  for (; x < w; x++) {
69
2.19k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
2.19k
    } else {
81
2.19k
      pixel_type First = in0[x];
82
2.19k
      pixel_type Second = in1[x];
83
2.19k
      pixel_type Third = in2[x];
84
2.19k
      if (third) Third = PixelAdd(Third, First);
85
2.19k
      if (second == 1) {
86
2.19k
        Second = PixelAdd(Second, First);
87
2.19k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
2.19k
      out0[x] = First;
91
2.19k
      out1[x] = Second;
92
2.19k
      out2[x] = Third;
93
2.19k
    }
94
2.19k
  }
95
3.26k
}
void jxl::N_SSE4::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.60k
               pixel_type* out2, size_t w) {
33
3.60k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.60k
                "Invalid transform type");
35
3.60k
  int second = transform_type >> 1;
36
3.60k
  int third = transform_type & 1;
37
38
3.60k
  size_t x = 0;
39
3.60k
  const HWY_FULL(pixel_type) d;
40
3.60k
  const size_t N = Lanes(d);
41
103k
  for (; x + N - 1 < w; x += N) {
42
99.7k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
99.7k
    } else {
54
99.7k
      auto First = Load(d, in0 + x);
55
99.7k
      auto Second = Load(d, in1 + x);
56
99.7k
      auto Third = Load(d, in2 + x);
57
99.8k
      if (third) Third = Add(Third, First);
58
99.8k
      if (second == 1) {
59
99.8k
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
99.7k
      Store(First, d, out0 + x);
64
99.7k
      Store(Second, d, out1 + x);
65
99.7k
      Store(Third, d, out2 + x);
66
99.7k
    }
67
99.7k
  }
68
5.79k
  for (; x < w; x++) {
69
2.18k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
2.18k
    } else {
81
2.18k
      pixel_type First = in0[x];
82
2.18k
      pixel_type Second = in1[x];
83
2.18k
      pixel_type Third = in2[x];
84
2.18k
      if (third) Third = PixelAdd(Third, First);
85
2.18k
      if (second == 1) {
86
2.18k
        Second = PixelAdd(Second, First);
87
2.18k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
2.18k
      out0[x] = First;
91
2.18k
      out1[x] = Second;
92
2.18k
      out2[x] = Third;
93
2.18k
    }
94
2.18k
  }
95
3.60k
}
void jxl::N_SSE4::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
27.5k
               pixel_type* out2, size_t w) {
33
27.5k
  static_assert(transform_type >= 0 && transform_type < 7,
34
27.5k
                "Invalid transform type");
35
27.5k
  int second = transform_type >> 1;
36
27.5k
  int third = transform_type & 1;
37
38
27.5k
  size_t x = 0;
39
27.5k
  const HWY_FULL(pixel_type) d;
40
27.5k
  const size_t N = Lanes(d);
41
908k
  for (; x + N - 1 < w; x += N) {
42
881k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
881k
    } else {
54
881k
      auto First = Load(d, in0 + x);
55
881k
      auto Second = Load(d, in1 + x);
56
881k
      auto Third = Load(d, in2 + x);
57
881k
      if (third) Third = Add(Third, First);
58
881k
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
881k
      } else if (second == 2) {
61
880k
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
880k
      }
63
881k
      Store(First, d, out0 + x);
64
881k
      Store(Second, d, out1 + x);
65
881k
      Store(Third, d, out2 + x);
66
881k
    }
67
881k
  }
68
46.9k
  for (; x < w; x++) {
69
19.3k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
19.3k
    } else {
81
19.3k
      pixel_type First = in0[x];
82
19.3k
      pixel_type Second = in1[x];
83
19.3k
      pixel_type Third = in2[x];
84
19.3k
      if (third) Third = PixelAdd(Third, First);
85
19.3k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
19.3k
      } else if (second == 2) {
88
19.3k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
19.3k
      }
90
19.3k
      out0[x] = First;
91
19.3k
      out1[x] = Second;
92
19.3k
      out2[x] = Third;
93
19.3k
    }
94
19.3k
  }
95
27.5k
}
void jxl::N_SSE4::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
157k
               pixel_type* out2, size_t w) {
33
157k
  static_assert(transform_type >= 0 && transform_type < 7,
34
157k
                "Invalid transform type");
35
157k
  int second = transform_type >> 1;
36
157k
  int third = transform_type & 1;
37
38
157k
  size_t x = 0;
39
157k
  const HWY_FULL(pixel_type) d;
40
157k
  const size_t N = Lanes(d);
41
8.36M
  for (; x + N - 1 < w; x += N) {
42
8.21M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
8.21M
    } else {
54
8.21M
      auto First = Load(d, in0 + x);
55
8.21M
      auto Second = Load(d, in1 + x);
56
8.21M
      auto Third = Load(d, in2 + x);
57
8.24M
      if (third) Third = Add(Third, First);
58
8.21M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
8.21M
      } else if (second == 2) {
61
8.20M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
8.20M
      }
63
8.21M
      Store(First, d, out0 + x);
64
8.21M
      Store(Second, d, out1 + x);
65
8.21M
      Store(Third, d, out2 + x);
66
8.21M
    }
67
8.21M
  }
68
158k
  for (; x < w; x++) {
69
1.69k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
1.69k
    } else {
81
1.69k
      pixel_type First = in0[x];
82
1.69k
      pixel_type Second = in1[x];
83
1.69k
      pixel_type Third = in2[x];
84
1.69k
      if (third) Third = PixelAdd(Third, First);
85
1.69k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
1.69k
      } else if (second == 2) {
88
1.69k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
1.69k
      }
90
1.69k
      out0[x] = First;
91
1.69k
      out1[x] = Second;
92
1.69k
      out2[x] = Third;
93
1.69k
    }
94
1.69k
  }
95
157k
}
void jxl::N_SSE4::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
45.8k
               pixel_type* out2, size_t w) {
33
45.8k
  static_assert(transform_type >= 0 && transform_type < 7,
34
45.8k
                "Invalid transform type");
35
45.8k
  int second = transform_type >> 1;
36
45.8k
  int third = transform_type & 1;
37
38
45.8k
  size_t x = 0;
39
45.8k
  const HWY_FULL(pixel_type) d;
40
45.8k
  const size_t N = Lanes(d);
41
1.57M
  for (; x + N - 1 < w; x += N) {
42
1.53M
    if (transform_type == 6) {
43
1.53M
      auto Y = Load(d, in0 + x);
44
1.53M
      auto Co = Load(d, in1 + x);
45
1.53M
      auto Cg = Load(d, in2 + x);
46
1.53M
      Y = Sub(Y, ShiftRight<1>(Cg));
47
1.53M
      auto G = Add(Cg, Y);
48
1.53M
      Y = Sub(Y, ShiftRight<1>(Co));
49
1.53M
      auto R = Add(Y, Co);
50
1.53M
      Store(R, d, out0 + x);
51
1.53M
      Store(G, d, out1 + x);
52
1.53M
      Store(Y, d, out2 + x);
53
18.4E
    } else {
54
18.4E
      auto First = Load(d, in0 + x);
55
18.4E
      auto Second = Load(d, in1 + x);
56
18.4E
      auto Third = Load(d, in2 + x);
57
18.4E
      if (third) Third = Add(Third, First);
58
18.4E
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
18.4E
      Store(First, d, out0 + x);
64
18.4E
      Store(Second, d, out1 + x);
65
18.4E
      Store(Third, d, out2 + x);
66
18.4E
    }
67
1.52M
  }
68
87.4k
  for (; x < w; x++) {
69
41.6k
    if (transform_type == 6) {
70
41.6k
      pixel_type Y = in0[x];
71
41.6k
      pixel_type Co = in1[x];
72
41.6k
      pixel_type Cg = in2[x];
73
41.6k
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
41.6k
      pixel_type G = PixelAdd(Cg, tmp);
75
41.6k
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
41.6k
      pixel_type R = PixelAdd(B, Co);
77
41.6k
      out0[x] = R;
78
41.6k
      out1[x] = G;
79
41.6k
      out2[x] = B;
80
18.4E
    } else {
81
18.4E
      pixel_type First = in0[x];
82
18.4E
      pixel_type Second = in1[x];
83
18.4E
      pixel_type Third = in2[x];
84
18.4E
      if (third) Third = PixelAdd(Third, First);
85
18.4E
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
18.4E
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
18.4E
      out0[x] = First;
91
18.4E
      out1[x] = Second;
92
18.4E
      out2[x] = Third;
93
18.4E
    }
94
41.5k
  }
95
45.8k
}
Unexecuted instantiation: void jxl::N_AVX2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
void jxl::N_AVX2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
67.2k
               pixel_type* out2, size_t w) {
33
67.2k
  static_assert(transform_type >= 0 && transform_type < 7,
34
67.2k
                "Invalid transform type");
35
67.2k
  int second = transform_type >> 1;
36
67.2k
  int third = transform_type & 1;
37
38
67.2k
  size_t x = 0;
39
67.2k
  const HWY_FULL(pixel_type) d;
40
67.2k
  const size_t N = Lanes(d);
41
646k
  for (; x + N - 1 < w; x += N) {
42
579k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
579k
    } else {
54
579k
      auto First = Load(d, in0 + x);
55
579k
      auto Second = Load(d, in1 + x);
56
579k
      auto Third = Load(d, in2 + x);
57
579k
      if (third) Third = Add(Third, First);
58
579k
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
579k
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
579k
      Store(First, d, out0 + x);
64
579k
      Store(Second, d, out1 + x);
65
579k
      Store(Third, d, out2 + x);
66
579k
    }
67
579k
  }
68
204k
  for (; x < w; x++) {
69
137k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
137k
    } else {
81
137k
      pixel_type First = in0[x];
82
137k
      pixel_type Second = in1[x];
83
137k
      pixel_type Third = in2[x];
84
137k
      if (third) Third = PixelAdd(Third, First);
85
137k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
137k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
137k
      out0[x] = First;
91
137k
      out1[x] = Second;
92
137k
      out2[x] = Third;
93
137k
    }
94
137k
  }
95
67.2k
}
void jxl::N_AVX2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
222k
               pixel_type* out2, size_t w) {
33
222k
  static_assert(transform_type >= 0 && transform_type < 7,
34
222k
                "Invalid transform type");
35
222k
  int second = transform_type >> 1;
36
222k
  int third = transform_type & 1;
37
38
222k
  size_t x = 0;
39
222k
  const HWY_FULL(pixel_type) d;
40
222k
  const size_t N = Lanes(d);
41
3.45M
  for (; x + N - 1 < w; x += N) {
42
3.22M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
3.22M
    } else {
54
3.22M
      auto First = Load(d, in0 + x);
55
3.22M
      auto Second = Load(d, in1 + x);
56
3.22M
      auto Third = Load(d, in2 + x);
57
3.22M
      if (third) Third = Add(Third, First);
58
3.22M
      if (second == 1) {
59
3.22M
        Second = Add(Second, First);
60
3.22M
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
3.22M
      Store(First, d, out0 + x);
64
3.22M
      Store(Second, d, out1 + x);
65
3.22M
      Store(Third, d, out2 + x);
66
3.22M
    }
67
3.22M
  }
68
495k
  for (; x < w; x++) {
69
273k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
273k
    } else {
81
273k
      pixel_type First = in0[x];
82
273k
      pixel_type Second = in1[x];
83
273k
      pixel_type Third = in2[x];
84
273k
      if (third) Third = PixelAdd(Third, First);
85
273k
      if (second == 1) {
86
273k
        Second = PixelAdd(Second, First);
87
273k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
273k
      out0[x] = First;
91
273k
      out1[x] = Second;
92
273k
      out2[x] = Third;
93
273k
    }
94
273k
  }
95
222k
}
void jxl::N_AVX2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
81.9k
               pixel_type* out2, size_t w) {
33
81.9k
  static_assert(transform_type >= 0 && transform_type < 7,
34
81.9k
                "Invalid transform type");
35
81.9k
  int second = transform_type >> 1;
36
81.9k
  int third = transform_type & 1;
37
38
81.9k
  size_t x = 0;
39
81.9k
  const HWY_FULL(pixel_type) d;
40
81.9k
  const size_t N = Lanes(d);
41
956k
  for (; x + N - 1 < w; x += N) {
42
874k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
874k
    } else {
54
874k
      auto First = Load(d, in0 + x);
55
874k
      auto Second = Load(d, in1 + x);
56
874k
      auto Third = Load(d, in2 + x);
57
874k
      if (third) Third = Add(Third, First);
58
874k
      if (second == 1) {
59
874k
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
874k
      Store(First, d, out0 + x);
64
874k
      Store(Second, d, out1 + x);
65
874k
      Store(Third, d, out2 + x);
66
874k
    }
67
874k
  }
68
428k
  for (; x < w; x++) {
69
346k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
346k
    } else {
81
346k
      pixel_type First = in0[x];
82
346k
      pixel_type Second = in1[x];
83
346k
      pixel_type Third = in2[x];
84
346k
      if (third) Third = PixelAdd(Third, First);
85
346k
      if (second == 1) {
86
346k
        Second = PixelAdd(Second, First);
87
346k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
346k
      out0[x] = First;
91
346k
      out1[x] = Second;
92
346k
      out2[x] = Third;
93
346k
    }
94
346k
  }
95
81.9k
}
void jxl::N_AVX2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
82.7k
               pixel_type* out2, size_t w) {
33
82.7k
  static_assert(transform_type >= 0 && transform_type < 7,
34
82.7k
                "Invalid transform type");
35
82.7k
  int second = transform_type >> 1;
36
82.7k
  int third = transform_type & 1;
37
38
82.7k
  size_t x = 0;
39
82.7k
  const HWY_FULL(pixel_type) d;
40
82.7k
  const size_t N = Lanes(d);
41
1.48M
  for (; x + N - 1 < w; x += N) {
42
1.40M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
1.40M
    } else {
54
1.40M
      auto First = Load(d, in0 + x);
55
1.40M
      auto Second = Load(d, in1 + x);
56
1.40M
      auto Third = Load(d, in2 + x);
57
1.40M
      if (third) Third = Add(Third, First);
58
1.40M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
1.40M
      } else if (second == 2) {
61
1.40M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
1.40M
      }
63
1.40M
      Store(First, d, out0 + x);
64
1.40M
      Store(Second, d, out1 + x);
65
1.40M
      Store(Third, d, out2 + x);
66
1.40M
    }
67
1.40M
  }
68
193k
  for (; x < w; x++) {
69
111k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
111k
    } else {
81
111k
      pixel_type First = in0[x];
82
111k
      pixel_type Second = in1[x];
83
111k
      pixel_type Third = in2[x];
84
111k
      if (third) Third = PixelAdd(Third, First);
85
111k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
111k
      } else if (second == 2) {
88
111k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
111k
      }
90
111k
      out0[x] = First;
91
111k
      out1[x] = Second;
92
111k
      out2[x] = Third;
93
111k
    }
94
111k
  }
95
82.7k
}
void jxl::N_AVX2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
185k
               pixel_type* out2, size_t w) {
33
185k
  static_assert(transform_type >= 0 && transform_type < 7,
34
185k
                "Invalid transform type");
35
185k
  int second = transform_type >> 1;
36
185k
  int third = transform_type & 1;
37
38
185k
  size_t x = 0;
39
185k
  const HWY_FULL(pixel_type) d;
40
185k
  const size_t N = Lanes(d);
41
4.67M
  for (; x + N - 1 < w; x += N) {
42
4.49M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
4.49M
    } else {
54
4.49M
      auto First = Load(d, in0 + x);
55
4.49M
      auto Second = Load(d, in1 + x);
56
4.49M
      auto Third = Load(d, in2 + x);
57
4.49M
      if (third) Third = Add(Third, First);
58
4.49M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
4.49M
      } else if (second == 2) {
61
4.49M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
4.49M
      }
63
4.49M
      Store(First, d, out0 + x);
64
4.49M
      Store(Second, d, out1 + x);
65
4.49M
      Store(Third, d, out2 + x);
66
4.49M
    }
67
4.49M
  }
68
567k
  for (; x < w; x++) {
69
382k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
382k
    } else {
81
382k
      pixel_type First = in0[x];
82
382k
      pixel_type Second = in1[x];
83
382k
      pixel_type Third = in2[x];
84
382k
      if (third) Third = PixelAdd(Third, First);
85
382k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
382k
      } else if (second == 2) {
88
382k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
382k
      }
90
382k
      out0[x] = First;
91
382k
      out1[x] = Second;
92
382k
      out2[x] = Third;
93
382k
    }
94
382k
  }
95
185k
}
void jxl::N_AVX2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
6.11M
               pixel_type* out2, size_t w) {
33
6.11M
  static_assert(transform_type >= 0 && transform_type < 7,
34
6.11M
                "Invalid transform type");
35
6.11M
  int second = transform_type >> 1;
36
6.11M
  int third = transform_type & 1;
37
38
6.11M
  size_t x = 0;
39
6.11M
  const HWY_FULL(pixel_type) d;
40
6.11M
  const size_t N = Lanes(d);
41
10.0M
  for (; x + N - 1 < w; x += N) {
42
3.97M
    if (transform_type == 6) {
43
3.97M
      auto Y = Load(d, in0 + x);
44
3.97M
      auto Co = Load(d, in1 + x);
45
3.97M
      auto Cg = Load(d, in2 + x);
46
3.97M
      Y = Sub(Y, ShiftRight<1>(Cg));
47
3.97M
      auto G = Add(Cg, Y);
48
3.97M
      Y = Sub(Y, ShiftRight<1>(Co));
49
3.97M
      auto R = Add(Y, Co);
50
3.97M
      Store(R, d, out0 + x);
51
3.97M
      Store(G, d, out1 + x);
52
3.97M
      Store(Y, d, out2 + x);
53
3.97M
    } else {
54
12
      auto First = Load(d, in0 + x);
55
12
      auto Second = Load(d, in1 + x);
56
12
      auto Third = Load(d, in2 + x);
57
12
      if (third) Third = Add(Third, First);
58
12
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
12
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
12
      Store(First, d, out0 + x);
64
12
      Store(Second, d, out1 + x);
65
12
      Store(Third, d, out2 + x);
66
12
    }
67
3.97M
  }
68
45.2M
  for (; x < w; x++) {
69
39.1M
    if (transform_type == 6) {
70
39.1M
      pixel_type Y = in0[x];
71
39.1M
      pixel_type Co = in1[x];
72
39.1M
      pixel_type Cg = in2[x];
73
39.1M
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
39.1M
      pixel_type G = PixelAdd(Cg, tmp);
75
39.1M
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
39.1M
      pixel_type R = PixelAdd(B, Co);
77
39.1M
      out0[x] = R;
78
39.1M
      out1[x] = G;
79
39.1M
      out2[x] = B;
80
39.1M
    } else {
81
0
      pixel_type First = in0[x];
82
0
      pixel_type Second = in1[x];
83
0
      pixel_type Third = in2[x];
84
0
      if (third) Third = PixelAdd(Third, First);
85
0
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
0
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
0
      out0[x] = First;
91
0
      out1[x] = Second;
92
0
      out2[x] = Third;
93
0
    }
94
39.1M
  }
95
6.11M
}
Unexecuted instantiation: void jxl::N_SSE2::InvRCTRow<0>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
void jxl::N_SSE2::InvRCTRow<1>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
5.05k
               pixel_type* out2, size_t w) {
33
5.05k
  static_assert(transform_type >= 0 && transform_type < 7,
34
5.05k
                "Invalid transform type");
35
5.05k
  int second = transform_type >> 1;
36
5.05k
  int third = transform_type & 1;
37
38
5.05k
  size_t x = 0;
39
5.05k
  const HWY_FULL(pixel_type) d;
40
5.05k
  const size_t N = Lanes(d);
41
147k
  for (; x + N - 1 < w; x += N) {
42
142k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
142k
    } else {
54
142k
      auto First = Load(d, in0 + x);
55
142k
      auto Second = Load(d, in1 + x);
56
142k
      auto Third = Load(d, in2 + x);
57
142k
      if (third) Third = Add(Third, First);
58
142k
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
142k
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
142k
      Store(First, d, out0 + x);
64
142k
      Store(Second, d, out1 + x);
65
142k
      Store(Third, d, out2 + x);
66
142k
    }
67
142k
  }
68
13.9k
  for (; x < w; x++) {
69
8.88k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
8.88k
    } else {
81
8.88k
      pixel_type First = in0[x];
82
8.88k
      pixel_type Second = in1[x];
83
8.88k
      pixel_type Third = in2[x];
84
8.88k
      if (third) Third = PixelAdd(Third, First);
85
8.88k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
8.88k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
8.88k
      out0[x] = First;
91
8.88k
      out1[x] = Second;
92
8.88k
      out2[x] = Third;
93
8.88k
    }
94
8.88k
  }
95
5.05k
}
void jxl::N_SSE2::InvRCTRow<2>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.75k
               pixel_type* out2, size_t w) {
33
3.75k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.75k
                "Invalid transform type");
35
3.75k
  int second = transform_type >> 1;
36
3.75k
  int third = transform_type & 1;
37
38
3.75k
  size_t x = 0;
39
3.75k
  const HWY_FULL(pixel_type) d;
40
3.75k
  const size_t N = Lanes(d);
41
101k
  for (; x + N - 1 < w; x += N) {
42
97.5k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
97.5k
    } else {
54
97.5k
      auto First = Load(d, in0 + x);
55
97.5k
      auto Second = Load(d, in1 + x);
56
97.5k
      auto Third = Load(d, in2 + x);
57
97.5k
      if (third) Third = Add(Third, First);
58
97.5k
      if (second == 1) {
59
97.5k
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
97.5k
      Store(First, d, out0 + x);
64
97.5k
      Store(Second, d, out1 + x);
65
97.5k
      Store(Third, d, out2 + x);
66
97.5k
    }
67
97.5k
  }
68
6.63k
  for (; x < w; x++) {
69
2.88k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
2.88k
    } else {
81
2.88k
      pixel_type First = in0[x];
82
2.88k
      pixel_type Second = in1[x];
83
2.88k
      pixel_type Third = in2[x];
84
2.88k
      if (third) Third = PixelAdd(Third, First);
85
2.88k
      if (second == 1) {
86
2.88k
        Second = PixelAdd(Second, First);
87
2.88k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
2.88k
      out0[x] = First;
91
2.88k
      out1[x] = Second;
92
2.88k
      out2[x] = Third;
93
2.88k
    }
94
2.88k
  }
95
3.75k
}
void jxl::N_SSE2::InvRCTRow<3>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
3.35k
               pixel_type* out2, size_t w) {
33
3.35k
  static_assert(transform_type >= 0 && transform_type < 7,
34
3.35k
                "Invalid transform type");
35
3.35k
  int second = transform_type >> 1;
36
3.35k
  int third = transform_type & 1;
37
38
3.35k
  size_t x = 0;
39
3.35k
  const HWY_FULL(pixel_type) d;
40
3.35k
  const size_t N = Lanes(d);
41
80.5k
  for (; x + N - 1 < w; x += N) {
42
77.2k
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
77.2k
    } else {
54
77.2k
      auto First = Load(d, in0 + x);
55
77.2k
      auto Second = Load(d, in1 + x);
56
77.2k
      auto Third = Load(d, in2 + x);
57
77.2k
      if (third) Third = Add(Third, First);
58
77.3k
      if (second == 1) {
59
77.3k
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
77.2k
      Store(First, d, out0 + x);
64
77.2k
      Store(Second, d, out1 + x);
65
77.2k
      Store(Third, d, out2 + x);
66
77.2k
    }
67
77.2k
  }
68
6.73k
  for (; x < w; x++) {
69
3.37k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
3.37k
    } else {
81
3.37k
      pixel_type First = in0[x];
82
3.37k
      pixel_type Second = in1[x];
83
3.37k
      pixel_type Third = in2[x];
84
3.37k
      if (third) Third = PixelAdd(Third, First);
85
3.37k
      if (second == 1) {
86
3.37k
        Second = PixelAdd(Second, First);
87
3.37k
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
3.37k
      out0[x] = First;
91
3.37k
      out1[x] = Second;
92
3.37k
      out2[x] = Third;
93
3.37k
    }
94
3.37k
  }
95
3.35k
}
void jxl::N_SSE2::InvRCTRow<4>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
63.6k
               pixel_type* out2, size_t w) {
33
63.6k
  static_assert(transform_type >= 0 && transform_type < 7,
34
63.6k
                "Invalid transform type");
35
63.6k
  int second = transform_type >> 1;
36
63.6k
  int third = transform_type & 1;
37
38
63.6k
  size_t x = 0;
39
63.6k
  const HWY_FULL(pixel_type) d;
40
63.6k
  const size_t N = Lanes(d);
41
3.61M
  for (; x + N - 1 < w; x += N) {
42
3.55M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
3.55M
    } else {
54
3.55M
      auto First = Load(d, in0 + x);
55
3.55M
      auto Second = Load(d, in1 + x);
56
3.55M
      auto Third = Load(d, in2 + x);
57
3.55M
      if (third) Third = Add(Third, First);
58
3.55M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
3.55M
      } else if (second == 2) {
61
3.55M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
3.55M
      }
63
3.55M
      Store(First, d, out0 + x);
64
3.55M
      Store(Second, d, out1 + x);
65
3.55M
      Store(Third, d, out2 + x);
66
3.55M
    }
67
3.55M
  }
68
71.1k
  for (; x < w; x++) {
69
7.52k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
7.52k
    } else {
81
7.52k
      pixel_type First = in0[x];
82
7.52k
      pixel_type Second = in1[x];
83
7.52k
      pixel_type Third = in2[x];
84
7.52k
      if (third) Third = PixelAdd(Third, First);
85
7.52k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
7.52k
      } else if (second == 2) {
88
7.52k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
7.52k
      }
90
7.52k
      out0[x] = First;
91
7.52k
      out1[x] = Second;
92
7.52k
      out2[x] = Third;
93
7.52k
    }
94
7.52k
  }
95
63.6k
}
void jxl::N_SSE2::InvRCTRow<5>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
69.5k
               pixel_type* out2, size_t w) {
33
69.5k
  static_assert(transform_type >= 0 && transform_type < 7,
34
69.5k
                "Invalid transform type");
35
69.5k
  int second = transform_type >> 1;
36
69.5k
  int third = transform_type & 1;
37
38
69.5k
  size_t x = 0;
39
69.5k
  const HWY_FULL(pixel_type) d;
40
69.5k
  const size_t N = Lanes(d);
41
3.51M
  for (; x + N - 1 < w; x += N) {
42
3.44M
    if (transform_type == 6) {
43
0
      auto Y = Load(d, in0 + x);
44
0
      auto Co = Load(d, in1 + x);
45
0
      auto Cg = Load(d, in2 + x);
46
0
      Y = Sub(Y, ShiftRight<1>(Cg));
47
0
      auto G = Add(Cg, Y);
48
0
      Y = Sub(Y, ShiftRight<1>(Co));
49
0
      auto R = Add(Y, Co);
50
0
      Store(R, d, out0 + x);
51
0
      Store(G, d, out1 + x);
52
0
      Store(Y, d, out2 + x);
53
3.44M
    } else {
54
3.44M
      auto First = Load(d, in0 + x);
55
3.44M
      auto Second = Load(d, in1 + x);
56
3.44M
      auto Third = Load(d, in2 + x);
57
3.44M
      if (third) Third = Add(Third, First);
58
3.44M
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
3.44M
      } else if (second == 2) {
61
3.44M
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
3.44M
      }
63
3.44M
      Store(First, d, out0 + x);
64
3.44M
      Store(Second, d, out1 + x);
65
3.44M
      Store(Third, d, out2 + x);
66
3.44M
    }
67
3.44M
  }
68
72.6k
  for (; x < w; x++) {
69
3.10k
    if (transform_type == 6) {
70
0
      pixel_type Y = in0[x];
71
0
      pixel_type Co = in1[x];
72
0
      pixel_type Cg = in2[x];
73
0
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
0
      pixel_type G = PixelAdd(Cg, tmp);
75
0
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
0
      pixel_type R = PixelAdd(B, Co);
77
0
      out0[x] = R;
78
0
      out1[x] = G;
79
0
      out2[x] = B;
80
3.10k
    } else {
81
3.10k
      pixel_type First = in0[x];
82
3.10k
      pixel_type Second = in1[x];
83
3.10k
      pixel_type Third = in2[x];
84
3.10k
      if (third) Third = PixelAdd(Third, First);
85
3.10k
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
3.10k
      } else if (second == 2) {
88
3.10k
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
3.10k
      }
90
3.10k
      out0[x] = First;
91
3.10k
      out1[x] = Second;
92
3.10k
      out2[x] = Third;
93
3.10k
    }
94
3.10k
  }
95
69.5k
}
void jxl::N_SSE2::InvRCTRow<6>(int const*, int const*, int const*, int*, int*, int*, unsigned long)
Line
Count
Source
32
87.5k
               pixel_type* out2, size_t w) {
33
87.5k
  static_assert(transform_type >= 0 && transform_type < 7,
34
87.5k
                "Invalid transform type");
35
87.5k
  int second = transform_type >> 1;
36
87.5k
  int third = transform_type & 1;
37
38
87.5k
  size_t x = 0;
39
87.5k
  const HWY_FULL(pixel_type) d;
40
87.5k
  const size_t N = Lanes(d);
41
4.71M
  for (; x + N - 1 < w; x += N) {
42
4.62M
    if (transform_type == 6) {
43
4.62M
      auto Y = Load(d, in0 + x);
44
4.62M
      auto Co = Load(d, in1 + x);
45
4.62M
      auto Cg = Load(d, in2 + x);
46
4.62M
      Y = Sub(Y, ShiftRight<1>(Cg));
47
4.62M
      auto G = Add(Cg, Y);
48
4.62M
      Y = Sub(Y, ShiftRight<1>(Co));
49
4.62M
      auto R = Add(Y, Co);
50
4.62M
      Store(R, d, out0 + x);
51
4.62M
      Store(G, d, out1 + x);
52
4.62M
      Store(Y, d, out2 + x);
53
18.4E
    } else {
54
18.4E
      auto First = Load(d, in0 + x);
55
18.4E
      auto Second = Load(d, in1 + x);
56
18.4E
      auto Third = Load(d, in2 + x);
57
18.4E
      if (third) Third = Add(Third, First);
58
18.4E
      if (second == 1) {
59
0
        Second = Add(Second, First);
60
18.4E
      } else if (second == 2) {
61
0
        Second = Add(Second, ShiftRight<1>(Add(First, Third)));
62
0
      }
63
18.4E
      Store(First, d, out0 + x);
64
18.4E
      Store(Second, d, out1 + x);
65
18.4E
      Store(Third, d, out2 + x);
66
18.4E
    }
67
4.62M
  }
68
121k
  for (; x < w; x++) {
69
33.8k
    if (transform_type == 6) {
70
33.8k
      pixel_type Y = in0[x];
71
33.8k
      pixel_type Co = in1[x];
72
33.8k
      pixel_type Cg = in2[x];
73
33.8k
      pixel_type tmp = PixelAdd(Y, -(Cg >> 1));
74
33.8k
      pixel_type G = PixelAdd(Cg, tmp);
75
33.8k
      pixel_type B = PixelAdd(tmp, -(Co >> 1));
76
33.8k
      pixel_type R = PixelAdd(B, Co);
77
33.8k
      out0[x] = R;
78
33.8k
      out1[x] = G;
79
33.8k
      out2[x] = B;
80
33.8k
    } else {
81
0
      pixel_type First = in0[x];
82
0
      pixel_type Second = in1[x];
83
0
      pixel_type Third = in2[x];
84
0
      if (third) Third = PixelAdd(Third, First);
85
0
      if (second == 1) {
86
0
        Second = PixelAdd(Second, First);
87
0
      } else if (second == 2) {
88
0
        Second = PixelAdd(Second, (PixelAdd(First, Third) >> 1));
89
0
      }
90
0
      out0[x] = First;
91
0
      out1[x] = Second;
92
0
      out2[x] = Third;
93
0
    }
94
33.8k
  }
95
87.5k
}
96
97
18.2k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
98
18.2k
  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
99
18.2k
  size_t m = begin_c;
100
18.2k
  Channel& c0 = input.channel[m + 0];
101
18.2k
  size_t w = c0.w;
102
18.2k
  size_t h = c0.h;
103
18.2k
  if (rct_type == 0) {  // noop
104
4.80k
    return true;
105
4.80k
  }
106
  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
107
13.4k
  int permutation = rct_type / 7;
108
13.4k
  JXL_ENSURE(permutation < 6);
109
  // 0-5 values have the low bit corresponding to Third and the high bits
110
  // corresponding to Second. 6 corresponds to YCoCg.
111
  //
112
  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
113
  //
114
  // Third: 0=nop, 1=SubtractFirst
115
13.4k
  int custom = rct_type % 7;
116
  // Special case: permute-only. Swap channels around.
117
13.4k
  if (custom == 0) {
118
1.28k
    Channel ch0 = std::move(input.channel[m]);
119
1.28k
    Channel ch1 = std::move(input.channel[m + 1]);
120
1.28k
    Channel ch2 = std::move(input.channel[m + 2]);
121
1.28k
    input.channel[m + (permutation % 3)] = std::move(ch0);
122
1.28k
    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
123
1.28k
        std::move(ch1);
124
1.28k
    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
125
1.28k
        std::move(ch2);
126
1.28k
    return true;
127
1.28k
  }
128
12.1k
  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
129
12.1k
      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
130
12.1k
      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
131
12.1k
  const auto process_row = [&](const uint32_t task,
132
7.23M
                               size_t /* thread */) -> Status {
133
7.23M
    const size_t y = task;
134
7.23M
    const pixel_type* in0 = input.channel[m].Row(y);
135
7.23M
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
7.23M
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
7.23M
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
7.23M
    pixel_type* out1 =
139
7.23M
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
7.23M
    pixel_type* out2 =
141
7.23M
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
7.23M
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
7.23M
    return true;
144
7.23M
  };
rct.cc:jxl::N_SSE4::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
132
245k
                               size_t /* thread */) -> Status {
133
245k
    const size_t y = task;
134
245k
    const pixel_type* in0 = input.channel[m].Row(y);
135
245k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
245k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
245k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
245k
    pixel_type* out1 =
139
245k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
245k
    pixel_type* out2 =
141
245k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
245k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
245k
    return true;
144
245k
  };
rct.cc:jxl::N_AVX2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
132
6.75M
                               size_t /* thread */) -> Status {
133
6.75M
    const size_t y = task;
134
6.75M
    const pixel_type* in0 = input.channel[m].Row(y);
135
6.75M
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
6.75M
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
6.75M
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
6.75M
    pixel_type* out1 =
139
6.75M
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
6.75M
    pixel_type* out2 =
141
6.75M
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
6.75M
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
6.75M
    return true;
144
6.75M
  };
rct.cc:jxl::N_SSE2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)::$_0::operator()(unsigned int, unsigned long) const
Line
Count
Source
132
235k
                               size_t /* thread */) -> Status {
133
235k
    const size_t y = task;
134
235k
    const pixel_type* in0 = input.channel[m].Row(y);
135
235k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
235k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
235k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
235k
    pixel_type* out1 =
139
235k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
235k
    pixel_type* out2 =
141
235k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
235k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
235k
    return true;
144
235k
  };
145
12.1k
  JXL_RETURN_IF_ERROR(
146
12.1k
      RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT"));
147
12.1k
  return true;
148
12.1k
}
jxl::N_SSE4::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)
Line
Count
Source
97
3.46k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
98
3.46k
  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
99
3.46k
  size_t m = begin_c;
100
3.46k
  Channel& c0 = input.channel[m + 0];
101
3.46k
  size_t w = c0.w;
102
3.46k
  size_t h = c0.h;
103
3.46k
  if (rct_type == 0) {  // noop
104
224
    return true;
105
224
  }
106
  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
107
3.24k
  int permutation = rct_type / 7;
108
3.24k
  JXL_ENSURE(permutation < 6);
109
  // 0-5 values have the low bit corresponding to Third and the high bits
110
  // corresponding to Second. 6 corresponds to YCoCg.
111
  //
112
  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
113
  //
114
  // Third: 0=nop, 1=SubtractFirst
115
3.24k
  int custom = rct_type % 7;
116
  // Special case: permute-only. Swap channels around.
117
3.24k
  if (custom == 0) {
118
210
    Channel ch0 = std::move(input.channel[m]);
119
210
    Channel ch1 = std::move(input.channel[m + 1]);
120
210
    Channel ch2 = std::move(input.channel[m + 2]);
121
210
    input.channel[m + (permutation % 3)] = std::move(ch0);
122
210
    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
123
210
        std::move(ch1);
124
210
    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
125
210
        std::move(ch2);
126
210
    return true;
127
210
  }
128
3.03k
  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
129
3.03k
      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
130
3.03k
      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
131
3.03k
  const auto process_row = [&](const uint32_t task,
132
3.03k
                               size_t /* thread */) -> Status {
133
3.03k
    const size_t y = task;
134
3.03k
    const pixel_type* in0 = input.channel[m].Row(y);
135
3.03k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
3.03k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
3.03k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
3.03k
    pixel_type* out1 =
139
3.03k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
3.03k
    pixel_type* out2 =
141
3.03k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
3.03k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
3.03k
    return true;
144
3.03k
  };
145
3.03k
  JXL_RETURN_IF_ERROR(
146
3.03k
      RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT"));
147
3.03k
  return true;
148
3.03k
}
jxl::N_AVX2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)
Line
Count
Source
97
12.0k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
98
12.0k
  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
99
12.0k
  size_t m = begin_c;
100
12.0k
  Channel& c0 = input.channel[m + 0];
101
12.0k
  size_t w = c0.w;
102
12.0k
  size_t h = c0.h;
103
12.0k
  if (rct_type == 0) {  // noop
104
4.45k
    return true;
105
4.45k
  }
106
  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
107
7.57k
  int permutation = rct_type / 7;
108
7.57k
  JXL_ENSURE(permutation < 6);
109
  // 0-5 values have the low bit corresponding to Third and the high bits
110
  // corresponding to Second. 6 corresponds to YCoCg.
111
  //
112
  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
113
  //
114
  // Third: 0=nop, 1=SubtractFirst
115
7.57k
  int custom = rct_type % 7;
116
  // Special case: permute-only. Swap channels around.
117
7.57k
  if (custom == 0) {
118
869
    Channel ch0 = std::move(input.channel[m]);
119
869
    Channel ch1 = std::move(input.channel[m + 1]);
120
869
    Channel ch2 = std::move(input.channel[m + 2]);
121
869
    input.channel[m + (permutation % 3)] = std::move(ch0);
122
869
    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
123
869
        std::move(ch1);
124
869
    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
125
869
        std::move(ch2);
126
869
    return true;
127
869
  }
128
6.70k
  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
129
6.70k
      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
130
6.70k
      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
131
6.70k
  const auto process_row = [&](const uint32_t task,
132
6.70k
                               size_t /* thread */) -> Status {
133
6.70k
    const size_t y = task;
134
6.70k
    const pixel_type* in0 = input.channel[m].Row(y);
135
6.70k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
6.70k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
6.70k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
6.70k
    pixel_type* out1 =
139
6.70k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
6.70k
    pixel_type* out2 =
141
6.70k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
6.70k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
6.70k
    return true;
144
6.70k
  };
145
6.70k
  JXL_RETURN_IF_ERROR(
146
6.70k
      RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT"));
147
6.70k
  return true;
148
6.70k
}
jxl::N_SSE2::InvRCT(jxl::Image&, unsigned long, unsigned long, jxl::ThreadPool*)
Line
Count
Source
97
2.75k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
98
2.75k
  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, begin_c + 2));
99
2.75k
  size_t m = begin_c;
100
2.75k
  Channel& c0 = input.channel[m + 0];
101
2.75k
  size_t w = c0.w;
102
2.75k
  size_t h = c0.h;
103
2.75k
  if (rct_type == 0) {  // noop
104
127
    return true;
105
127
  }
106
  // Permutation: 0=RGB, 1=GBR, 2=BRG, 3=RBG, 4=GRB, 5=BGR
107
2.63k
  int permutation = rct_type / 7;
108
2.63k
  JXL_ENSURE(permutation < 6);
109
  // 0-5 values have the low bit corresponding to Third and the high bits
110
  // corresponding to Second. 6 corresponds to YCoCg.
111
  //
112
  // Second: 0=nop, 1=SubtractFirst, 2=SubtractAvgFirstThird
113
  //
114
  // Third: 0=nop, 1=SubtractFirst
115
2.63k
  int custom = rct_type % 7;
116
  // Special case: permute-only. Swap channels around.
117
2.63k
  if (custom == 0) {
118
207
    Channel ch0 = std::move(input.channel[m]);
119
207
    Channel ch1 = std::move(input.channel[m + 1]);
120
207
    Channel ch2 = std::move(input.channel[m + 2]);
121
207
    input.channel[m + (permutation % 3)] = std::move(ch0);
122
207
    input.channel[m + ((permutation + 1 + permutation / 3) % 3)] =
123
207
        std::move(ch1);
124
207
    input.channel[m + ((permutation + 2 - permutation / 3) % 3)] =
125
207
        std::move(ch2);
126
207
    return true;
127
207
  }
128
2.42k
  constexpr decltype(&InvRCTRow<0>) inv_rct_row[] = {
129
2.42k
      InvRCTRow<0>, InvRCTRow<1>, InvRCTRow<2>, InvRCTRow<3>,
130
2.42k
      InvRCTRow<4>, InvRCTRow<5>, InvRCTRow<6>};
131
2.42k
  const auto process_row = [&](const uint32_t task,
132
2.42k
                               size_t /* thread */) -> Status {
133
2.42k
    const size_t y = task;
134
2.42k
    const pixel_type* in0 = input.channel[m].Row(y);
135
2.42k
    const pixel_type* in1 = input.channel[m + 1].Row(y);
136
2.42k
    const pixel_type* in2 = input.channel[m + 2].Row(y);
137
2.42k
    pixel_type* out0 = input.channel[m + (permutation % 3)].Row(y);
138
2.42k
    pixel_type* out1 =
139
2.42k
        input.channel[m + ((permutation + 1 + permutation / 3) % 3)].Row(y);
140
2.42k
    pixel_type* out2 =
141
2.42k
        input.channel[m + ((permutation + 2 - permutation / 3) % 3)].Row(y);
142
2.42k
    inv_rct_row[custom](in0, in1, in2, out0, out1, out2, w);
143
2.42k
    return true;
144
2.42k
  };
145
2.42k
  JXL_RETURN_IF_ERROR(
146
2.42k
      RunOnPool(pool, 0, h, ThreadPool::NoInit, process_row, "InvRCT"));
147
2.42k
  return true;
148
2.42k
}
149
150
}  // namespace HWY_NAMESPACE
151
}  // namespace jxl
152
HWY_AFTER_NAMESPACE();
153
154
#if HWY_ONCE
155
namespace jxl {
156
157
HWY_EXPORT(InvRCT);
158
18.2k
Status InvRCT(Image& input, size_t begin_c, size_t rct_type, ThreadPool* pool) {
159
18.2k
  return HWY_DYNAMIC_DISPATCH(InvRCT)(input, begin_c, rct_type, pool);
160
18.2k
}
161
162
}  // namespace jxl
163
#endif