/src/Simd/src/Simd/SimdAvx2BgrToRgb.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2021 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdMemory.h" |
25 | | #include "Simd/SimdStore.h" |
26 | | |
27 | | namespace Simd |
28 | | { |
29 | | #ifdef SIMD_AVX2_ENABLE |
30 | | namespace Avx2 |
31 | | { |
32 | | const __m256i K8_SHFL_0S0 = SIMD_MM256_SETR_EPI8(0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1, |
33 | | 0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF); |
34 | | const __m256i K8_SHFL_0P0 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x9, |
35 | | -1, 0x7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); |
36 | | const __m256i K8_SHFL_0P1 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
37 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x8, -1); |
38 | | const __m256i K8_SHFL_1S1 = SIMD_MM256_SETR_EPI8(-1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD, |
39 | | 0x2, 0x1, 0x0, 0x5, 0x4, 0x3, 0x8, 0x7, 0x6, 0xB, 0xA, 0x9, 0xE, 0xD, 0xC, -1); |
40 | | const __m256i K8_SHFL_1P0 = SIMD_MM256_SETR_EPI8(0x6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
41 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); |
42 | | const __m256i K8_SHFL_1P2 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
43 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x9); |
44 | | const __m256i K8_SHFL_2S2 = SIMD_MM256_SETR_EPI8(0x0, -1, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0xA, 0x9, 0x8, 0xD, 0xC, 0xB, -1, 0xF, |
45 | | -1, 0x3, 0x2, 0x1, 0x6, 0x5, 0x4, 0x9, 0x8, 0x7, 0xC, 0xB, 0xA, 0xF, 0xE, 0xD); |
46 | | const __m256i K8_SHFL_2P1 = SIMD_MM256_SETR_EPI8(-1, 0x7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
47 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); |
48 | | const __m256i K8_SHFL_2P2 = SIMD_MM256_SETR_EPI8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x8, -1, |
49 | | 0x6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); |
50 | | |
51 | | template <bool align> SIMD_INLINE void BgrToRgb(const uint8_t * src, uint8_t * dst) |
52 | 179k | { |
53 | 179k | __m256i s0 = Load<align>((__m256i*)src + 0); |
54 | 179k | __m256i s1 = Load<align>((__m256i*)src + 1); |
55 | 179k | __m256i s2 = Load<align>((__m256i*)src + 2); |
56 | 179k | __m256i p0 = _mm256_permute4x64_epi64(s0, 0x1B); |
57 | 179k | __m256i p1 = _mm256_permute4x64_epi64(s1, 0x1B); |
58 | 179k | __m256i p2 = _mm256_permute4x64_epi64(s2, 0x1B); |
59 | 179k | Store<align>((__m256i*)dst + 0, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0, K8_SHFL_0S0), |
60 | 179k | _mm256_shuffle_epi8(p0, K8_SHFL_0P0)), _mm256_shuffle_epi8(p1, K8_SHFL_0P1))); |
61 | 179k | Store<align>((__m256i*)dst + 1, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s1, K8_SHFL_1S1), |
62 | 179k | _mm256_shuffle_epi8(p0, K8_SHFL_1P0)), _mm256_shuffle_epi8(p2, K8_SHFL_1P2))); |
63 | 179k | Store<align>((__m256i*)dst + 2, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s2, K8_SHFL_2S2), |
64 | 179k | _mm256_shuffle_epi8(p1, K8_SHFL_2P1)), _mm256_shuffle_epi8(p2, K8_SHFL_2P2))); |
65 | 179k | } void Simd::Avx2::BgrToRgb<true>(unsigned char const*, unsigned char*) Line | Count | Source | 52 | 6.82k | { | 53 | 6.82k | __m256i s0 = Load<align>((__m256i*)src + 0); | 54 | 6.82k | __m256i s1 = Load<align>((__m256i*)src + 1); | 55 | 6.82k | __m256i s2 = Load<align>((__m256i*)src + 2); | 56 | 6.82k | __m256i p0 = _mm256_permute4x64_epi64(s0, 0x1B); | 57 | 6.82k | __m256i p1 = _mm256_permute4x64_epi64(s1, 0x1B); | 58 | | __m256i p2 = _mm256_permute4x64_epi64(s2, 0x1B); | 59 | 6.82k | Store<align>((__m256i*)dst + 0, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0, K8_SHFL_0S0), | 60 | 6.82k | _mm256_shuffle_epi8(p0, K8_SHFL_0P0)), _mm256_shuffle_epi8(p1, K8_SHFL_0P1))); | 61 | 6.82k | Store<align>((__m256i*)dst + 1, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s1, K8_SHFL_1S1), | 62 | 6.82k | _mm256_shuffle_epi8(p0, K8_SHFL_1P0)), _mm256_shuffle_epi8(p2, K8_SHFL_1P2))); | 63 | 6.82k | Store<align>((__m256i*)dst + 2, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s2, K8_SHFL_2S2), | 64 | 6.82k | _mm256_shuffle_epi8(p1, K8_SHFL_2P1)), _mm256_shuffle_epi8(p2, K8_SHFL_2P2))); | 65 | 6.82k | } |
void Simd::Avx2::BgrToRgb<false>(unsigned char const*, unsigned char*) Line | Count | Source | 52 | 172k | { | 53 | 172k | __m256i s0 = Load<align>((__m256i*)src + 0); | 54 | 172k | __m256i s1 = Load<align>((__m256i*)src + 1); | 55 | 172k | __m256i s2 = Load<align>((__m256i*)src + 2); | 56 | 172k | __m256i p0 = _mm256_permute4x64_epi64(s0, 0x1B); | 57 | 172k | __m256i p1 = _mm256_permute4x64_epi64(s1, 0x1B); | 58 | | __m256i p2 = _mm256_permute4x64_epi64(s2, 0x1B); | 59 | 172k | Store<align>((__m256i*)dst + 0, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s0, K8_SHFL_0S0), | 60 | 172k | _mm256_shuffle_epi8(p0, K8_SHFL_0P0)), _mm256_shuffle_epi8(p1, K8_SHFL_0P1))); | 61 | 172k | Store<align>((__m256i*)dst + 1, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s1, K8_SHFL_1S1), | 62 | 172k | _mm256_shuffle_epi8(p0, K8_SHFL_1P0)), _mm256_shuffle_epi8(p2, K8_SHFL_1P2))); | 63 | 172k | Store<align>((__m256i*)dst + 2, _mm256_or_si256(_mm256_or_si256(_mm256_shuffle_epi8(s2, K8_SHFL_2S2), | 64 | 172k | _mm256_shuffle_epi8(p1, K8_SHFL_2P1)), _mm256_shuffle_epi8(p2, K8_SHFL_2P2))); | 65 | 172k | } |
|
66 | | |
67 | | template <bool align> void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) |
68 | 119k | { |
69 | 119k | assert(width >= A); |
70 | 119k | if (align) |
71 | 119k | assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); |
72 | | |
73 | 119k | const size_t A3 = A * 3; |
74 | 119k | size_t size = width * 3; |
75 | 119k | size_t aligned = AlignLo(width, A) * 3; |
76 | | |
77 | 239k | for (size_t row = 0; row < height; ++row) |
78 | 120k | { |
79 | 180k | for (size_t i = 0; i < aligned; i += A3) |
80 | 60.0k | BgrToRgb<align>(bgr + i, rgb + i); |
81 | 120k | if (aligned < size) |
82 | 119k | BgrToRgb<false>(bgr + size - A3, rgb + size - A3); |
83 | 120k | bgr += bgrStride; |
84 | 120k | rgb += rgbStride; |
85 | 120k | } |
86 | 119k | } void Simd::Avx2::BgrToRgb<true>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long) Line | Count | Source | 68 | 39 | { | 69 | 39 | assert(width >= A); | 70 | 39 | if (align) | 71 | 39 | assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); | 72 | | | 73 | 39 | const size_t A3 = A * 3; | 74 | 39 | size_t size = width * 3; | 75 | 39 | size_t aligned = AlignLo(width, A) * 3; | 76 | | | 77 | 78 | for (size_t row = 0; row < height; ++row) | 78 | 39 | { | 79 | 6.86k | for (size_t i = 0; i < aligned; i += A3) | 80 | 6.82k | BgrToRgb<align>(bgr + i, rgb + i); | 81 | 39 | if (aligned < size) | 82 | 0 | BgrToRgb<false>(bgr + size - A3, rgb + size - A3); | 83 | 39 | bgr += bgrStride; | 84 | 39 | rgb += rgbStride; | 85 | 39 | } | 86 | 39 | } |
void Simd::Avx2::BgrToRgb<false>(unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long) Line | Count | Source | 68 | 119k | { | 69 | 119k | assert(width >= A); | 70 | 119k | if (align) | 71 | 119k | assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)); | 72 | | | 73 | 119k | const size_t A3 = A * 3; | 74 | 119k | size_t size = width * 3; | 75 | 119k | size_t aligned = AlignLo(width, A) * 3; | 76 | | | 77 | 239k | for (size_t row = 0; row < height; ++row) | 78 | 119k | { | 79 | 173k | for (size_t i = 0; i < aligned; i += A3) | 80 | 53.2k | BgrToRgb<align>(bgr + i, rgb + i); | 81 | 119k | if (aligned < size) | 82 | 119k | BgrToRgb<false>(bgr + size - A3, rgb + size - A3); | 83 | 119k | bgr += bgrStride; | 84 | 119k | rgb += rgbStride; | 85 | 119k | } | 86 | 119k | } |
|
87 | | |
88 | | void BgrToRgb(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * rgb, size_t rgbStride) |
89 | 119k | { |
90 | 119k | if (Aligned(bgr) && Aligned(bgrStride) && Aligned(rgb) && Aligned(rgbStride)) |
91 | 39 | BgrToRgb<true>(bgr, width, height, bgrStride, rgb, rgbStride); |
92 | 119k | else |
93 | 119k | BgrToRgb<false>(bgr, width, height, bgrStride, rgb, rgbStride); |
94 | 119k | } |
95 | | } |
96 | | #endif//SIMD_AVX2_ENABLE |
97 | | } |