/src/Simd/src/Simd/SimdAvx2YuvToHue.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2017 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdStore.h" |
25 | | #include "Simd/SimdMemory.h" |
26 | | #include "Simd/SimdConversion.h" |
27 | | |
28 | | namespace Simd |
29 | | { |
30 | | #ifdef SIMD_AVX2_ENABLE |
31 | | namespace Avx2 |
32 | | { |
33 | | SIMD_INLINE __m256i MulDiv32(__m256i dividend, __m256i divisor, const __m256 & KF_255_DIV_6) |
34 | 0 | { |
35 | 0 | return _mm256_cvttps_epi32(_mm256_div_ps(_mm256_mul_ps(KF_255_DIV_6, _mm256_cvtepi32_ps(dividend)), _mm256_cvtepi32_ps(divisor))); |
36 | 0 | } |
37 | | |
38 | | SIMD_INLINE __m256i MulDiv16(__m256i dividend, __m256i divisor, const __m256 & KF_255_DIV_6) |
39 | 0 | { |
40 | 0 | const __m256i quotientLo = MulDiv32(_mm256_unpacklo_epi16(dividend, K_ZERO), _mm256_unpacklo_epi16(divisor, K_ZERO), KF_255_DIV_6); |
41 | 0 | const __m256i quotientHi = MulDiv32(_mm256_unpackhi_epi16(dividend, K_ZERO), _mm256_unpackhi_epi16(divisor, K_ZERO), KF_255_DIV_6); |
42 | 0 | return _mm256_packs_epi32(quotientLo, quotientHi); |
43 | 0 | } |
44 | | |
45 | | SIMD_INLINE __m256i AdjustedYuvToHue16(__m256i y, __m256i u, __m256i v, const __m256 & KF_255_DIV_6) |
46 | 0 | { |
47 | 0 | const __m256i red = AdjustedYuvToRed16(y, v); |
48 | 0 | const __m256i green = AdjustedYuvToGreen16(y, u, v); |
49 | 0 | const __m256i blue = AdjustedYuvToBlue16(y, u); |
50 | 0 | const __m256i max = MaxI16(red, green, blue); |
51 | 0 | const __m256i range = _mm256_subs_epi16(max, MinI16(red, green, blue)); |
52 | |
|
53 | 0 | const __m256i redMaxMask = _mm256_cmpeq_epi16(red, max); |
54 | 0 | const __m256i greenMaxMask = _mm256_andnot_si256(redMaxMask, _mm256_cmpeq_epi16(green, max)); |
55 | 0 | const __m256i blueMaxMask = _mm256_andnot_si256(redMaxMask, _mm256_andnot_si256(greenMaxMask, K_INV_ZERO)); |
56 | |
|
57 | 0 | const __m256i redMaxCase = _mm256_and_si256(redMaxMask, |
58 | 0 | _mm256_add_epi16(_mm256_sub_epi16(green, blue), _mm256_mullo_epi16(range, K16_0006))); |
59 | 0 | const __m256i greenMaxCase = _mm256_and_si256(greenMaxMask, |
60 | 0 | _mm256_add_epi16(_mm256_sub_epi16(blue, red), _mm256_mullo_epi16(range, K16_0002))); |
61 | 0 | const __m256i blueMaxCase = _mm256_and_si256(blueMaxMask, |
62 | 0 | _mm256_add_epi16(_mm256_sub_epi16(red, green), _mm256_mullo_epi16(range, K16_0004))); |
63 | |
|
64 | 0 | const __m256i dividend = _mm256_or_si256(_mm256_or_si256(redMaxCase, greenMaxCase), blueMaxCase); |
65 | |
|
66 | 0 | return _mm256_andnot_si256(_mm256_cmpeq_epi16(range, K_ZERO), _mm256_and_si256(MulDiv16(dividend, range, KF_255_DIV_6), K16_00FF)); |
67 | 0 | } |
68 | | |
69 | | SIMD_INLINE __m256i YuvToHue16(__m256i y, __m256i u, __m256i v, const __m256 & KF_255_DIV_6) |
70 | 0 | { |
71 | 0 | return AdjustedYuvToHue16(AdjustY16(y), AdjustUV16(u), AdjustUV16(v), KF_255_DIV_6); |
72 | 0 | } |
73 | | |
74 | | SIMD_INLINE __m256i YuvToHue8(__m256i y, __m256i u, __m256i v, const __m256 & KF_255_DIV_6) |
75 | 0 | { |
76 | 0 | return _mm256_packus_epi16( |
77 | 0 | YuvToHue16(_mm256_unpacklo_epi8(y, K_ZERO), _mm256_unpacklo_epi8(u, K_ZERO), _mm256_unpacklo_epi8(v, K_ZERO), KF_255_DIV_6), |
78 | 0 | YuvToHue16(_mm256_unpackhi_epi8(y, K_ZERO), _mm256_unpackhi_epi8(u, K_ZERO), _mm256_unpackhi_epi8(v, K_ZERO), KF_255_DIV_6)); |
79 | 0 | } |
80 | | |
81 | | template <bool align> SIMD_INLINE void Yuv420pToHue(const uint8_t * y, __m256i u, __m256i v, uint8_t * hue, const __m256 & KF_255_DIV_6) |
82 | 0 | { |
83 | 0 | Store<align>((__m256i*)(hue), YuvToHue8(Load<align>((__m256i*)(y)), |
84 | 0 | _mm256_unpacklo_epi8(u, u), _mm256_unpacklo_epi8(v, v), KF_255_DIV_6)); |
85 | 0 | Store<align>((__m256i*)(hue + A), YuvToHue8(Load<align>((__m256i*)(y + A)), |
86 | 0 | _mm256_unpackhi_epi8(u, u), _mm256_unpackhi_epi8(v, v), KF_255_DIV_6)); |
87 | 0 | } Unexecuted instantiation: void Simd::Avx2::Yuv420pToHue<true>(unsigned char const*, long long __vector(4), long long __vector(4), unsigned char*, float __vector(8) const&) Unexecuted instantiation: void Simd::Avx2::Yuv420pToHue<false>(unsigned char const*, long long __vector(4), long long __vector(4), unsigned char*, float __vector(8) const&) |
88 | | |
89 | | template <bool align> void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, |
90 | | size_t width, size_t height, uint8_t * hue, size_t hueStride) |
91 | 0 | { |
92 | 0 | assert((width % 2 == 0) && (height % 2 == 0) && (width >= DA) && (height >= 2)); |
93 | 0 | if (align) |
94 | 0 | { |
95 | 0 | assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); |
96 | 0 | assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); |
97 | 0 | } |
98 | |
|
99 | 0 | const __m256 KF_255_DIV_6 = _mm256_set1_ps(Base::KF_255_DIV_6); |
100 | |
|
101 | 0 | size_t bodyWidth = AlignLo(width, DA); |
102 | 0 | size_t tail = width - bodyWidth; |
103 | 0 | for (size_t row = 0; row < height; row += 2) |
104 | 0 | { |
105 | 0 | for (size_t colUV = 0, colY = 0, col_hue = 0; colY < bodyWidth; colY += DA, colUV += A, col_hue += DA) |
106 | 0 | { |
107 | 0 | __m256i u_ = LoadPermuted<align>((__m256i*)(u + colUV)); |
108 | 0 | __m256i v_ = LoadPermuted<align>((__m256i*)(v + colUV)); |
109 | 0 | Yuv420pToHue<align>(y + colY, u_, v_, hue + col_hue, KF_255_DIV_6); |
110 | 0 | Yuv420pToHue<align>(y + yStride + colY, u_, v_, hue + hueStride + col_hue, KF_255_DIV_6); |
111 | 0 | } |
112 | 0 | if (tail) |
113 | 0 | { |
114 | 0 | size_t offset = width - DA; |
115 | 0 | __m256i u_ = LoadPermuted<false>((__m256i*)(u + offset / 2)); |
116 | 0 | __m256i v_ = LoadPermuted<false>((__m256i*)(v + offset / 2)); |
117 | 0 | Yuv420pToHue<false>(y + offset, u_, v_, hue + offset, KF_255_DIV_6); |
118 | 0 | Yuv420pToHue<false>(y + yStride + offset, u_, v_, hue + hueStride + offset, KF_255_DIV_6); |
119 | 0 | } |
120 | 0 | y += 2 * yStride; |
121 | 0 | u += uStride; |
122 | 0 | v += vStride; |
123 | 0 | hue += 2 * hueStride; |
124 | 0 | } |
125 | 0 | } Unexecuted instantiation: void Simd::Avx2::Yuv420pToHue<true>(unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Avx2::Yuv420pToHue<false>(unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long) |
126 | | |
127 | | template <bool align> void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, |
128 | | size_t width, size_t height, uint8_t * hue, size_t hueStride) |
129 | 0 | { |
130 | 0 | assert(width >= A); |
131 | 0 | if (align) |
132 | 0 | { |
133 | 0 | assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); |
134 | 0 | assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); |
135 | 0 | } |
136 | |
|
137 | 0 | const __m256 KF_255_DIV_6 = _mm256_set1_ps(Base::KF_255_DIV_6); |
138 | |
|
139 | 0 | size_t bodyWidth = AlignLo(width, A); |
140 | 0 | size_t tail = width - bodyWidth; |
141 | 0 | for (size_t row = 0; row < height; row += 1) |
142 | 0 | { |
143 | 0 | for (size_t col = 0; col < bodyWidth; col += A) |
144 | 0 | { |
145 | 0 | Store<align>((__m256i*)(hue + col), YuvToHue8(Load<align>((__m256i*)(y + col)), |
146 | 0 | Load<align>((__m256i*)(u + col)), Load<align>((__m256i*)(v + col)), KF_255_DIV_6)); |
147 | 0 | } |
148 | 0 | if (tail) |
149 | 0 | { |
150 | 0 | size_t offset = width - A; |
151 | 0 | Store<false>((__m256i*)(hue + offset), YuvToHue8(Load<false>((__m256i*)(y + offset)), |
152 | 0 | Load<false>((__m256i*)(u + offset)), Load<false>((__m256i*)(v + offset)), KF_255_DIV_6)); |
153 | 0 | } |
154 | 0 | y += yStride; |
155 | 0 | u += uStride; |
156 | 0 | v += vStride; |
157 | 0 | hue += hueStride; |
158 | 0 | } |
159 | 0 | } Unexecuted instantiation: void Simd::Avx2::Yuv444pToHue<true>(unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Avx2::Yuv444pToHue<false>(unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned long, unsigned long, unsigned char*, unsigned long) |
160 | | |
161 | | void Yuv420pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, |
162 | | size_t width, size_t height, uint8_t * hue, size_t hueStride) |
163 | 0 | { |
164 | 0 | if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)) |
165 | 0 | Yuv420pToHue<true>(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); |
166 | 0 | else |
167 | 0 | Yuv420pToHue<false>(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); |
168 | 0 | } |
169 | | |
170 | | void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, |
171 | | size_t width, size_t height, uint8_t * hue, size_t hueStride) |
172 | 0 | { |
173 | 0 | if (Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)) |
174 | 0 | Yuv444pToHue<true>(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); |
175 | 0 | else |
176 | 0 | Yuv444pToHue<false>(y, yStride, u, uStride, v, vStride, width, height, hue, hueStride); |
177 | 0 | } |
178 | | } |
179 | | #endif// SIMD_AVX2_ENABLE |
180 | | } |