/src/Simd/src/Simd/SimdAvx512bwYuvToUyvy.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2022 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdMemory.h" |
25 | | #include "Simd/SimdStore.h" |
26 | | |
27 | | namespace Simd |
28 | | { |
29 | | #ifdef SIMD_AVX512BW_ENABLE |
30 | | namespace Avx512bw |
31 | | { |
32 | | SIMD_INLINE void Yuv420pToUyvy422(const uint8_t* y0, size_t yStride, const uint8_t* u, const uint8_t* v, |
33 | | uint8_t* uyvy0, size_t uyvyStride, __mmask32 yuvMask, __mmask32 uyvyMask0, __mmask32 uyvyMask1) |
34 | 0 | { |
35 | 0 | static const __m512i PRM0 = SIMD_MM512_SETR_EPI32(0x00, 0x08, 0x10, 0x11, 0x01, 0x09, 0x12, 0x13, 0x02, 0x0A, 0x14, 0x15, 0x03, 0x0B, 0x16, 0x17); |
36 | 0 | static const __m512i PRM1 = SIMD_MM512_SETR_EPI32(0x04, 0x0C, 0x18, 0x19, 0x05, 0x0D, 0x1A, 0x1B, 0x06, 0x0E, 0x1C, 0x1D, 0x07, 0x0F, 0x1E, 0x1F); |
37 | 0 | static const __m512i SHFL = SIMD_MM512_SETR_EPI8( |
38 | 0 | 0x0, 0x8, 0x4, 0x9, 0x1, 0xA, 0x5, 0xB, 0x2, 0xC, 0x6, 0xD, 0x3, 0xE, 0x7, 0xF, |
39 | 0 | 0x0, 0x8, 0x4, 0x9, 0x1, 0xA, 0x5, 0xB, 0x2, 0xC, 0x6, 0xD, 0x3, 0xE, 0x7, 0xF, |
40 | 0 | 0x0, 0x8, 0x4, 0x9, 0x1, 0xA, 0x5, 0xB, 0x2, 0xC, 0x6, 0xD, 0x3, 0xE, 0x7, 0xF, |
41 | 0 | 0x0, 0x8, 0x4, 0x9, 0x1, 0xA, 0x5, 0xB, 0x2, 0xC, 0x6, 0xD, 0x3, 0xE, 0x7, 0xF); |
42 | 0 | __m512i uv = Load(u, v, yuvMask); |
43 | 0 | __m512i _y0 = _mm512_maskz_loadu_epi16(yuvMask, y0); |
44 | 0 | _mm512_mask_storeu_epi16(uyvy0 + 0 * 64, uyvyMask0, _mm512_shuffle_epi8(_mm512_permutex2var_epi32(uv, PRM0, _y0), SHFL)); |
45 | 0 | _mm512_mask_storeu_epi16(uyvy0 + 1 * 64, uyvyMask1, _mm512_shuffle_epi8(_mm512_permutex2var_epi32(uv, PRM1, _y0), SHFL)); |
46 | 0 | __m512i _y1 = _mm512_maskz_loadu_epi16(yuvMask, y0 + yStride); |
47 | 0 | uint8_t* uyvy1 = uyvy0 + uyvyStride; |
48 | 0 | _mm512_mask_storeu_epi16(uyvy1 + 0 * 64, uyvyMask0, _mm512_shuffle_epi8(_mm512_permutex2var_epi32(uv, PRM0, _y1), SHFL)); |
49 | 0 | _mm512_mask_storeu_epi16(uyvy1 + 1 * 64, uyvyMask1, _mm512_shuffle_epi8(_mm512_permutex2var_epi32(uv, PRM1, _y1), SHFL)); |
50 | 0 | } |
51 | | |
52 | | void Yuv420pToUyvy422(const uint8_t* y, size_t yStride, const uint8_t* u, size_t uStride, |
53 | | const uint8_t* v, size_t vStride, size_t width, size_t height, uint8_t* uyvy, size_t uyvyStride) |
54 | 0 | { |
55 | 0 | assert((width % 2 == 0) && (height % 2 == 0) && width >= 2 * A); |
56 | |
|
57 | 0 | size_t size = width / 2; |
58 | 0 | size_t size32 = AlignLo(size, 32); |
59 | 0 | size_t tail = size - size32; |
60 | 0 | __mmask32 yuvMask = TailMask32(tail); |
61 | 0 | __mmask32 uyvyMask0 = TailMask32(tail * 2 - 32 * 0); |
62 | 0 | __mmask32 uyvyMask1 = TailMask32(tail * 2 - 32 * 1); |
63 | |
|
64 | 0 | for (size_t row = 0; row < height; row += 2) |
65 | 0 | { |
66 | 0 | size_t colY = 0, colUV = 0, colUyvy = 0; |
67 | 0 | for (; colUV < size32; colY += 64, colUV += 32, colUyvy += 128) |
68 | 0 | Yuv420pToUyvy422(y + colY, yStride, u + colUV, v + colUV, uyvy + colUyvy, uyvyStride, __mmask32(-1), __mmask32(-1), __mmask32(-1)); |
69 | 0 | if (tail) |
70 | 0 | Yuv420pToUyvy422(y + colY, yStride, u + colUV, v + colUV, uyvy + colUyvy, uyvyStride, yuvMask, uyvyMask0, uyvyMask1); |
71 | 0 | y += 2 * yStride; |
72 | 0 | u += uStride; |
73 | 0 | v += vStride; |
74 | 0 | uyvy += 2 * uyvyStride; |
75 | 0 | } |
76 | 0 | } |
77 | | } |
78 | | #endif |
79 | | } |