/src/Simd/src/Simd/SimdAvx2SynetDeconvolution32f.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2024 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdSynetDeconvolution32f.h" |
25 | | #include "Simd/SimdSynetConvolution32f.h" |
26 | | #include "Simd/SimdSynetConvolution32fCommon.h" |
27 | | #include "Simd/SimdExtract.h" |
28 | | #include "Simd/SimdSynet.h" |
29 | | #include "Simd/SimdAvx2.h" |
30 | | #include "Simd/SimdGemm.h" |
31 | | #include "Simd/SimdExp.h" |
32 | | #include "Simd/SimdCpu.h" |
33 | | |
34 | | namespace Simd |
35 | | { |
36 | | #if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE) |
37 | | namespace Avx2 |
38 | | { |
39 | | SynetDeconvolution32fGemmNN::SynetDeconvolution32fGemmNN(const DeconvParam & p) |
40 | 0 | : Sse41::SynetDeconvolution32fGemmNN(p) |
41 | 0 | { |
42 | 0 | _gemm.Init(InitGemmFuncs(Avx2::Gemm32fNN, "Avx2")); |
43 | 0 | if (_param.trans && _param.group == 1) |
44 | 0 | { |
45 | 0 | if (NHWC_GEMM_RUNTIME) |
46 | 0 | { |
47 | 0 | _gemmCb.Init(InitGemmCbFuncs(Avx2::Gemm32fNNcbBufferSize, Avx2::Gemm32fNNcbReorderB, Avx2::Gemm32fNNcbRun, "Avx2", GemmKernelF2, GemmKernelF3)); |
48 | 0 | _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K)); |
49 | 0 | } |
50 | 0 | else |
51 | 0 | _nhwcWeight.Resize(Avx2::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE)); |
52 | 0 | _nhwcRun = Avx2::Gemm32fNNcbRun; |
53 | 0 | _nhwcReorderB = Avx2::Gemm32fNNcbReorderB; |
54 | 0 | } |
55 | 0 | _biasAndActivation = Avx2::ConvolutionBiasAndActivation; |
56 | 0 | } |
57 | | |
58 | | //--------------------------------------------------------------------- |
59 | | |
60 | | typedef void(*DeconvolutionNhwcDirect2x2_Ptr) (const float * src0, const DeconvParam & p, size_t srcC, size_t dstC, |
61 | | const float * weight, const __m256 * bias, const __m256 * params, float * ds, int first); |
62 | | |
63 | | template<TermType term, SimdConvolutionActivationType type, size_t tail> void DeconvolutionNhwcDirect2x2_M(const float * src0, |
64 | | const DeconvParam & p, size_t srcC, size_t dstC, const float * weight0, const __m256 * bias, const __m256 * params, float * dst, int first) |
65 | 0 | { |
66 | 0 | size_t dS = p.srcC, dD = p.dstC; |
67 | 0 | const float * weight1 = weight0 + srcC * F, *src1, *src2, *src3, *src4, *src5; |
68 | 0 | if (tail > 1) src1 = src0 + 1 * dS; |
69 | 0 | if (tail > 2) src2 = src0 + 2 * dS; |
70 | 0 | if (tail > 3) src3 = src0 + 3 * dS; |
71 | 0 | if (tail > 4) src4 = src0 + 4 * dS; |
72 | 0 | if (tail > 5) src5 = src0 + 5 * dS; |
73 | 0 | __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; |
74 | 0 | if (first) |
75 | 0 | { |
76 | 0 | if (tail > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); |
77 | 0 | if (tail > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); |
78 | 0 | if (tail > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); |
79 | 0 | if (tail > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); |
80 | 0 | if (tail > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps(); |
81 | 0 | if (tail > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps(); |
82 | 0 | } |
83 | 0 | else |
84 | 0 | { |
85 | 0 | if (tail > 0) d00 = _mm256_loadu_ps(dst + 0x0 * dD), d01 = _mm256_loadu_ps(dst + 0x1 * dD); |
86 | 0 | if (tail > 1) d10 = _mm256_loadu_ps(dst + 0x2 * dD), d11 = _mm256_loadu_ps(dst + 0x3 * dD); |
87 | 0 | if (tail > 2) d20 = _mm256_loadu_ps(dst + 0x4 * dD), d21 = _mm256_loadu_ps(dst + 0x5 * dD); |
88 | 0 | if (tail > 3) d30 = _mm256_loadu_ps(dst + 0x6 * dD), d31 = _mm256_loadu_ps(dst + 0x7 * dD); |
89 | 0 | if (tail > 4) d40 = _mm256_loadu_ps(dst + 0x8 * dD), d41 = _mm256_loadu_ps(dst + 0x9 * dD); |
90 | 0 | if (tail > 5) d50 = _mm256_loadu_ps(dst + 0xa * dD), d51 = _mm256_loadu_ps(dst + 0xb * dD); |
91 | 0 | } |
92 | 0 | for (size_t sc = 0; sc < srcC; ++sc) |
93 | 0 | { |
94 | 0 | w0 = _mm256_loadu_ps(weight0); |
95 | 0 | w1 = _mm256_loadu_ps(weight1); |
96 | 0 | if (tail > 0) s0 = _mm256_set1_ps(src0[sc]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01); |
97 | 0 | if (tail > 1) s0 = _mm256_set1_ps(src1[sc]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11); |
98 | 0 | if (tail > 2) s0 = _mm256_set1_ps(src2[sc]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21); |
99 | 0 | if (tail > 3) s0 = _mm256_set1_ps(src3[sc]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31); |
100 | 0 | if (tail > 4) s0 = _mm256_set1_ps(src4[sc]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41); |
101 | 0 | if (tail > 5) s0 = _mm256_set1_ps(src5[sc]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51); |
102 | 0 | weight0 += F; |
103 | 0 | weight1 += F; |
104 | 0 | } |
105 | 0 | if (dstC == F) |
106 | 0 | { |
107 | 0 | if (tail > 0) Term<term>::template Save<type, 0>(dst + 0x0 * dD, d00, bias, params), Term<term>::template Save<type, 0>(dst + 0x1 * dD, d01, bias, params); |
108 | 0 | if (tail > 1) Term<term>::template Save<type, 0>(dst + 0x2 * dD, d10, bias, params), Term<term>::template Save<type, 0>(dst + 0x3 * dD, d11, bias, params); |
109 | 0 | if (tail > 2) Term<term>::template Save<type, 0>(dst + 0x4 * dD, d20, bias, params), Term<term>::template Save<type, 0>(dst + 0x5 * dD, d21, bias, params); |
110 | 0 | if (tail > 3) Term<term>::template Save<type, 0>(dst + 0x6 * dD, d30, bias, params), Term<term>::template Save<type, 0>(dst + 0x7 * dD, d31, bias, params); |
111 | 0 | if (tail > 4) Term<term>::template Save<type, 0>(dst + 0x8 * dD, d40, bias, params), Term<term>::template Save<type, 0>(dst + 0x9 * dD, d41, bias, params); |
112 | 0 | if (tail > 5) Term<term>::template Save<type, 0>(dst + 0xA * dD, d50, bias, params), Term<term>::template Save<type, 0>(dst + 0xB * dD, d51, bias, params); |
113 | 0 | } |
114 | 0 | else |
115 | 0 | { |
116 | 0 | if (tail > 0) Term<term>::template Save<type, 0>(dst + 0x0 * dD, d00, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x1 * dD, d01, bias, params, dstC); |
117 | 0 | if (tail > 1) Term<term>::template Save<type, 0>(dst + 0x2 * dD, d10, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x3 * dD, d11, bias, params, dstC); |
118 | 0 | if (tail > 2) Term<term>::template Save<type, 0>(dst + 0x4 * dD, d20, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x5 * dD, d21, bias, params, dstC); |
119 | 0 | if (tail > 3) Term<term>::template Save<type, 0>(dst + 0x6 * dD, d30, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x7 * dD, d31, bias, params, dstC); |
120 | 0 | if (tail > 4) Term<term>::template Save<type, 0>(dst + 0x8 * dD, d40, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x9 * dD, d41, bias, params, dstC); |
121 | 0 | if (tail > 5) Term<term>::template Save<type, 0>(dst + 0xA * dD, d50, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0xB * dD, d51, bias, params, dstC); |
122 | 0 | } |
123 | 0 | } Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) |
124 | | |
125 | | template <TermType term, SimdConvolutionActivationType type> SIMD_INLINE DeconvolutionNhwcDirect2x2_Ptr GetDeconvolutionNhwcDirect2x2(size_t tail) |
126 | 0 | { |
127 | 0 | switch (tail) |
128 | 0 | { |
129 | 0 | case 0: return NULL; |
130 | 0 | case 1: return DeconvolutionNhwcDirect2x2_M<term, type, 1>; |
131 | 0 | case 2: return DeconvolutionNhwcDirect2x2_M<term, type, 2>; |
132 | 0 | case 3: return DeconvolutionNhwcDirect2x2_M<term, type, 3>; |
133 | 0 | case 4: return DeconvolutionNhwcDirect2x2_M<term, type, 4>; |
134 | 0 | case 5: return DeconvolutionNhwcDirect2x2_M<term, type, 5>; |
135 | 0 | case 6: return DeconvolutionNhwcDirect2x2_M<term, type, 6>; |
136 | 0 | default: |
137 | 0 | assert(0); |
138 | 0 | return NULL; |
139 | 0 | } |
140 | 0 | } Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)3>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)1, (SimdConvolutionActivationType)0>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)4>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)5>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)6>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)7>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)8>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)9>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)10>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int) |
141 | | |
142 | | template<TermType term, SimdConvolutionActivationType type> void DeconvolutionNhwcDirect2x2(const float* src, const DeconvParam& p, |
143 | | size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst, int first) |
144 | 0 | { |
145 | 0 | size_t body = 6, srcWb = AlignLoAny(p.srcW, body), tail = p.srcW - srcWb; |
146 | 0 | DeconvolutionNhwcDirect2x2_Ptr bodyKernel = GetDeconvolutionNhwcDirect2x2<term, type>(body); |
147 | 0 | DeconvolutionNhwcDirect2x2_Ptr tailKernel = GetDeconvolutionNhwcDirect2x2<term, type>(tail); |
148 | |
|
149 | 0 | __m256 _params[2], _bias[1]; |
150 | 0 | _params[0] = _mm256_set1_ps(params[0]); |
151 | 0 | if (type == SimdConvolutionActivationRestrictRange || |
152 | 0 | type == SimdConvolutionActivationHswish || |
153 | 0 | type == SimdConvolutionActivationHardSigmoid) |
154 | 0 | _params[1] = _mm256_set1_ps(params[1]); |
155 | |
|
156 | 0 | for (size_t dc = 0; dc < dstC; dc += F) |
157 | 0 | { |
158 | 0 | size_t dC = Simd::Min(F, dstC - dc); |
159 | 0 | _bias[0] = _mm256_loadu_ps(bias + dc); |
160 | 0 | if (type == ::SimdConvolutionActivationPrelu) |
161 | 0 | _params[0] = _mm256_loadu_ps(params + dc); |
162 | 0 | const float* s = src + yBeg * p.srcW * p.srcC; |
163 | 0 | float* d = dst + yBeg * p.strideY * p.dstW * p.dstC; |
164 | 0 | const float* w0 = weight + 0 * p.kernelX * srcC * F; |
165 | 0 | const float* w1 = weight + 1 * p.kernelX * srcC * F; |
166 | 0 | for (size_t sy = yBeg; sy < yEnd; sy += 1, s += p.srcW * p.srcC) |
167 | 0 | { |
168 | 0 | for (size_t sx = 0; sx < srcWb; sx += body) |
169 | 0 | bodyKernel(s + sx * p.srcC, p, srcC, dC, w0, _bias, _params, d, first), d += body * p.strideX * p.dstC; |
170 | 0 | if (tail) |
171 | 0 | tailKernel(s + srcWb * p.srcC, p, srcC, dC, w0, _bias, _params, d, first), d += tail * p.strideX * p.dstC; |
172 | 0 | for (size_t sx = 0; sx < srcWb; sx += body) |
173 | 0 | bodyKernel(s + sx * p.srcC, p, srcC, dC, w1, _bias, _params, d, first), d += body * p.strideX * p.dstC; |
174 | 0 | if (tail) |
175 | 0 | tailKernel(s + srcWb * p.srcC, p, srcC, dC, w1, _bias, _params, d, first), d += tail * p.strideX * p.dstC; |
176 | 0 | } |
177 | 0 | weight += p.kernelY * p.kernelX * srcC * F; |
178 | 0 | dst += F; |
179 | 0 | } |
180 | 0 | } Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)3>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)1, (SimdConvolutionActivationType)0>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)4>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)5>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)6>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)7>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)8>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)9>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)10>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int) |
181 | | |
182 | | template<SimdConvolutionActivationType type> void DeconvolutionNhwcDirect2x2(const float* src, const DeconvParam& p, |
183 | | const SynetDeconvolution32fNhwcDirect2x2::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst) |
184 | 0 | { |
185 | 0 | for (size_t dc = 0; dc < p.dstC; dc += a.macroD) |
186 | 0 | { |
187 | 0 | size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc; |
188 | 0 | for (size_t sc = 0; sc < p.srcC; sc += a.macroC) |
189 | 0 | { |
190 | 0 | size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc; |
191 | 0 | size_t macroK = p.kernelY * p.kernelX * macroC; |
192 | 0 | for (size_t yBeg = 0; yBeg < p.srcH;) |
193 | 0 | { |
194 | 0 | size_t yEnd = Simd::Min(yBeg + a.macroH, p.srcH); |
195 | 0 | if (a.macroC == p.srcC) |
196 | 0 | DeconvolutionNhwcDirect2x2<TermLast, type>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 1); |
197 | 0 | else if (sc == 0) |
198 | 0 | DeconvolutionNhwcDirect2x2<TermInterim, SimdConvolutionActivationIdentity>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 1); |
199 | 0 | else if (sc + macroC == p.srcC) |
200 | 0 | DeconvolutionNhwcDirect2x2<TermLast, type>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 0); |
201 | 0 | else |
202 | 0 | DeconvolutionNhwcDirect2x2<TermInterim, SimdConvolutionActivationIdentity>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 0); |
203 | 0 | yBeg = yEnd; |
204 | 0 | } |
205 | 0 | weight += AlignHiAny(macroD, a.microD) * macroK; |
206 | 0 | } |
207 | 0 | if (type == ::SimdConvolutionActivationPrelu) |
208 | 0 | params += macroD; |
209 | 0 | } |
210 | 0 | } Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)3>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)4>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)5>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)6>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)7>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)8>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)9>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)10>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*) |
211 | | |
212 | | SynetDeconvolution32fNhwcDirect2x2::SynetDeconvolution32fNhwcDirect2x2(const DeconvParam & p) |
213 | 0 | : Sse41::SynetDeconvolution32fNhwcDirect2x2(p) |
214 | 0 | { |
215 | 0 | if (p.dstC > HF) |
216 | 0 | { |
217 | 0 | switch (p.activation) |
218 | 0 | { |
219 | 0 | case SimdConvolutionActivationIdentity: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break; |
220 | 0 | case SimdConvolutionActivationRelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break; |
221 | 0 | case SimdConvolutionActivationLeakyRelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationPrelu>; break; |
222 | 0 | case SimdConvolutionActivationRestrictRange: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break; |
223 | 0 | case SimdConvolutionActivationPrelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationPrelu>; break; |
224 | 0 | case SimdConvolutionActivationElu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationElu>; break; |
225 | 0 | case SimdConvolutionActivationHswish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationHswish>; break; |
226 | 0 | case SimdConvolutionActivationMish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationMish>; break; |
227 | 0 | case SimdConvolutionActivationHardSigmoid: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationHardSigmoid>; break; |
228 | 0 | case SimdConvolutionActivationSwish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationSwish>; break; |
229 | 0 | case SimdConvolutionActivationGelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationGelu>; break; |
230 | 0 | default: assert(0); |
231 | 0 | } |
232 | 0 | SetAlgParam(F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3()); |
233 | 0 | } |
234 | 0 | } |
235 | | |
236 | | //--------------------------------------------------------------------- |
237 | | |
238 | | void * SynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility) |
239 | 0 | { |
240 | 0 | DeconvParam param(batch, conv, compatibility); |
241 | 0 | if (!param.Valid(SimdTensorData32f)) |
242 | 0 | return NULL; |
243 | 0 | if (SynetDeconvolution32fNhwcDirect2x2::Preferable(param)) |
244 | 0 | return new SynetDeconvolution32fNhwcDirect2x2(param); |
245 | 0 | else |
246 | 0 | return new SynetDeconvolution32fGemmNN(param); |
247 | 0 | } |
248 | | } |
249 | | #endif//SIMD_AVX2_ENABLE |
250 | | } |