Coverage Report

Created: 2026-04-09 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx2SynetDeconvolution32f.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2024 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetDeconvolution32f.h"
25
#include "Simd/SimdSynetConvolution32f.h"
26
#include "Simd/SimdSynetConvolution32fCommon.h"
27
#include "Simd/SimdExtract.h"
28
#include "Simd/SimdSynet.h"
29
#include "Simd/SimdAvx2.h"
30
#include "Simd/SimdGemm.h"
31
#include "Simd/SimdExp.h"
32
#include "Simd/SimdCpu.h"
33
34
namespace Simd
35
{
36
#if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE)  
37
    namespace Avx2
38
    {
39
        SynetDeconvolution32fGemmNN::SynetDeconvolution32fGemmNN(const DeconvParam & p)
40
0
            : Sse41::SynetDeconvolution32fGemmNN(p)
41
0
        {
42
0
            _gemm.Init(InitGemmFuncs(Avx2::Gemm32fNN, "Avx2"));
43
0
            if (_param.trans && _param.group == 1)
44
0
            {
45
0
                if (NHWC_GEMM_RUNTIME)
46
0
                {
47
0
                    _gemmCb.Init(InitGemmCbFuncs(Avx2::Gemm32fNNcbBufferSize, Avx2::Gemm32fNNcbReorderB, Avx2::Gemm32fNNcbRun, "Avx2", GemmKernelF2, GemmKernelF3));
48
0
                    _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K));
49
0
                }
50
0
                else
51
0
                    _nhwcWeight.Resize(Avx2::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE));
52
0
                _nhwcRun = Avx2::Gemm32fNNcbRun;
53
0
                _nhwcReorderB = Avx2::Gemm32fNNcbReorderB;
54
0
            }
55
0
            _biasAndActivation = Avx2::ConvolutionBiasAndActivation;
56
0
        }
57
58
        //---------------------------------------------------------------------
59
60
        typedef void(*DeconvolutionNhwcDirect2x2_Ptr) (const float * src0, const DeconvParam & p, size_t srcC, size_t dstC, 
61
            const float * weight, const __m256 * bias, const __m256 * params, float * ds, int first);
62
63
        template<TermType term, SimdConvolutionActivationType type, size_t tail> void DeconvolutionNhwcDirect2x2_M(const float * src0,
64
            const DeconvParam & p, size_t srcC, size_t dstC, const float * weight0, const __m256 * bias, const __m256 * params, float * dst, int first)
65
0
        {
66
0
            size_t dS = p.srcC, dD = p.dstC;
67
0
            const float * weight1 = weight0 + srcC * F, *src1, *src2, *src3, *src4, *src5;
68
0
            if (tail > 1) src1 = src0 + 1 * dS;
69
0
            if (tail > 2) src2 = src0 + 2 * dS;
70
0
            if (tail > 3) src3 = src0 + 3 * dS;
71
0
            if (tail > 4) src4 = src0 + 4 * dS;
72
0
            if (tail > 5) src5 = src0 + 5 * dS;
73
0
            __m256 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1;
74
0
            if (first)
75
0
            {
76
0
                if (tail > 0) d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps();
77
0
                if (tail > 1) d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps();
78
0
                if (tail > 2) d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps();
79
0
                if (tail > 3) d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps();
80
0
                if (tail > 4) d40 = _mm256_setzero_ps(), d41 = _mm256_setzero_ps();
81
0
                if (tail > 5) d50 = _mm256_setzero_ps(), d51 = _mm256_setzero_ps();
82
0
            }
83
0
            else
84
0
            {
85
0
                if (tail > 0) d00 = _mm256_loadu_ps(dst + 0x0 * dD), d01 = _mm256_loadu_ps(dst + 0x1 * dD);
86
0
                if (tail > 1) d10 = _mm256_loadu_ps(dst + 0x2 * dD), d11 = _mm256_loadu_ps(dst + 0x3 * dD);
87
0
                if (tail > 2) d20 = _mm256_loadu_ps(dst + 0x4 * dD), d21 = _mm256_loadu_ps(dst + 0x5 * dD);
88
0
                if (tail > 3) d30 = _mm256_loadu_ps(dst + 0x6 * dD), d31 = _mm256_loadu_ps(dst + 0x7 * dD);
89
0
                if (tail > 4) d40 = _mm256_loadu_ps(dst + 0x8 * dD), d41 = _mm256_loadu_ps(dst + 0x9 * dD);
90
0
                if (tail > 5) d50 = _mm256_loadu_ps(dst + 0xa * dD), d51 = _mm256_loadu_ps(dst + 0xb * dD);
91
0
            }
92
0
            for (size_t sc = 0; sc < srcC; ++sc)
93
0
            {
94
0
                w0 = _mm256_loadu_ps(weight0);
95
0
                w1 = _mm256_loadu_ps(weight1);
96
0
                if (tail > 0) s0 = _mm256_set1_ps(src0[sc]), d00 = _mm256_fmadd_ps(s0, w0, d00), d01 = _mm256_fmadd_ps(s0, w1, d01);
97
0
                if (tail > 1) s0 = _mm256_set1_ps(src1[sc]), d10 = _mm256_fmadd_ps(s0, w0, d10), d11 = _mm256_fmadd_ps(s0, w1, d11);
98
0
                if (tail > 2) s0 = _mm256_set1_ps(src2[sc]), d20 = _mm256_fmadd_ps(s0, w0, d20), d21 = _mm256_fmadd_ps(s0, w1, d21);
99
0
                if (tail > 3) s0 = _mm256_set1_ps(src3[sc]), d30 = _mm256_fmadd_ps(s0, w0, d30), d31 = _mm256_fmadd_ps(s0, w1, d31);
100
0
                if (tail > 4) s0 = _mm256_set1_ps(src4[sc]), d40 = _mm256_fmadd_ps(s0, w0, d40), d41 = _mm256_fmadd_ps(s0, w1, d41);
101
0
                if (tail > 5) s0 = _mm256_set1_ps(src5[sc]), d50 = _mm256_fmadd_ps(s0, w0, d50), d51 = _mm256_fmadd_ps(s0, w1, d51);
102
0
                weight0 += F;
103
0
                weight1 += F;
104
0
            }
105
0
            if (dstC == F)
106
0
            {
107
0
                if (tail > 0) Term<term>::template Save<type, 0>(dst + 0x0 * dD, d00, bias, params), Term<term>::template Save<type, 0>(dst + 0x1 * dD, d01, bias, params);
108
0
                if (tail > 1) Term<term>::template Save<type, 0>(dst + 0x2 * dD, d10, bias, params), Term<term>::template Save<type, 0>(dst + 0x3 * dD, d11, bias, params);
109
0
                if (tail > 2) Term<term>::template Save<type, 0>(dst + 0x4 * dD, d20, bias, params), Term<term>::template Save<type, 0>(dst + 0x5 * dD, d21, bias, params);
110
0
                if (tail > 3) Term<term>::template Save<type, 0>(dst + 0x6 * dD, d30, bias, params), Term<term>::template Save<type, 0>(dst + 0x7 * dD, d31, bias, params);
111
0
                if (tail > 4) Term<term>::template Save<type, 0>(dst + 0x8 * dD, d40, bias, params), Term<term>::template Save<type, 0>(dst + 0x9 * dD, d41, bias, params);
112
0
                if (tail > 5) Term<term>::template Save<type, 0>(dst + 0xA * dD, d50, bias, params), Term<term>::template Save<type, 0>(dst + 0xB * dD, d51, bias, params);
113
0
            }
114
0
            else
115
0
            {
116
0
                if (tail > 0) Term<term>::template Save<type, 0>(dst + 0x0 * dD, d00, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x1 * dD, d01, bias, params, dstC);
117
0
                if (tail > 1) Term<term>::template Save<type, 0>(dst + 0x2 * dD, d10, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x3 * dD, d11, bias, params, dstC);
118
0
                if (tail > 2) Term<term>::template Save<type, 0>(dst + 0x4 * dD, d20, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x5 * dD, d21, bias, params, dstC);
119
0
                if (tail > 3) Term<term>::template Save<type, 0>(dst + 0x6 * dD, d30, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x7 * dD, d31, bias, params, dstC);
120
0
                if (tail > 4) Term<term>::template Save<type, 0>(dst + 0x8 * dD, d40, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x9 * dD, d41, bias, params, dstC);
121
0
                if (tail > 5) Term<term>::template Save<type, 0>(dst + 0xA * dD, d50, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0xB * dD, d51, bias, params, dstC);
122
0
            }
123
0
        }
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
124
125
        template <TermType term, SimdConvolutionActivationType type> SIMD_INLINE DeconvolutionNhwcDirect2x2_Ptr GetDeconvolutionNhwcDirect2x2(size_t tail)
126
0
        {
127
0
            switch (tail)
128
0
            {
129
0
            case 0: return NULL;
130
0
            case 1: return DeconvolutionNhwcDirect2x2_M<term, type, 1>;
131
0
            case 2: return DeconvolutionNhwcDirect2x2_M<term, type, 2>;
132
0
            case 3: return DeconvolutionNhwcDirect2x2_M<term, type, 3>;
133
0
            case 4: return DeconvolutionNhwcDirect2x2_M<term, type, 4>;
134
0
            case 5: return DeconvolutionNhwcDirect2x2_M<term, type, 5>;
135
0
            case 6: return DeconvolutionNhwcDirect2x2_M<term, type, 6>;
136
0
            default:
137
0
                assert(0);
138
0
                return NULL;
139
0
            }
140
0
        }
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)3>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)1, (SimdConvolutionActivationType)0>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)4>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)5>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)6>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)7>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)8>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)9>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
Unexecuted instantiation: void (*Simd::Avx2::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)10>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(8) const*, float __vector(8) const*, float*, int)
141
142
        template<TermType term, SimdConvolutionActivationType type> void DeconvolutionNhwcDirect2x2(const float* src, const DeconvParam& p,
143
            size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst, int first)
144
0
        {
145
0
            size_t body = 6, srcWb = AlignLoAny(p.srcW, body), tail = p.srcW - srcWb;
146
0
            DeconvolutionNhwcDirect2x2_Ptr bodyKernel = GetDeconvolutionNhwcDirect2x2<term, type>(body);
147
0
            DeconvolutionNhwcDirect2x2_Ptr tailKernel = GetDeconvolutionNhwcDirect2x2<term, type>(tail);
148
149
0
            __m256 _params[2], _bias[1];
150
0
            _params[0] = _mm256_set1_ps(params[0]);
151
0
            if (type == SimdConvolutionActivationRestrictRange ||
152
0
                type == SimdConvolutionActivationHswish ||
153
0
                type == SimdConvolutionActivationHardSigmoid)
154
0
                _params[1] = _mm256_set1_ps(params[1]);
155
156
0
            for (size_t dc = 0; dc < dstC; dc += F)
157
0
            {
158
0
                size_t dC = Simd::Min(F, dstC - dc);
159
0
                _bias[0] = _mm256_loadu_ps(bias + dc);
160
0
                if (type == ::SimdConvolutionActivationPrelu)
161
0
                    _params[0] = _mm256_loadu_ps(params + dc);
162
0
                const float* s = src + yBeg * p.srcW * p.srcC;
163
0
                float* d = dst + yBeg * p.strideY * p.dstW * p.dstC;
164
0
                const float* w0 = weight + 0 * p.kernelX * srcC * F;
165
0
                const float* w1 = weight + 1 * p.kernelX * srcC * F;
166
0
                for (size_t sy = yBeg; sy < yEnd; sy += 1, s += p.srcW * p.srcC)
167
0
                {
168
0
                    for (size_t sx = 0; sx < srcWb; sx += body)
169
0
                        bodyKernel(s + sx * p.srcC, p, srcC, dC, w0, _bias, _params, d, first), d += body * p.strideX * p.dstC;
170
0
                    if (tail)
171
0
                        tailKernel(s + srcWb * p.srcC, p, srcC, dC, w0, _bias, _params, d, first), d += tail * p.strideX * p.dstC;
172
0
                    for (size_t sx = 0; sx < srcWb; sx += body)
173
0
                        bodyKernel(s + sx * p.srcC, p, srcC, dC, w1, _bias, _params, d, first), d += body * p.strideX * p.dstC;
174
0
                    if (tail)
175
0
                        tailKernel(s + srcWb * p.srcC, p, srcC, dC, w1, _bias, _params, d, first), d += tail * p.strideX * p.dstC;
176
0
                }
177
0
                weight += p.kernelY * p.kernelX * srcC * F;
178
0
                dst += F;
179
0
            }
180
0
        }
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)3>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)1, (SimdConvolutionActivationType)0>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)4>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)5>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)6>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)7>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)8>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)9>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)10>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
181
182
        template<SimdConvolutionActivationType type> void DeconvolutionNhwcDirect2x2(const float* src, const DeconvParam& p,
183
            const SynetDeconvolution32fNhwcDirect2x2::AlgParam& a, const float* weight, const float* bias, const float* params, float* dst)
184
0
        {
185
0
            for (size_t dc = 0; dc < p.dstC; dc += a.macroD)
186
0
            {
187
0
                size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc;
188
0
                for (size_t sc = 0; sc < p.srcC; sc += a.macroC)
189
0
                {
190
0
                    size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc;
191
0
                    size_t macroK = p.kernelY * p.kernelX * macroC;
192
0
                    for (size_t yBeg = 0; yBeg < p.srcH;)
193
0
                    {
194
0
                        size_t yEnd = Simd::Min(yBeg + a.macroH, p.srcH);
195
0
                        if (a.macroC == p.srcC)
196
0
                            DeconvolutionNhwcDirect2x2<TermLast, type>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 1);
197
0
                        else if (sc == 0)
198
0
                            DeconvolutionNhwcDirect2x2<TermInterim, SimdConvolutionActivationIdentity>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 1);
199
0
                        else if (sc + macroC == p.srcC)
200
0
                            DeconvolutionNhwcDirect2x2<TermLast, type>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 0);
201
0
                        else
202
0
                            DeconvolutionNhwcDirect2x2<TermInterim, SimdConvolutionActivationIdentity>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 0);
203
0
                        yBeg = yEnd;
204
0
                    }
205
0
                    weight += AlignHiAny(macroD, a.microD) * macroK;
206
0
                }
207
0
                if (type == ::SimdConvolutionActivationPrelu)
208
0
                    params += macroD;
209
0
            }
210
0
        }
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)3>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)4>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)5>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)6>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)7>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)8>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)9>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx2::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)10>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
211
212
        SynetDeconvolution32fNhwcDirect2x2::SynetDeconvolution32fNhwcDirect2x2(const DeconvParam & p)
213
0
            : Sse41::SynetDeconvolution32fNhwcDirect2x2(p)
214
0
        {
215
0
            if (p.dstC > HF)
216
0
            {
217
0
                switch (p.activation)
218
0
                {
219
0
                case SimdConvolutionActivationIdentity: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break;
220
0
                case SimdConvolutionActivationRelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break;
221
0
                case SimdConvolutionActivationLeakyRelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationPrelu>; break;
222
0
                case SimdConvolutionActivationRestrictRange: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break;
223
0
                case SimdConvolutionActivationPrelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationPrelu>; break;
224
0
                case SimdConvolutionActivationElu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationElu>; break;
225
0
                case SimdConvolutionActivationHswish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationHswish>; break;
226
0
                case SimdConvolutionActivationMish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationMish>; break;
227
0
                case SimdConvolutionActivationHardSigmoid: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationHardSigmoid>; break;
228
0
                case SimdConvolutionActivationSwish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationSwish>; break;
229
0
                case SimdConvolutionActivationGelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationGelu>; break;
230
0
                default: assert(0);
231
0
                }
232
0
                SetAlgParam(F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3());
233
0
            }
234
0
        }
235
236
        //---------------------------------------------------------------------
237
238
        void * SynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility)
239
0
        {
240
0
            DeconvParam param(batch, conv, compatibility);
241
0
            if (!param.Valid(SimdTensorData32f))
242
0
                return NULL;
243
0
            if (SynetDeconvolution32fNhwcDirect2x2::Preferable(param))
244
0
                return new SynetDeconvolution32fNhwcDirect2x2(param);
245
0
            else
246
0
                return new SynetDeconvolution32fGemmNN(param);
247
0
        }
248
    }
249
#endif//SIMD_AVX2_ENABLE
250
}