Coverage Report

Created: 2026-04-09 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdSse41SynetDeconvolution32f.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2024 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetDeconvolution32f.h"
25
#include "Simd/SimdSynetConvolution32f.h"
26
#include "Simd/SimdSynetConvolution32fCommon.h"
27
#include "Simd/SimdExtract.h"
28
#include "Simd/SimdSynet.h"
29
#include "Simd/SimdSse41.h"
30
#include "Simd/SimdGemm.h"
31
#include "Simd/SimdExp.h"
32
#include "Simd/SimdCpu.h"
33
34
namespace Simd
35
{
36
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)  
37
    namespace Sse41
38
    {
39
        SynetDeconvolution32fGemmNN::SynetDeconvolution32fGemmNN(const DeconvParam & p)
40
0
            : Base::SynetDeconvolution32fGemmNN(p)
41
0
        {
42
0
            _gemm.Init(InitGemmFuncs(Sse41::Gemm32fNN, "Sse41"));
43
0
            if (_param.trans && _param.group == 1)
44
0
            {
45
0
                if (NHWC_GEMM_RUNTIME)
46
0
                {
47
0
                    _gemmCb.Init(InitGemmCbFuncs(Sse41::Gemm32fNNcbBufferSize, Sse41::Gemm32fNNcbReorderB, Sse41::Gemm32fNNcbRun, "Sse41", GemmKernelF2, GemmKernelF3));
48
0
                    _nhwcWeight.Resize(_gemmCb.At(0).BufferSize(_M*_merge, _N, _K));
49
0
                }
50
0
                else
51
0
                    _nhwcWeight.Resize(Sse41::Gemm32fNNcbBufferSize(_M*_merge, _N, _K, GemmKernelAny, NHWC_GEMM_COMPATIBLE));
52
0
                _nhwcRun = Sse41::Gemm32fNNcbRun;
53
0
                _nhwcReorderB = Sse41::Gemm32fNNcbReorderB;
54
0
            }
55
0
            _biasAndActivation = Sse41::ConvolutionBiasAndActivation;
56
0
        }
57
58
        //-------------------------------------------------------------------------------------------------
59
60
        void SynetDeconvolution32fGemmNN::RowToImg(const float* src, float* dst)
61
0
        {
62
0
            const DeconvParam& p = _param;
63
0
            assert(p.trans && p.group == 1);
64
0
            if ((p.IsPad(0) && p.IsDilation(1) && p.kernelY == p.strideX && p.kernelX == p.strideX) || p.dstC < F)
65
0
            {
66
0
                Base::SynetDeconvolution32fGemmNN::RowToImg(src, dst);
67
0
                return;
68
0
            }
69
0
            else
70
0
            {
71
0
                size_t dstCF = AlignLo(p.dstC, F);
72
0
                for (size_t dy = 0; dy < p.dstH; ++dy)
73
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
74
0
                        memset(dst + (dy * p.dstW + dx) * p.dstC, 0, p.dstC * sizeof(float));
75
0
                for (size_t sy = 0; sy < p.srcH; ++sy)
76
0
                {
77
0
                    for (size_t sx = 0; sx < p.srcW; ++sx)
78
0
                    {
79
0
                        size_t dy = sy * p.strideY - p.padY;
80
0
                        for (size_t ky = 0; ky < p.kernelY; ky++, dy += p.dilationY)
81
0
                        {
82
0
                            if (dy < p.dstH)
83
0
                            {
84
0
                                size_t dx = sx * p.strideX - p.padX;
85
0
                                for (size_t kx = 0; kx < p.kernelX; kx++, dx += p.dilationX)
86
0
                                {
87
0
                                    if (dx < p.dstW)
88
0
                                    {
89
0
                                        float* d = dst + (dy * p.dstW + dx) * p.dstC;
90
0
                                        size_t dc = 0;
91
0
                                        for (; dc < dstCF; dc += F)
92
0
                                            _mm_storeu_ps(d + dc, _mm_add_ps(_mm_loadu_ps(d + dc), _mm_loadu_ps(src + dc)));
93
0
                                        for (; dc < p.dstC; ++dc)
94
0
                                            d[dc] += src[dc];
95
0
                                    }
96
0
                                    src += p.dstC;
97
0
                                }
98
0
                            }
99
0
                            else
100
0
                                src += p.kernelX * p.dstC;
101
0
                        }
102
0
                    }
103
0
                }
104
0
            }
105
0
        }
106
107
        //-------------------------------------------------------------------------------------------------
108
109
        typedef void (*DeconvolutionNhwcDirect2x2_Ptr) (const float * src0, const DeconvParam & p, size_t srcC, size_t dstC, const float * weight, const __m128 * bias, const __m128 * params, float * ds, int first);
110
111
        template<TermType term, SimdConvolutionActivationType type, size_t tail> void DeconvolutionNhwcDirect2x2_M(const float * src0,
112
            const DeconvParam & p, size_t srcC, size_t dstC, const float * weight0, const __m128 * bias, const __m128 * params, float * dst, int first)
113
0
        {
114
0
            size_t dS = p.srcC, dD = p.dstC;
115
0
            const float * weight1 = weight0 + srcC * F, * src1, * src2, * src3, * src4, * src5;
116
0
            if (tail > 1) src1 = src0 + 1 * dS;
117
0
            if (tail > 2) src2 = src0 + 2 * dS;
118
0
            if (tail > 3) src3 = src0 + 3 * dS;
119
0
            if (tail > 4) src4 = src0 + 4 * dS;
120
0
            if (tail > 5) src5 = src0 + 5 * dS;
121
0
            __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1;
122
0
            if (first)
123
0
            {
124
0
                if (tail > 0) d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps();
125
0
                if (tail > 1) d10 = _mm_setzero_ps(), d11 = _mm_setzero_ps();
126
0
                if (tail > 2) d20 = _mm_setzero_ps(), d21 = _mm_setzero_ps();
127
0
                if (tail > 3) d30 = _mm_setzero_ps(), d31 = _mm_setzero_ps();
128
0
                if (tail > 4) d40 = _mm_setzero_ps(), d41 = _mm_setzero_ps();
129
0
                if (tail > 5) d50 = _mm_setzero_ps(), d51 = _mm_setzero_ps();
130
0
            }
131
0
            else
132
0
            {
133
0
                if (tail > 0) d00 = _mm_loadu_ps(dst + 0x0 * dD), d01 = _mm_loadu_ps(dst + 0x1 * dD);
134
0
                if (tail > 1) d10 = _mm_loadu_ps(dst + 0x2 * dD), d11 = _mm_loadu_ps(dst + 0x3 * dD);
135
0
                if (tail > 2) d20 = _mm_loadu_ps(dst + 0x4 * dD), d21 = _mm_loadu_ps(dst + 0x5 * dD);
136
0
                if (tail > 3) d30 = _mm_loadu_ps(dst + 0x6 * dD), d31 = _mm_loadu_ps(dst + 0x7 * dD);
137
0
                if (tail > 4) d40 = _mm_loadu_ps(dst + 0x8 * dD), d41 = _mm_loadu_ps(dst + 0x9 * dD);
138
0
                if (tail > 5) d50 = _mm_loadu_ps(dst + 0xa * dD), d51 = _mm_loadu_ps(dst + 0xb * dD);
139
0
            }
140
0
            for (size_t sc = 0; sc < srcC; ++sc)
141
0
            {
142
0
                w0 = _mm_loadu_ps(weight0);
143
0
                w1 = _mm_loadu_ps(weight1);
144
0
                if (tail > 0) s0 = _mm_set1_ps(src0[sc]), d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00), d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01);
145
0
                if (tail > 1) s0 = _mm_set1_ps(src1[sc]), d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10), d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11);
146
0
                if (tail > 2) s0 = _mm_set1_ps(src2[sc]), d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20), d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21);
147
0
                if (tail > 3) s0 = _mm_set1_ps(src3[sc]), d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30), d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31);
148
0
                if (tail > 4) s0 = _mm_set1_ps(src4[sc]), d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40), d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41);
149
0
                if (tail > 5) s0 = _mm_set1_ps(src5[sc]), d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50), d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51);
150
0
                weight0 += F;
151
0
                weight1 += F;
152
0
            }
153
0
            if (dstC == F)
154
0
            {
155
0
                if (tail > 0) Term<term>::template Save<type, 0>(dst + 0x0 * dD, d00, bias, params), Term<term>::template Save<type, 0>(dst + 0x1 * dD, d01, bias, params);
156
0
                if (tail > 1) Term<term>::template Save<type, 0>(dst + 0x2 * dD, d10, bias, params), Term<term>::template Save<type, 0>(dst + 0x3 * dD, d11, bias, params);
157
0
                if (tail > 2) Term<term>::template Save<type, 0>(dst + 0x4 * dD, d20, bias, params), Term<term>::template Save<type, 0>(dst + 0x5 * dD, d21, bias, params);
158
0
                if (tail > 3) Term<term>::template Save<type, 0>(dst + 0x6 * dD, d30, bias, params), Term<term>::template Save<type, 0>(dst + 0x7 * dD, d31, bias, params);
159
0
                if (tail > 4) Term<term>::template Save<type, 0>(dst + 0x8 * dD, d40, bias, params), Term<term>::template Save<type, 0>(dst + 0x9 * dD, d41, bias, params);
160
0
                if (tail > 5) Term<term>::template Save<type, 0>(dst + 0xA * dD, d50, bias, params), Term<term>::template Save<type, 0>(dst + 0xB * dD, d51, bias, params);
161
0
            }
162
0
            else
163
0
            {
164
0
                if (tail > 0) Term<term>::template Save<type, 0>(dst + 0x0 * dD, d00, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x1 * dD, d01, bias, params, dstC);
165
0
                if (tail > 1) Term<term>::template Save<type, 0>(dst + 0x2 * dD, d10, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x3 * dD, d11, bias, params, dstC);
166
0
                if (tail > 2) Term<term>::template Save<type, 0>(dst + 0x4 * dD, d20, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x5 * dD, d21, bias, params, dstC);
167
0
                if (tail > 3) Term<term>::template Save<type, 0>(dst + 0x6 * dD, d30, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x7 * dD, d31, bias, params, dstC);
168
0
                if (tail > 4) Term<term>::template Save<type, 0>(dst + 0x8 * dD, d40, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0x9 * dD, d41, bias, params, dstC);
169
0
                if (tail > 5) Term<term>::template Save<type, 0>(dst + 0xA * dD, d50, bias, params, dstC), Term<term>::template Save<type, 0>(dst + 0xB * dD, d51, bias, params, dstC);
170
0
            }
171
0
        }
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)3, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)1, (SimdConvolutionActivationType)0, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)4, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)5, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)6, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)7, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)8, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)9, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 1ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 2ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 3ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 4ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 5ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2_M<(Simd::TermType)0, (SimdConvolutionActivationType)10, 6ul>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
172
173
        template <TermType term, SimdConvolutionActivationType type> SIMD_INLINE DeconvolutionNhwcDirect2x2_Ptr GetDeconvolutionNhwcDirect2x2(size_t tail)
174
0
        {            
175
0
            switch (tail)
176
0
            {
177
0
            case 0: return NULL;
178
0
            case 1: return DeconvolutionNhwcDirect2x2_M<term, type, 1>;
179
0
            case 2: return DeconvolutionNhwcDirect2x2_M<term, type, 2>;
180
0
            case 3: return DeconvolutionNhwcDirect2x2_M<term, type, 3>;
181
0
            case 4: return DeconvolutionNhwcDirect2x2_M<term, type, 4>;
182
0
            case 5: return DeconvolutionNhwcDirect2x2_M<term, type, 5>;
183
0
            case 6: return DeconvolutionNhwcDirect2x2_M<term, type, 6>;
184
0
            default:
185
0
                assert(0);
186
0
                return NULL;
187
0
            }           
188
0
        }
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)3>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)1, (SimdConvolutionActivationType)0>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)4>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)5>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)6>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)7>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)8>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)9>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
Unexecuted instantiation: void (*Simd::Sse41::GetDeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)10>(unsigned long))(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, float const*, float __vector(4) const*, float __vector(4) const*, float*, int)
189
190
        template<TermType term, SimdConvolutionActivationType type> void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam & p,
191
            size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float * weight, const float * bias, const float * params, float * dst, int first)
192
0
        {
193
0
            size_t body = 6, srcWb = AlignLoAny(p.srcW, body), tail = p.srcW - srcWb;
194
0
            DeconvolutionNhwcDirect2x2_Ptr bodyKernel = GetDeconvolutionNhwcDirect2x2<term, type>(body);
195
0
            DeconvolutionNhwcDirect2x2_Ptr tailKernel = GetDeconvolutionNhwcDirect2x2<term, type>(tail);
196
197
0
            __m128 _params[2], _bias[1];
198
0
            _params[0] = _mm_set1_ps(params[0]);
199
0
            if (type == SimdConvolutionActivationRestrictRange ||
200
0
                type == SimdConvolutionActivationHswish ||
201
0
                type == SimdConvolutionActivationHardSigmoid)
202
0
                _params[1] = _mm_set1_ps(params[1]);
203
204
0
            for (size_t dc = 0; dc < dstC; dc += F)
205
0
            {
206
0
                size_t dC = Simd::Min(F, dstC - dc);
207
0
                _bias[0] = _mm_loadu_ps(bias + dc);
208
0
                if (type == ::SimdConvolutionActivationPrelu)
209
0
                    _params[0] = _mm_loadu_ps(params + dc);
210
0
                const float * s = src + yBeg * p.srcW * p.srcC;
211
0
                float * d = dst + yBeg * p.strideY * p.dstW * p.dstC;
212
0
                const float * w0 = weight + 0 * p.kernelX * srcC * F;
213
0
                const float * w1 = weight + 1 * p.kernelX * srcC * F;
214
0
                for (size_t sy = yBeg; sy < yEnd; sy += 1, s += p.srcW * p.srcC)
215
0
                {
216
0
                    for (size_t sx = 0; sx < srcWb; sx += body)
217
0
                        bodyKernel(s + sx * p.srcC, p, srcC, dC, w0, _bias, _params, d, first), d += body * p.strideX * p.dstC;
218
0
                    if(tail)
219
0
                        tailKernel(s + srcWb * p.srcC, p, srcC, dC, w0, _bias, _params, d, first), d += tail * p.strideX * p.dstC;
220
0
                    for (size_t sx = 0; sx < srcWb; sx += body)
221
0
                        bodyKernel(s + sx * p.srcC, p, srcC, dC, w1, _bias, _params, d, first), d += body * p.strideX * p.dstC;
222
0
                    if (tail)
223
0
                        tailKernel(s + srcWb * p.srcC, p, srcC, dC, w1, _bias, _params, d, first), d += tail * p.strideX * p.dstC;
224
0
                }
225
0
                weight += p.kernelY * p.kernelX*srcC*F;
226
0
                dst += F;
227
0
            }
228
0
        }
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)3>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)1, (SimdConvolutionActivationType)0>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)4>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)5>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)6>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)7>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)8>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)9>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(Simd::TermType)0, (SimdConvolutionActivationType)10>(float const*, Simd::DeconvParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
229
230
        template<SimdConvolutionActivationType type> void DeconvolutionNhwcDirect2x2(const float * src, const DeconvParam & p,
231
            const SynetDeconvolution32fNhwcDirect2x2::AlgParam & a, const float * weight, const float * bias, const float * params, float * dst)
232
0
        {
233
0
            for (size_t dc = 0; dc < p.dstC; dc += a.macroD)
234
0
            {
235
0
                size_t macroD = Simd::Min(p.dstC, dc + a.macroD) - dc;
236
0
                for (size_t sc = 0; sc < p.srcC; sc += a.macroC)
237
0
                {
238
0
                    size_t macroC = Simd::Min(p.srcC, sc + a.macroC) - sc;
239
0
                    size_t macroK = p.kernelY * p.kernelX * macroC;
240
0
                    for (size_t yBeg = 0; yBeg < p.srcH;)
241
0
                    {
242
0
                        size_t yEnd = Simd::Min(yBeg + a.macroH, p.srcH);
243
0
                        if (a.macroC == p.srcC)
244
0
                            DeconvolutionNhwcDirect2x2<TermLast, type>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 1);
245
0
                        else if (sc == 0)
246
0
                            DeconvolutionNhwcDirect2x2<TermInterim, SimdConvolutionActivationIdentity>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 1);
247
0
                        else if (sc + macroC == p.srcC)
248
0
                            DeconvolutionNhwcDirect2x2<TermLast, type>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 0);
249
0
                        else
250
0
                            DeconvolutionNhwcDirect2x2<TermInterim, SimdConvolutionActivationIdentity>(src + sc, p, macroD, yBeg, yEnd, macroC, weight, bias + dc, params, dst + dc, 0);
251
0
                        yBeg = yEnd;
252
0
                    }
253
0
                    weight += AlignHiAny(macroD, a.microD)*macroK;
254
0
                }
255
0
                if (type == ::SimdConvolutionActivationPrelu)
256
0
                    params += macroD;
257
0
            }
258
0
        }
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)3>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)4>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)5>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)6>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)7>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)8>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)9>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::DeconvolutionNhwcDirect2x2<(SimdConvolutionActivationType)10>(float const*, Simd::DeconvParam const&, Simd::Base::SynetDeconvolution32fNhwcDirect2x2::AlgParam const&, float const*, float const*, float const*, float*)
259
260
        SynetDeconvolution32fNhwcDirect2x2::SynetDeconvolution32fNhwcDirect2x2(const DeconvParam & p)
261
0
            : Base::SynetDeconvolution32fNhwcDirect2x2(p)
262
0
        {
263
0
            switch (p.activation)
264
0
            {
265
0
            case SimdConvolutionActivationIdentity: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break;
266
0
            case SimdConvolutionActivationRelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break;
267
0
            case SimdConvolutionActivationLeakyRelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationPrelu>; break;
268
0
            case SimdConvolutionActivationRestrictRange: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationRestrictRange>; break;
269
0
            case SimdConvolutionActivationPrelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationPrelu>; break;
270
0
            case SimdConvolutionActivationElu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationElu>; break;
271
0
            case SimdConvolutionActivationHswish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationHswish>; break;
272
0
            case SimdConvolutionActivationMish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationMish>; break;
273
0
            case SimdConvolutionActivationHardSigmoid: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationHardSigmoid>; break;
274
0
            case SimdConvolutionActivationSwish: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationSwish>; break;
275
0
            case SimdConvolutionActivationGelu: _deconvolution = DeconvolutionNhwcDirect2x2<SimdConvolutionActivationGelu>; break;
276
0
            default: assert(0);
277
0
            }
278
0
            SetAlgParam(F, Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3());
279
0
        }
280
281
        bool SynetDeconvolution32fNhwcDirect2x2::Preferable(const DeconvParam & p)
282
0
        {
283
0
            return p.IsPad(0) && p.IsDilation(1) && p.IsKernel(2) && p.IsStride(2) && p.group == 1 && p.trans;
284
0
        }
285
286
        //-------------------------------------------------------------------------------------------------
287
288
        void * SynetDeconvolution32fInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility)
289
0
        {
290
0
            DeconvParam param(batch, conv, compatibility);
291
0
            if (!param.Valid(SimdTensorData32f))
292
0
                return NULL;
293
0
            if (SynetDeconvolution32fNhwcDirect2x2::Preferable(param))
294
0
                return new SynetDeconvolution32fNhwcDirect2x2(param);
295
0
            else
296
0
                return new SynetDeconvolution32fGemmNN(param);
297
0
        }
298
    }
299
#endif
300
}