Coverage Report

Created: 2025-09-27 07:34

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdSse41SynetGridSample2d32fBlZ.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2025 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
25
#include "Simd/SimdSynetGridSample.h"
26
27
#include "Simd/SimdLoad.h"
28
#include "Simd/SimdSet.h"
29
30
namespace Simd
31
{
32
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)    
33
    namespace Sse41
34
    {
35
        template <int align> SIMD_INLINE float Denormalize32f(float pos, int dim)
36
0
        {
37
0
            if (align)
38
0
                return float((pos + 1) / 2.0f * (dim - 1));
39
0
            else
40
0
                return float(((pos + 1) * dim - 1) / 2.0f);
41
0
        }
Unexecuted instantiation: float Simd::Sse41::Denormalize32f<1>(float, int)
Unexecuted instantiation: float Simd::Sse41::Denormalize32f<0>(float, int)
42
43
        template<int align, int range>  void IndexCoeffs32fBlZ(const float* grd, size_t dstS, int srcH, int srcW, int padW, uint32_t* idx, float* dy, float* dx, int& yMin, int& yMax)
44
0
        {
45
0
            size_t dstSF = AlignLo(dstS, F), d = 0;
46
0
            const __m128 a = SetFloat((srcW - align) / 2.0f, (srcH - align) / 2.0f);
47
0
            const __m128 b = SetFloat((srcW - 1) / 2.0f, (srcH - 1) / 2.0f);
48
0
            const __m128i _0 = _mm_setzero_si128();
49
0
            const __m128i _2 = _mm_set1_epi32(2);
50
0
            const __m128i _srcH = _mm_set1_epi32(srcH + 2);
51
0
            const __m128i _srcW = _mm_set1_epi32(srcW + 2);
52
0
            const __m128i _padW = _mm_set1_epi32(padW);
53
0
            __m128i _yMin, _yMax;
54
0
            if (range)
55
0
            {
56
0
                _yMin = _mm_set1_epi32(yMin);
57
0
                _yMax = _mm_set1_epi32(yMax);
58
0
            }
59
0
            for (; d < dstSF; d += F)
60
0
            {
61
0
                __m128 xy0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + 0), a), b);
62
0
                __m128 xy1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + F), a), b);
63
0
                __m128 x = _mm_shuffle_ps(xy0, xy1, 0x88);
64
0
                __m128 y = _mm_shuffle_ps(xy0, xy1, 0xDD);
65
0
                __m128 xf = _mm_round_ps(x, _MM_FROUND_FLOOR);
66
0
                __m128 yf = _mm_round_ps(y, _MM_FROUND_FLOOR);
67
0
                _mm_storeu_ps(dy + d, _mm_sub_ps(y, yf));
68
0
                _mm_storeu_ps(dx + d, _mm_sub_ps(x, xf));
69
0
                __m128i xi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(xf), _2), _0), _srcW);
70
0
                __m128i yi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(yf), _2), _0), _srcH);
71
0
                _mm_storeu_si128((__m128i*)(idx + d), _mm_add_epi32(_mm_mullo_epi32(_padW, yi), xi));
72
0
                if (range)
73
0
                {
74
0
                    _yMin = _mm_min_epi32(_yMin, yi);
75
0
                    _yMax = _mm_max_epi32(_yMax, yi);
76
0
                }
77
0
                grd += 2 * F;
78
0
            }
79
0
            if (range)
80
0
            {
81
0
                yMin = MinVal32i(_yMin);
82
0
                yMax = MaxVal32i(_yMax);
83
0
            }
84
0
            for (; d < dstS; ++d)
85
0
            {
86
0
                float x = Denormalize32f<align>(grd[0], srcW);
87
0
                float y = Denormalize32f<align>(grd[1], srcH);
88
0
                int x0 = int(std::floor(x));
89
0
                int y0 = int(std::floor(y));
90
0
                dy[d] = y - float(y0);
91
0
                dx[d] = x - float(x0);
92
0
                x0 = Simd::RestrictRange(x0, -2, srcW) + 2;
93
0
                y0 = Simd::RestrictRange(y0, -2, srcH) + 2;
94
0
                idx[d] = padW * y0 + x0;
95
0
                if (range)
96
0
                {
97
0
                    yMin = Min(yMin, y0);
98
0
                    yMax = Max(yMax, y0);
99
0
                }
100
0
                grd += 2;
101
0
            }
102
0
        }
Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<1, 1>(float const*, unsigned long, int, int, int, unsigned int*, float*, float*, int&, int&)
Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<0, 1>(float const*, unsigned long, int, int, int, unsigned int*, float*, float*, int&, int&)
Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<1, 0>(float const*, unsigned long, int, int, int, unsigned int*, float*, float*, int&, int&)
Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<0, 0>(float const*, unsigned long, int, int, int, unsigned int*, float*, float*, int&, int&)
103
104
        //-------------------------------------------------------------------------------------------------
105
106
        void BilinearInterp32fBlZ(const float* pad0, size_t dstS, int padW, uint32_t* idx, float* dy, float* dx, float* dst)
107
0
        {
108
0
            size_t dstSF = AlignLo(dstS, F), d = 0;
109
0
            const float* pad1 = pad0 + padW;
110
0
            __m128 p0, p1, _1 = _mm_set1_ps(1.0f);
111
0
            for (; d < dstSF; d += F)
112
0
            {
113
0
                int i0 = idx[d + 0], i1 = idx[d + 1], i2 = idx[d + 2], i3 = idx[d + 3];
114
0
                p0 = Load(pad0 + i0, pad0 + i1);
115
0
                p1 = Load(pad0 + i2, pad0 + i3);
116
0
                __m128 p00 = _mm_shuffle_ps(p0, p1, 0x88);
117
0
                __m128 p01 = _mm_shuffle_ps(p0, p1, 0xDD);
118
0
                p0 = Load(pad1 + i0, pad1 + i1);
119
0
                p1 = Load(pad1 + i2, pad1 + i3);
120
0
                __m128 p10 = _mm_shuffle_ps(p0, p1, 0x88);
121
0
                __m128 p11 = _mm_shuffle_ps(p0, p1, 0xDD);
122
0
                __m128 dy1 = _mm_loadu_ps(dy + d);
123
0
                __m128 dy0 = _mm_sub_ps(_1, dy1);
124
0
                __m128 dx1 = _mm_loadu_ps(dx + d);
125
0
                __m128 dx0 = _mm_sub_ps(_1, dx1);
126
0
                __m128 d0 = _mm_add_ps(_mm_mul_ps(dx0, p00), _mm_mul_ps(dx1, p01));
127
0
                __m128 d1 = _mm_add_ps(_mm_mul_ps(dx0, p10), _mm_mul_ps(dx1, p11));
128
0
                _mm_storeu_ps(dst + d, _mm_add_ps(_mm_mul_ps(dy0, d0), _mm_mul_ps(dy1, d1)));
129
0
            }
130
0
            for (; d < dstS; ++d)
131
0
            {
132
0
                int offs = idx[d];
133
0
                float p00 = pad0[offs + 0];
134
0
                float p01 = pad0[offs + 1];
135
0
                float p10 = pad1[offs + 0];
136
0
                float p11 = pad1[offs + 1];
137
0
                float dy1 = dy[d];
138
0
                float dy0 = 1.0f - dy1;
139
0
                float dx1 = dx[d];
140
0
                float dx0 = 1.0f - dx1;
141
0
                dst[d] = dy0 * (dx0 * p00 + dx1 * p01) + dy1 * (dx0 * p10 + dx1 * p11);
142
0
            }
143
0
        }
144
145
        //-------------------------------------------------------------------------------------------------
146
147
        SynetGridSample2d32fBlZ::SynetGridSample2d32fBlZ(const GridSample2dParam& param)
148
0
            : Base::SynetGridSample2d32fBlZ(param)
149
0
        {
150
0
            if (_sparse)
151
0
                _indexCoeffs = _param.align ? IndexCoeffs32fBlZ<1, 1> : IndexCoeffs32fBlZ<0, 1>;
152
0
            else
153
0
                _indexCoeffs = _param.align ? IndexCoeffs32fBlZ<1, 0> : IndexCoeffs32fBlZ<0, 0>;
154
0
            _bilinearInterp = BilinearInterp32fBlZ;
155
0
        }
156
    }
157
#endif
158
}