/src/Simd/src/Simd/SimdSse41SynetGridSample2d32fBlZ.cpp

Source
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2025 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#include "Simd/SimdSynetGridSample.h"

#include "Simd/SimdLoad.h"
#include "Simd/SimdSet.h"

namespace Simd
{
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)    
    namespace Sse41
    {
        template <int align> SIMD_INLINE float Denormalize32f(float pos, int dim)
        {
            if (align)
                return float((pos + 1) / 2.0f * (dim - 1));
            else
                return float(((pos + 1) * dim - 1) / 2.0f);
        }

        template<int align, int range>  void IndexCoeffs32fBlZ(const float* grd, size_t dstS, int srcH, int srcW, int padW, uint32_t* idx, float* dy, float* dx, int& yMin, int& yMax)
        {
            size_t dstSF = AlignLo(dstS, F), d = 0;
            const __m128 a = SetFloat((srcW - align) / 2.0f, (srcH - align) / 2.0f);
            const __m128 b = SetFloat((srcW - 1) / 2.0f, (srcH - 1) / 2.0f);
            const __m128i _0 = _mm_setzero_si128();
            const __m128i _2 = _mm_set1_epi32(2);
            const __m128i _srcH = _mm_set1_epi32(srcH + 2);
            const __m128i _srcW = _mm_set1_epi32(srcW + 2);
            const __m128i _padW = _mm_set1_epi32(padW);
            __m128i _yMin, _yMax;
            if (range)
            {
                _yMin = _mm_set1_epi32(yMin);
                _yMax = _mm_set1_epi32(yMax);
            }
            for (; d < dstSF; d += F)
            {
                __m128 xy0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + 0), a), b);
                __m128 xy1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + F), a), b);
                __m128 x = _mm_shuffle_ps(xy0, xy1, 0x88);
                __m128 y = _mm_shuffle_ps(xy0, xy1, 0xDD);
                __m128 xf = _mm_round_ps(x, _MM_FROUND_FLOOR);
                __m128 yf = _mm_round_ps(y, _MM_FROUND_FLOOR);
                _mm_storeu_ps(dy + d, _mm_sub_ps(y, yf));
                _mm_storeu_ps(dx + d, _mm_sub_ps(x, xf));
                __m128i xi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(xf), _2), _0), _srcW);
                __m128i yi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(yf), _2), _0), _srcH);
                _mm_storeu_si128((__m128i*)(idx + d), _mm_add_epi32(_mm_mullo_epi32(_padW, yi), xi));
                if (range)
                {
                    _yMin = _mm_min_epi32(_yMin, yi);
                    _yMax = _mm_max_epi32(_yMax, yi);
                }
                grd += 2 * F;
            }
            if (range)
            {
                yMin = MinVal32i(_yMin);
                yMax = MaxVal32i(_yMax);
            }
            for (; d < dstS; ++d)
            {
                float x = Denormalize32f<align>(grd[0], srcW);
                float y = Denormalize32f<align>(grd[1], srcH);
                int x0 = int(std::floor(x));
                int y0 = int(std::floor(y));
                dy[d] = y - float(y0);
                dx[d] = x - float(x0);
                x0 = Simd::RestrictRange(x0, -2, srcW) + 2;
                y0 = Simd::RestrictRange(y0, -2, srcH) + 2;
                idx[d] = padW * y0 + x0;
                if (range)
                {
                    yMin = Min(yMin, y0);
                    yMax = Max(yMax, y0);
                }
                grd += 2;
            }
        }

        //-------------------------------------------------------------------------------------------------

        void BilinearInterp32fBlZ(const float* pad0, size_t dstS, int padW, uint32_t* idx, float* dy, float* dx, float* dst)
        {
            size_t dstSF = AlignLo(dstS, F), d = 0;
            const float* pad1 = pad0 + padW;
            __m128 p0, p1, _1 = _mm_set1_ps(1.0f);
            for (; d < dstSF; d += F)
            {
                int i0 = idx[d + 0], i1 = idx[d + 1], i2 = idx[d + 2], i3 = idx[d + 3];
                p0 = Load(pad0 + i0, pad0 + i1);
                p1 = Load(pad0 + i2, pad0 + i3);
                __m128 p00 = _mm_shuffle_ps(p0, p1, 0x88);
                __m128 p01 = _mm_shuffle_ps(p0, p1, 0xDD);
                p0 = Load(pad1 + i0, pad1 + i1);
                p1 = Load(pad1 + i2, pad1 + i3);
                __m128 p10 = _mm_shuffle_ps(p0, p1, 0x88);
                __m128 p11 = _mm_shuffle_ps(p0, p1, 0xDD);
                __m128 dy1 = _mm_loadu_ps(dy + d);
                __m128 dy0 = _mm_sub_ps(_1, dy1);
                __m128 dx1 = _mm_loadu_ps(dx + d);
                __m128 dx0 = _mm_sub_ps(_1, dx1);
                __m128 d0 = _mm_add_ps(_mm_mul_ps(dx0, p00), _mm_mul_ps(dx1, p01));
                __m128 d1 = _mm_add_ps(_mm_mul_ps(dx0, p10), _mm_mul_ps(dx1, p11));
                _mm_storeu_ps(dst + d, _mm_add_ps(_mm_mul_ps(dy0, d0), _mm_mul_ps(dy1, d1)));
            }
            for (; d < dstS; ++d)
            {
                int offs = idx[d];
                float p00 = pad0[offs + 0];
                float p01 = pad0[offs + 1];
                float p10 = pad1[offs + 0];
                float p11 = pad1[offs + 1];
                float dy1 = dy[d];
                float dy0 = 1.0f - dy1;
                float dx1 = dx[d];
                float dx0 = 1.0f - dx1;
                dst[d] = dy0 * (dx0 * p00 + dx1 * p01) + dy1 * (dx0 * p10 + dx1 * p11);
            }
        }

        //-------------------------------------------------------------------------------------------------

        SynetGridSample2d32fBlZ::SynetGridSample2d32fBlZ(const GridSample2dParam& param)
            : Base::SynetGridSample2d32fBlZ(param)
        {
            if (_sparse)
                _indexCoeffs = _param.align ? IndexCoeffs32fBlZ<1, 1> : IndexCoeffs32fBlZ<0, 1>;
            else
                _indexCoeffs = _param.align ? IndexCoeffs32fBlZ<1, 0> : IndexCoeffs32fBlZ<0, 0>;
            _bilinearInterp = BilinearInterp32fBlZ;
        }
    }
#endif
}

Line	Count	Source
1		/*
2		* Simd Library (http://ermig1979.github.io/Simd).
3		*
4		* Copyright (c) 2011-2025 Yermalayeu Ihar.
5		*
6		* Permission is hereby granted, free of charge, to any person obtaining a copy
7		* of this software and associated documentation files (the "Software"), to deal
8		* in the Software without restriction, including without limitation the rights
9		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10		* copies of the Software, and to permit persons to whom the Software is
11		* furnished to do so, subject to the following conditions:
12		*
13		* The above copyright notice and this permission notice shall be included in
14		* all copies or substantial portions of the Software.
15		*
16		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24
25		#include "Simd/SimdSynetGridSample.h"
26
27		#include "Simd/SimdLoad.h"
28		#include "Simd/SimdSet.h"
29
30		namespace Simd
31		{
32		#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)
33		namespace Sse41
34		{
35		template <int align> SIMD_INLINE float Denormalize32f(float pos, int dim)
36	0	{
37	0	if (align)
38	0	return float((pos + 1) / 2.0f * (dim - 1));
39	0	else
40	0	return float(((pos + 1) * dim - 1) / 2.0f);
41	0	} Unexecuted instantiation: float Simd::Sse41::Denormalize32f<1>(float, int) Unexecuted instantiation: float Simd::Sse41::Denormalize32f<0>(float, int)
42
43		template<int align, int range> void IndexCoeffs32fBlZ(const float* grd, size_t dstS, int srcH, int srcW, int padW, uint32_t* idx, float* dy, float* dx, int& yMin, int& yMax)
44	0	{
45	0	size_t dstSF = AlignLo(dstS, F), d = 0;
46	0	const __m128 a = SetFloat((srcW - align) / 2.0f, (srcH - align) / 2.0f);
47	0	const __m128 b = SetFloat((srcW - 1) / 2.0f, (srcH - 1) / 2.0f);
48	0	const __m128i _0 = _mm_setzero_si128();
49	0	const __m128i _2 = _mm_set1_epi32(2);
50	0	const __m128i _srcH = _mm_set1_epi32(srcH + 2);
51	0	const __m128i _srcW = _mm_set1_epi32(srcW + 2);
52	0	const __m128i _padW = _mm_set1_epi32(padW);
53	0	__m128i _yMin, _yMax;
54	0	if (range)
55	0	{
56	0	_yMin = _mm_set1_epi32(yMin);
57	0	_yMax = _mm_set1_epi32(yMax);
58	0	}
59	0	for (; d < dstSF; d += F)
60	0	{
61	0	__m128 xy0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + 0), a), b);
62	0	__m128 xy1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + F), a), b);
63	0	__m128 x = _mm_shuffle_ps(xy0, xy1, 0x88);
64	0	__m128 y = _mm_shuffle_ps(xy0, xy1, 0xDD);
65	0	__m128 xf = _mm_round_ps(x, _MM_FROUND_FLOOR);
66	0	__m128 yf = _mm_round_ps(y, _MM_FROUND_FLOOR);
67	0	_mm_storeu_ps(dy + d, _mm_sub_ps(y, yf));
68	0	_mm_storeu_ps(dx + d, _mm_sub_ps(x, xf));
69	0	__m128i xi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(xf), _2), _0), _srcW);
70	0	__m128i yi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(yf), _2), _0), _srcH);
71	0	_mm_storeu_si128((__m128i*)(idx + d), _mm_add_epi32(_mm_mullo_epi32(_padW, yi), xi));
72	0	if (range)
73	0	{
74	0	_yMin = _mm_min_epi32(_yMin, yi);
75	0	_yMax = _mm_max_epi32(_yMax, yi);
76	0	}
77	0	grd += 2 * F;
78	0	}
79	0	if (range)
80	0	{
81	0	yMin = MinVal32i(_yMin);
82	0	yMax = MaxVal32i(_yMax);
83	0	}
84	0	for (; d < dstS; ++d)
85	0	{
86	0	float x = Denormalize32f<align>(grd[0], srcW);
87	0	float y = Denormalize32f<align>(grd[1], srcH);
88	0	int x0 = int(std::floor(x));
89	0	int y0 = int(std::floor(y));
90	0	dy[d] = y - float(y0);
91	0	dx[d] = x - float(x0);
92	0	x0 = Simd::RestrictRange(x0, -2, srcW) + 2;
93	0	y0 = Simd::RestrictRange(y0, -2, srcH) + 2;
94	0	idx[d] = padW * y0 + x0;
95	0	if (range)
96	0	{
97	0	yMin = Min(yMin, y0);
98	0	yMax = Max(yMax, y0);
99	0	}
100	0	grd += 2;
101	0	}
102	0	} Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<1, 1>(float const, unsigned long, int, int, int, unsigned int, float, float, int&, int&) Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<0, 1>(float const, unsigned long, int, int, int, unsigned int, float, float, int&, int&) Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<1, 0>(float const, unsigned long, int, int, int, unsigned int, float, float, int&, int&) Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<0, 0>(float const, unsigned long, int, int, int, unsigned int, float, float, int&, int&)
103
104		//-------------------------------------------------------------------------------------------------
105
106		void BilinearInterp32fBlZ(const float* pad0, size_t dstS, int padW, uint32_t* idx, float* dy, float* dx, float* dst)
107	0	{
108	0	size_t dstSF = AlignLo(dstS, F), d = 0;
109	0	const float* pad1 = pad0 + padW;
110	0	__m128 p0, p1, _1 = _mm_set1_ps(1.0f);
111	0	for (; d < dstSF; d += F)
112	0	{
113	0	int i0 = idx[d + 0], i1 = idx[d + 1], i2 = idx[d + 2], i3 = idx[d + 3];
114	0	p0 = Load(pad0 + i0, pad0 + i1);
115	0	p1 = Load(pad0 + i2, pad0 + i3);
116	0	__m128 p00 = _mm_shuffle_ps(p0, p1, 0x88);
117	0	__m128 p01 = _mm_shuffle_ps(p0, p1, 0xDD);
118	0	p0 = Load(pad1 + i0, pad1 + i1);
119	0	p1 = Load(pad1 + i2, pad1 + i3);
120	0	__m128 p10 = _mm_shuffle_ps(p0, p1, 0x88);
121	0	__m128 p11 = _mm_shuffle_ps(p0, p1, 0xDD);
122	0	__m128 dy1 = _mm_loadu_ps(dy + d);
123	0	__m128 dy0 = _mm_sub_ps(_1, dy1);
124	0	__m128 dx1 = _mm_loadu_ps(dx + d);
125	0	__m128 dx0 = _mm_sub_ps(_1, dx1);
126	0	__m128 d0 = _mm_add_ps(_mm_mul_ps(dx0, p00), _mm_mul_ps(dx1, p01));
127	0	__m128 d1 = _mm_add_ps(_mm_mul_ps(dx0, p10), _mm_mul_ps(dx1, p11));
128	0	_mm_storeu_ps(dst + d, _mm_add_ps(_mm_mul_ps(dy0, d0), _mm_mul_ps(dy1, d1)));
129	0	}
130	0	for (; d < dstS; ++d)
131	0	{
132	0	int offs = idx[d];
133	0	float p00 = pad0[offs + 0];
134	0	float p01 = pad0[offs + 1];
135	0	float p10 = pad1[offs + 0];
136	0	float p11 = pad1[offs + 1];
137	0	float dy1 = dy[d];
138	0	float dy0 = 1.0f - dy1;
139	0	float dx1 = dx[d];
140	0	float dx0 = 1.0f - dx1;
141	0	dst[d] = dy0 * (dx0 * p00 + dx1 * p01) + dy1 * (dx0 * p10 + dx1 * p11);
142	0	}
143	0	}
144
145		//-------------------------------------------------------------------------------------------------
146
147		SynetGridSample2d32fBlZ::SynetGridSample2d32fBlZ(const GridSample2dParam& param)
148	0	: Base::SynetGridSample2d32fBlZ(param)
149	0	{
150	0	if (_sparse)
151	0	_indexCoeffs = _param.align ? IndexCoeffs32fBlZ<1, 1> : IndexCoeffs32fBlZ<0, 1>;
152	0	else
153	0	_indexCoeffs = _param.align ? IndexCoeffs32fBlZ<1, 0> : IndexCoeffs32fBlZ<0, 0>;
154	0	_bilinearInterp = BilinearInterp32fBlZ;
155	0	}
156		}
157		#endif
158		}

Coverage Report

Created: 2025-09-27 07:34