Coverage Report

Created: 2025-06-13 06:29

/src/gdal/alg/gdalgridsse.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Project:  GDAL Gridding API.
4
 * Purpose:  Implementation of GDAL scattered data gridder.
5
 * Author:   Even Rouault, <even dot rouault at spatialys.com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#include "gdalgrid.h"
14
#include "gdalgrid_priv.h"
15
16
#ifdef HAVE_SSE_AT_COMPILE_TIME
17
18
#ifdef USE_NEON_OPTIMIZATIONS
19
#include "include_sse2neon.h"
20
#else
21
#include <xmmintrin.h>
22
#endif
23
24
/************************************************************************/
25
/*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE()     */
26
/************************************************************************/
27
28
CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
29
    const void *poOptions, GUInt32 nPoints,
30
    CPL_UNUSED const double *unused_padfX,
31
    CPL_UNUSED const double *unused_padfY,
32
    CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
33
    double *pdfValue, void *hExtraParamsIn)
34
0
{
35
0
    size_t i = 0;
36
0
    GDALGridExtraParameters *psExtraParams =
37
0
        static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
38
0
    const float *pafX = psExtraParams->pafX;
39
0
    const float *pafY = psExtraParams->pafY;
40
0
    const float *pafZ = psExtraParams->pafZ;
41
42
0
    const float fEpsilon = 0.0000000000001f;
43
0
    const float fXPoint = static_cast<float>(dfXPoint);
44
0
    const float fYPoint = static_cast<float>(dfYPoint);
45
0
    const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon));
46
0
    const __m128 xmm_x = _mm_load1_ps(const_cast<float *>(&fXPoint));
47
0
    const __m128 xmm_y = _mm_load1_ps(const_cast<float *>(&fYPoint));
48
0
    __m128 xmm_nominator = _mm_setzero_ps();
49
0
    __m128 xmm_denominator = _mm_setzero_ps();
50
0
    int mask = 0;
51
52
0
#if defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS)
53
    // This would also work in 32bit mode, but there are only 8 XMM registers
54
    // whereas we have 16 for 64bit.
55
0
    const size_t LOOP_SIZE = 8;
56
0
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
57
0
    for (i = 0; i < nPointsRound; i += LOOP_SIZE)
58
0
    {
59
        // rx = pafX[i] - fXPoint
60
0
        __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x);
61
0
        __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x);
62
        // ry = pafY[i] - fYPoint
63
0
        __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y);
64
0
        __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y);
65
        // r2 = rx * rx + ry * ry
66
0
        __m128 xmm_r2 =
67
0
            _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), _mm_mul_ps(xmm_ry, xmm_ry));
68
0
        __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4),
69
0
                                     _mm_mul_ps(xmm_ry_4, xmm_ry_4));
70
        // invr2 = 1.0f / r2
71
0
        __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2);
72
0
        __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4);
73
        // nominator += invr2 * pafZ[i]
74
0
        xmm_nominator = _mm_add_ps(
75
0
            xmm_nominator, _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
76
0
        xmm_nominator = _mm_add_ps(
77
0
            xmm_nominator, _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4)));
78
        // denominator += invr2
79
0
        xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2);
80
0
        xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4);
81
        // if( r2 < fEpsilon)
82
0
        mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) |
83
0
               (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4);
84
0
        if (mask)
85
0
            break;
86
0
    }
87
#else
88
#define LOOP_SIZE 4
89
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
90
    for (i = 0; i < nPointsRound; i += LOOP_SIZE)
91
    {
92
        __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i),
93
                                   xmm_x); /* rx = pafX[i] - fXPoint */
94
        __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i),
95
                                   xmm_y); /* ry = pafY[i] - fYPoint */
96
        __m128 xmm_r2 =
97
            _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */
98
                       _mm_mul_ps(xmm_ry, xmm_ry));
99
        __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */
100
        xmm_nominator =
101
            _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */
102
                       _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
103
        xmm_denominator =
104
            _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */
105
        mask = _mm_movemask_ps(
106
            _mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */
107
        if (mask)
108
            break;
109
    }
110
#endif
111
112
    // Find which i triggered r2 < fEpsilon.
113
0
    if (mask)
114
0
    {
115
0
        for (size_t j = 0; j < LOOP_SIZE; j++)
116
0
        {
117
0
            if (mask & (1 << j))
118
0
            {
119
0
                (*pdfValue) = (pafZ)[i + j];
120
0
                return CE_None;
121
0
            }
122
0
        }
123
0
    }
124
125
    // Get back nominator and denominator values for XMM registers.
126
0
    float afNominator[4];
127
0
    float afDenominator[4];
128
0
    _mm_storeu_ps(afNominator, xmm_nominator);
129
0
    _mm_storeu_ps(afDenominator, xmm_denominator);
130
131
0
    float fNominator =
132
0
        afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3];
133
0
    float fDenominator = afDenominator[0] + afDenominator[1] +
134
0
                         afDenominator[2] + afDenominator[3];
135
136
    /* Do the few remaining loop iterations */
137
0
    for (; i < nPoints; i++)
138
0
    {
139
0
        const float fRX = pafX[i] - fXPoint;
140
0
        const float fRY = pafY[i] - fYPoint;
141
0
        const float fR2 = fRX * fRX + fRY * fRY;
142
143
        // If the test point is close to the grid node, use the point
144
        // value directly as a node value to avoid singularity.
145
0
        if (fR2 < 0.0000000000001)
146
0
        {
147
0
            break;
148
0
        }
149
0
        else
150
0
        {
151
0
            const float fInvR2 = 1.0f / fR2;
152
0
            fNominator += fInvR2 * pafZ[i];
153
0
            fDenominator += fInvR2;
154
0
        }
155
0
    }
156
157
0
    if (i != nPoints)
158
0
    {
159
0
        (*pdfValue) = pafZ[i];
160
0
    }
161
0
    else if (fDenominator == 0.0)
162
0
    {
163
0
        (*pdfValue) =
164
0
            static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
165
0
                poOptions)
166
0
                ->dfNoDataValue;
167
0
    }
168
0
    else
169
0
    {
170
0
        (*pdfValue) = fNominator / fDenominator;
171
0
    }
172
173
0
    return CE_None;
174
0
}
175
176
#endif /* HAVE_SSE_AT_COMPILE_TIME */