Coverage Report

Created: 2025-06-13 06:18

/src/gdal/alg/gdalgridavx.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Project:  GDAL Gridding API.
4
 * Purpose:  Implementation of GDAL scattered data gridder.
5
 * Author:   Even Rouault, <even dot rouault at spatialys.com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#include "gdalgrid.h"
14
#include "gdalgrid_priv.h"
15
16
#ifdef HAVE_AVX_AT_COMPILE_TIME
17
#include <immintrin.h>
18
19
/************************************************************************/
20
/*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX()     */
21
/************************************************************************/
22
23
0
#define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)
24
25
CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
26
    const void *poOptions, GUInt32 nPoints,
27
    CPL_UNUSED const double *unused_padfX,
28
    CPL_UNUSED const double *unused_padfY,
29
    CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
30
    double *pdfValue, void *hExtraParamsIn)
31
0
{
32
0
    size_t i = 0;
33
0
    GDALGridExtraParameters *psExtraParams =
34
0
        static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
35
0
    const float *pafX = psExtraParams->pafX;
36
0
    const float *pafY = psExtraParams->pafY;
37
0
    const float *pafZ = psExtraParams->pafZ;
38
39
0
    const float fEpsilon = 0.0000000000001f;
40
0
    const float fXPoint = static_cast<float>(dfXPoint);
41
0
    const float fYPoint = static_cast<float>(dfYPoint);
42
0
    const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
43
0
    const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
44
0
    const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
45
0
    __m256 ymm_nominator = _mm256_setzero_ps();
46
0
    __m256 ymm_denominator = _mm256_setzero_ps();
47
0
    int mask = 0;
48
49
0
#undef LOOP_SIZE
50
0
#if defined(__x86_64) || defined(_M_X64)
51
    /* This would also work in 32bit mode, but there are only 8 XMM registers */
52
    /* whereas we have 16 for 64bit */
53
0
#define LOOP_SIZE 16
54
0
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
55
0
    for (i = 0; i < nPointsRound; i += LOOP_SIZE)
56
0
    {
57
0
        __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i),
58
0
                                      ymm_x); /* rx = pafX[i] - fXPoint */
59
0
        __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
60
0
        __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i),
61
0
                                      ymm_y); /* ry = pafY[i] - fYPoint */
62
0
        __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
63
0
        __m256 ymm_r2 = _mm256_add_ps(
64
0
            _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
65
0
            _mm256_mul_ps(ymm_ry, ymm_ry));
66
0
        __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
67
0
                                        _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
68
0
        __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
69
0
        __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
70
0
        ymm_nominator =
71
0
            _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
72
0
                          _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
73
0
        ymm_nominator = _mm256_add_ps(
74
0
            ymm_nominator,
75
0
            _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
76
0
        ymm_denominator = _mm256_add_ps(ymm_denominator,
77
0
                                        ymm_invr2); /* denominator += invr2 */
78
0
        ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
79
0
        mask =
80
0
            _mm256_movemask_ps(_mm256_cmp_ps(
81
0
                ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */
82
0
            (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS))
83
0
             << 8);
84
0
        if (mask)
85
0
            break;
86
0
    }
87
#else
88
#define LOOP_SIZE 8
89
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
90
    for (i = 0; i < nPointsRound; i += LOOP_SIZE)
91
    {
92
        __m256 ymm_rx =
93
            _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i),
94
                          ymm_x); /* rx = pafX[i] - fXPoint */
95
        __m256 ymm_ry =
96
            _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i),
97
                          ymm_y); /* ry = pafY[i] - fYPoint */
98
        __m256 ymm_r2 = _mm256_add_ps(
99
            _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
100
            _mm256_mul_ps(ymm_ry, ymm_ry));
101
        __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
102
        ymm_nominator = _mm256_add_ps(
103
            ymm_nominator, /* nominator += invr2 * pafZ[i] */
104
            _mm256_mul_ps(ymm_invr2,
105
                          _mm256_load_ps(const_cast<float *>(pafZ) + i)));
106
        ymm_denominator = _mm256_add_ps(ymm_denominator,
107
                                        ymm_invr2); /* denominator += invr2 */
108
        mask = _mm256_movemask_ps(_mm256_cmp_ps(
109
            ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */
110
        if (mask)
111
            break;
112
    }
113
#endif
114
115
    // Find which i triggered r2 < fEpsilon.
116
0
    if (mask)
117
0
    {
118
0
        for (int j = 0; j < LOOP_SIZE; j++)
119
0
        {
120
0
            if (mask & (1 << j))
121
0
            {
122
0
                (*pdfValue) = (pafZ)[i + j];
123
124
                // GCC and MSVC need explicit zeroing.
125
#if !defined(__clang__)
126
                _mm256_zeroupper();
127
#endif
128
0
                return CE_None;
129
0
            }
130
0
        }
131
0
    }
132
0
#undef LOOP_SIZE
133
134
    // Get back nominator and denominator values for YMM registers.
135
0
    float afNominator[8];
136
0
    float afDenominator[8];
137
0
    _mm256_storeu_ps(afNominator, ymm_nominator);
138
0
    _mm256_storeu_ps(afDenominator, ymm_denominator);
139
140
    // MSVC doesn't emit AVX afterwards but may use SSE, so clear
141
    // upper bits.  Other compilers will continue using AVX for the
142
    // below floating points operations.
143
#if defined(_MSC_FULL_VER)
144
    _mm256_zeroupper();
145
#endif
146
147
0
    float fNominator = afNominator[0] + afNominator[1] + afNominator[2] +
148
0
                       afNominator[3] + afNominator[4] + afNominator[5] +
149
0
                       afNominator[6] + afNominator[7];
150
0
    float fDenominator = afDenominator[0] + afDenominator[1] +
151
0
                         afDenominator[2] + afDenominator[3] +
152
0
                         afDenominator[4] + afDenominator[5] +
153
0
                         afDenominator[6] + afDenominator[7];
154
155
    // Do the few remaining loop iterations.
156
0
    for (; i < nPoints; i++)
157
0
    {
158
0
        const float fRX = pafX[i] - fXPoint;
159
0
        const float fRY = pafY[i] - fYPoint;
160
0
        const float fR2 = fRX * fRX + fRY * fRY;
161
162
        // If the test point is close to the grid node, use the point
163
        // value directly as a node value to avoid singularity.
164
0
        if (fR2 < 0.0000000000001)
165
0
        {
166
0
            break;
167
0
        }
168
0
        else
169
0
        {
170
0
            const float fInvR2 = 1.0f / fR2;
171
0
            fNominator += fInvR2 * pafZ[i];
172
0
            fDenominator += fInvR2;
173
0
        }
174
0
    }
175
176
0
    if (i != nPoints)
177
0
    {
178
0
        (*pdfValue) = pafZ[i];
179
0
    }
180
0
    else if (fDenominator == 0.0)
181
0
    {
182
0
        (*pdfValue) =
183
0
            static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
184
0
                poOptions)
185
0
                ->dfNoDataValue;
186
0
    }
187
0
    else
188
0
        (*pdfValue) = fNominator / fDenominator;
189
190
        // GCC needs explicit zeroing.
191
#if defined(__GNUC__) && !defined(__clang__)
192
    _mm256_zeroupper();
193
#endif
194
195
0
    return CE_None;
196
0
}
197
198
#endif /* HAVE_AVX_AT_COMPILE_TIME */