Coverage Report

Created: 2025-06-22 06:59

/src/gdal/alg/gdalgridavx.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Project:  GDAL Gridding API.
4
 * Purpose:  Implementation of GDAL scattered data gridder.
5
 * Author:   Even Rouault, <even dot rouault at spatialys.com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#include "gdalgrid.h"
14
#include "gdalgrid_priv.h"
15
16
#ifdef HAVE_AVX_AT_COMPILE_TIME
17
#include <immintrin.h>
18
19
/************************************************************************/
20
/*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX()     */
21
/************************************************************************/
22
23
CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
24
    const void *poOptions, GUInt32 nPoints,
25
    CPL_UNUSED const double *unused_padfX,
26
    CPL_UNUSED const double *unused_padfY,
27
    CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,
28
    double *pdfValue, void *hExtraParamsIn)
29
0
{
30
0
    size_t i = 0;
31
0
    GDALGridExtraParameters *psExtraParams =
32
0
        static_cast<GDALGridExtraParameters *>(hExtraParamsIn);
33
0
    const float *pafX = psExtraParams->pafX;
34
0
    const float *pafY = psExtraParams->pafY;
35
0
    const float *pafZ = psExtraParams->pafZ;
36
37
0
    const float fEpsilon = 0.0000000000001f;
38
0
    const float fXPoint = static_cast<float>(dfXPoint);
39
0
    const float fYPoint = static_cast<float>(dfYPoint);
40
0
    const __m256 ymm_small = _mm256_set1_ps(fEpsilon);
41
0
    const __m256 ymm_x = _mm256_set1_ps(fXPoint);
42
0
    const __m256 ymm_y = _mm256_set1_ps(fYPoint);
43
0
    __m256 ymm_nominator = _mm256_setzero_ps();
44
0
    __m256 ymm_denominator = _mm256_setzero_ps();
45
0
    int mask = 0;
46
47
0
#undef LOOP_SIZE
48
0
#if defined(__x86_64) || defined(_M_X64)
49
    /* This would also work in 32bit mode, but there are only 8 XMM registers */
50
    /* whereas we have 16 for 64bit */
51
0
#define LOOP_SIZE 16
52
0
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
53
0
    for (i = 0; i < nPointsRound; i += LOOP_SIZE)
54
0
    {
55
0
        __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i),
56
0
                                      ymm_x); /* rx = pafX[i] - fXPoint */
57
0
        __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
58
0
        __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i),
59
0
                                      ymm_y); /* ry = pafY[i] - fYPoint */
60
0
        __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
61
0
        __m256 ymm_r2 = _mm256_add_ps(
62
0
            _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
63
0
            _mm256_mul_ps(ymm_ry, ymm_ry));
64
0
        __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
65
0
                                        _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
66
0
        __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
67
0
        __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
68
0
        ymm_nominator =
69
0
            _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */
70
0
                          _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
71
0
        ymm_nominator = _mm256_add_ps(
72
0
            ymm_nominator,
73
0
            _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
74
0
        ymm_denominator = _mm256_add_ps(ymm_denominator,
75
0
                                        ymm_invr2); /* denominator += invr2 */
76
0
        ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
77
0
        mask =
78
0
            _mm256_movemask_ps(_mm256_cmp_ps(
79
0
                ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */
80
0
            (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS))
81
0
             << 8);
82
0
        if (mask)
83
0
            break;
84
0
    }
85
#else
86
#define LOOP_SIZE 8
87
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
88
    for (i = 0; i < nPointsRound; i += LOOP_SIZE)
89
    {
90
        __m256 ymm_rx =
91
            _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i),
92
                          ymm_x); /* rx = pafX[i] - fXPoint */
93
        __m256 ymm_ry =
94
            _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i),
95
                          ymm_y); /* ry = pafY[i] - fYPoint */
96
        __m256 ymm_r2 = _mm256_add_ps(
97
            _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */
98
            _mm256_mul_ps(ymm_ry, ymm_ry));
99
        __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */
100
        ymm_nominator = _mm256_add_ps(
101
            ymm_nominator, /* nominator += invr2 * pafZ[i] */
102
            _mm256_mul_ps(ymm_invr2,
103
                          _mm256_load_ps(const_cast<float *>(pafZ) + i)));
104
        ymm_denominator = _mm256_add_ps(ymm_denominator,
105
                                        ymm_invr2); /* denominator += invr2 */
106
        mask = _mm256_movemask_ps(_mm256_cmp_ps(
107
            ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */
108
        if (mask)
109
            break;
110
    }
111
#endif
112
113
    // Find which i triggered r2 < fEpsilon.
114
0
    if (mask)
115
0
    {
116
0
        for (int j = 0; j < LOOP_SIZE; j++)
117
0
        {
118
0
            if (mask & (1 << j))
119
0
            {
120
0
                (*pdfValue) = (pafZ)[i + j];
121
122
0
                return CE_None;
123
0
            }
124
0
        }
125
0
    }
126
0
#undef LOOP_SIZE
127
128
    // Get back nominator and denominator values for YMM registers.
129
0
    float afNominator[8];
130
0
    float afDenominator[8];
131
0
    _mm256_storeu_ps(afNominator, ymm_nominator);
132
0
    _mm256_storeu_ps(afDenominator, ymm_denominator);
133
134
    // MSVC doesn't emit AVX afterwards but may use SSE, so clear
135
    // upper bits.  Other compilers will continue using AVX for the
136
    // below floating points operations.
137
#if defined(_MSC_FULL_VER)
138
    _mm256_zeroupper();
139
#endif
140
141
0
    float fNominator = afNominator[0] + afNominator[1] + afNominator[2] +
142
0
                       afNominator[3] + afNominator[4] + afNominator[5] +
143
0
                       afNominator[6] + afNominator[7];
144
0
    float fDenominator = afDenominator[0] + afDenominator[1] +
145
0
                         afDenominator[2] + afDenominator[3] +
146
0
                         afDenominator[4] + afDenominator[5] +
147
0
                         afDenominator[6] + afDenominator[7];
148
149
    // Do the few remaining loop iterations.
150
0
    for (; i < nPoints; i++)
151
0
    {
152
0
        const float fRX = pafX[i] - fXPoint;
153
0
        const float fRY = pafY[i] - fYPoint;
154
0
        const float fR2 = fRX * fRX + fRY * fRY;
155
156
        // If the test point is close to the grid node, use the point
157
        // value directly as a node value to avoid singularity.
158
0
        if (fR2 < 0.0000000000001)
159
0
        {
160
0
            break;
161
0
        }
162
0
        else
163
0
        {
164
0
            const float fInvR2 = 1.0f / fR2;
165
0
            fNominator += fInvR2 * pafZ[i];
166
0
            fDenominator += fInvR2;
167
0
        }
168
0
    }
169
170
0
    if (i != nPoints)
171
0
    {
172
0
        (*pdfValue) = pafZ[i];
173
0
    }
174
0
    else if (fDenominator == 0.0)
175
0
    {
176
0
        (*pdfValue) =
177
0
            static_cast<const GDALGridInverseDistanceToAPowerOptions *>(
178
0
                poOptions)
179
0
                ->dfNoDataValue;
180
0
    }
181
0
    else
182
0
        (*pdfValue) = fNominator / fDenominator;
183
184
0
    return CE_None;
185
0
}
186
187
#endif /* HAVE_AVX_AT_COMPILE_TIME */