/src/gdal/alg/gdalgridavx.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /******************************************************************************  | 
2  |  |  *  | 
3  |  |  * Project:  GDAL Gridding API.  | 
4  |  |  * Purpose:  Implementation of GDAL scattered data gridder.  | 
5  |  |  * Author:   Even Rouault, <even dot rouault at spatialys.com>  | 
6  |  |  *  | 
7  |  |  ******************************************************************************  | 
8  |  |  * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com>  | 
9  |  |  *  | 
10  |  |  * SPDX-License-Identifier: MIT  | 
11  |  |  ****************************************************************************/  | 
12  |  |  | 
13  |  | #include "gdalgrid.h"  | 
14  |  | #include "gdalgrid_priv.h"  | 
15  |  |  | 
16  |  | #ifdef HAVE_AVX_AT_COMPILE_TIME  | 
17  |  | #include <immintrin.h>  | 
18  |  |  | 
19  |  | /************************************************************************/  | 
20  |  | /*         GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX()     */  | 
21  |  | /************************************************************************/  | 
22  |  |  | 
23  | 0  | #define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x)  | 
24  |  |  | 
25  |  | CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(  | 
26  |  |     const void *poOptions, GUInt32 nPoints,  | 
27  |  |     CPL_UNUSED const double *unused_padfX,  | 
28  |  |     CPL_UNUSED const double *unused_padfY,  | 
29  |  |     CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint,  | 
30  |  |     double *pdfValue, void *hExtraParamsIn)  | 
31  | 0  | { | 
32  | 0  |     size_t i = 0;  | 
33  | 0  |     GDALGridExtraParameters *psExtraParams =  | 
34  | 0  |         static_cast<GDALGridExtraParameters *>(hExtraParamsIn);  | 
35  | 0  |     const float *pafX = psExtraParams->pafX;  | 
36  | 0  |     const float *pafY = psExtraParams->pafY;  | 
37  | 0  |     const float *pafZ = psExtraParams->pafZ;  | 
38  |  | 
  | 
39  | 0  |     const float fEpsilon = 0.0000000000001f;  | 
40  | 0  |     const float fXPoint = static_cast<float>(dfXPoint);  | 
41  | 0  |     const float fYPoint = static_cast<float>(dfYPoint);  | 
42  | 0  |     const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);  | 
43  | 0  |     const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);  | 
44  | 0  |     const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);  | 
45  | 0  |     __m256 ymm_nominator = _mm256_setzero_ps();  | 
46  | 0  |     __m256 ymm_denominator = _mm256_setzero_ps();  | 
47  | 0  |     int mask = 0;  | 
48  |  | 
  | 
49  | 0  | #undef LOOP_SIZE  | 
50  | 0  | #if defined(__x86_64) || defined(_M_X64)  | 
51  |  |     /* This would also work in 32bit mode, but there are only 8 XMM registers */  | 
52  |  |     /* whereas we have 16 for 64bit */  | 
53  | 0  | #define LOOP_SIZE 16  | 
54  | 0  |     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;  | 
55  | 0  |     for (i = 0; i < nPointsRound; i += LOOP_SIZE)  | 
56  | 0  |     { | 
57  | 0  |         __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i),  | 
58  | 0  |                                       ymm_x); /* rx = pafX[i] - fXPoint */  | 
59  | 0  |         __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);  | 
60  | 0  |         __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i),  | 
61  | 0  |                                       ymm_y); /* ry = pafY[i] - fYPoint */  | 
62  | 0  |         __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);  | 
63  | 0  |         __m256 ymm_r2 = _mm256_add_ps(  | 
64  | 0  |             _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */  | 
65  | 0  |             _mm256_mul_ps(ymm_ry, ymm_ry));  | 
66  | 0  |         __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),  | 
67  | 0  |                                         _mm256_mul_ps(ymm_ry_8, ymm_ry_8));  | 
68  | 0  |         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */  | 
69  | 0  |         __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);  | 
70  | 0  |         ymm_nominator =  | 
71  | 0  |             _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */  | 
72  | 0  |                           _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));  | 
73  | 0  |         ymm_nominator = _mm256_add_ps(  | 
74  | 0  |             ymm_nominator,  | 
75  | 0  |             _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));  | 
76  | 0  |         ymm_denominator = _mm256_add_ps(ymm_denominator,  | 
77  | 0  |                                         ymm_invr2); /* denominator += invr2 */  | 
78  | 0  |         ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);  | 
79  | 0  |         mask =  | 
80  | 0  |             _mm256_movemask_ps(_mm256_cmp_ps(  | 
81  | 0  |                 ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */  | 
82  | 0  |             (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS))  | 
83  | 0  |              << 8);  | 
84  | 0  |         if (mask)  | 
85  | 0  |             break;  | 
86  | 0  |     }  | 
87  |  | #else  | 
88  |  | #define LOOP_SIZE 8  | 
89  |  |     size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;  | 
90  |  |     for (i = 0; i < nPointsRound; i += LOOP_SIZE)  | 
91  |  |     { | 
92  |  |         __m256 ymm_rx =  | 
93  |  |             _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i),  | 
94  |  |                           ymm_x); /* rx = pafX[i] - fXPoint */  | 
95  |  |         __m256 ymm_ry =  | 
96  |  |             _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i),  | 
97  |  |                           ymm_y); /* ry = pafY[i] - fYPoint */  | 
98  |  |         __m256 ymm_r2 = _mm256_add_ps(  | 
99  |  |             _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */  | 
100  |  |             _mm256_mul_ps(ymm_ry, ymm_ry));  | 
101  |  |         __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */  | 
102  |  |         ymm_nominator = _mm256_add_ps(  | 
103  |  |             ymm_nominator, /* nominator += invr2 * pafZ[i] */  | 
104  |  |             _mm256_mul_ps(ymm_invr2,  | 
105  |  |                           _mm256_load_ps(const_cast<float *>(pafZ) + i)));  | 
106  |  |         ymm_denominator = _mm256_add_ps(ymm_denominator,  | 
107  |  |                                         ymm_invr2); /* denominator += invr2 */  | 
108  |  |         mask = _mm256_movemask_ps(_mm256_cmp_ps(  | 
109  |  |             ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */  | 
110  |  |         if (mask)  | 
111  |  |             break;  | 
112  |  |     }  | 
113  |  | #endif  | 
114  |  |  | 
115  |  |     // Find which i triggered r2 < fEpsilon.  | 
116  | 0  |     if (mask)  | 
117  | 0  |     { | 
118  | 0  |         for (int j = 0; j < LOOP_SIZE; j++)  | 
119  | 0  |         { | 
120  | 0  |             if (mask & (1 << j))  | 
121  | 0  |             { | 
122  | 0  |                 (*pdfValue) = (pafZ)[i + j];  | 
123  |  |  | 
124  |  |                 // GCC and MSVC need explicit zeroing.  | 
125  |  | #if !defined(__clang__)  | 
126  |  |                 _mm256_zeroupper();  | 
127  |  | #endif  | 
128  | 0  |                 return CE_None;  | 
129  | 0  |             }  | 
130  | 0  |         }  | 
131  | 0  |     }  | 
132  | 0  | #undef LOOP_SIZE  | 
133  |  |  | 
134  |  |     // Get back nominator and denominator values for YMM registers.  | 
135  | 0  |     float afNominator[8];  | 
136  | 0  |     float afDenominator[8];  | 
137  | 0  |     _mm256_storeu_ps(afNominator, ymm_nominator);  | 
138  | 0  |     _mm256_storeu_ps(afDenominator, ymm_denominator);  | 
139  |  |  | 
140  |  |     // MSVC doesn't emit AVX afterwards but may use SSE, so clear  | 
141  |  |     // upper bits.  Other compilers will continue using AVX for the  | 
142  |  |     // below floating points operations.  | 
143  |  | #if defined(_MSC_FULL_VER)  | 
144  |  |     _mm256_zeroupper();  | 
145  |  | #endif  | 
146  |  | 
  | 
147  | 0  |     float fNominator = afNominator[0] + afNominator[1] + afNominator[2] +  | 
148  | 0  |                        afNominator[3] + afNominator[4] + afNominator[5] +  | 
149  | 0  |                        afNominator[6] + afNominator[7];  | 
150  | 0  |     float fDenominator = afDenominator[0] + afDenominator[1] +  | 
151  | 0  |                          afDenominator[2] + afDenominator[3] +  | 
152  | 0  |                          afDenominator[4] + afDenominator[5] +  | 
153  | 0  |                          afDenominator[6] + afDenominator[7];  | 
154  |  |  | 
155  |  |     // Do the few remaining loop iterations.  | 
156  | 0  |     for (; i < nPoints; i++)  | 
157  | 0  |     { | 
158  | 0  |         const float fRX = pafX[i] - fXPoint;  | 
159  | 0  |         const float fRY = pafY[i] - fYPoint;  | 
160  | 0  |         const float fR2 = fRX * fRX + fRY * fRY;  | 
161  |  |  | 
162  |  |         // If the test point is close to the grid node, use the point  | 
163  |  |         // value directly as a node value to avoid singularity.  | 
164  | 0  |         if (fR2 < 0.0000000000001)  | 
165  | 0  |         { | 
166  | 0  |             break;  | 
167  | 0  |         }  | 
168  | 0  |         else  | 
169  | 0  |         { | 
170  | 0  |             const float fInvR2 = 1.0f / fR2;  | 
171  | 0  |             fNominator += fInvR2 * pafZ[i];  | 
172  | 0  |             fDenominator += fInvR2;  | 
173  | 0  |         }  | 
174  | 0  |     }  | 
175  |  | 
  | 
176  | 0  |     if (i != nPoints)  | 
177  | 0  |     { | 
178  | 0  |         (*pdfValue) = pafZ[i];  | 
179  | 0  |     }  | 
180  | 0  |     else if (fDenominator == 0.0)  | 
181  | 0  |     { | 
182  | 0  |         (*pdfValue) =  | 
183  | 0  |             static_cast<const GDALGridInverseDistanceToAPowerOptions *>(  | 
184  | 0  |                 poOptions)  | 
185  | 0  |                 ->dfNoDataValue;  | 
186  | 0  |     }  | 
187  | 0  |     else  | 
188  | 0  |         (*pdfValue) = fNominator / fDenominator;  | 
189  |  |  | 
190  |  |         // GCC needs explicit zeroing.  | 
191  |  | #if defined(__GNUC__) && !defined(__clang__)  | 
192  |  |     _mm256_zeroupper();  | 
193  |  | #endif  | 
194  |  | 
  | 
195  | 0  |     return CE_None;  | 
196  | 0  | }  | 
197  |  |  | 
198  |  | #endif /* HAVE_AVX_AT_COMPILE_TIME */  |