/src/gdal/alg/gdalgridavx.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Project: GDAL Gridding API. |
4 | | * Purpose: Implementation of GDAL scattered data gridder. |
5 | | * Author: Even Rouault, <even dot rouault at spatialys.com> |
6 | | * |
7 | | ****************************************************************************** |
8 | | * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com> |
9 | | * |
10 | | * SPDX-License-Identifier: MIT |
11 | | ****************************************************************************/ |
12 | | |
13 | | #include "gdalgrid.h" |
14 | | #include "gdalgrid_priv.h" |
15 | | |
16 | | #ifdef HAVE_AVX_AT_COMPILE_TIME |
17 | | #include <immintrin.h> |
18 | | |
19 | | /************************************************************************/ |
20 | | /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX() */ |
21 | | /************************************************************************/ |
22 | | |
23 | 0 | #define GDAL_mm256_load1_ps(x) _mm256_set_ps(x, x, x, x, x, x, x, x) |
24 | | |
25 | | CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX( |
26 | | const void *poOptions, GUInt32 nPoints, |
27 | | CPL_UNUSED const double *unused_padfX, |
28 | | CPL_UNUSED const double *unused_padfY, |
29 | | CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, |
30 | | double *pdfValue, void *hExtraParamsIn) |
31 | 0 | { |
32 | 0 | size_t i = 0; |
33 | 0 | GDALGridExtraParameters *psExtraParams = |
34 | 0 | static_cast<GDALGridExtraParameters *>(hExtraParamsIn); |
35 | 0 | const float *pafX = psExtraParams->pafX; |
36 | 0 | const float *pafY = psExtraParams->pafY; |
37 | 0 | const float *pafZ = psExtraParams->pafZ; |
38 | |
|
39 | 0 | const float fEpsilon = 0.0000000000001f; |
40 | 0 | const float fXPoint = static_cast<float>(dfXPoint); |
41 | 0 | const float fYPoint = static_cast<float>(dfYPoint); |
42 | 0 | const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon); |
43 | 0 | const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint); |
44 | 0 | const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint); |
45 | 0 | __m256 ymm_nominator = _mm256_setzero_ps(); |
46 | 0 | __m256 ymm_denominator = _mm256_setzero_ps(); |
47 | 0 | int mask = 0; |
48 | |
|
49 | 0 | #undef LOOP_SIZE |
50 | 0 | #if defined(__x86_64) || defined(_M_X64) |
51 | | /* This would also work in 32bit mode, but there are only 8 XMM registers */ |
52 | | /* whereas we have 16 for 64bit */ |
53 | 0 | #define LOOP_SIZE 16 |
54 | 0 | size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; |
55 | 0 | for (i = 0; i < nPointsRound; i += LOOP_SIZE) |
56 | 0 | { |
57 | 0 | __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), |
58 | 0 | ymm_x); /* rx = pafX[i] - fXPoint */ |
59 | 0 | __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x); |
60 | 0 | __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), |
61 | 0 | ymm_y); /* ry = pafY[i] - fYPoint */ |
62 | 0 | __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y); |
63 | 0 | __m256 ymm_r2 = _mm256_add_ps( |
64 | 0 | _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ |
65 | 0 | _mm256_mul_ps(ymm_ry, ymm_ry)); |
66 | 0 | __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8), |
67 | 0 | _mm256_mul_ps(ymm_ry_8, ymm_ry_8)); |
68 | 0 | __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ |
69 | 0 | __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8); |
70 | 0 | ymm_nominator = |
71 | 0 | _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ |
72 | 0 | _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i))); |
73 | 0 | ymm_nominator = _mm256_add_ps( |
74 | 0 | ymm_nominator, |
75 | 0 | _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8))); |
76 | 0 | ymm_denominator = _mm256_add_ps(ymm_denominator, |
77 | 0 | ymm_invr2); /* denominator += invr2 */ |
78 | 0 | ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8); |
79 | 0 | mask = |
80 | 0 | _mm256_movemask_ps(_mm256_cmp_ps( |
81 | 0 | ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */ |
82 | 0 | (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) |
83 | 0 | << 8); |
84 | 0 | if (mask) |
85 | 0 | break; |
86 | 0 | } |
87 | | #else |
88 | | #define LOOP_SIZE 8 |
89 | | size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; |
90 | | for (i = 0; i < nPointsRound; i += LOOP_SIZE) |
91 | | { |
92 | | __m256 ymm_rx = |
93 | | _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafX) + i), |
94 | | ymm_x); /* rx = pafX[i] - fXPoint */ |
95 | | __m256 ymm_ry = |
96 | | _mm256_sub_ps(_mm256_load_ps(const_cast<float *>(pafY) + i), |
97 | | ymm_y); /* ry = pafY[i] - fYPoint */ |
98 | | __m256 ymm_r2 = _mm256_add_ps( |
99 | | _mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ |
100 | | _mm256_mul_ps(ymm_ry, ymm_ry)); |
101 | | __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ |
102 | | ymm_nominator = _mm256_add_ps( |
103 | | ymm_nominator, /* nominator += invr2 * pafZ[i] */ |
104 | | _mm256_mul_ps(ymm_invr2, |
105 | | _mm256_load_ps(const_cast<float *>(pafZ) + i))); |
106 | | ymm_denominator = _mm256_add_ps(ymm_denominator, |
107 | | ymm_invr2); /* denominator += invr2 */ |
108 | | mask = _mm256_movemask_ps(_mm256_cmp_ps( |
109 | | ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */ |
110 | | if (mask) |
111 | | break; |
112 | | } |
113 | | #endif |
114 | | |
115 | | // Find which i triggered r2 < fEpsilon. |
116 | 0 | if (mask) |
117 | 0 | { |
118 | 0 | for (int j = 0; j < LOOP_SIZE; j++) |
119 | 0 | { |
120 | 0 | if (mask & (1 << j)) |
121 | 0 | { |
122 | 0 | (*pdfValue) = (pafZ)[i + j]; |
123 | | |
124 | | // GCC and MSVC need explicit zeroing. |
125 | | #if !defined(__clang__) |
126 | | _mm256_zeroupper(); |
127 | | #endif |
128 | 0 | return CE_None; |
129 | 0 | } |
130 | 0 | } |
131 | 0 | } |
132 | 0 | #undef LOOP_SIZE |
133 | | |
134 | | // Get back nominator and denominator values for YMM registers. |
135 | 0 | float afNominator[8]; |
136 | 0 | float afDenominator[8]; |
137 | 0 | _mm256_storeu_ps(afNominator, ymm_nominator); |
138 | 0 | _mm256_storeu_ps(afDenominator, ymm_denominator); |
139 | | |
140 | | // MSVC doesn't emit AVX afterwards but may use SSE, so clear |
141 | | // upper bits. Other compilers will continue using AVX for the |
142 | | // below floating points operations. |
143 | | #if defined(_MSC_FULL_VER) |
144 | | _mm256_zeroupper(); |
145 | | #endif |
146 | |
|
147 | 0 | float fNominator = afNominator[0] + afNominator[1] + afNominator[2] + |
148 | 0 | afNominator[3] + afNominator[4] + afNominator[5] + |
149 | 0 | afNominator[6] + afNominator[7]; |
150 | 0 | float fDenominator = afDenominator[0] + afDenominator[1] + |
151 | 0 | afDenominator[2] + afDenominator[3] + |
152 | 0 | afDenominator[4] + afDenominator[5] + |
153 | 0 | afDenominator[6] + afDenominator[7]; |
154 | | |
155 | | // Do the few remaining loop iterations. |
156 | 0 | for (; i < nPoints; i++) |
157 | 0 | { |
158 | 0 | const float fRX = pafX[i] - fXPoint; |
159 | 0 | const float fRY = pafY[i] - fYPoint; |
160 | 0 | const float fR2 = fRX * fRX + fRY * fRY; |
161 | | |
162 | | // If the test point is close to the grid node, use the point |
163 | | // value directly as a node value to avoid singularity. |
164 | 0 | if (fR2 < 0.0000000000001) |
165 | 0 | { |
166 | 0 | break; |
167 | 0 | } |
168 | 0 | else |
169 | 0 | { |
170 | 0 | const float fInvR2 = 1.0f / fR2; |
171 | 0 | fNominator += fInvR2 * pafZ[i]; |
172 | 0 | fDenominator += fInvR2; |
173 | 0 | } |
174 | 0 | } |
175 | |
|
176 | 0 | if (i != nPoints) |
177 | 0 | { |
178 | 0 | (*pdfValue) = pafZ[i]; |
179 | 0 | } |
180 | 0 | else if (fDenominator == 0.0) |
181 | 0 | { |
182 | 0 | (*pdfValue) = |
183 | 0 | static_cast<const GDALGridInverseDistanceToAPowerOptions *>( |
184 | 0 | poOptions) |
185 | 0 | ->dfNoDataValue; |
186 | 0 | } |
187 | 0 | else |
188 | 0 | (*pdfValue) = fNominator / fDenominator; |
189 | | |
190 | | // GCC needs explicit zeroing. |
191 | | #if defined(__GNUC__) && !defined(__clang__) |
192 | | _mm256_zeroupper(); |
193 | | #endif |
194 | |
|
195 | 0 | return CE_None; |
196 | 0 | } |
197 | | |
198 | | #endif /* HAVE_AVX_AT_COMPILE_TIME */ |