/src/gdal/alg/gdalgridsse.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Project: GDAL Gridding API. |
4 | | * Purpose: Implementation of GDAL scattered data gridder. |
5 | | * Author: Even Rouault, <even dot rouault at spatialys.com> |
6 | | * |
7 | | ****************************************************************************** |
8 | | * Copyright (c) 2013, Even Rouault <even dot rouault at spatialys.com> |
9 | | * |
10 | | * SPDX-License-Identifier: MIT |
11 | | ****************************************************************************/ |
12 | | |
13 | | #include "gdalgrid.h" |
14 | | #include "gdalgrid_priv.h" |
15 | | |
16 | | #ifdef HAVE_SSE_AT_COMPILE_TIME |
17 | | |
18 | | #ifdef USE_NEON_OPTIMIZATIONS |
19 | | #include "include_sse2neon.h" |
20 | | #else |
21 | | #include <xmmintrin.h> |
22 | | #endif |
23 | | |
24 | | /************************************************************************/ |
25 | | /* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */ |
26 | | /************************************************************************/ |
27 | | |
28 | | CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( |
29 | | const void *poOptions, GUInt32 nPoints, |
30 | | CPL_UNUSED const double *unused_padfX, |
31 | | CPL_UNUSED const double *unused_padfY, |
32 | | CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, |
33 | | double *pdfValue, void *hExtraParamsIn) |
34 | 0 | { |
35 | 0 | size_t i = 0; |
36 | 0 | GDALGridExtraParameters *psExtraParams = |
37 | 0 | static_cast<GDALGridExtraParameters *>(hExtraParamsIn); |
38 | 0 | const float *pafX = psExtraParams->pafX; |
39 | 0 | const float *pafY = psExtraParams->pafY; |
40 | 0 | const float *pafZ = psExtraParams->pafZ; |
41 | |
|
42 | 0 | const float fEpsilon = 0.0000000000001f; |
43 | 0 | const float fXPoint = static_cast<float>(dfXPoint); |
44 | 0 | const float fYPoint = static_cast<float>(dfYPoint); |
45 | 0 | const __m128 xmm_small = _mm_load1_ps(const_cast<float *>(&fEpsilon)); |
46 | 0 | const __m128 xmm_x = _mm_load1_ps(const_cast<float *>(&fXPoint)); |
47 | 0 | const __m128 xmm_y = _mm_load1_ps(const_cast<float *>(&fYPoint)); |
48 | 0 | __m128 xmm_nominator = _mm_setzero_ps(); |
49 | 0 | __m128 xmm_denominator = _mm_setzero_ps(); |
50 | 0 | int mask = 0; |
51 | |
|
52 | 0 | #if defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS) |
53 | | // This would also work in 32bit mode, but there are only 8 XMM registers |
54 | | // whereas we have 16 for 64bit. |
55 | 0 | const size_t LOOP_SIZE = 8; |
56 | 0 | size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; |
57 | 0 | for (i = 0; i < nPointsRound; i += LOOP_SIZE) |
58 | 0 | { |
59 | | // rx = pafX[i] - fXPoint |
60 | 0 | __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); |
61 | 0 | __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x); |
62 | | // ry = pafY[i] - fYPoint |
63 | 0 | __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); |
64 | 0 | __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y); |
65 | | // r2 = rx * rx + ry * ry |
66 | 0 | __m128 xmm_r2 = |
67 | 0 | _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), _mm_mul_ps(xmm_ry, xmm_ry)); |
68 | 0 | __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4), |
69 | 0 | _mm_mul_ps(xmm_ry_4, xmm_ry_4)); |
70 | | // invr2 = 1.0f / r2 |
71 | 0 | __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); |
72 | 0 | __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4); |
73 | | // nominator += invr2 * pafZ[i] |
74 | 0 | xmm_nominator = _mm_add_ps( |
75 | 0 | xmm_nominator, _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); |
76 | 0 | xmm_nominator = _mm_add_ps( |
77 | 0 | xmm_nominator, _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4))); |
78 | | // denominator += invr2 |
79 | 0 | xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); |
80 | 0 | xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4); |
81 | | // if( r2 < fEpsilon) |
82 | 0 | mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | |
83 | 0 | (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4); |
84 | 0 | if (mask) |
85 | 0 | break; |
86 | 0 | } |
87 | | #else |
88 | | #define LOOP_SIZE 4 |
89 | | size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; |
90 | | for (i = 0; i < nPointsRound; i += LOOP_SIZE) |
91 | | { |
92 | | __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), |
93 | | xmm_x); /* rx = pafX[i] - fXPoint */ |
94 | | __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), |
95 | | xmm_y); /* ry = pafY[i] - fYPoint */ |
96 | | __m128 xmm_r2 = |
97 | | _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ |
98 | | _mm_mul_ps(xmm_ry, xmm_ry)); |
99 | | __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ |
100 | | xmm_nominator = |
101 | | _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ |
102 | | _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); |
103 | | xmm_denominator = |
104 | | _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ |
105 | | mask = _mm_movemask_ps( |
106 | | _mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */ |
107 | | if (mask) |
108 | | break; |
109 | | } |
110 | | #endif |
111 | | |
112 | | // Find which i triggered r2 < fEpsilon. |
113 | 0 | if (mask) |
114 | 0 | { |
115 | 0 | for (size_t j = 0; j < LOOP_SIZE; j++) |
116 | 0 | { |
117 | 0 | if (mask & (1 << j)) |
118 | 0 | { |
119 | 0 | (*pdfValue) = (pafZ)[i + j]; |
120 | 0 | return CE_None; |
121 | 0 | } |
122 | 0 | } |
123 | 0 | } |
124 | | |
125 | | // Get back nominator and denominator values for XMM registers. |
126 | 0 | float afNominator[4]; |
127 | 0 | float afDenominator[4]; |
128 | 0 | _mm_storeu_ps(afNominator, xmm_nominator); |
129 | 0 | _mm_storeu_ps(afDenominator, xmm_denominator); |
130 | |
|
131 | 0 | float fNominator = |
132 | 0 | afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3]; |
133 | 0 | float fDenominator = afDenominator[0] + afDenominator[1] + |
134 | 0 | afDenominator[2] + afDenominator[3]; |
135 | | |
136 | | /* Do the few remaining loop iterations */ |
137 | 0 | for (; i < nPoints; i++) |
138 | 0 | { |
139 | 0 | const float fRX = pafX[i] - fXPoint; |
140 | 0 | const float fRY = pafY[i] - fYPoint; |
141 | 0 | const float fR2 = fRX * fRX + fRY * fRY; |
142 | | |
143 | | // If the test point is close to the grid node, use the point |
144 | | // value directly as a node value to avoid singularity. |
145 | 0 | if (fR2 < 0.0000000000001) |
146 | 0 | { |
147 | 0 | break; |
148 | 0 | } |
149 | 0 | else |
150 | 0 | { |
151 | 0 | const float fInvR2 = 1.0f / fR2; |
152 | 0 | fNominator += fInvR2 * pafZ[i]; |
153 | 0 | fDenominator += fInvR2; |
154 | 0 | } |
155 | 0 | } |
156 | |
|
157 | 0 | if (i != nPoints) |
158 | 0 | { |
159 | 0 | (*pdfValue) = pafZ[i]; |
160 | 0 | } |
161 | 0 | else if (fDenominator == 0.0) |
162 | 0 | { |
163 | 0 | (*pdfValue) = |
164 | 0 | static_cast<const GDALGridInverseDistanceToAPowerOptions *>( |
165 | 0 | poOptions) |
166 | 0 | ->dfNoDataValue; |
167 | 0 | } |
168 | 0 | else |
169 | 0 | { |
170 | 0 | (*pdfValue) = fNominator / fDenominator; |
171 | 0 | } |
172 | |
|
173 | 0 | return CE_None; |
174 | 0 | } |
175 | | |
176 | | #endif /* HAVE_SSE_AT_COMPILE_TIME */ |