Coverage Report

Created: 2025-11-16 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/gcore/overview.cpp
Line
Count
Source
1
2
/******************************************************************************
3
 *
4
 * Project:  GDAL Core
5
 * Purpose:  Helper code to implement overview support in different drivers.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2000, Frank Warmerdam
10
 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14
15
#include "cpl_port.h"
16
#include "gdal_priv.h"
17
18
#include <cmath>
19
#include <cstddef>
20
#include <cstdlib>
21
22
#include <algorithm>
23
#include <complex>
24
#include <condition_variable>
25
#include <limits>
26
#include <list>
27
#include <memory>
28
#include <mutex>
29
#include <vector>
30
31
#include "cpl_conv.h"
32
#include "cpl_error.h"
33
#include "cpl_float.h"
34
#include "cpl_progress.h"
35
#include "cpl_vsi.h"
36
#include "gdal.h"
37
#include "gdal_thread_pool.h"
38
#include "gdalwarper.h"
39
#include "gdal_vrt.h"
40
#include "vrtdataset.h"
41
42
#ifdef USE_NEON_OPTIMIZATIONS
43
#include "include_sse2neon.h"
44
45
#if (!defined(__aarch64__) && !defined(_M_ARM64))
46
#define ARM_V7
47
#endif
48
49
#define USE_SSE2
50
51
#include "gdalsse_priv.h"
52
53
// Restrict to 64bit processors because they are guaranteed to have SSE2,
54
// or if __AVX2__ is defined.
55
#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
56
#define USE_SSE2
57
58
#include "gdalsse_priv.h"
59
60
#ifdef __SSE3__
61
#include <pmmintrin.h>
62
#endif
63
#ifdef __SSSE3__
64
#include <tmmintrin.h>
65
#endif
66
#ifdef __SSE4_1__
67
#include <smmintrin.h>
68
#endif
69
#ifdef __AVX2__
70
#include <immintrin.h>
71
#endif
72
73
#endif
74
75
// To be included after above USE_SSE2 and include gdalsse_priv.h
76
// to avoid build issue on Windows x86
77
#include "gdal_priv_templates.hpp"
78
79
/************************************************************************/
80
/*                      GDALResampleChunk_Near()                        */
81
/************************************************************************/
82
83
template <class T>
84
static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
85
                                      const T *pChunk, T **ppDstBuffer)
86
87
0
{
88
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
89
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
90
0
    const GDALDataType eWrkDataType = args.eWrkDataType;
91
0
    const int nChunkXOff = args.nChunkXOff;
92
0
    const int nChunkXSize = args.nChunkXSize;
93
0
    const int nChunkYOff = args.nChunkYOff;
94
0
    const int nDstXOff = args.nDstXOff;
95
0
    const int nDstXOff2 = args.nDstXOff2;
96
0
    const int nDstYOff = args.nDstYOff;
97
0
    const int nDstYOff2 = args.nDstYOff2;
98
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
99
100
    /* -------------------------------------------------------------------- */
101
    /*      Allocate buffers.                                               */
102
    /* -------------------------------------------------------------------- */
103
0
    *ppDstBuffer = static_cast<T *>(
104
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
105
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
106
0
    if (*ppDstBuffer == nullptr)
107
0
    {
108
0
        return CE_Failure;
109
0
    }
110
0
    T *const pDstBuffer = *ppDstBuffer;
111
112
0
    int *panSrcXOff =
113
0
        static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
114
115
0
    if (panSrcXOff == nullptr)
116
0
    {
117
0
        return CE_Failure;
118
0
    }
119
120
    /* ==================================================================== */
121
    /*      Precompute inner loop constants.                                */
122
    /* ==================================================================== */
123
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
124
0
    {
125
0
        int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
126
0
        if (nSrcXOff < nChunkXOff)
127
0
            nSrcXOff = nChunkXOff;
128
129
0
        panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
130
0
    }
131
132
    /* ==================================================================== */
133
    /*      Loop over destination scanlines.                                */
134
    /* ==================================================================== */
135
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
136
0
    {
137
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
138
0
        if (nSrcYOff < nChunkYOff)
139
0
            nSrcYOff = nChunkYOff;
140
141
0
        const T *const pSrcScanline =
142
0
            pChunk +
143
0
            (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
144
0
            nChunkXOff;
145
146
        /* --------------------------------------------------------------------
147
         */
148
        /*      Loop over destination pixels */
149
        /* --------------------------------------------------------------------
150
         */
151
0
        T *pDstScanline =
152
0
            pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
153
0
        for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
154
0
        {
155
0
            pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
156
0
        }
157
0
    }
158
159
0
    CPLFree(panSrcXOff);
160
161
0
    return CE_None;
162
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>**)
163
164
static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
165
                                     const void *pChunk, void **ppDstBuffer,
166
                                     GDALDataType *peDstBufferDataType)
167
0
{
168
0
    *peDstBufferDataType = args.eWrkDataType;
169
0
    switch (args.eWrkDataType)
170
0
    {
171
        // For nearest resampling, as no computation is done, only the
172
        // size of the data type matters.
173
0
        case GDT_Byte:
174
0
        case GDT_Int8:
175
0
        {
176
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
177
0
            return GDALResampleChunk_NearT(
178
0
                args, static_cast<const uint8_t *>(pChunk),
179
0
                reinterpret_cast<uint8_t **>(ppDstBuffer));
180
0
        }
181
182
0
        case GDT_Int16:
183
0
        case GDT_UInt16:
184
0
        case GDT_Float16:
185
0
        {
186
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
187
0
            return GDALResampleChunk_NearT(
188
0
                args, static_cast<const uint16_t *>(pChunk),
189
0
                reinterpret_cast<uint16_t **>(ppDstBuffer));
190
0
        }
191
192
0
        case GDT_CInt16:
193
0
        case GDT_CFloat16:
194
0
        case GDT_Int32:
195
0
        case GDT_UInt32:
196
0
        case GDT_Float32:
197
0
        {
198
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
199
0
            return GDALResampleChunk_NearT(
200
0
                args, static_cast<const uint32_t *>(pChunk),
201
0
                reinterpret_cast<uint32_t **>(ppDstBuffer));
202
0
        }
203
204
0
        case GDT_CInt32:
205
0
        case GDT_CFloat32:
206
0
        case GDT_Int64:
207
0
        case GDT_UInt64:
208
0
        case GDT_Float64:
209
0
        {
210
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
211
0
            return GDALResampleChunk_NearT(
212
0
                args, static_cast<const uint64_t *>(pChunk),
213
0
                reinterpret_cast<uint64_t **>(ppDstBuffer));
214
0
        }
215
216
0
        case GDT_CFloat64:
217
0
        {
218
0
            return GDALResampleChunk_NearT(
219
0
                args, static_cast<const std::complex<double> *>(pChunk),
220
0
                reinterpret_cast<std::complex<double> **>(ppDstBuffer));
221
0
        }
222
223
0
        case GDT_Unknown:
224
0
        case GDT_TypeCount:
225
0
            break;
226
0
    }
227
0
    CPLAssert(false);
228
0
    return CE_Failure;
229
0
}
230
231
namespace
232
{
233
234
// Find in the color table the entry whose RGB value is the closest
235
// (using quadratic distance) to the test color, ignoring transparent entries.
236
int BestColorEntry(const std::vector<GDALColorEntry> &entries,
237
                   const GDALColorEntry &test)
238
0
{
239
0
    int nMinDist = std::numeric_limits<int>::max();
240
0
    size_t bestEntry = 0;
241
0
    for (size_t i = 0; i < entries.size(); ++i)
242
0
    {
243
0
        const GDALColorEntry &entry = entries[i];
244
        // Ignore transparent entries
245
0
        if (entry.c4 == 0)
246
0
            continue;
247
248
0
        int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
249
0
                    ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
250
0
                    ((test.c3 - entry.c3) * (test.c3 - entry.c3));
251
0
        if (nDist < nMinDist)
252
0
        {
253
0
            nMinDist = nDist;
254
0
            bestEntry = i;
255
0
        }
256
0
    }
257
0
    return static_cast<int>(bestEntry);
258
0
}
259
260
std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
261
                                           int &transparentIdx)
262
0
{
263
0
    std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
264
265
0
    transparentIdx = -1;
266
0
    int i = 0;
267
0
    for (auto &entry : entries)
268
0
    {
269
0
        table.GetColorEntryAsRGB(i, &entry);
270
0
        if (transparentIdx < 0 && entry.c4 == 0)
271
0
            transparentIdx = i;
272
0
        ++i;
273
0
    }
274
0
    return entries;
275
0
}
276
277
}  // unnamed  namespace
278
279
/************************************************************************/
280
/*                             SQUARE()                                 */
281
/************************************************************************/
282
283
template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
284
0
{
285
0
    return static_cast<Tsquare>(val) * val;
286
0
}
Unexecuted instantiation: int SQUARE<int, int>(int)
Unexecuted instantiation: double SQUARE<double, double>(double)
Unexecuted instantiation: float SQUARE<float, float>(float)
287
288
/************************************************************************/
289
/*                          ComputeIntegerRMS()                         */
290
/************************************************************************/
291
// Compute rms = sqrt(sumSquares / weight) in such a way that it is the
292
// integer that minimizes abs(rms**2 - sumSquares / weight)
293
template <class T, class Twork>
294
inline T ComputeIntegerRMS(double sumSquares, double weight)
295
0
{
296
0
    const double sumDivWeight = sumSquares / weight;
297
0
    T rms = static_cast<T>(sqrt(sumDivWeight));
298
299
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
300
    // Naive version:
301
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
302
0
    if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
303
0
        2 * sumDivWeight)
304
0
        rms += 1;
305
0
    return rms;
306
0
}
Unexecuted instantiation: unsigned char ComputeIntegerRMS<unsigned char, int>(double, double)
Unexecuted instantiation: unsigned short ComputeIntegerRMS<unsigned short, unsigned long>(double, double)
307
308
template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
309
{
310
    CPLAssert(false);
311
    return 0;
312
}
313
314
template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
315
0
{
316
    // It has been verified that given the correction on rms below, using
317
    // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
318
    // is equivalent, so use the former as it is used twice.
319
0
    const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
320
0
    const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
321
0
    GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
322
323
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
324
    // Naive version:
325
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
326
    // Optimized version for integer case and weight == 4
327
0
    if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
328
0
        rms += 1;
329
0
    return rms;
330
0
}
331
332
template <>
333
inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
334
0
{
335
0
    const double sumDivWeight = sumSquares * 0.25;
336
0
    GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
337
338
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
339
    // Naive version:
340
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
341
    // Optimized version for integer case and weight == 4
342
0
    if (static_cast<GUInt32>(rms) * (rms + 1) <
343
0
        static_cast<GUInt32>(sumDivWeight + 0.25))
344
0
        rms += 1;
345
0
    return rms;
346
0
}
347
348
#ifdef USE_SSE2
349
350
/************************************************************************/
351
/*                   QuadraticMeanByteSSE2OrAVX2()                      */
352
/************************************************************************/
353
354
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
355
#define sse2_packus_epi32 _mm_packus_epi32
356
#else
357
inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
358
0
{
359
0
    const auto minus32768_32 = _mm_set1_epi32(-32768);
360
0
    const auto minus32768_16 = _mm_set1_epi16(-32768);
361
0
    a = _mm_add_epi32(a, minus32768_32);
362
0
    b = _mm_add_epi32(b, minus32768_32);
363
0
    a = _mm_packs_epi32(a, b);
364
0
    a = _mm_sub_epi16(a, minus32768_16);
365
0
    return a;
366
0
}
367
#endif
368
369
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
370
#define sse2_hadd_epi16 _mm_hadd_epi16
371
#else
372
inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
373
0
{
374
    // Horizontal addition of adjacent pairs
375
0
    const auto mask = _mm_set1_epi32(0xFFFF);
376
0
    const auto horizLo =
377
0
        _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
378
0
    const auto horizHi =
379
0
        _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
380
381
    // Recombine low and high parts
382
0
    return _mm_packs_epi32(horizLo, horizHi);
383
0
}
384
#endif
385
386
#ifdef __AVX2__
387
388
#define set1_epi16 _mm256_set1_epi16
389
#define set1_epi32 _mm256_set1_epi32
390
#define setzero _mm256_setzero_si256
391
#define set1_ps _mm256_set1_ps
392
#define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
393
#define unpacklo_epi8 _mm256_unpacklo_epi8
394
#define unpackhi_epi8 _mm256_unpackhi_epi8
395
#define madd_epi16 _mm256_madd_epi16
396
#define add_epi32 _mm256_add_epi32
397
#define mul_ps _mm256_mul_ps
398
#define cvtepi32_ps _mm256_cvtepi32_ps
399
#define sqrt_ps _mm256_sqrt_ps
400
#define cvttps_epi32 _mm256_cvttps_epi32
401
#define packs_epi32 _mm256_packs_epi32
402
#define packus_epi32 _mm256_packus_epi32
403
#define srli_epi32 _mm256_srli_epi32
404
#define mullo_epi16 _mm256_mullo_epi16
405
#define srli_epi16 _mm256_srli_epi16
406
#define cmpgt_epi16 _mm256_cmpgt_epi16
407
#define add_epi16 _mm256_add_epi16
408
#define sub_epi16 _mm256_sub_epi16
409
#define packus_epi16 _mm256_packus_epi16
410
411
/* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
412
/* to get the lower 128-bit bits of what would be a true 256-bit vector register
413
 */
414
415
inline __m256i FIXUP_LANES(__m256i x)
416
{
417
    return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
418
}
419
420
#define store_lo(x, y)                                                         \
421
    _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
422
                     _mm256_extracti128_si256(FIXUP_LANES(y), 0))
423
#define storeu_int(x, y)                                                       \
424
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
425
#define hadd_epi16 _mm256_hadd_epi16
426
#else
427
0
#define set1_epi16 _mm_set1_epi16
428
0
#define set1_epi32 _mm_set1_epi32
429
0
#define setzero _mm_setzero_si128
430
#define set1_ps _mm_set1_ps
431
0
#define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
432
0
#define unpacklo_epi8 _mm_unpacklo_epi8
433
0
#define unpackhi_epi8 _mm_unpackhi_epi8
434
0
#define madd_epi16 _mm_madd_epi16
435
0
#define add_epi32 _mm_add_epi32
436
#define mul_ps _mm_mul_ps
437
0
#define cvtepi32_ps _mm_cvtepi32_ps
438
0
#define sqrt_ps _mm_sqrt_ps
439
0
#define cvttps_epi32 _mm_cvttps_epi32
440
0
#define packs_epi32 _mm_packs_epi32
441
0
#define packus_epi32 sse2_packus_epi32
442
0
#define srli_epi32 _mm_srli_epi32
443
0
#define mullo_epi16 _mm_mullo_epi16
444
0
#define srli_epi16 _mm_srli_epi16
445
0
#define cmpgt_epi16 _mm_cmpgt_epi16
446
0
#define add_epi16 _mm_add_epi16
447
0
#define sub_epi16 _mm_sub_epi16
448
0
#define packus_epi16 _mm_packus_epi16
449
0
#define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
450
0
#define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
451
0
#define hadd_epi16 sse2_hadd_epi16
452
#endif
453
454
template <class T>
455
static int
456
#if defined(__GNUC__)
457
    __attribute__((noinline))
458
#endif
459
    QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
460
                                const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
461
                                T *CPL_RESTRICT pDstScanline)
462
0
{
463
    // Optimized implementation for RMS on Byte by
464
    // processing by group of 8 output pixels, so as to use
465
    // a single _mm_sqrt_ps() call for 4 output pixels
466
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
467
468
0
    int iDstPixel = 0;
469
0
    const auto one16 = set1_epi16(1);
470
0
    const auto one32 = set1_epi32(1);
471
0
    const auto zero = setzero();
472
0
    const auto minus32768 = set1_epi16(-32768);
473
474
0
    constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
475
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
476
0
    {
477
        // Load 2 * DEST_ELTS bytes from each line
478
0
        auto firstLine = loadu_int(pSrcScanlineShifted);
479
0
        auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
480
        // Extend those Bytes as UInt16s
481
0
        auto firstLineLo = unpacklo_epi8(firstLine, zero);
482
0
        auto firstLineHi = unpackhi_epi8(firstLine, zero);
483
0
        auto secondLineLo = unpacklo_epi8(secondLine, zero);
484
0
        auto secondLineHi = unpackhi_epi8(secondLine, zero);
485
486
        // Multiplication of 16 bit values and horizontal
487
        // addition of 32 bit results
488
        // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
489
0
        firstLineLo = madd_epi16(firstLineLo, firstLineLo);
490
0
        firstLineHi = madd_epi16(firstLineHi, firstLineHi);
491
0
        secondLineLo = madd_epi16(secondLineLo, secondLineLo);
492
0
        secondLineHi = madd_epi16(secondLineHi, secondLineHi);
493
494
        // Vertical addition
495
0
        const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
496
0
        const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
497
498
0
        const auto sumSquaresPlusOneDiv4Lo =
499
0
            srli_epi32(add_epi32(sumSquaresLo, one32), 2);
500
0
        const auto sumSquaresPlusOneDiv4Hi =
501
0
            srli_epi32(add_epi32(sumSquaresHi, one32), 2);
502
503
        // Take square root and truncate/floor to int32
504
0
        const auto rmsLo =
505
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
506
0
        const auto rmsHi =
507
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
508
509
        // Merge back low and high registers with each RMS value
510
        // as a 16 bit value.
511
0
        auto rms = packs_epi32(rmsLo, rmsHi);
512
513
        // Round to upper value if it minimizes the
514
        // error |rms^2 - sumSquares/4|
515
        // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
516
        //    rms += 1;
517
        // which is equivalent to:
518
        // if( rms * (rms + 1) < (sumSquares+1) / 4 )
519
        //    rms += 1;
520
        // And both left and right parts fit on 16 (unsigned) bits
521
0
        const auto sumSquaresPlusOneDiv4 =
522
0
            packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
523
        // cmpgt_epi16 operates on signed int16, but here
524
        // we have unsigned values, so shift them by -32768 before
525
0
        const auto mask = cmpgt_epi16(
526
0
            add_epi16(sumSquaresPlusOneDiv4, minus32768),
527
0
            add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
528
        // The value of the mask will be -1 when the correction needs to be
529
        // applied
530
0
        rms = sub_epi16(rms, mask);
531
532
        // Pack each 16 bit RMS value to 8 bits
533
0
        rms = packus_epi16(rms, rms /* could be anything */);
534
0
        store_lo(&pDstScanline[iDstPixel], rms);
535
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
536
0
    }
537
538
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
539
0
    return iDstPixel;
540
0
}
541
542
/************************************************************************/
543
/*                      AverageByteSSE2OrAVX2()                         */
544
/************************************************************************/
545
546
static int
547
AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
548
                      const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
549
                      GByte *CPL_RESTRICT pDstScanline)
550
0
{
551
    // Optimized implementation for average on Byte by
552
    // processing by group of 16 output pixels for SSE2, or 32 for AVX2
553
554
0
    const auto zero = setzero();
555
0
    const auto two16 = set1_epi16(2);
556
0
    const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
557
558
0
    constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
559
0
    int iDstPixel = 0;
560
0
    for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
561
0
         iDstPixel += 2 * DEST_ELTS)
562
0
    {
563
0
        decltype(setzero()) average0;
564
0
        {
565
            // Load 2 * DEST_ELTS bytes from each line
566
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
567
0
            const auto secondLine =
568
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
569
            // Extend those Bytes as UInt16s
570
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
571
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
572
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
573
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
574
575
            // Vertical addition
576
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
577
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
578
579
            // Horizontal addition of adjacent pairs, and recombine low and high
580
            // parts
581
0
            const auto sum = hadd_epi16(sumLo, sumHi);
582
583
            // average = (sum + 2) / 4
584
0
            average0 = srli_epi16(add_epi16(sum, two16), 2);
585
586
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
587
0
        }
588
589
0
        decltype(setzero()) average1;
590
0
        {
591
            // Load 2 * DEST_ELTS bytes from each line
592
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
593
0
            const auto secondLine =
594
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
595
            // Extend those Bytes as UInt16s
596
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
597
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
598
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
599
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
600
601
            // Vertical addition
602
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
603
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
604
605
            // Horizontal addition of adjacent pairs, and recombine low and high
606
            // parts
607
0
            const auto sum = hadd_epi16(sumLo, sumHi);
608
609
            // average = (sum + 2) / 4
610
0
            average1 = srli_epi16(add_epi16(sum, two16), 2);
611
612
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
613
0
        }
614
615
        // Pack each 16 bit average value to 8 bits
616
0
        const auto average = packus_epi16(average0, average1);
617
0
        storeu_int(&pDstScanline[iDstPixel], average);
618
0
    }
619
620
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
621
0
    return iDstPixel;
622
0
}
623
624
/************************************************************************/
625
/*                     QuadraticMeanUInt16SSE2()                        */
626
/************************************************************************/
627
628
#ifdef __SSE3__
629
#define sse2_hadd_pd _mm_hadd_pd
630
#else
631
inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
632
0
{
633
0
    auto aLo_bLo =
634
0
        _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
635
0
    auto aHi_bHi =
636
0
        _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
637
0
    return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
638
0
}
639
#endif
640
641
inline __m128d SQUARE_PD(__m128d x)
642
0
{
643
0
    return _mm_mul_pd(x, x);
644
0
}
645
646
#ifdef __AVX2__
647
648
inline __m256d SQUARE_PD(__m256d x)
649
{
650
    return _mm256_mul_pd(x, x);
651
}
652
653
inline __m256d FIXUP_LANES(__m256d x)
654
{
655
    return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
656
}
657
658
inline __m256 FIXUP_LANES(__m256 x)
659
{
660
    return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
661
}
662
663
#endif
664
665
static int
666
QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
667
                        const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
668
                        uint16_t *CPL_RESTRICT pDstScanline)
669
0
{
670
    // Optimized implementation for RMS on UInt16 by
671
    // processing by group of 4 output pixels.
672
0
    const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
673
674
0
    int iDstPixel = 0;
675
0
    const auto zero = _mm_setzero_si128();
676
677
#ifdef __AVX2__
678
    const auto zeroDot25 = _mm256_set1_pd(0.25);
679
    const auto zeroDot5 = _mm256_set1_pd(0.5);
680
681
    // The first four 0's could be anything, as we only take the bottom
682
    // 128 bits.
683
    const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
684
#else
685
0
    const auto zeroDot25 = _mm_set1_pd(0.25);
686
0
    const auto zeroDot5 = _mm_set1_pd(0.5);
687
0
#endif
688
689
0
    constexpr int DEST_ELTS =
690
0
        static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
691
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
692
0
    {
693
        // Load 8 UInt16 from each line
694
0
        const auto firstLine = _mm_loadu_si128(
695
0
            reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
696
0
        const auto secondLine =
697
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(
698
0
                pSrcScanlineShifted + nChunkXSize));
699
700
        // Detect if all of the source values fit in 14 bits.
701
        // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
702
        // and we can do a much faster implementation.
703
0
        const auto maskTmp =
704
0
            _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
705
#if defined(__i386__) || defined(_M_IX86)
706
        uint64_t nMaskFitsIn14Bits = 0;
707
        _mm_storel_epi64(
708
            reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
709
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
710
#else
711
0
        const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
712
0
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
713
0
#endif
714
0
        if (nMaskFitsIn14Bits == 0)
715
0
        {
716
            // Multiplication of 16 bit values and horizontal
717
            // addition of 32 bit results
718
0
            const auto firstLineHSumSquare =
719
0
                _mm_madd_epi16(firstLine, firstLine);
720
0
            const auto secondLineHSumSquare =
721
0
                _mm_madd_epi16(secondLine, secondLine);
722
            // Vertical addition
723
0
            const auto sumSquares =
724
0
                _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
725
            // In theory we should take sqrt(sumSquares * 0.25f)
726
            // but given the rounding we do, this is equivalent to
727
            // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
728
            // sumSquares <= 4 * 16383^2
729
0
            const auto one32 = _mm_set1_epi32(1);
730
0
            const auto sumSquaresPlusOneDiv4 =
731
0
                _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
732
            // Take square root and truncate/floor to int32
733
0
            auto rms = _mm_cvttps_epi32(
734
0
                _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
735
736
            // Round to upper value if it minimizes the
737
            // error |rms^2 - sumSquares/4|
738
            // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
739
            //    rms += 1;
740
            // which is equivalent to:
741
            // if( rms * rms + rms < (sumSquares+1) / 4 )
742
            //    rms += 1;
743
0
            auto mask =
744
0
                _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
745
0
                                _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
746
0
            rms = _mm_sub_epi32(rms, mask);
747
            // Pack each 32 bit RMS value to 16 bits
748
0
            rms = _mm_packs_epi32(rms, rms /* could be anything */);
749
0
            _mm_storel_epi64(
750
0
                reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
751
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
752
0
            continue;
753
0
        }
754
755
        // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
756
        // to 32 bit would result in 4 multiplications instead of 8, but
757
        // mullo/mulhi have a worse throughput than mul_pd.
758
759
        // Extend those UInt16s as UInt32s
760
0
        const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
761
0
        const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
762
0
        const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
763
0
        const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
764
765
#ifdef __AVX2__
766
        // Multiplication of 32 bit values previously converted to 64 bit double
767
        const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
768
        const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
769
        const auto secondLineLoDbl =
770
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
771
        const auto secondLineHiDbl =
772
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
773
774
        // Vertical addition of squares
775
        const auto sumSquaresLo =
776
            _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
777
        const auto sumSquaresHi =
778
            _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
779
780
        // Horizontal addition of squares
781
        const auto sumSquares =
782
            FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
783
784
        const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
785
786
        // Take square root and truncate/floor to int32
787
        auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
788
        const auto rmsDouble = _mm256_cvtepi32_pd(rms);
789
        const auto right = _mm256_sub_pd(
790
            sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
791
792
        auto mask =
793
            _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
794
        // Extract 32-bit from each of the 4 64-bit masks
795
        // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
796
        // _MM_SHUFFLE(2,0,2,0)));
797
        mask = _mm256_permutevar8x32_ps(mask, permutation);
798
        const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
799
800
        // Apply the correction
801
        rms = _mm_sub_epi32(rms, maskI);
802
803
        // Pack each 32 bit RMS value to 16 bits
804
        rms = _mm_packus_epi32(rms, rms /* could be anything */);
805
#else
806
        // Multiplication of 32 bit values previously converted to 64 bit double
807
0
        const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
808
0
        const auto firstLineLoHi =
809
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
810
0
        const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
811
0
        const auto firstLineHiHi =
812
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
813
814
0
        const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
815
0
        const auto secondLineLoHi =
816
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
817
0
        const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
818
0
        const auto secondLineHiHi =
819
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
820
821
        // Vertical addition of squares
822
0
        const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
823
0
        const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
824
0
        const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
825
0
        const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
826
827
        // Horizontal addition of squares
828
0
        const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
829
0
        const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
830
831
0
        const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
832
0
        const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
833
        // Take square root and truncate/floor to int32
834
0
        const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
835
0
        const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
836
837
        // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
838
        // if( 0.5 < sumDivWeight - (rms * rms + rms) )
839
        //     rms += 1;
840
0
        const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
841
0
        const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
842
0
        const auto rightLo = _mm_sub_pd(
843
0
            sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
844
0
        const auto rightHi = _mm_sub_pd(
845
0
            sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
846
847
0
        const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
848
0
        const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
849
        // The value of the mask will be -1 when the correction needs to be
850
        // applied
851
0
        const auto mask = _mm_castps_si128(_mm_shuffle_ps(
852
0
            maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
853
854
0
        auto rms = _mm_castps_si128(
855
0
            _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
856
        // Apply the correction
857
0
        rms = _mm_sub_epi32(rms, mask);
858
859
        // Pack each 32 bit RMS value to 16 bits
860
0
        rms = sse2_packus_epi32(rms, rms /* could be anything */);
861
0
#endif
862
863
0
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
864
0
                         rms);
865
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
866
0
    }
867
868
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
869
0
    return iDstPixel;
870
0
}
871
872
/************************************************************************/
873
/*                         AverageUInt16SSE2()                          */
874
/************************************************************************/
875
876
static int
877
AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
878
                  const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
879
                  uint16_t *CPL_RESTRICT pDstScanline)
880
0
{
881
    // Optimized implementation for average on UInt16 by
882
    // processing by group of 8 output pixels.
883
884
0
    const auto mask = _mm_set1_epi32(0xFFFF);
885
0
    const auto two = _mm_set1_epi32(2);
886
0
    const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
887
888
0
    int iDstPixel = 0;
889
0
    constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
890
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
891
0
    {
892
0
        __m128i averageLow;
893
        // Load 8 UInt16 from each line
894
0
        {
895
0
            const auto firstLine = _mm_loadu_si128(
896
0
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
897
0
            const auto secondLine =
898
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
899
0
                    pSrcScanlineShifted + nChunkXSize));
900
901
            // Horizontal addition and extension to 32 bit
902
0
            const auto horizAddFirstLine = _mm_add_epi32(
903
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
904
0
            const auto horizAddSecondLine =
905
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
906
0
                              _mm_srli_epi32(secondLine, 16));
907
908
            // Vertical addition and average computation
909
            // average = (sum + 2) >> 2
910
0
            const auto sum = _mm_add_epi32(
911
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
912
0
            averageLow = _mm_srli_epi32(sum, 2);
913
0
        }
914
        // Load 8 UInt16 from each line
915
0
        __m128i averageHigh;
916
0
        {
917
0
            const auto firstLine =
918
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
919
0
                    pSrcScanlineShifted + DEST_ELTS));
920
0
            const auto secondLine =
921
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
922
0
                    pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
923
924
            // Horizontal addition and extension to 32 bit
925
0
            const auto horizAddFirstLine = _mm_add_epi32(
926
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
927
0
            const auto horizAddSecondLine =
928
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
929
0
                              _mm_srli_epi32(secondLine, 16));
930
931
            // Vertical addition and average computation
932
            // average = (sum + 2) >> 2
933
0
            const auto sum = _mm_add_epi32(
934
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
935
0
            averageHigh = _mm_srli_epi32(sum, 2);
936
0
        }
937
938
        // Pack each 32 bit average value to 16 bits
939
0
        auto average = sse2_packus_epi32(averageLow, averageHigh);
940
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
941
0
                         average);
942
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
943
0
    }
944
945
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
946
0
    return iDstPixel;
947
0
}
948
949
/************************************************************************/
950
/*                      QuadraticMeanFloatSSE2()                        */
951
/************************************************************************/
952
953
#if !defined(ARM_V7)
954
955
#ifdef __SSE3__
956
#define sse2_hadd_ps _mm_hadd_ps
957
#else
958
inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
959
0
{
960
0
    auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
961
0
    auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
962
0
    return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
963
0
}
964
#endif
965
966
#ifdef __AVX2__
967
#define set1_ps _mm256_set1_ps
968
#define loadu_ps _mm256_loadu_ps
969
#define andnot_ps _mm256_andnot_ps
970
#define and_ps _mm256_and_ps
971
#define max_ps _mm256_max_ps
972
#define shuffle_ps _mm256_shuffle_ps
973
#define div_ps _mm256_div_ps
974
#define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
975
#define mul_ps _mm256_mul_ps
976
#define add_ps _mm256_add_ps
977
#define hadd_ps _mm256_hadd_ps
978
#define sqrt_ps _mm256_sqrt_ps
979
#define or_ps _mm256_or_ps
980
#define unpacklo_ps _mm256_unpacklo_ps
981
#define unpackhi_ps _mm256_unpackhi_ps
982
#define storeu_ps _mm256_storeu_ps
983
#define blendv_ps _mm256_blendv_ps
984
985
inline __m256 SQUARE_PS(__m256 x)
986
{
987
    return _mm256_mul_ps(x, x);
988
}
989
990
#else
991
992
0
#define set1_ps _mm_set1_ps
993
0
#define loadu_ps _mm_loadu_ps
994
0
#define andnot_ps _mm_andnot_ps
995
#define and_ps _mm_and_ps
996
0
#define max_ps _mm_max_ps
997
0
#define shuffle_ps _mm_shuffle_ps
998
0
#define div_ps _mm_div_ps
999
0
#define cmpeq_ps _mm_cmpeq_ps
1000
0
#define mul_ps _mm_mul_ps
1001
0
#define add_ps _mm_add_ps
1002
#define hadd_ps sse2_hadd_ps
1003
0
#define sqrt_ps _mm_sqrt_ps
1004
#define or_ps _mm_or_ps
1005
#define unpacklo_ps _mm_unpacklo_ps
1006
#define unpackhi_ps _mm_unpackhi_ps
1007
0
#define storeu_ps _mm_storeu_ps
1008
1009
inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
1010
{
1011
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1012
    return _mm_blendv_ps(a, b, mask);
1013
#else
1014
    return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
1015
#endif
1016
}
1017
1018
inline __m128 SQUARE_PS(__m128 x)
1019
0
{
1020
0
    return _mm_mul_ps(x, x);
1021
0
}
1022
1023
inline __m128 FIXUP_LANES(__m128 x)
1024
0
{
1025
0
    return x;
1026
0
}
1027
1028
#endif
1029
1030
static int
1031
#if defined(__GNUC__)
1032
    __attribute__((noinline))
1033
#endif
1034
    QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1035
                           const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1036
                           float *CPL_RESTRICT pDstScanline)
1037
0
{
1038
    // Optimized implementation for RMS on Float32 by
1039
    // processing by group of output pixels.
1040
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1041
1042
0
    int iDstPixel = 0;
1043
0
    const auto minus_zero = set1_ps(-0.0f);
1044
0
    const auto zeroDot25 = set1_ps(0.25f);
1045
0
    const auto one = set1_ps(1.0f);
1046
0
    const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1047
0
    constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
1048
1049
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1050
0
    {
1051
        // Load 2*DEST_ELTS Float32 from each line
1052
0
        auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1053
0
        auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
1054
0
        auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1055
0
        auto secondLineHi =
1056
0
            loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
1057
1058
        // Take the absolute value
1059
0
        firstLineLo = andnot_ps(minus_zero, firstLineLo);
1060
0
        firstLineHi = andnot_ps(minus_zero, firstLineHi);
1061
0
        secondLineLo = andnot_ps(minus_zero, secondLineLo);
1062
0
        secondLineHi = andnot_ps(minus_zero, secondLineHi);
1063
1064
0
        auto firstLineEven =
1065
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1066
0
        auto firstLineOdd =
1067
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1068
0
        auto secondLineEven =
1069
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1070
0
        auto secondLineOdd =
1071
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1072
1073
        // Compute the maximum of each DEST_ELTS value to RMS-average
1074
0
        const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1075
0
                                 max_ps(secondLineEven, secondLineEven));
1076
1077
        // Normalize each value by the maximum of the DEST_ELTS ones.
1078
        // This step is important to avoid that the square evaluates to infinity
1079
        // for sufficiently big input.
1080
0
        auto invMax = div_ps(one, maxV);
1081
        // Deal with 0 being the maximum to correct division by zero
1082
        // note: comparing to -0 leads to identical results as to comparing with
1083
        // 0
1084
0
        invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1085
1086
0
        firstLineEven = mul_ps(firstLineEven, invMax);
1087
0
        firstLineOdd = mul_ps(firstLineOdd, invMax);
1088
0
        secondLineEven = mul_ps(secondLineEven, invMax);
1089
0
        secondLineOdd = mul_ps(secondLineOdd, invMax);
1090
1091
        // Compute squares
1092
0
        firstLineEven = SQUARE_PS(firstLineEven);
1093
0
        firstLineOdd = SQUARE_PS(firstLineOdd);
1094
0
        secondLineEven = SQUARE_PS(secondLineEven);
1095
0
        secondLineOdd = SQUARE_PS(secondLineOdd);
1096
1097
0
        const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1098
0
                                       add_ps(secondLineEven, secondLineOdd));
1099
1100
0
        auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1101
1102
        // Deal with infinity being the maximum
1103
0
        const auto maskIsInf = cmpeq_ps(maxV, infv);
1104
0
        rms = blendv_ps(rms, infv, maskIsInf);
1105
1106
0
        rms = FIXUP_LANES(rms);
1107
1108
0
        storeu_ps(&pDstScanline[iDstPixel], rms);
1109
0
        pSrcScanlineShifted += DEST_ELTS * 2;
1110
0
    }
1111
1112
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1113
0
    return iDstPixel;
1114
0
}
1115
1116
/************************************************************************/
1117
/*                        AverageFloatSSE2()                            */
1118
/************************************************************************/
1119
1120
static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1121
                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1122
                            float *CPL_RESTRICT pDstScanline)
1123
0
{
1124
    // Optimized implementation for average on Float32 by
1125
    // processing by group of output pixels.
1126
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1127
1128
0
    int iDstPixel = 0;
1129
0
    const auto zeroDot25 = _mm_set1_ps(0.25f);
1130
0
    constexpr int DEST_ELTS =
1131
0
        static_cast<int>(sizeof(zeroDot25) / sizeof(float));
1132
1133
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1134
0
    {
1135
        // Load 2 * DEST_ELTS Float32 from each line
1136
0
        const auto firstLineLo =
1137
0
            _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
1138
0
        const auto firstLineHi = _mm_mul_ps(
1139
0
            _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
1140
0
        const auto secondLineLo = _mm_mul_ps(
1141
0
            _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
1142
0
        const auto secondLineHi = _mm_mul_ps(
1143
0
            _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
1144
0
            zeroDot25);
1145
1146
        // Vertical addition
1147
0
        const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
1148
0
        const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
1149
1150
        // Horizontal addition
1151
0
        const auto average = sse2_hadd_ps(tmpLo, tmpHi);
1152
1153
0
        _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1154
0
        pSrcScanlineShifted += DEST_ELTS * 2;
1155
0
    }
1156
1157
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1158
0
    return iDstPixel;
1159
0
}
1160
1161
/************************************************************************/
1162
/*                        AverageDoubleSSE2()                           */
1163
/************************************************************************/
1164
1165
static int
1166
AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
1167
                  const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1168
                  double *CPL_RESTRICT pDstScanline)
1169
0
{
1170
    // Optimized implementation for average on Float64 by
1171
    // processing by group of output pixels.
1172
0
    const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1173
1174
0
    int iDstPixel = 0;
1175
0
    const auto zeroDot25 = _mm_set1_pd(0.25);
1176
0
    constexpr int DEST_ELTS =
1177
0
        static_cast<int>(sizeof(zeroDot25) / sizeof(double));
1178
1179
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1180
0
    {
1181
        // Load 4 * DEST_ELTS Float64 from each line
1182
0
        const auto firstLine0 = _mm_mul_pd(
1183
0
            _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
1184
0
        const auto firstLine1 = _mm_mul_pd(
1185
0
            _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
1186
0
        const auto secondLine0 = _mm_mul_pd(
1187
0
            _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
1188
0
            zeroDot25);
1189
0
        const auto secondLine1 = _mm_mul_pd(
1190
0
            _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
1191
0
            zeroDot25);
1192
1193
        // Vertical addition
1194
0
        const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
1195
0
        const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
1196
1197
        // Horizontal addition
1198
0
        const auto average0 = sse2_hadd_pd(tmp0, tmp1);
1199
1200
0
        _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
1201
0
        pSrcScanlineShifted += DEST_ELTS * 2;
1202
0
    }
1203
1204
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1205
0
    return iDstPixel;
1206
0
}
1207
1208
#endif
1209
1210
#endif
1211
1212
/************************************************************************/
1213
/*                    GDALResampleChunk_AverageOrRMS()                  */
1214
/************************************************************************/
1215
1216
template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
1217
static CPLErr
1218
GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1219
                                 const T *pChunk, void **ppDstBuffer)
1220
0
{
1221
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1222
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1223
0
    const double dfSrcXDelta = args.dfSrcXDelta;
1224
0
    const double dfSrcYDelta = args.dfSrcYDelta;
1225
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1226
0
    const int nChunkXOff = args.nChunkXOff;
1227
0
    const int nChunkYOff = args.nChunkYOff;
1228
0
    const int nChunkXSize = args.nChunkXSize;
1229
0
    const int nChunkYSize = args.nChunkYSize;
1230
0
    const int nDstXOff = args.nDstXOff;
1231
0
    const int nDstXOff2 = args.nDstXOff2;
1232
0
    const int nDstYOff = args.nDstYOff;
1233
0
    const int nDstYOff2 = args.nDstYOff2;
1234
0
    const char *pszResampling = args.pszResampling;
1235
0
    bool bHasNoData = args.bHasNoData;
1236
0
    const double dfNoDataValue = args.dfNoDataValue;
1237
0
    const GDALColorTable *const poColorTable =
1238
0
        !bQuadraticMean &&
1239
                // AVERAGE_BIT2GRAYSCALE
1240
0
                CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"))
1241
0
            ? nullptr
1242
0
            : args.poColorTable;
1243
0
    const bool bPropagateNoData = args.bPropagateNoData;
1244
1245
0
    T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
1246
0
    const T tReplacementVal =
1247
0
        bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1248
0
                         args.eOvrDataType, dfNoDataValue))
1249
0
                   : 0;
1250
1251
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1252
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1253
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
1254
1255
    /* -------------------------------------------------------------------- */
1256
    /*      Allocate buffers.                                               */
1257
    /* -------------------------------------------------------------------- */
1258
0
    *ppDstBuffer = static_cast<T *>(
1259
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1260
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
1261
0
    if (*ppDstBuffer == nullptr)
1262
0
    {
1263
0
        return CE_Failure;
1264
0
    }
1265
0
    T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1266
1267
0
    struct PrecomputedXValue
1268
0
    {
1269
0
        int nLeftXOffShifted;
1270
0
        int nRightXOffShifted;
1271
0
        double dfLeftWeight;
1272
0
        double dfRightWeight;
1273
0
        double dfTotalWeightFullLine;
1274
0
    };
1275
1276
0
    PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1277
0
        VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1278
1279
0
    if (pasSrcX == nullptr)
1280
0
    {
1281
0
        return CE_Failure;
1282
0
    }
1283
1284
0
    std::vector<GDALColorEntry> colorEntries;
1285
1286
0
    if (poColorTable)
1287
0
    {
1288
0
        int nTransparentIdx = -1;
1289
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1290
1291
        // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1292
        // it as nodata value
1293
0
        if (bHasNoData && dfNoDataValue >= 0.0 &&
1294
0
            tNoDataValue < colorEntries.size())
1295
0
            colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1296
1297
        // Or if we have no explicit nodata, but a color table entry that is
1298
        // transparent, consider it as the nodata value
1299
0
        else if (!bHasNoData && nTransparentIdx >= 0)
1300
0
        {
1301
0
            bHasNoData = true;
1302
0
            tNoDataValue = static_cast<T>(nTransparentIdx);
1303
0
        }
1304
0
    }
1305
1306
    /* ==================================================================== */
1307
    /*      Precompute inner loop constants.                                */
1308
    /* ==================================================================== */
1309
0
    bool bSrcXSpacingIsTwo = true;
1310
0
    int nLastSrcXOff2 = -1;
1311
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1312
0
    {
1313
0
        const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1314
        // Apply some epsilon to avoid numerical precision issues
1315
0
        int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1316
0
        const double dfSrcXOff2 =
1317
0
            dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1318
0
        int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1319
1320
0
        if (nSrcXOff < nChunkXOff)
1321
0
            nSrcXOff = nChunkXOff;
1322
0
        if (nSrcXOff2 == nSrcXOff)
1323
0
            nSrcXOff2++;
1324
0
        if (nSrcXOff2 > nChunkRightXOff)
1325
0
            nSrcXOff2 = nChunkRightXOff;
1326
1327
0
        pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1328
0
        pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1329
0
            nSrcXOff2 - nChunkXOff;
1330
0
        pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1331
0
            (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1332
0
        pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1333
0
            1 - (nSrcXOff2 - dfSrcXOff2);
1334
0
        pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1335
0
            pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1336
0
        if (nSrcXOff + 1 < nSrcXOff2)
1337
0
        {
1338
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1339
0
                nSrcXOff2 - nSrcXOff - 2;
1340
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1341
0
                pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1342
0
        }
1343
1344
0
        if (nSrcXOff2 - nSrcXOff != 2 ||
1345
0
            (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1346
0
        {
1347
0
            bSrcXSpacingIsTwo = false;
1348
0
        }
1349
0
        nLastSrcXOff2 = nSrcXOff2;
1350
0
    }
1351
1352
    /* ==================================================================== */
1353
    /*      Loop over destination scanlines.                                */
1354
    /* ==================================================================== */
1355
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1356
0
    {
1357
0
        const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1358
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1359
0
        if (nSrcYOff < nChunkYOff)
1360
0
            nSrcYOff = nChunkYOff;
1361
1362
0
        const double dfSrcYOff2 =
1363
0
            dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1364
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1365
0
        if (nSrcYOff2 == nSrcYOff)
1366
0
            ++nSrcYOff2;
1367
0
        if (nSrcYOff2 > nChunkBottomYOff)
1368
0
            nSrcYOff2 = nChunkBottomYOff;
1369
1370
0
        T *const pDstScanline =
1371
0
            pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1372
1373
        /* --------------------------------------------------------------------
1374
         */
1375
        /*      Loop over destination pixels */
1376
        /* --------------------------------------------------------------------
1377
         */
1378
0
        if (poColorTable == nullptr)
1379
0
        {
1380
0
            if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1381
0
                pabyChunkNodataMask == nullptr)
1382
0
            {
1383
                if constexpr (eWrkDataType == GDT_Byte ||
1384
                              eWrkDataType == GDT_UInt16)
1385
0
                {
1386
                    // Optimized case : no nodata, overview by a factor of 2 and
1387
                    // regular x and y src spacing.
1388
0
                    const T *pSrcScanlineShifted =
1389
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1390
0
                        static_cast<size_t>(nSrcYOff - nChunkYOff) *
1391
0
                            nChunkXSize;
1392
0
                    int iDstPixel = 0;
1393
0
#ifdef USE_SSE2
1394
                    if constexpr (eWrkDataType == GDT_Byte)
1395
0
                    {
1396
                        if constexpr (bQuadraticMean)
1397
0
                        {
1398
0
                            iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1399
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1400
0
                                pDstScanline);
1401
                        }
1402
                        else
1403
0
                        {
1404
0
                            iDstPixel = AverageByteSSE2OrAVX2(
1405
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1406
0
                                pDstScanline);
1407
0
                        }
1408
                    }
1409
                    else
1410
0
                    {
1411
0
                        static_assert(eWrkDataType == GDT_UInt16);
1412
                        if constexpr (bQuadraticMean)
1413
0
                        {
1414
0
                            iDstPixel = QuadraticMeanUInt16SSE2(
1415
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1416
0
                                pDstScanline);
1417
                        }
1418
                        else
1419
0
                        {
1420
0
                            iDstPixel = AverageUInt16SSE2(
1421
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1422
0
                                pDstScanline);
1423
0
                        }
1424
0
                    }
1425
0
#endif
1426
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1427
0
                    {
1428
0
                        Tsum nTotal = 0;
1429
0
                        T nVal;
1430
                        if constexpr (bQuadraticMean)
1431
0
                            nTotal =
1432
0
                                SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1433
0
                                SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1434
0
                                SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1435
0
                                SQUARE<Tsum>(
1436
                                    pSrcScanlineShifted[1 + nChunkXSize]);
1437
                        else
1438
0
                            nTotal = pSrcScanlineShifted[0] +
1439
0
                                     pSrcScanlineShifted[1] +
1440
0
                                     pSrcScanlineShifted[nChunkXSize] +
1441
0
                                     pSrcScanlineShifted[1 + nChunkXSize];
1442
1443
0
                        constexpr int nTotalWeight = 4;
1444
                        if constexpr (bQuadraticMean)
1445
0
                            nVal = ComputeIntegerRMS_4values<T>(nTotal);
1446
                        else
1447
0
                            nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1448
0
                                                  nTotalWeight);
1449
1450
                        // No need to compare nVal against tNoDataValue as we
1451
                        // are in a case where pabyChunkNodataMask == nullptr
1452
                        // implies the absence of nodata value.
1453
0
                        pDstScanline[iDstPixel] = nVal;
1454
0
                        pSrcScanlineShifted += 2;
1455
0
                    }
1456
                }
1457
                else
1458
0
                {
1459
0
                    static_assert(eWrkDataType == GDT_Float32 ||
1460
0
                                  eWrkDataType == GDT_Float64);
1461
0
                    const T *pSrcScanlineShifted =
1462
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1463
0
                        static_cast<size_t>(nSrcYOff - nChunkYOff) *
1464
0
                            nChunkXSize;
1465
0
                    int iDstPixel = 0;
1466
0
#if defined(USE_SSE2) && !defined(ARM_V7)
1467
                    if constexpr (eWrkDataType == GDT_Float32)
1468
0
                    {
1469
0
                        static_assert(std::is_same_v<T, float>);
1470
                        if constexpr (bQuadraticMean)
1471
0
                        {
1472
0
                            iDstPixel = QuadraticMeanFloatSSE2(
1473
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1474
0
                                pDstScanline);
1475
                        }
1476
                        else
1477
0
                        {
1478
0
                            iDstPixel = AverageFloatSSE2(
1479
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1480
0
                                pDstScanline);
1481
0
                        }
1482
                    }
1483
                    else
1484
0
                    {
1485
                        if constexpr (!bQuadraticMean)
1486
0
                        {
1487
0
                            iDstPixel = AverageDoubleSSE2(
1488
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1489
0
                                pDstScanline);
1490
0
                        }
1491
0
                    }
1492
0
#endif
1493
1494
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1495
0
                    {
1496
0
                        T nVal;
1497
1498
                        if constexpr (bQuadraticMean)
1499
0
                        {
1500
                            // Avoid issues with large values by renormalizing
1501
0
                            const auto max = std::max(
1502
0
                                {std::fabs(pSrcScanlineShifted[0]),
1503
0
                                 std::fabs(pSrcScanlineShifted[1]),
1504
0
                                 std::fabs(pSrcScanlineShifted[nChunkXSize]),
1505
0
                                 std::fabs(
1506
0
                                     pSrcScanlineShifted[1 + nChunkXSize])});
1507
0
                            if (max == 0)
1508
0
                            {
1509
0
                                nVal = 0;
1510
0
                            }
1511
0
                            else if (std::isinf(max))
1512
0
                            {
1513
                                // If there is at least one infinity value,
1514
                                // then just summing, and taking the abs
1515
                                // value will give the expected result:
1516
                                // * +inf if all values are +inf
1517
                                // * +inf if all values are -inf
1518
                                // * NaN otherwise
1519
0
                                nVal = std::fabs(
1520
0
                                    pSrcScanlineShifted[0] +
1521
0
                                    pSrcScanlineShifted[1] +
1522
0
                                    pSrcScanlineShifted[nChunkXSize] +
1523
0
                                    pSrcScanlineShifted[1 + nChunkXSize]);
1524
0
                            }
1525
0
                            else
1526
0
                            {
1527
0
                                const auto inv_max = static_cast<T>(1.0) / max;
1528
0
                                nVal =
1529
0
                                    max *
1530
0
                                    std::sqrt(
1531
0
                                        static_cast<T>(0.25) *
1532
0
                                        (SQUARE(pSrcScanlineShifted[0] *
1533
0
                                                inv_max) +
1534
0
                                         SQUARE(pSrcScanlineShifted[1] *
1535
0
                                                inv_max) +
1536
0
                                         SQUARE(
1537
0
                                             pSrcScanlineShifted[nChunkXSize] *
1538
0
                                             inv_max) +
1539
0
                                         SQUARE(
1540
0
                                             pSrcScanlineShifted[1 +
1541
0
                                                                 nChunkXSize] *
1542
0
                                             inv_max)));
1543
0
                            }
1544
                        }
1545
                        else
1546
0
                        {
1547
0
                            constexpr auto weight = static_cast<T>(0.25);
1548
                            // Multiply each value by weight to avoid
1549
                            // potential overflow
1550
0
                            nVal =
1551
0
                                (weight * pSrcScanlineShifted[0] +
1552
0
                                 weight * pSrcScanlineShifted[1] +
1553
0
                                 weight * pSrcScanlineShifted[nChunkXSize] +
1554
0
                                 weight * pSrcScanlineShifted[1 + nChunkXSize]);
1555
0
                        }
1556
1557
                        // No need to compare nVal against tNoDataValue as we
1558
                        // are in a case where pabyChunkNodataMask == nullptr
1559
                        // implies the absence of nodata value.
1560
0
                        pDstScanline[iDstPixel] = nVal;
1561
0
                        pSrcScanlineShifted += 2;
1562
0
                    }
1563
0
                }
1564
0
            }
1565
0
            else
1566
0
            {
1567
0
                const double dfBottomWeight =
1568
0
                    (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1569
0
                                                : 1.0 - (dfSrcYOff - nSrcYOff);
1570
0
                const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1571
0
                nSrcYOff -= nChunkYOff;
1572
0
                nSrcYOff2 -= nChunkYOff;
1573
1574
0
                double dfTotalWeightFullColumn = dfBottomWeight;
1575
0
                if (nSrcYOff + 1 < nSrcYOff2)
1576
0
                {
1577
0
                    dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1578
0
                    dfTotalWeightFullColumn += dfTopWeight;
1579
0
                }
1580
1581
0
                for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1582
0
                {
1583
0
                    const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1584
0
                    const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1585
1586
0
                    double dfTotal = 0;
1587
0
                    double dfTotalWeight = 0;
1588
0
                    [[maybe_unused]] double dfMulFactor = 1.0;
1589
0
                    [[maybe_unused]] double dfInvMulFactor = 1.0;
1590
0
                    constexpr bool bUseMulFactor =
1591
0
                        (eWrkDataType == GDT_Float32 ||
1592
0
                         eWrkDataType == GDT_Float64);
1593
0
                    if (pabyChunkNodataMask == nullptr)
1594
0
                    {
1595
                        if constexpr (bUseMulFactor)
1596
0
                        {
1597
                            if constexpr (bQuadraticMean)
1598
0
                            {
1599
0
                                T mulFactor = 0;
1600
0
                                auto pChunkShifted =
1601
0
                                    pChunk +
1602
0
                                    static_cast<size_t>(nSrcYOff) * nChunkXSize;
1603
1604
0
                                for (int iY = nSrcYOff; iY < nSrcYOff2;
1605
0
                                     ++iY, pChunkShifted += nChunkXSize)
1606
0
                                {
1607
0
                                    for (int iX = nSrcXOff; iX < nSrcXOff2;
1608
0
                                         ++iX)
1609
0
                                        mulFactor = std::max(
1610
0
                                            mulFactor,
1611
0
                                            std::fabs(pChunkShifted[iX]));
1612
0
                                }
1613
0
                                dfMulFactor = double(mulFactor);
1614
0
                                dfInvMulFactor =
1615
0
                                    dfMulFactor > 0 &&
1616
0
                                            std::isfinite(dfMulFactor)
1617
0
                                        ? 1.0 / dfMulFactor
1618
0
                                        : 1.0;
1619
                            }
1620
                            else
1621
0
                            {
1622
0
                                dfMulFactor = (nSrcYOff2 - nSrcYOff) *
1623
0
                                              (nSrcXOff2 - nSrcXOff);
1624
0
                                dfInvMulFactor = 1.0 / dfMulFactor;
1625
0
                            }
1626
0
                        }
1627
1628
0
                        auto pChunkShifted =
1629
0
                            pChunk +
1630
0
                            static_cast<size_t>(nSrcYOff) * nChunkXSize;
1631
0
                        int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1632
0
                        double dfWeightY = dfBottomWeight;
1633
0
                        while (true)
1634
0
                        {
1635
0
                            double dfTotalLine;
1636
                            if constexpr (bQuadraticMean)
1637
0
                            {
1638
                                // Left pixel
1639
0
                                {
1640
0
                                    const T val = pChunkShifted[nSrcXOff];
1641
0
                                    dfTotalLine =
1642
0
                                        SQUARE(double(val) * dfInvMulFactor) *
1643
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1644
0
                                }
1645
1646
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1647
0
                                {
1648
                                    // Middle pixels
1649
0
                                    for (int iX = nSrcXOff + 1;
1650
0
                                         iX < nSrcXOff2 - 1; ++iX)
1651
0
                                    {
1652
0
                                        const T val = pChunkShifted[iX];
1653
0
                                        dfTotalLine += SQUARE(double(val) *
1654
0
                                                              dfInvMulFactor);
1655
0
                                    }
1656
1657
                                    // Right pixel
1658
0
                                    {
1659
0
                                        const T val =
1660
0
                                            pChunkShifted[nSrcXOff2 - 1];
1661
0
                                        dfTotalLine +=
1662
0
                                            SQUARE(double(val) *
1663
0
                                                   dfInvMulFactor) *
1664
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1665
0
                                    }
1666
0
                                }
1667
                            }
1668
                            else
1669
0
                            {
1670
                                // Left pixel
1671
0
                                {
1672
0
                                    const T val = pChunkShifted[nSrcXOff];
1673
0
                                    dfTotalLine =
1674
0
                                        double(val) * dfInvMulFactor *
1675
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1676
0
                                }
1677
1678
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1679
0
                                {
1680
                                    // Middle pixels
1681
0
                                    for (int iX = nSrcXOff + 1;
1682
0
                                         iX < nSrcXOff2 - 1; ++iX)
1683
0
                                    {
1684
0
                                        const T val = pChunkShifted[iX];
1685
0
                                        dfTotalLine +=
1686
0
                                            double(val) * dfInvMulFactor;
1687
0
                                    }
1688
1689
                                    // Right pixel
1690
0
                                    {
1691
0
                                        const T val =
1692
0
                                            pChunkShifted[nSrcXOff2 - 1];
1693
0
                                        dfTotalLine +=
1694
0
                                            double(val) * dfInvMulFactor *
1695
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1696
0
                                    }
1697
0
                                }
1698
0
                            }
1699
1700
0
                            dfTotal += dfTotalLine * dfWeightY;
1701
0
                            --nCounterY;
1702
0
                            if (nCounterY < 0)
1703
0
                                break;
1704
0
                            pChunkShifted += nChunkXSize;
1705
0
                            dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1706
0
                        }
1707
1708
0
                        dfTotalWeight =
1709
0
                            pasSrcX[iDstPixel].dfTotalWeightFullLine *
1710
0
                            dfTotalWeightFullColumn;
1711
0
                    }
1712
0
                    else
1713
0
                    {
1714
0
                        size_t nCount = 0;
1715
0
                        for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1716
0
                        {
1717
0
                            const auto pChunkShifted =
1718
0
                                pChunk + static_cast<size_t>(iY) * nChunkXSize;
1719
1720
0
                            double dfTotalLine = 0;
1721
0
                            double dfTotalWeightLine = 0;
1722
                            // Left pixel
1723
0
                            {
1724
0
                                const int iX = nSrcXOff;
1725
0
                                const T val = pChunkShifted[iX];
1726
0
                                if (pabyChunkNodataMask
1727
0
                                        [iX +
1728
0
                                         static_cast<size_t>(iY) * nChunkXSize])
1729
0
                                {
1730
0
                                    nCount++;
1731
0
                                    const double dfWeightX =
1732
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1733
0
                                    dfTotalWeightLine = dfWeightX;
1734
                                    if constexpr (bQuadraticMean)
1735
0
                                        dfTotalLine =
1736
                                            SQUARE(double(val)) * dfWeightX;
1737
                                    else
1738
0
                                        dfTotalLine = double(val) * dfWeightX;
1739
0
                                }
1740
0
                            }
1741
1742
0
                            if (nSrcXOff < nSrcXOff2 - 1)
1743
0
                            {
1744
                                // Middle pixels
1745
0
                                for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1746
0
                                     ++iX)
1747
0
                                {
1748
0
                                    const T val = pChunkShifted[iX];
1749
0
                                    if (pabyChunkNodataMask
1750
0
                                            [iX + static_cast<size_t>(iY) *
1751
0
                                                      nChunkXSize])
1752
0
                                    {
1753
0
                                        nCount++;
1754
0
                                        dfTotalWeightLine += 1;
1755
                                        if constexpr (bQuadraticMean)
1756
0
                                            dfTotalLine += SQUARE(double(val));
1757
                                        else
1758
0
                                            dfTotalLine += double(val);
1759
0
                                    }
1760
0
                                }
1761
1762
                                // Right pixel
1763
0
                                {
1764
0
                                    const int iX = nSrcXOff2 - 1;
1765
0
                                    const T val = pChunkShifted[iX];
1766
0
                                    if (pabyChunkNodataMask
1767
0
                                            [iX + static_cast<size_t>(iY) *
1768
0
                                                      nChunkXSize])
1769
0
                                    {
1770
0
                                        nCount++;
1771
0
                                        const double dfWeightX =
1772
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1773
0
                                        dfTotalWeightLine += dfWeightX;
1774
                                        if constexpr (bQuadraticMean)
1775
0
                                            dfTotalLine +=
1776
                                                SQUARE(double(val)) * dfWeightX;
1777
                                        else
1778
0
                                            dfTotalLine +=
1779
0
                                                double(val) * dfWeightX;
1780
0
                                    }
1781
0
                                }
1782
0
                            }
1783
1784
0
                            const double dfWeightY =
1785
0
                                (iY == nSrcYOff)        ? dfBottomWeight
1786
0
                                : (iY + 1 == nSrcYOff2) ? dfTopWeight
1787
0
                                                        : 1.0;
1788
0
                            dfTotal += dfTotalLine * dfWeightY;
1789
0
                            dfTotalWeight += dfTotalWeightLine * dfWeightY;
1790
0
                        }
1791
1792
0
                        if (nCount == 0 ||
1793
0
                            (bPropagateNoData &&
1794
0
                             nCount <
1795
0
                                 static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1796
0
                                     (nSrcXOff2 - nSrcXOff)))
1797
0
                        {
1798
0
                            pDstScanline[iDstPixel] = tNoDataValue;
1799
0
                            continue;
1800
0
                        }
1801
0
                    }
1802
                    if constexpr (eWrkDataType == GDT_Byte)
1803
0
                    {
1804
0
                        T nVal;
1805
                        if constexpr (bQuadraticMean)
1806
0
                            nVal = ComputeIntegerRMS<T, int>(dfTotal,
1807
                                                             dfTotalWeight);
1808
                        else
1809
0
                            nVal =
1810
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1811
0
                        if (bHasNoData && nVal == tNoDataValue)
1812
0
                            nVal = tReplacementVal;
1813
0
                        pDstScanline[iDstPixel] = nVal;
1814
                    }
1815
                    else if constexpr (eWrkDataType == GDT_UInt16)
1816
0
                    {
1817
0
                        T nVal;
1818
                        if constexpr (bQuadraticMean)
1819
0
                            nVal = ComputeIntegerRMS<T, uint64_t>(
1820
                                dfTotal, dfTotalWeight);
1821
                        else
1822
0
                            nVal =
1823
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1824
0
                        if (bHasNoData && nVal == tNoDataValue)
1825
0
                            nVal = tReplacementVal;
1826
0
                        pDstScanline[iDstPixel] = nVal;
1827
                    }
1828
                    else
1829
0
                    {
1830
0
                        T nVal;
1831
                        if constexpr (bQuadraticMean)
1832
0
                        {
1833
                            if constexpr (bUseMulFactor)
1834
0
                                nVal = static_cast<T>(
1835
0
                                    dfMulFactor *
1836
                                    sqrt(dfTotal / dfTotalWeight));
1837
                            else
1838
                                nVal = static_cast<T>(
1839
                                    sqrt(dfTotal / dfTotalWeight));
1840
                        }
1841
                        else
1842
0
                        {
1843
                            if constexpr (bUseMulFactor)
1844
0
                                nVal = static_cast<T>(
1845
                                    dfMulFactor * (dfTotal / dfTotalWeight));
1846
                            else
1847
                                nVal = static_cast<T>(dfTotal / dfTotalWeight);
1848
0
                        }
1849
0
                        if (bHasNoData && nVal == tNoDataValue)
1850
0
                            nVal = tReplacementVal;
1851
0
                        pDstScanline[iDstPixel] = nVal;
1852
0
                    }
1853
0
                }
1854
0
            }
1855
0
        }
1856
0
        else
1857
0
        {
1858
0
            nSrcYOff -= nChunkYOff;
1859
0
            nSrcYOff2 -= nChunkYOff;
1860
1861
0
            for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1862
0
            {
1863
0
                const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1864
0
                const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1865
1866
0
                uint64_t nTotalR = 0;
1867
0
                uint64_t nTotalG = 0;
1868
0
                uint64_t nTotalB = 0;
1869
0
                size_t nCount = 0;
1870
1871
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1872
0
                {
1873
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1874
0
                    {
1875
0
                        const T val =
1876
0
                            pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1877
                        // cppcheck-suppress unsignedLessThanZero
1878
0
                        if (val < 0 || val >= colorEntries.size())
1879
0
                            continue;
1880
0
                        const size_t idx = static_cast<size_t>(val);
1881
0
                        const auto &entry = colorEntries[idx];
1882
0
                        if (entry.c4)
1883
0
                        {
1884
                            if constexpr (bQuadraticMean)
1885
0
                            {
1886
0
                                nTotalR += SQUARE<int>(entry.c1);
1887
0
                                nTotalG += SQUARE<int>(entry.c2);
1888
0
                                nTotalB += SQUARE<int>(entry.c3);
1889
0
                                ++nCount;
1890
                            }
1891
                            else
1892
0
                            {
1893
0
                                nTotalR += entry.c1;
1894
0
                                nTotalG += entry.c2;
1895
0
                                nTotalB += entry.c3;
1896
0
                                ++nCount;
1897
0
                            }
1898
0
                        }
1899
0
                    }
1900
0
                }
1901
1902
0
                if (nCount == 0 ||
1903
0
                    (bPropagateNoData &&
1904
0
                     nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1905
0
                                  (nSrcXOff2 - nSrcXOff)))
1906
0
                {
1907
0
                    pDstScanline[iDstPixel] = tNoDataValue;
1908
0
                }
1909
0
                else
1910
0
                {
1911
0
                    GDALColorEntry color;
1912
                    if constexpr (bQuadraticMean)
1913
0
                    {
1914
0
                        color.c1 =
1915
0
                            static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1916
0
                        color.c2 =
1917
0
                            static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1918
0
                        color.c3 =
1919
0
                            static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1920
                    }
1921
                    else
1922
0
                    {
1923
0
                        color.c1 =
1924
0
                            static_cast<short>((nTotalR + nCount / 2) / nCount);
1925
0
                        color.c2 =
1926
0
                            static_cast<short>((nTotalG + nCount / 2) / nCount);
1927
0
                        color.c3 =
1928
0
                            static_cast<short>((nTotalB + nCount / 2) / nCount);
1929
0
                    }
1930
0
                    pDstScanline[iDstPixel] =
1931
0
                        static_cast<T>(BestColorEntry(colorEntries, color));
1932
0
                }
1933
0
            }
1934
0
        }
1935
0
    }
1936
1937
0
    CPLFree(pasSrcX);
1938
1939
0
    return CE_None;
1940
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned char, int, (GDALDataType)1, true>(GDALOverviewResampleArgs const&, unsigned char const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, double, (GDALDataType)2, true>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<float, double, (GDALDataType)6, true>(GDALOverviewResampleArgs const&, float const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<double, double, (GDALDataType)7, true>(GDALOverviewResampleArgs const&, double const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned char, int, (GDALDataType)1, false>(GDALOverviewResampleArgs const&, unsigned char const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, unsigned int, (GDALDataType)2, false>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<float, double, (GDALDataType)6, false>(GDALOverviewResampleArgs const&, float const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<double, double, (GDALDataType)7, false>(GDALOverviewResampleArgs const&, double const*, void**)
1941
1942
template <bool bQuadraticMean>
1943
static CPLErr
1944
GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
1945
                                       const void *pChunk, void **ppDstBuffer,
1946
                                       GDALDataType *peDstBufferDataType)
1947
0
{
1948
0
    *peDstBufferDataType = args.eWrkDataType;
1949
0
    switch (args.eWrkDataType)
1950
0
    {
1951
0
        case GDT_Byte:
1952
0
        {
1953
0
            return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte,
1954
0
                                                    bQuadraticMean>(
1955
0
                args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1956
0
        }
1957
1958
0
        case GDT_UInt16:
1959
0
        {
1960
            if constexpr (bQuadraticMean)
1961
0
            {
1962
                // Use double as accumulation type, because UInt32 could overflow
1963
0
                return GDALResampleChunk_AverageOrRMS_T<
1964
0
                    GUInt16, double, GDT_UInt16, bQuadraticMean>(
1965
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1966
            }
1967
            else
1968
0
            {
1969
0
                return GDALResampleChunk_AverageOrRMS_T<
1970
0
                    GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
1971
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1972
0
            }
1973
0
        }
1974
1975
0
        case GDT_Float32:
1976
0
        {
1977
0
            return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
1978
0
                                                    bQuadraticMean>(
1979
0
                args, static_cast<const float *>(pChunk), ppDstBuffer);
1980
0
        }
1981
1982
0
        case GDT_Float64:
1983
0
        {
1984
0
            return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
1985
0
                                                    bQuadraticMean>(
1986
0
                args, static_cast<const double *>(pChunk), ppDstBuffer);
1987
0
        }
1988
1989
0
        default:
1990
0
            break;
1991
0
    }
1992
1993
0
    CPLAssert(false);
1994
0
    return CE_Failure;
1995
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMSInternal<true>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMSInternal<false>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
1996
1997
static CPLErr
1998
GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1999
                               const void *pChunk, void **ppDstBuffer,
2000
                               GDALDataType *peDstBufferDataType)
2001
0
{
2002
0
    if (EQUAL(args.pszResampling, "RMS"))
2003
0
        return GDALResampleChunk_AverageOrRMSInternal<true>(
2004
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
2005
0
    else
2006
0
        return GDALResampleChunk_AverageOrRMSInternal<false>(
2007
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
2008
0
}
2009
2010
/************************************************************************/
2011
/*                     GDALResampleChunk_Gauss()                        */
2012
/************************************************************************/
2013
2014
static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
2015
                                      const void *pChunk, void **ppDstBuffer,
2016
                                      GDALDataType *peDstBufferDataType)
2017
2018
0
{
2019
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2020
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2021
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2022
0
    const int nChunkXOff = args.nChunkXOff;
2023
0
    const int nChunkXSize = args.nChunkXSize;
2024
0
    const int nChunkYOff = args.nChunkYOff;
2025
0
    const int nChunkYSize = args.nChunkYSize;
2026
0
    const int nDstXOff = args.nDstXOff;
2027
0
    const int nDstXOff2 = args.nDstXOff2;
2028
0
    const int nDstYOff = args.nDstYOff;
2029
0
    const int nDstYOff2 = args.nDstYOff2;
2030
0
    const bool bHasNoData = args.bHasNoData;
2031
0
    double dfNoDataValue = args.dfNoDataValue;
2032
0
    const GDALColorTable *poColorTable = args.poColorTable;
2033
2034
0
    const double *const padfChunk = static_cast<const double *>(pChunk);
2035
2036
0
    *ppDstBuffer =
2037
0
        VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2038
0
                            GDALGetDataTypeSizeBytes(GDT_Float64));
2039
0
    if (*ppDstBuffer == nullptr)
2040
0
    {
2041
0
        return CE_Failure;
2042
0
    }
2043
0
    *peDstBufferDataType = GDT_Float64;
2044
0
    double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
2045
2046
    /* -------------------------------------------------------------------- */
2047
    /*      Create the filter kernel and allocate scanline buffer.          */
2048
    /* -------------------------------------------------------------------- */
2049
0
    int nGaussMatrixDim = 3;
2050
0
    const int *panGaussMatrix;
2051
0
    constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
2052
0
    constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
2053
0
                                        4,  6, 24, 36, 24, 6, 4,  16, 24,
2054
0
                                        16, 4, 1,  4,  6,  4, 1};
2055
0
    constexpr int anGaussMatrix7x7[] = {
2056
0
        1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
2057
0
        6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
2058
0
        120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
2059
0
        90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
2060
2061
0
    const int nOXSize = args.nOvrXSize;
2062
0
    const int nOYSize = args.nOvrYSize;
2063
0
    const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2064
2065
    // matrix for gauss filter
2066
0
    if (nResYFactor <= 2)
2067
0
    {
2068
0
        panGaussMatrix = anGaussMatrix3x3;
2069
0
        nGaussMatrixDim = 3;
2070
0
    }
2071
0
    else if (nResYFactor <= 4)
2072
0
    {
2073
0
        panGaussMatrix = anGaussMatrix5x5;
2074
0
        nGaussMatrixDim = 5;
2075
0
    }
2076
0
    else
2077
0
    {
2078
0
        panGaussMatrix = anGaussMatrix7x7;
2079
0
        nGaussMatrixDim = 7;
2080
0
    }
2081
2082
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2083
    int *panGaussMatrixDup = static_cast<int *>(
2084
        CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
2085
    memcpy(panGaussMatrixDup, panGaussMatrix,
2086
           sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
2087
    panGaussMatrix = panGaussMatrixDup;
2088
#endif
2089
2090
0
    if (!bHasNoData)
2091
0
        dfNoDataValue = 0.0;
2092
2093
0
    std::vector<GDALColorEntry> colorEntries;
2094
0
    int nTransparentIdx = -1;
2095
0
    if (poColorTable)
2096
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2097
2098
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
2099
    // it as nodata value.
2100
0
    if (bHasNoData && dfNoDataValue >= 0.0 &&
2101
0
        dfNoDataValue < colorEntries.size())
2102
0
        colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
2103
2104
    // Or if we have no explicit nodata, but a color table entry that is
2105
    // transparent, consider it as the nodata value.
2106
0
    else if (!bHasNoData && nTransparentIdx >= 0)
2107
0
    {
2108
0
        dfNoDataValue = nTransparentIdx;
2109
0
    }
2110
2111
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2112
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2113
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
2114
2115
    /* ==================================================================== */
2116
    /*      Loop over destination scanlines.                                */
2117
    /* ==================================================================== */
2118
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2119
0
    {
2120
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
2121
0
        int nSrcYOff2 =
2122
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
2123
2124
0
        if (nSrcYOff < nChunkYOff)
2125
0
        {
2126
0
            nSrcYOff = nChunkYOff;
2127
0
            nSrcYOff2++;
2128
0
        }
2129
2130
0
        const int iSizeY = nSrcYOff2 - nSrcYOff;
2131
0
        nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
2132
0
        nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
2133
2134
0
        if (nSrcYOff2 > nChunkBottomYOff ||
2135
0
            (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
2136
0
        {
2137
0
            nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
2138
0
        }
2139
2140
0
        int nYShiftGaussMatrix = 0;
2141
0
        if (nSrcYOff < nChunkYOff)
2142
0
        {
2143
0
            nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
2144
0
            nSrcYOff = nChunkYOff;
2145
0
        }
2146
2147
0
        const double *const padfSrcScanline =
2148
0
            padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2149
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
2150
0
        if (pabyChunkNodataMask != nullptr)
2151
0
            pabySrcScanlineNodataMask =
2152
0
                pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2153
2154
        /* --------------------------------------------------------------------
2155
         */
2156
        /*      Loop over destination pixels */
2157
        /* --------------------------------------------------------------------
2158
         */
2159
0
        double *const padfDstScanline =
2160
0
            padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
2161
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2162
0
        {
2163
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
2164
0
            int nSrcXOff2 =
2165
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
2166
2167
0
            if (nSrcXOff < nChunkXOff)
2168
0
            {
2169
0
                nSrcXOff = nChunkXOff;
2170
0
                nSrcXOff2++;
2171
0
            }
2172
2173
0
            const int iSizeX = nSrcXOff2 - nSrcXOff;
2174
0
            nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2175
0
            nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2176
2177
0
            if (nSrcXOff2 > nChunkRightXOff ||
2178
0
                (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2179
0
            {
2180
0
                nSrcXOff2 =
2181
0
                    std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2182
0
            }
2183
2184
0
            int nXShiftGaussMatrix = 0;
2185
0
            if (nSrcXOff < nChunkXOff)
2186
0
            {
2187
0
                nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2188
0
                nSrcXOff = nChunkXOff;
2189
0
            }
2190
2191
0
            if (poColorTable == nullptr)
2192
0
            {
2193
0
                double dfTotal = 0.0;
2194
0
                GInt64 nCount = 0;
2195
0
                const int *panLineWeight =
2196
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2197
0
                    nXShiftGaussMatrix;
2198
2199
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2200
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2201
0
                {
2202
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2203
0
                    {
2204
0
                        const double val =
2205
0
                            padfSrcScanline[iX - nChunkXOff +
2206
0
                                            static_cast<GPtrDiff_t>(iY -
2207
0
                                                                    nSrcYOff) *
2208
0
                                                nChunkXSize];
2209
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2210
0
                            pabySrcScanlineNodataMask[iX - nChunkXOff +
2211
0
                                                      static_cast<GPtrDiff_t>(
2212
0
                                                          iY - nSrcYOff) *
2213
0
                                                          nChunkXSize])
2214
0
                        {
2215
0
                            const int nWeight = panLineWeight[i];
2216
0
                            dfTotal += val * nWeight;
2217
0
                            nCount += nWeight;
2218
0
                        }
2219
0
                    }
2220
0
                }
2221
2222
0
                if (nCount == 0)
2223
0
                {
2224
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2225
0
                }
2226
0
                else
2227
0
                {
2228
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2229
0
                }
2230
0
            }
2231
0
            else
2232
0
            {
2233
0
                GInt64 nTotalR = 0;
2234
0
                GInt64 nTotalG = 0;
2235
0
                GInt64 nTotalB = 0;
2236
0
                GInt64 nTotalWeight = 0;
2237
0
                const int *panLineWeight =
2238
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2239
0
                    nXShiftGaussMatrix;
2240
2241
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2242
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2243
0
                {
2244
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2245
0
                    {
2246
0
                        const double val =
2247
0
                            padfSrcScanline[iX - nChunkXOff +
2248
0
                                            static_cast<GPtrDiff_t>(iY -
2249
0
                                                                    nSrcYOff) *
2250
0
                                                nChunkXSize];
2251
0
                        if (val < 0 || val >= colorEntries.size())
2252
0
                            continue;
2253
2254
0
                        size_t idx = static_cast<size_t>(val);
2255
0
                        if (colorEntries[idx].c4)
2256
0
                        {
2257
0
                            const int nWeight = panLineWeight[i];
2258
0
                            nTotalR +=
2259
0
                                static_cast<GInt64>(colorEntries[idx].c1) *
2260
0
                                nWeight;
2261
0
                            nTotalG +=
2262
0
                                static_cast<GInt64>(colorEntries[idx].c2) *
2263
0
                                nWeight;
2264
0
                            nTotalB +=
2265
0
                                static_cast<GInt64>(colorEntries[idx].c3) *
2266
0
                                nWeight;
2267
0
                            nTotalWeight += nWeight;
2268
0
                        }
2269
0
                    }
2270
0
                }
2271
2272
0
                if (nTotalWeight == 0)
2273
0
                {
2274
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2275
0
                }
2276
0
                else
2277
0
                {
2278
0
                    GDALColorEntry color;
2279
2280
0
                    color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2281
0
                                                  nTotalWeight);
2282
0
                    color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2283
0
                                                  nTotalWeight);
2284
0
                    color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2285
0
                                                  nTotalWeight);
2286
0
                    padfDstScanline[iDstPixel - nDstXOff] =
2287
0
                        BestColorEntry(colorEntries, color);
2288
0
                }
2289
0
            }
2290
0
        }
2291
0
    }
2292
2293
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2294
    CPLFree(panGaussMatrixDup);
2295
#endif
2296
2297
0
    return CE_None;
2298
0
}
2299
2300
/************************************************************************/
2301
/*                      GDALResampleChunk_Mode()                        */
2302
/************************************************************************/
2303
2304
template <class T> static inline bool IsSame(T a, T b)
2305
0
{
2306
0
    return a == b;
2307
0
}
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned char>(unsigned char, unsigned char)
Unexecuted instantiation: overview.cpp:bool IsSame<signed char>(signed char, signed char)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned short>(unsigned short, unsigned short)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned int>(unsigned int, unsigned int)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned long>(unsigned long, unsigned long)
2308
2309
template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
2310
0
{
2311
0
    return a == b || (CPLIsNan(a) && CPLIsNan(b));
2312
0
}
2313
2314
template <> bool IsSame<float>(float a, float b)
2315
0
{
2316
0
    return a == b || (std::isnan(a) && std::isnan(b));
2317
0
}
2318
2319
template <> bool IsSame<double>(double a, double b)
2320
0
{
2321
0
    return a == b || (std::isnan(a) && std::isnan(b));
2322
0
}
2323
2324
namespace
2325
{
2326
struct ComplexFloat16
2327
{
2328
    GFloat16 r;
2329
    GFloat16 i;
2330
};
2331
}  // namespace
2332
2333
template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
2334
0
{
2335
0
    return (a.r == b.r && a.i == b.i) ||
2336
0
           (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
2337
0
}
2338
2339
template <>
2340
bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2341
0
{
2342
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2343
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2344
0
}
2345
2346
template <>
2347
bool IsSame<std::complex<double>>(std::complex<double> a,
2348
                                  std::complex<double> b)
2349
0
{
2350
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2351
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2352
0
}
2353
2354
template <class T>
2355
static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2356
                                      const T *pChunk, T *const pDstBuffer)
2357
2358
0
{
2359
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2360
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2361
0
    const double dfSrcXDelta = args.dfSrcXDelta;
2362
0
    const double dfSrcYDelta = args.dfSrcYDelta;
2363
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2364
0
    const int nChunkXOff = args.nChunkXOff;
2365
0
    const int nChunkXSize = args.nChunkXSize;
2366
0
    const int nChunkYOff = args.nChunkYOff;
2367
0
    const int nChunkYSize = args.nChunkYSize;
2368
0
    const int nDstXOff = args.nDstXOff;
2369
0
    const int nDstXOff2 = args.nDstXOff2;
2370
0
    const int nDstYOff = args.nDstYOff;
2371
0
    const int nDstYOff2 = args.nDstYOff2;
2372
0
    const bool bHasNoData = args.bHasNoData;
2373
0
    const GDALColorTable *poColorTable = args.poColorTable;
2374
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
2375
2376
0
    T tNoDataValue;
2377
    if constexpr (std::is_same<T, ComplexFloat16>::value)
2378
0
    {
2379
0
        tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
2380
0
        tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
2381
    }
2382
    else if constexpr (std::is_same<T, std::complex<float>>::value ||
2383
                       std::is_same<T, std::complex<double>>::value)
2384
0
    {
2385
0
        using BaseT = typename T::value_type;
2386
0
        tNoDataValue =
2387
0
            std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2388
0
                                std::numeric_limits<BaseT>::quiet_NaN());
2389
    }
2390
0
    else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2391
0
        tNoDataValue = 0;
2392
0
    else
2393
0
        tNoDataValue = static_cast<T>(args.dfNoDataValue);
2394
2395
0
    using CountType = uint32_t;
2396
0
    CountType nMaxNumPx = 0;
2397
0
    T *paVals = nullptr;
2398
0
    CountType *panCounts = nullptr;
2399
2400
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2401
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2402
0
    std::vector<int> anVals(256, 0);
2403
2404
    /* ==================================================================== */
2405
    /*      Loop over destination scanlines.                                */
2406
    /* ==================================================================== */
2407
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2408
0
    {
2409
0
        const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2410
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2411
#ifdef only_pixels_with_more_than_10_pct_participation
2412
        // When oversampling, don't take into account pixels that have a tiny
2413
        // participation in the resulting pixel
2414
        if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2415
            nSrcYOff < nChunkBottomYOff)
2416
            nSrcYOff++;
2417
#endif
2418
0
        if (nSrcYOff < nChunkYOff)
2419
0
            nSrcYOff = nChunkYOff;
2420
2421
0
        const double dfSrcYOff2 =
2422
0
            dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2423
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2424
#ifdef only_pixels_with_more_than_10_pct_participation
2425
        // When oversampling, don't take into account pixels that have a tiny
2426
        // participation in the resulting pixel
2427
        if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2428
            nSrcYOff2 > nChunkYOff)
2429
            nSrcYOff2--;
2430
#endif
2431
0
        if (nSrcYOff2 == nSrcYOff)
2432
0
            ++nSrcYOff2;
2433
0
        if (nSrcYOff2 > nChunkBottomYOff)
2434
0
            nSrcYOff2 = nChunkBottomYOff;
2435
2436
0
        const T *const paSrcScanline =
2437
0
            pChunk +
2438
0
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2439
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
2440
0
        if (pabyChunkNodataMask != nullptr)
2441
0
            pabySrcScanlineNodataMask =
2442
0
                pabyChunkNodataMask +
2443
0
                static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2444
2445
0
        T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2446
        /* --------------------------------------------------------------------
2447
         */
2448
        /*      Loop over destination pixels */
2449
        /* --------------------------------------------------------------------
2450
         */
2451
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2452
0
        {
2453
0
            const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2454
            // Apply some epsilon to avoid numerical precision issues
2455
0
            int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2456
#ifdef only_pixels_with_more_than_10_pct_participation
2457
            // When oversampling, don't take into account pixels that have a
2458
            // tiny participation in the resulting pixel
2459
            if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2460
                nSrcXOff < nChunkRightXOff)
2461
                nSrcXOff++;
2462
#endif
2463
0
            if (nSrcXOff < nChunkXOff)
2464
0
                nSrcXOff = nChunkXOff;
2465
2466
0
            const double dfSrcXOff2 =
2467
0
                dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2468
0
            int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2469
#ifdef only_pixels_with_more_than_10_pct_participation
2470
            // When oversampling, don't take into account pixels that have a
2471
            // tiny participation in the resulting pixel
2472
            if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2473
                nSrcXOff2 > nChunkXOff)
2474
                nSrcXOff2--;
2475
#endif
2476
0
            if (nSrcXOff2 == nSrcXOff)
2477
0
                nSrcXOff2++;
2478
0
            if (nSrcXOff2 > nChunkRightXOff)
2479
0
                nSrcXOff2 = nChunkRightXOff;
2480
2481
0
            bool bRegularProcessing = false;
2482
            if constexpr (!std::is_same<T, GByte>::value)
2483
0
                bRegularProcessing = true;
2484
0
            else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2485
0
                bRegularProcessing = true;
2486
2487
0
            if (bRegularProcessing)
2488
0
            {
2489
                // Sanity check to make sure the allocation of paVals and
2490
                // panCounts don't overflow.
2491
0
                static_assert(sizeof(CountType) <= sizeof(size_t));
2492
0
                if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2493
0
                    static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
2494
0
                        (std::numeric_limits<CountType>::max() /
2495
0
                         std::max(sizeof(T), sizeof(CountType))) /
2496
0
                            static_cast<CountType>(nSrcXOff2 - nSrcXOff))
2497
0
                {
2498
0
                    CPLError(CE_Failure, CPLE_NotSupported,
2499
0
                             "Too big downsampling factor");
2500
0
                    CPLFree(paVals);
2501
0
                    CPLFree(panCounts);
2502
0
                    return CE_Failure;
2503
0
                }
2504
0
                const CountType nNumPx =
2505
0
                    static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
2506
0
                    (nSrcXOff2 - nSrcXOff);
2507
0
                CountType iMaxInd = 0;
2508
0
                CountType iMaxVal = 0;
2509
2510
0
                if (paVals == nullptr || nNumPx > nMaxNumPx)
2511
0
                {
2512
0
                    T *paValsNew = static_cast<T *>(
2513
0
                        VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2514
0
                    CountType *panCountsNew =
2515
0
                        static_cast<CountType *>(VSI_REALLOC_VERBOSE(
2516
0
                            panCounts, nNumPx * sizeof(CountType)));
2517
0
                    if (paValsNew != nullptr)
2518
0
                        paVals = paValsNew;
2519
0
                    if (panCountsNew != nullptr)
2520
0
                        panCounts = panCountsNew;
2521
0
                    if (paValsNew == nullptr || panCountsNew == nullptr)
2522
0
                    {
2523
0
                        CPLFree(paVals);
2524
0
                        CPLFree(panCounts);
2525
0
                        return CE_Failure;
2526
0
                    }
2527
0
                    nMaxNumPx = nNumPx;
2528
0
                }
2529
2530
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2531
0
                {
2532
0
                    const GPtrDiff_t iTotYOff =
2533
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2534
0
                        nChunkXOff;
2535
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2536
0
                    {
2537
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2538
0
                            pabySrcScanlineNodataMask[iX + iTotYOff])
2539
0
                        {
2540
0
                            const T val = paSrcScanline[iX + iTotYOff];
2541
0
                            CountType i = 0;  // Used after for.
2542
2543
                            // Check array for existing entry.
2544
0
                            for (; i < iMaxInd; ++i)
2545
0
                            {
2546
0
                                if (IsSame(paVals[i], val))
2547
0
                                {
2548
0
                                    if (++panCounts[i] > panCounts[iMaxVal])
2549
0
                                    {
2550
0
                                        iMaxVal = i;
2551
0
                                    }
2552
0
                                    break;
2553
0
                                }
2554
0
                            }
2555
2556
                            // Add to arr if entry not already there.
2557
0
                            if (i == iMaxInd)
2558
0
                            {
2559
0
                                paVals[iMaxInd] = val;
2560
0
                                panCounts[iMaxInd] = 1;
2561
2562
0
                                if (iMaxInd == 0)
2563
0
                                {
2564
0
                                    iMaxVal = iMaxInd;
2565
0
                                }
2566
2567
0
                                ++iMaxInd;
2568
0
                            }
2569
0
                        }
2570
0
                    }
2571
0
                }
2572
2573
0
                if (iMaxInd == 0)
2574
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2575
0
                else
2576
0
                    paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2577
0
            }
2578
            else if constexpr (std::is_same<T, GByte>::value)
2579
            // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2580
0
            {
2581
                // So we go here for a paletted or non-paletted byte band.
2582
                // The input values are then between 0 and 255.
2583
0
                int nMaxVal = 0;
2584
0
                int iMaxInd = -1;
2585
2586
                // The cost of this zeroing might be high. Perhaps we should
2587
                // just use the above generic case, and go to this one if the
2588
                // number of source pixels is large enough
2589
0
                std::fill(anVals.begin(), anVals.end(), 0);
2590
2591
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2592
0
                {
2593
0
                    const GPtrDiff_t iTotYOff =
2594
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2595
0
                        nChunkXOff;
2596
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2597
0
                    {
2598
0
                        const T val = paSrcScanline[iX + iTotYOff];
2599
0
                        if (!bHasNoData || val != tNoDataValue)
2600
0
                        {
2601
0
                            int nVal = static_cast<int>(val);
2602
0
                            if (++anVals[nVal] > nMaxVal)
2603
0
                            {
2604
                                // Sum the density.
2605
                                // Is it the most common value so far?
2606
0
                                iMaxInd = nVal;
2607
0
                                nMaxVal = anVals[nVal];
2608
0
                            }
2609
0
                        }
2610
0
                    }
2611
0
                }
2612
2613
0
                if (iMaxInd == -1)
2614
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2615
0
                else
2616
0
                    paDstScanline[iDstPixel - nDstXOff] =
2617
0
                        static_cast<T>(iMaxInd);
2618
0
            }
2619
0
        }
2620
0
    }
2621
2622
0
    CPLFree(paVals);
2623
0
    CPLFree(panCounts);
2624
2625
0
    return CE_None;
2626
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<signed char>(GDALOverviewResampleArgs const&, signed char const*, signed char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<cpl::Float16>(GDALOverviewResampleArgs const&, cpl::Float16 const*, cpl::Float16*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<float>(GDALOverviewResampleArgs const&, float const*, float*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<double>(GDALOverviewResampleArgs const&, double const*, double*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<(anonymous namespace)::ComplexFloat16>(GDALOverviewResampleArgs const&, (anonymous namespace)::ComplexFloat16 const*, (anonymous namespace)::ComplexFloat16*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<float> >(GDALOverviewResampleArgs const&, std::__1::complex<float> const*, std::__1::complex<float>*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>*)
2627
2628
static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2629
                                     const void *pChunk, void **ppDstBuffer,
2630
                                     GDALDataType *peDstBufferDataType)
2631
0
{
2632
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2633
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2634
0
        GDALGetDataTypeSizeBytes(args.eWrkDataType));
2635
0
    if (*ppDstBuffer == nullptr)
2636
0
    {
2637
0
        return CE_Failure;
2638
0
    }
2639
2640
0
    CPLAssert(args.eSrcDataType == args.eWrkDataType);
2641
2642
0
    *peDstBufferDataType = args.eWrkDataType;
2643
0
    switch (args.eWrkDataType)
2644
0
    {
2645
        // For mode resampling, as no computation is done, only the
2646
        // size of the data type matters... except for Byte where we have
2647
        // special processing. And for floating point values
2648
0
        case GDT_Byte:
2649
0
        {
2650
0
            return GDALResampleChunk_ModeT(args,
2651
0
                                           static_cast<const GByte *>(pChunk),
2652
0
                                           static_cast<GByte *>(*ppDstBuffer));
2653
0
        }
2654
2655
0
        case GDT_Int8:
2656
0
        {
2657
0
            return GDALResampleChunk_ModeT(args,
2658
0
                                           static_cast<const int8_t *>(pChunk),
2659
0
                                           static_cast<int8_t *>(*ppDstBuffer));
2660
0
        }
2661
2662
0
        case GDT_Int16:
2663
0
        case GDT_UInt16:
2664
0
        {
2665
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2666
0
            return GDALResampleChunk_ModeT(
2667
0
                args, static_cast<const uint16_t *>(pChunk),
2668
0
                static_cast<uint16_t *>(*ppDstBuffer));
2669
0
        }
2670
2671
0
        case GDT_CInt16:
2672
0
        case GDT_Int32:
2673
0
        case GDT_UInt32:
2674
0
        {
2675
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2676
0
            return GDALResampleChunk_ModeT(
2677
0
                args, static_cast<const uint32_t *>(pChunk),
2678
0
                static_cast<uint32_t *>(*ppDstBuffer));
2679
0
        }
2680
2681
0
        case GDT_CInt32:
2682
0
        case GDT_Int64:
2683
0
        case GDT_UInt64:
2684
0
        {
2685
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2686
0
            return GDALResampleChunk_ModeT(
2687
0
                args, static_cast<const uint64_t *>(pChunk),
2688
0
                static_cast<uint64_t *>(*ppDstBuffer));
2689
0
        }
2690
2691
0
        case GDT_Float16:
2692
0
        {
2693
0
            return GDALResampleChunk_ModeT(
2694
0
                args, static_cast<const GFloat16 *>(pChunk),
2695
0
                static_cast<GFloat16 *>(*ppDstBuffer));
2696
0
        }
2697
2698
0
        case GDT_Float32:
2699
0
        {
2700
0
            return GDALResampleChunk_ModeT(args,
2701
0
                                           static_cast<const float *>(pChunk),
2702
0
                                           static_cast<float *>(*ppDstBuffer));
2703
0
        }
2704
2705
0
        case GDT_Float64:
2706
0
        {
2707
0
            return GDALResampleChunk_ModeT(args,
2708
0
                                           static_cast<const double *>(pChunk),
2709
0
                                           static_cast<double *>(*ppDstBuffer));
2710
0
        }
2711
2712
0
        case GDT_CFloat16:
2713
0
        {
2714
0
            return GDALResampleChunk_ModeT(
2715
0
                args, static_cast<const ComplexFloat16 *>(pChunk),
2716
0
                static_cast<ComplexFloat16 *>(*ppDstBuffer));
2717
0
        }
2718
2719
0
        case GDT_CFloat32:
2720
0
        {
2721
0
            return GDALResampleChunk_ModeT(
2722
0
                args, static_cast<const std::complex<float> *>(pChunk),
2723
0
                static_cast<std::complex<float> *>(*ppDstBuffer));
2724
0
        }
2725
2726
0
        case GDT_CFloat64:
2727
0
        {
2728
0
            return GDALResampleChunk_ModeT(
2729
0
                args, static_cast<const std::complex<double> *>(pChunk),
2730
0
                static_cast<std::complex<double> *>(*ppDstBuffer));
2731
0
        }
2732
2733
0
        case GDT_Unknown:
2734
0
        case GDT_TypeCount:
2735
0
            break;
2736
0
    }
2737
2738
0
    CPLAssert(false);
2739
0
    return CE_Failure;
2740
0
}
2741
2742
/************************************************************************/
2743
/*                  GDALResampleConvolutionHorizontal()                 */
2744
/************************************************************************/
2745
2746
template <class T>
2747
static inline double
2748
GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2749
                                  int nSrcPixelCount)
2750
0
{
2751
0
    double dfVal1 = 0.0;
2752
0
    double dfVal2 = 0.0;
2753
0
    int i = 0;  // Used after for.
2754
    // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2755
    // manually (untypical) unrolled loop in -O2 and -O3:
2756
    // https://github.com/OSGeo/gdal/issues/9508
2757
0
#if !defined(__INTEL_CLANG_COMPILER)
2758
0
    for (; i < nSrcPixelCount - 3; i += 4)
2759
0
    {
2760
0
        dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
2761
0
        dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
2762
0
        dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
2763
0
        dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
2764
0
    }
2765
0
#endif
2766
0
    for (; i < nSrcPixelCount; ++i)
2767
0
    {
2768
0
        dfVal1 += double(pChunk[i]) * padfWeights[i];
2769
0
    }
2770
0
    return dfVal1 + dfVal2;
2771
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<float>(float const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<double>(double const*, double const*, int)
2772
2773
template <class T>
2774
static inline void GDALResampleConvolutionHorizontalWithMask(
2775
    const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2776
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2777
0
{
2778
0
    dfVal = 0;
2779
0
    dfWeightSum = 0;
2780
0
    int i = 0;
2781
0
    for (; i < nSrcPixelCount - 3; i += 4)
2782
0
    {
2783
0
        const double dfWeight0 = padfWeights[i] * pabyMask[i];
2784
0
        const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2785
0
        const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2786
0
        const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2787
0
        dfVal += double(pChunk[i + 0]) * dfWeight0;
2788
0
        dfVal += double(pChunk[i + 1]) * dfWeight1;
2789
0
        dfVal += double(pChunk[i + 2]) * dfWeight2;
2790
0
        dfVal += double(pChunk[i + 3]) * dfWeight3;
2791
0
        dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2792
0
    }
2793
0
    for (; i < nSrcPixelCount; ++i)
2794
0
    {
2795
0
        const double dfWeight = padfWeights[i] * pabyMask[i];
2796
0
        dfVal += double(pChunk[i]) * dfWeight;
2797
0
        dfWeightSum += dfWeight;
2798
0
    }
2799
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<float>(float const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<double>(double const*, unsigned char const*, double const*, int, double&, double&)
2800
2801
template <class T>
2802
static inline void GDALResampleConvolutionHorizontal_3rows(
2803
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2804
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2805
    double &dfRes2, double &dfRes3)
2806
0
{
2807
0
    double dfVal1 = 0.0;
2808
0
    double dfVal2 = 0.0;
2809
0
    double dfVal3 = 0.0;
2810
0
    double dfVal4 = 0.0;
2811
0
    double dfVal5 = 0.0;
2812
0
    double dfVal6 = 0.0;
2813
0
    int i = 0;  // Used after for.
2814
0
    for (; i < nSrcPixelCount - 3; i += 4)
2815
0
    {
2816
0
        dfVal1 += double(pChunkRow1[i + 0]) * padfWeights[i + 0];
2817
0
        dfVal1 += double(pChunkRow1[i + 1]) * padfWeights[i + 1];
2818
0
        dfVal2 += double(pChunkRow1[i + 2]) * padfWeights[i + 2];
2819
0
        dfVal2 += double(pChunkRow1[i + 3]) * padfWeights[i + 3];
2820
0
        dfVal3 += double(pChunkRow2[i + 0]) * padfWeights[i + 0];
2821
0
        dfVal3 += double(pChunkRow2[i + 1]) * padfWeights[i + 1];
2822
0
        dfVal4 += double(pChunkRow2[i + 2]) * padfWeights[i + 2];
2823
0
        dfVal4 += double(pChunkRow2[i + 3]) * padfWeights[i + 3];
2824
0
        dfVal5 += double(pChunkRow3[i + 0]) * padfWeights[i + 0];
2825
0
        dfVal5 += double(pChunkRow3[i + 1]) * padfWeights[i + 1];
2826
0
        dfVal6 += double(pChunkRow3[i + 2]) * padfWeights[i + 2];
2827
0
        dfVal6 += double(pChunkRow3[i + 3]) * padfWeights[i + 3];
2828
0
    }
2829
0
    for (; i < nSrcPixelCount; ++i)
2830
0
    {
2831
0
        dfVal1 += double(pChunkRow1[i]) * padfWeights[i];
2832
0
        dfVal3 += double(pChunkRow2[i]) * padfWeights[i];
2833
0
        dfVal5 += double(pChunkRow3[i]) * padfWeights[i];
2834
0
    }
2835
0
    dfRes1 = dfVal1 + dfVal2;
2836
0
    dfRes2 = dfVal3 + dfVal4;
2837
0
    dfRes3 = dfVal5 + dfVal6;
2838
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2839
2840
template <class T>
2841
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2842
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2843
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2844
    double &dfRes2, double &dfRes3)
2845
0
{
2846
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2847
0
                                            padfWeights, nSrcPixelCount, dfRes1,
2848
0
                                            dfRes2, dfRes3);
2849
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2850
2851
template <class T>
2852
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2853
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2854
    const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2855
0
{
2856
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2857
0
                                            padfWeights, 4, dfRes1, dfRes2,
2858
0
                                            dfRes3);
2859
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<float>(float const*, float const*, float const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<double>(double const*, double const*, double const*, double const*, double&, double&, double&)
2860
2861
/************************************************************************/
2862
/*                  GDALResampleConvolutionVertical()                   */
2863
/************************************************************************/
2864
2865
template <class T>
2866
static inline double
2867
GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2868
                                const double *padfWeights, int nSrcLineCount)
2869
0
{
2870
0
    double dfVal1 = 0.0;
2871
0
    double dfVal2 = 0.0;
2872
0
    int i = 0;
2873
0
    size_t j = 0;
2874
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2875
0
    {
2876
0
        dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2877
0
        dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2878
0
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2879
0
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2880
0
    }
2881
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2882
0
    {
2883
0
        dfVal1 += pChunk[j] * padfWeights[i];
2884
0
    }
2885
0
    return dfVal1 + dfVal2;
2886
0
}
2887
2888
template <class T>
2889
static inline void GDALResampleConvolutionVertical_2cols(
2890
    const T *pChunk, size_t nStride, const double *padfWeights,
2891
    int nSrcLineCount, double &dfRes1, double &dfRes2)
2892
0
{
2893
0
    double dfVal1 = 0.0;
2894
0
    double dfVal2 = 0.0;
2895
0
    double dfVal3 = 0.0;
2896
0
    double dfVal4 = 0.0;
2897
0
    int i = 0;
2898
0
    size_t j = 0;
2899
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2900
0
    {
2901
0
        dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2902
0
        dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2903
0
        dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2904
0
        dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2905
0
        dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2906
0
        dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2907
0
        dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2908
0
        dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2909
0
    }
2910
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2911
0
    {
2912
0
        dfVal1 += pChunk[j + 0] * padfWeights[i];
2913
0
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2914
0
    }
2915
0
    dfRes1 = dfVal1 + dfVal2;
2916
0
    dfRes2 = dfVal3 + dfVal4;
2917
0
}
2918
2919
#ifdef USE_SSE2
2920
2921
#ifdef __AVX__
2922
/************************************************************************/
2923
/*             GDALResampleConvolutionVertical_16cols<T>                */
2924
/************************************************************************/
2925
2926
template <class T>
2927
static inline void
2928
GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2929
                                       const double *padfWeights,
2930
                                       int nSrcLineCount, float *afDest)
2931
{
2932
    int i = 0;
2933
    size_t j = 0;
2934
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2935
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2936
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2937
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2938
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2939
    {
2940
        XMMReg4Double w0 =
2941
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2942
        XMMReg4Double w1 =
2943
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2944
        XMMReg4Double w2 =
2945
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2946
        XMMReg4Double w3 =
2947
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2948
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2949
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2950
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2951
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2952
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2953
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2954
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2955
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2956
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2957
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2958
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2959
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2960
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2961
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2962
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2963
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2964
    }
2965
    for (; i < nSrcLineCount; ++i, j += nStride)
2966
    {
2967
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2968
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2969
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2970
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2971
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2972
    }
2973
    v_acc0.Store4Val(afDest);
2974
    v_acc1.Store4Val(afDest + 4);
2975
    v_acc2.Store4Val(afDest + 8);
2976
    v_acc3.Store4Val(afDest + 12);
2977
}
2978
2979
template <class T>
2980
static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2981
                                                          const double *, int,
2982
                                                          double *)
2983
{
2984
    // Cannot be reached
2985
    CPLAssert(false);
2986
}
2987
2988
#else
2989
2990
/************************************************************************/
2991
/*              GDALResampleConvolutionVertical_8cols<T>                */
2992
/************************************************************************/
2993
2994
template <class T>
2995
static inline void
2996
GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
2997
                                      const double *padfWeights,
2998
                                      int nSrcLineCount, float *afDest)
2999
0
{
3000
0
    int i = 0;
3001
0
    size_t j = 0;
3002
0
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
3003
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3004
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
3005
0
    {
3006
0
        XMMReg4Double w0 =
3007
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
3008
0
        XMMReg4Double w1 =
3009
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
3010
0
        XMMReg4Double w2 =
3011
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
3012
0
        XMMReg4Double w3 =
3013
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
3014
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
3015
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
3016
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
3017
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
3018
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
3019
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
3020
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
3021
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
3022
0
    }
3023
0
    for (; i < nSrcLineCount; ++i, j += nStride)
3024
0
    {
3025
0
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3026
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3027
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3028
0
    }
3029
0
    v_acc0.Store4Val(afDest);
3030
0
    v_acc1.Store4Val(afDest + 4);
3031
0
}
3032
3033
template <class T>
3034
static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
3035
                                                         const double *, int,
3036
                                                         double *)
3037
{
3038
    // Cannot be reached
3039
    CPLAssert(false);
3040
}
3041
3042
#endif  // __AVX__
3043
3044
/************************************************************************/
3045
/*              GDALResampleConvolutionHorizontalSSE2<T>                */
3046
/************************************************************************/
3047
3048
template <class T>
3049
static inline double GDALResampleConvolutionHorizontalSSE2(
3050
    const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3051
0
{
3052
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3053
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3054
0
    int i = 0;  // Used after for.
3055
0
    for (; i < nSrcPixelCount - 7; i += 8)
3056
0
    {
3057
        // Retrieve the pixel & accumulate
3058
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
3059
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
3060
0
        const XMMReg4Double v_weight1 =
3061
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3062
0
        const XMMReg4Double v_weight2 =
3063
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3064
3065
0
        v_acc1 += v_pixels1 * v_weight1;
3066
0
        v_acc2 += v_pixels2 * v_weight2;
3067
0
    }
3068
3069
0
    v_acc1 += v_acc2;
3070
3071
0
    double dfVal = v_acc1.GetHorizSum();
3072
0
    for (; i < nSrcPixelCount; ++i)
3073
0
    {
3074
0
        dfVal += pChunk[i] * padfWeightsAligned[i];
3075
0
    }
3076
0
    return dfVal;
3077
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned char>(unsigned char const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned short>(unsigned short const*, double const*, int)
3078
3079
/************************************************************************/
3080
/*              GDALResampleConvolutionHorizontal<GByte>                */
3081
/************************************************************************/
3082
3083
template <>
3084
inline double GDALResampleConvolutionHorizontal<GByte>(
3085
    const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3086
0
{
3087
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3088
0
                                                 nSrcPixelCount);
3089
0
}
3090
3091
template <>
3092
inline double GDALResampleConvolutionHorizontal<GUInt16>(
3093
    const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3094
0
{
3095
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3096
0
                                                 nSrcPixelCount);
3097
0
}
3098
3099
/************************************************************************/
3100
/*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
3101
/************************************************************************/
3102
3103
template <class T>
3104
static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
3105
    const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
3106
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
3107
0
{
3108
0
    int i = 0;  // Used after for.
3109
0
    XMMReg4Double v_acc = XMMReg4Double::Zero();
3110
0
    XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
3111
0
    for (; i < nSrcPixelCount - 3; i += 4)
3112
0
    {
3113
0
        const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
3114
0
        const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
3115
0
        XMMReg4Double v_weight =
3116
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3117
0
        v_weight *= v_mask;
3118
0
        v_acc += v_pixels * v_weight;
3119
0
        v_acc_weight += v_weight;
3120
0
    }
3121
3122
0
    dfVal = v_acc.GetHorizSum();
3123
0
    dfWeightSum = v_acc_weight.GetHorizSum();
3124
0
    for (; i < nSrcPixelCount; ++i)
3125
0
    {
3126
0
        const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
3127
0
        dfVal += pChunk[i] * dfWeight;
3128
0
        dfWeightSum += dfWeight;
3129
0
    }
3130
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned char>(unsigned char const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned short>(unsigned short const*, unsigned char const*, double const*, int, double&, double&)
3131
3132
/************************************************************************/
3133
/*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
3134
/************************************************************************/
3135
3136
template <>
3137
inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
3138
    const GByte *pChunk, const GByte *pabyMask,
3139
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3140
    double &dfWeightSum)
3141
0
{
3142
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
3143
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3144
0
        dfWeightSum);
3145
0
}
3146
3147
template <>
3148
inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
3149
    const GUInt16 *pChunk, const GByte *pabyMask,
3150
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3151
    double &dfWeightSum)
3152
0
{
3153
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
3154
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3155
0
        dfWeightSum);
3156
0
}
3157
3158
/************************************************************************/
3159
/*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
3160
/************************************************************************/
3161
3162
template <class T>
3163
static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
3164
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3165
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3166
    double &dfRes2, double &dfRes3)
3167
0
{
3168
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
3169
0
                  v_acc2 = XMMReg4Double::Zero(),
3170
0
                  v_acc3 = XMMReg4Double::Zero();
3171
0
    int i = 0;
3172
0
    for (; i < nSrcPixelCount - 7; i += 8)
3173
0
    {
3174
        // Retrieve the pixel & accumulate.
3175
0
        XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3176
0
        XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
3177
0
        const XMMReg4Double v_weight1 =
3178
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3179
0
        const XMMReg4Double v_weight2 =
3180
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3181
3182
0
        v_acc1 += v_pixels1 * v_weight1;
3183
0
        v_acc1 += v_pixels2 * v_weight2;
3184
3185
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3186
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
3187
0
        v_acc2 += v_pixels1 * v_weight1;
3188
0
        v_acc2 += v_pixels2 * v_weight2;
3189
3190
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3191
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
3192
0
        v_acc3 += v_pixels1 * v_weight1;
3193
0
        v_acc3 += v_pixels2 * v_weight2;
3194
0
    }
3195
3196
0
    dfRes1 = v_acc1.GetHorizSum();
3197
0
    dfRes2 = v_acc2.GetHorizSum();
3198
0
    dfRes3 = v_acc3.GetHorizSum();
3199
0
    for (; i < nSrcPixelCount; ++i)
3200
0
    {
3201
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3202
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3203
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3204
0
    }
3205
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
3206
3207
/************************************************************************/
3208
/*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
3209
/************************************************************************/
3210
3211
template <>
3212
inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
3213
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3214
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3215
    double &dfRes2, double &dfRes3)
3216
0
{
3217
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
3218
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3219
0
        dfRes1, dfRes2, dfRes3);
3220
0
}
3221
3222
template <>
3223
inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
3224
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3225
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3226
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3227
0
{
3228
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
3229
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3230
0
        dfRes1, dfRes2, dfRes3);
3231
0
}
3232
3233
/************************************************************************/
3234
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
3235
/************************************************************************/
3236
3237
template <class T>
3238
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3239
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3240
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3241
    double &dfRes2, double &dfRes3)
3242
0
{
3243
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3244
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3245
0
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3246
0
    int i = 0;  // Use after for.
3247
0
    for (; i < nSrcPixelCount - 3; i += 4)
3248
0
    {
3249
        // Retrieve the pixel & accumulate.
3250
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3251
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3252
0
        const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3253
0
        const XMMReg4Double v_weight =
3254
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3255
3256
0
        v_acc1 += v_pixels1 * v_weight;
3257
0
        v_acc2 += v_pixels2 * v_weight;
3258
0
        v_acc3 += v_pixels3 * v_weight;
3259
0
    }
3260
3261
0
    dfRes1 = v_acc1.GetHorizSum();
3262
0
    dfRes2 = v_acc2.GetHorizSum();
3263
0
    dfRes3 = v_acc3.GetHorizSum();
3264
3265
0
    for (; i < nSrcPixelCount; ++i)
3266
0
    {
3267
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3268
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3269
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3270
0
    }
3271
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
3272
3273
/************************************************************************/
3274
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
3275
/************************************************************************/
3276
3277
template <>
3278
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3279
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3280
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3281
    double &dfRes2, double &dfRes3)
3282
0
{
3283
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3284
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3285
0
        dfRes1, dfRes2, dfRes3);
3286
0
}
3287
3288
template <>
3289
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3290
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3291
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3292
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3293
0
{
3294
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3295
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3296
0
        dfRes1, dfRes2, dfRes3);
3297
0
}
3298
3299
/************************************************************************/
3300
/*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
3301
/************************************************************************/
3302
3303
template <class T>
3304
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3305
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3306
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3307
    double &dfRes3)
3308
0
{
3309
0
    const XMMReg4Double v_weight =
3310
0
        XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3311
3312
    // Retrieve the pixel & accumulate.
3313
0
    const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3314
0
    const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3315
0
    const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3316
3317
0
    XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3318
0
    XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3319
0
    XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3320
3321
0
    dfRes1 = v_acc1.GetHorizSum();
3322
0
    dfRes2 = v_acc2.GetHorizSum();
3323
0
    dfRes3 = v_acc3.GetHorizSum();
3324
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, double&, double&, double&)
3325
3326
/************************************************************************/
3327
/*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
3328
/************************************************************************/
3329
3330
template <>
3331
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3332
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3333
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3334
    double &dfRes3)
3335
0
{
3336
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3337
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3338
0
        dfRes3);
3339
0
}
3340
3341
template <>
3342
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3343
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3344
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3345
    double &dfRes2, double &dfRes3)
3346
0
{
3347
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3348
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3349
0
        dfRes3);
3350
0
}
3351
3352
#endif  // USE_SSE2
3353
3354
/************************************************************************/
3355
/*                    GDALResampleChunk_Convolution()                   */
3356
/************************************************************************/
3357
3358
template <class T, class Twork, GDALDataType eWrkDataType,
3359
          bool bKernelWithNegativeWeights, bool bNeedRescale>
3360
static CPLErr GDALResampleChunk_ConvolutionT(
3361
    const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3362
    FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3363
    int nKernelRadius, float fMaxVal)
3364
3365
0
{
3366
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3367
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3368
0
    const double dfSrcXDelta = args.dfSrcXDelta;
3369
0
    const double dfSrcYDelta = args.dfSrcYDelta;
3370
0
    constexpr int nBands = 1;
3371
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3372
0
    const int nChunkXOff = args.nChunkXOff;
3373
0
    const int nChunkXSize = args.nChunkXSize;
3374
0
    const int nChunkYOff = args.nChunkYOff;
3375
0
    const int nChunkYSize = args.nChunkYSize;
3376
0
    const int nDstXOff = args.nDstXOff;
3377
0
    const int nDstXOff2 = args.nDstXOff2;
3378
0
    const int nDstYOff = args.nDstYOff;
3379
0
    const int nDstYOff2 = args.nDstYOff2;
3380
0
    const bool bHasNoData = args.bHasNoData;
3381
0
    double dfNoDataValue = args.dfNoDataValue;
3382
3383
0
    if (!bHasNoData)
3384
0
        dfNoDataValue = 0.0;
3385
0
    const auto dstDataType = args.eOvrDataType;
3386
0
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3387
0
    const double dfReplacementVal =
3388
0
        bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3389
0
                   : dfNoDataValue;
3390
    // cppcheck-suppress unreadVariable
3391
0
    const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3392
0
    const bool bNoDataValueInt64Valid =
3393
0
        isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3394
0
    const auto nNodataValueInt64 =
3395
0
        bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3396
0
    constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3397
3398
    // TODO: we should have some generic function to do this.
3399
0
    Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3400
0
    Twork fDstMax = cpl::NumericLimits<Twork>::max();
3401
0
    if (dstDataType == GDT_Byte)
3402
0
    {
3403
0
        fDstMin = std::numeric_limits<GByte>::min();
3404
0
        fDstMax = std::numeric_limits<GByte>::max();
3405
0
    }
3406
0
    else if (dstDataType == GDT_Int8)
3407
0
    {
3408
0
        fDstMin = std::numeric_limits<GInt8>::min();
3409
0
        fDstMax = std::numeric_limits<GInt8>::max();
3410
0
    }
3411
0
    else if (dstDataType == GDT_UInt16)
3412
0
    {
3413
0
        fDstMin = std::numeric_limits<GUInt16>::min();
3414
0
        fDstMax = std::numeric_limits<GUInt16>::max();
3415
0
    }
3416
0
    else if (dstDataType == GDT_Int16)
3417
0
    {
3418
0
        fDstMin = std::numeric_limits<GInt16>::min();
3419
0
        fDstMax = std::numeric_limits<GInt16>::max();
3420
0
    }
3421
0
    else if (dstDataType == GDT_UInt32)
3422
0
    {
3423
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3424
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3425
0
    }
3426
0
    else if (dstDataType == GDT_Int32)
3427
0
    {
3428
        // cppcheck-suppress unreadVariable
3429
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3430
        // cppcheck-suppress unreadVariable
3431
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3432
0
    }
3433
0
    else if (dstDataType == GDT_UInt64)
3434
0
    {
3435
        // cppcheck-suppress unreadVariable
3436
0
        fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3437
        // cppcheck-suppress unreadVariable
3438
        // (1 << 64) - 2048: largest uint64 value a double can hold
3439
0
        fDstMax = static_cast<Twork>(18446744073709549568ULL);
3440
0
    }
3441
0
    else if (dstDataType == GDT_Int64)
3442
0
    {
3443
        // cppcheck-suppress unreadVariable
3444
0
        fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3445
        // cppcheck-suppress unreadVariable
3446
        // (1 << 63) - 1024: largest int64 that a double can hold
3447
0
        fDstMax = static_cast<Twork>(9223372036854774784LL);
3448
0
    }
3449
3450
0
    auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3451
0
                               bNoDataValueInt64Valid, nNodataValueInt64,
3452
0
                               dfNoDataValue, dfReplacementVal](Twork fVal)
3453
0
    {
3454
0
        if (!bHasNoData)
3455
0
            return fVal;
3456
3457
        // Clamp value before comparing to nodata: this is only needed for
3458
        // kernels with negative weights (Lanczos)
3459
0
        Twork fClamped = fVal;
3460
0
        if (fClamped < fDstMin)
3461
0
            fClamped = fDstMin;
3462
0
        else if (fClamped > fDstMax)
3463
0
            fClamped = fDstMax;
3464
0
        if (isIntegerDT)
3465
0
        {
3466
0
            if (bNoDataValueInt64Valid)
3467
0
            {
3468
0
                const double fClampedRounded = double(std::round(fClamped));
3469
0
                if (fClampedRounded >=
3470
0
                        static_cast<double>(static_cast<Twork>(
3471
0
                            std::numeric_limits<int64_t>::min())) &&
3472
0
                    fClampedRounded <= static_cast<double>(static_cast<Twork>(
3473
0
                                           9223372036854774784LL)) &&
3474
0
                    nNodataValueInt64 ==
3475
0
                        static_cast<GInt64>(std::round(fClamped)))
3476
0
                {
3477
                    // Do not use the nodata value
3478
0
                    return static_cast<Twork>(dfReplacementVal);
3479
0
                }
3480
0
            }
3481
0
        }
3482
0
        else if (dfNoDataValue == static_cast<double>(fClamped))
3483
0
        {
3484
            // Do not use the nodata value
3485
0
            return static_cast<Twork>(dfReplacementVal);
3486
0
        }
3487
0
        return fClamped;
3488
0
    };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, true, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double)#1}::operator()(double) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double)#1}::operator()(double) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, false>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double)#1}::operator()(double) const
3489
3490
    /* -------------------------------------------------------------------- */
3491
    /*      Allocate work buffers.                                          */
3492
    /* -------------------------------------------------------------------- */
3493
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
3494
0
    Twork *pafWrkScanline = nullptr;
3495
0
    if (dstDataType != eWrkDataType)
3496
0
    {
3497
0
        pafWrkScanline =
3498
0
            static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3499
0
        if (pafWrkScanline == nullptr)
3500
0
            return CE_Failure;
3501
0
    }
3502
3503
0
    const double dfXScale = 1.0 / dfXRatioDstToSrc;
3504
0
    const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3505
0
    const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3506
0
    const double dfYScale = 1.0 / dfYRatioDstToSrc;
3507
0
    const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3508
0
    const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3509
3510
    // Temporary array to store result of horizontal filter.
3511
0
    double *const padfHorizontalFiltered = static_cast<double *>(
3512
0
        VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3513
3514
    // To store convolution coefficients.
3515
0
    double *const padfWeights =
3516
0
        static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3517
0
            static_cast<int>(
3518
0
                2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5) *
3519
0
            sizeof(double)));
3520
3521
0
    GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3522
0
    if (pabyChunkNodataMask)
3523
0
        pabyChunkNodataMaskHorizontalFiltered =
3524
0
            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3525
0
    if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3526
0
        (pabyChunkNodataMask != nullptr &&
3527
0
         pabyChunkNodataMaskHorizontalFiltered == nullptr))
3528
0
    {
3529
0
        VSIFree(pafWrkScanline);
3530
0
        VSIFree(padfHorizontalFiltered);
3531
0
        VSIFreeAligned(padfWeights);
3532
0
        VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3533
0
        return CE_Failure;
3534
0
    }
3535
3536
    /* ==================================================================== */
3537
    /*      First pass: horizontal filter                                   */
3538
    /* ==================================================================== */
3539
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3540
0
#ifdef USE_SSE2
3541
0
    const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3542
0
#endif
3543
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3544
0
    {
3545
0
        const double dfSrcPixel =
3546
0
            (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3547
0
        int nSrcPixelStart =
3548
0
            static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3549
0
        if (nSrcPixelStart < nChunkXOff)
3550
0
            nSrcPixelStart = nChunkXOff;
3551
0
        int nSrcPixelStop =
3552
0
            static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3553
0
        if (nSrcPixelStop > nChunkRightXOff)
3554
0
            nSrcPixelStop = nChunkRightXOff;
3555
#if 0
3556
        if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3557
        {
3558
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3559
        }
3560
        if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3561
        {
3562
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3563
        }
3564
#endif
3565
0
        const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3566
0
        double dfWeightSum = 0.0;
3567
3568
        // Compute convolution coefficients.
3569
0
        int nSrcPixel = nSrcPixelStart;
3570
0
        double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3571
0
        for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3572
0
        {
3573
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3574
0
            dfX += dfXScaleWeight;
3575
0
            padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3576
0
            dfX += dfXScaleWeight;
3577
0
            padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3578
0
            dfX += dfXScaleWeight;
3579
0
            padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3580
0
            dfX += dfXScaleWeight;
3581
0
            dfWeightSum +=
3582
0
                pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3583
0
        }
3584
0
        for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3585
0
        {
3586
0
            const double dfWeight = pfnFilterFunc(dfX);
3587
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3588
0
            dfWeightSum += dfWeight;
3589
0
        }
3590
3591
0
        const int nHeight = nChunkYSize * nBands;
3592
0
        if (pabyChunkNodataMask == nullptr)
3593
0
        {
3594
            // For floating-point data types, we must scale down a bit values
3595
            // if input values are close to +/- std::numeric_limits<T>::max()
3596
#ifdef OLD_CPPCHECK
3597
            constexpr double mulFactor = 1;
3598
#else
3599
0
            constexpr double mulFactor =
3600
0
                (bNeedRescale &&
3601
0
                 (std::is_same_v<T, float> || std::is_same_v<T, double>))
3602
0
                    ? 2
3603
0
                    : 1;
3604
0
#endif
3605
3606
0
            if (dfWeightSum != 0)
3607
0
            {
3608
0
                const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3609
0
                for (int i = 0; i < nSrcPixelCount; ++i)
3610
0
                {
3611
0
                    padfWeights[i] *= dfInvWeightSum;
3612
0
                }
3613
0
            }
3614
3615
0
            const auto ScaleValue = [
3616
#ifdef _MSC_VER
3617
                                        mulFactor
3618
#endif
3619
0
            ](double dfVal, [[maybe_unused]] const T *inputValues,
3620
0
                                    [[maybe_unused]] int nInputValues)
3621
0
            {
3622
0
                constexpr bool isFloat =
3623
0
                    std::is_same_v<T, float> || std::is_same_v<T, double>;
3624
                if constexpr (isFloat)
3625
0
                {
3626
0
                    if (std::isfinite(dfVal))
3627
0
                    {
3628
0
                        return std::clamp(dfVal,
3629
0
                                          -std::numeric_limits<double>::max() /
3630
0
                                              mulFactor,
3631
0
                                          std::numeric_limits<double>::max() /
3632
0
                                              mulFactor) *
3633
0
                               mulFactor;
3634
0
                    }
3635
                    else if constexpr (bKernelWithNegativeWeights)
3636
0
                    {
3637
0
                        if (std::isnan(dfVal))
3638
0
                        {
3639
                            // Either one of the input value is NaN or they are +/-Inf
3640
0
                            const bool isPositive = inputValues[0] >= 0;
3641
0
                            for (int i = 0; i < nInputValues; ++i)
3642
0
                            {
3643
0
                                if (std::isnan(inputValues[i]))
3644
0
                                    return dfVal;
3645
                                // cppcheck-suppress knownConditionTrueFalse
3646
0
                                if ((inputValues[i] >= 0) != isPositive)
3647
0
                                    return dfVal;
3648
0
                            }
3649
                            // All values are positive or negative infinity
3650
0
                            return static_cast<double>(inputValues[0]);
3651
0
                        }
3652
0
                    }
3653
0
                }
3654
0
                return dfVal;
3655
0
            };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, float const*, int)#1}::operator()(double, float const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, true, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int)#1}::operator()(double, double const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, float const*, int)#1}::operator()(double, float const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int)#1}::operator()(double, double const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, float const*, int)#1}::operator()(double, float const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, false>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int)#1}::operator()(double, double const*, int) const
3656
3657
0
            int iSrcLineOff = 0;
3658
0
#ifdef USE_SSE2
3659
0
            if (nSrcPixelCount == 4)
3660
0
            {
3661
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3662
0
                {
3663
0
                    const size_t j =
3664
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3665
0
                        (nSrcPixelStart - nChunkXOff);
3666
0
                    double dfVal1 = 0.0;
3667
0
                    double dfVal2 = 0.0;
3668
0
                    double dfVal3 = 0.0;
3669
0
                    GDALResampleConvolutionHorizontalPixelCount4_3rows(
3670
0
                        pChunk + j, pChunk + j + nChunkXSize,
3671
0
                        pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3672
0
                        dfVal2, dfVal3);
3673
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3674
0
                                               nDstXSize +
3675
0
                                           iDstPixel - nDstXOff] =
3676
0
                        ScaleValue(dfVal1, pChunk + j, 4);
3677
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3678
0
                                            1) *
3679
0
                                               nDstXSize +
3680
0
                                           iDstPixel - nDstXOff] =
3681
0
                        ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
3682
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3683
0
                                            2) *
3684
0
                                               nDstXSize +
3685
0
                                           iDstPixel - nDstXOff] =
3686
0
                        ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
3687
0
                }
3688
0
            }
3689
0
            else if (bSrcPixelCountLess8)
3690
0
            {
3691
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3692
0
                {
3693
0
                    const size_t j =
3694
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3695
0
                        (nSrcPixelStart - nChunkXOff);
3696
0
                    double dfVal1 = 0.0;
3697
0
                    double dfVal2 = 0.0;
3698
0
                    double dfVal3 = 0.0;
3699
0
                    GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3700
0
                        pChunk + j, pChunk + j + nChunkXSize,
3701
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3702
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3703
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3704
0
                                               nDstXSize +
3705
0
                                           iDstPixel - nDstXOff] =
3706
0
                        ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3707
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3708
0
                                            1) *
3709
0
                                               nDstXSize +
3710
0
                                           iDstPixel - nDstXOff] =
3711
0
                        ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3712
0
                                   nSrcPixelCount);
3713
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3714
0
                                            2) *
3715
0
                                               nDstXSize +
3716
0
                                           iDstPixel - nDstXOff] =
3717
0
                        ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3718
0
                                   nSrcPixelCount);
3719
0
                }
3720
0
            }
3721
0
            else
3722
0
#endif
3723
0
            {
3724
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3725
0
                {
3726
0
                    const size_t j =
3727
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3728
0
                        (nSrcPixelStart - nChunkXOff);
3729
0
                    double dfVal1 = 0.0;
3730
0
                    double dfVal2 = 0.0;
3731
0
                    double dfVal3 = 0.0;
3732
0
                    GDALResampleConvolutionHorizontal_3rows(
3733
0
                        pChunk + j, pChunk + j + nChunkXSize,
3734
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3735
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3736
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3737
0
                                               nDstXSize +
3738
0
                                           iDstPixel - nDstXOff] =
3739
0
                        ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3740
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3741
0
                                            1) *
3742
0
                                               nDstXSize +
3743
0
                                           iDstPixel - nDstXOff] =
3744
0
                        ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3745
0
                                   nSrcPixelCount);
3746
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3747
0
                                            2) *
3748
0
                                               nDstXSize +
3749
0
                                           iDstPixel - nDstXOff] =
3750
0
                        ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3751
0
                                   nSrcPixelCount);
3752
0
                }
3753
0
            }
3754
0
            for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3755
0
            {
3756
0
                const size_t j =
3757
0
                    static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3758
0
                    (nSrcPixelStart - nChunkXOff);
3759
0
                const double dfVal = GDALResampleConvolutionHorizontal(
3760
0
                    pChunk + j, padfWeights, nSrcPixelCount);
3761
0
                padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3762
0
                                           nDstXSize +
3763
0
                                       iDstPixel - nDstXOff] =
3764
0
                    ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
3765
0
            }
3766
0
        }
3767
0
        else
3768
0
        {
3769
0
            for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3770
0
            {
3771
0
                const size_t j =
3772
0
                    static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3773
0
                    (nSrcPixelStart - nChunkXOff);
3774
3775
0
                if (bKernelWithNegativeWeights)
3776
0
                {
3777
0
                    int nConsecutiveValid = 0;
3778
0
                    int nMaxConsecutiveValid = 0;
3779
0
                    for (int k = 0; k < nSrcPixelCount; k++)
3780
0
                    {
3781
0
                        if (pabyChunkNodataMask[j + k])
3782
0
                            nConsecutiveValid++;
3783
0
                        else if (nConsecutiveValid)
3784
0
                        {
3785
0
                            nMaxConsecutiveValid = std::max(
3786
0
                                nMaxConsecutiveValid, nConsecutiveValid);
3787
0
                            nConsecutiveValid = 0;
3788
0
                        }
3789
0
                    }
3790
0
                    nMaxConsecutiveValid =
3791
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
3792
0
                    if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3793
0
                    {
3794
0
                        const size_t nTempOffset =
3795
0
                            static_cast<size_t>(iSrcLineOff) * nDstXSize +
3796
0
                            iDstPixel - nDstXOff;
3797
0
                        padfHorizontalFiltered[nTempOffset] = 0.0;
3798
0
                        pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3799
0
                        continue;
3800
0
                    }
3801
0
                }
3802
3803
0
                double dfVal = 0.0;
3804
0
                GDALResampleConvolutionHorizontalWithMask(
3805
0
                    pChunk + j, pabyChunkNodataMask + j, padfWeights,
3806
0
                    nSrcPixelCount, dfVal, dfWeightSum);
3807
0
                const size_t nTempOffset =
3808
0
                    static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3809
0
                    nDstXOff;
3810
0
                if (dfWeightSum > 0.0)
3811
0
                {
3812
0
                    padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3813
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3814
0
                }
3815
0
                else
3816
0
                {
3817
0
                    padfHorizontalFiltered[nTempOffset] = 0.0;
3818
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3819
0
                }
3820
0
            }
3821
0
        }
3822
0
    }
3823
3824
    /* ==================================================================== */
3825
    /*      Second pass: vertical filter                                    */
3826
    /* ==================================================================== */
3827
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3828
3829
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3830
0
    {
3831
0
        Twork *const pafDstScanline =
3832
0
            pafWrkScanline
3833
0
                ? pafWrkScanline
3834
0
                : static_cast<Twork *>(pDstBuffer) +
3835
0
                      static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3836
3837
0
        const double dfSrcLine =
3838
0
            (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3839
0
        int nSrcLineStart =
3840
0
            static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3841
0
        int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3842
0
        if (nSrcLineStart < nChunkYOff)
3843
0
            nSrcLineStart = nChunkYOff;
3844
0
        if (nSrcLineStop > nChunkBottomYOff)
3845
0
            nSrcLineStop = nChunkBottomYOff;
3846
#if 0
3847
        if( nSrcLineStart < nChunkYOff &&
3848
            nChunkYOff > 0 )
3849
        {
3850
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3851
        }
3852
        if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3853
        {
3854
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3855
        }
3856
#endif
3857
0
        const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3858
0
        double dfWeightSum = 0.0;
3859
3860
        // Compute convolution coefficients.
3861
0
        int nSrcLine = nSrcLineStart;  // Used after for.
3862
0
        double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3863
0
        for (; nSrcLine < nSrcLineStop - 3;
3864
0
             nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3865
0
        {
3866
0
            padfWeights[nSrcLine - nSrcLineStart] = dfY;
3867
0
            padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3868
0
            padfWeights[nSrcLine + 2 - nSrcLineStart] =
3869
0
                dfY + 2 * dfYScaleWeight;
3870
0
            padfWeights[nSrcLine + 3 - nSrcLineStart] =
3871
0
                dfY + 3 * dfYScaleWeight;
3872
0
            dfWeightSum +=
3873
0
                pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3874
0
        }
3875
0
        for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3876
0
        {
3877
0
            const double dfWeight = pfnFilterFunc(dfY);
3878
0
            padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3879
0
            dfWeightSum += dfWeight;
3880
0
        }
3881
3882
0
        if (pabyChunkNodataMask == nullptr)
3883
0
        {
3884
            // For floating-point data types, we must scale down a bit values
3885
            // if input values are close to +/- std::numeric_limits<T>::max()
3886
#ifdef OLD_CPPCHECK
3887
            constexpr double mulFactor = 1;
3888
#else
3889
0
            constexpr double mulFactor =
3890
0
                (bNeedRescale &&
3891
0
                 (std::is_same_v<T, float> || std::is_same_v<T, double>))
3892
0
                    ? 2
3893
0
                    : 1;
3894
0
#endif
3895
3896
0
            if (dfWeightSum != 0)
3897
0
            {
3898
0
                const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3899
0
                for (int i = 0; i < nSrcLineCount; ++i)
3900
0
                    padfWeights[i] *= dfInvWeightSum;
3901
0
            }
3902
3903
0
            int iFilteredPixelOff = 0;  // Used after for.
3904
            // j used after for.
3905
0
            size_t j =
3906
0
                (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3907
0
#ifdef USE_SSE2
3908
            if constexpr ((!bNeedRescale ||
3909
                           !std::is_same_v<T, float>)&&eWrkDataType ==
3910
                          GDT_Float32)
3911
0
            {
3912
#ifdef __AVX__
3913
                for (; iFilteredPixelOff < nDstXSize - 15;
3914
                     iFilteredPixelOff += 16, j += 16)
3915
                {
3916
                    GDALResampleConvolutionVertical_16cols(
3917
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3918
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3919
                    if (bHasNoData)
3920
                    {
3921
                        for (int k = 0; k < 16; k++)
3922
                        {
3923
                            pafDstScanline[iFilteredPixelOff + k] =
3924
                                replaceValIfNodata(
3925
                                    pafDstScanline[iFilteredPixelOff + k]);
3926
                        }
3927
                    }
3928
                }
3929
#else
3930
0
                for (; iFilteredPixelOff < nDstXSize - 7;
3931
0
                     iFilteredPixelOff += 8, j += 8)
3932
0
                {
3933
0
                    GDALResampleConvolutionVertical_8cols(
3934
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3935
0
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3936
0
                    if (bHasNoData)
3937
0
                    {
3938
0
                        for (int k = 0; k < 8; k++)
3939
0
                        {
3940
0
                            pafDstScanline[iFilteredPixelOff + k] =
3941
0
                                replaceValIfNodata(
3942
0
                                    pafDstScanline[iFilteredPixelOff + k]);
3943
0
                        }
3944
0
                    }
3945
0
                }
3946
0
#endif
3947
3948
0
                for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3949
0
                {
3950
0
                    const Twork fVal =
3951
0
                        static_cast<Twork>(GDALResampleConvolutionVertical(
3952
0
                            padfHorizontalFiltered + j, nDstXSize, padfWeights,
3953
0
                            nSrcLineCount));
3954
0
                    pafDstScanline[iFilteredPixelOff] =
3955
0
                        replaceValIfNodata(fVal);
3956
0
                }
3957
            }
3958
            else
3959
#endif
3960
0
            {
3961
0
                const auto ScaleValue = [
3962
#ifdef _MSC_VER
3963
                                            mulFactor
3964
#endif
3965
0
                ](double dfVal, [[maybe_unused]] const double *inputValues,
3966
0
                                        [[maybe_unused]] int nStride,
3967
0
                                        [[maybe_unused]] int nInputValues)
3968
0
                {
3969
0
                    constexpr bool isFloat =
3970
0
                        std::is_same_v<T, float> || std::is_same_v<T, double>;
3971
                    if constexpr (isFloat)
3972
0
                    {
3973
0
                        if (std::isfinite(dfVal))
3974
0
                        {
3975
0
                            return std::clamp(
3976
0
                                       dfVal,
3977
0
                                       static_cast<double>(
3978
0
                                           -std::numeric_limits<Twork>::max()) /
3979
0
                                           mulFactor,
3980
0
                                       static_cast<double>(
3981
0
                                           std::numeric_limits<Twork>::max()) /
3982
0
                                           mulFactor) *
3983
0
                                   mulFactor;
3984
0
                        }
3985
                        else if constexpr (bKernelWithNegativeWeights)
3986
0
                        {
3987
0
                            if (std::isnan(dfVal))
3988
0
                            {
3989
                                // Either one of the input value is NaN or they are +/-Inf
3990
0
                                const bool isPositive = inputValues[0] >= 0;
3991
0
                                for (int i = 0; i < nInputValues; ++i)
3992
0
                                {
3993
0
                                    if (std::isnan(inputValues[i * nStride]))
3994
0
                                        return dfVal;
3995
                                    // cppcheck-suppress knownConditionTrueFalse
3996
0
                                    if ((inputValues[i] >= 0) != isPositive)
3997
0
                                        return dfVal;
3998
0
                                }
3999
                                // All values are positive or negative infinity
4000
0
                                return inputValues[0];
4001
0
                            }
4002
0
                        }
4003
0
                    }
4004
4005
0
                    return dfVal;
4006
0
                };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, true, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, false>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
4007
4008
0
                for (; iFilteredPixelOff < nDstXSize - 1;
4009
0
                     iFilteredPixelOff += 2, j += 2)
4010
0
                {
4011
0
                    double dfVal1 = 0.0;
4012
0
                    double dfVal2 = 0.0;
4013
0
                    GDALResampleConvolutionVertical_2cols(
4014
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
4015
0
                        nSrcLineCount, dfVal1, dfVal2);
4016
0
                    pafDstScanline[iFilteredPixelOff] =
4017
0
                        replaceValIfNodata(static_cast<Twork>(
4018
0
                            ScaleValue(dfVal1, padfHorizontalFiltered + j,
4019
0
                                       nDstXSize, nSrcLineCount)));
4020
0
                    pafDstScanline[iFilteredPixelOff + 1] =
4021
0
                        replaceValIfNodata(static_cast<Twork>(
4022
0
                            ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
4023
0
                                       nDstXSize, nSrcLineCount)));
4024
0
                }
4025
0
                if (iFilteredPixelOff < nDstXSize)
4026
0
                {
4027
0
                    const double dfVal = GDALResampleConvolutionVertical(
4028
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
4029
0
                        nSrcLineCount);
4030
0
                    pafDstScanline[iFilteredPixelOff] =
4031
0
                        replaceValIfNodata(static_cast<Twork>(
4032
0
                            ScaleValue(dfVal, padfHorizontalFiltered + j,
4033
0
                                       nDstXSize, nSrcLineCount)));
4034
0
                }
4035
0
            }
4036
0
        }
4037
0
        else
4038
0
        {
4039
0
            for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
4040
0
                 ++iFilteredPixelOff)
4041
0
            {
4042
0
                double dfVal = 0.0;
4043
0
                dfWeightSum = 0.0;
4044
0
                size_t j = (nSrcLineStart - nChunkYOff) *
4045
0
                               static_cast<size_t>(nDstXSize) +
4046
0
                           iFilteredPixelOff;
4047
0
                if (bKernelWithNegativeWeights)
4048
0
                {
4049
0
                    int nConsecutiveValid = 0;
4050
0
                    int nMaxConsecutiveValid = 0;
4051
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4052
0
                    {
4053
0
                        const double dfWeight =
4054
0
                            padfWeights[i] *
4055
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
4056
0
                        if (pabyChunkNodataMaskHorizontalFiltered[j])
4057
0
                        {
4058
0
                            nConsecutiveValid++;
4059
0
                        }
4060
0
                        else if (nConsecutiveValid)
4061
0
                        {
4062
0
                            nMaxConsecutiveValid = std::max(
4063
0
                                nMaxConsecutiveValid, nConsecutiveValid);
4064
0
                            nConsecutiveValid = 0;
4065
0
                        }
4066
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
4067
0
                        dfWeightSum += dfWeight;
4068
0
                    }
4069
0
                    nMaxConsecutiveValid =
4070
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
4071
0
                    if (nMaxConsecutiveValid < nSrcLineCount / 2)
4072
0
                    {
4073
0
                        pafDstScanline[iFilteredPixelOff] =
4074
0
                            static_cast<Twork>(dfNoDataValue);
4075
0
                        continue;
4076
0
                    }
4077
0
                }
4078
0
                else
4079
0
                {
4080
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4081
0
                    {
4082
0
                        const double dfWeight =
4083
0
                            padfWeights[i] *
4084
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
4085
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
4086
0
                        dfWeightSum += dfWeight;
4087
0
                    }
4088
0
                }
4089
0
                if (dfWeightSum > 0.0)
4090
0
                {
4091
0
                    pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
4092
0
                        static_cast<Twork>(dfVal / dfWeightSum));
4093
0
                }
4094
0
                else
4095
0
                {
4096
0
                    pafDstScanline[iFilteredPixelOff] =
4097
0
                        static_cast<Twork>(dfNoDataValue);
4098
0
                }
4099
0
            }
4100
0
        }
4101
4102
0
        if (fMaxVal != 0.0f)
4103
0
        {
4104
            if constexpr (std::is_same_v<T, double>)
4105
0
            {
4106
0
                for (int i = 0; i < nDstXSize; ++i)
4107
0
                {
4108
0
                    if (pafDstScanline[i] > static_cast<double>(fMaxVal))
4109
0
                        pafDstScanline[i] = static_cast<double>(fMaxVal);
4110
0
                }
4111
            }
4112
            else
4113
0
            {
4114
0
                for (int i = 0; i < nDstXSize; ++i)
4115
0
                {
4116
0
                    if (pafDstScanline[i] > fMaxVal)
4117
0
                        pafDstScanline[i] = fMaxVal;
4118
0
                }
4119
0
            }
4120
0
        }
4121
4122
0
        if (pafWrkScanline)
4123
0
        {
4124
0
            GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
4125
0
                            static_cast<GByte *>(pDstBuffer) +
4126
0
                                static_cast<size_t>(iDstLine - nDstYOff) *
4127
0
                                    nDstXSize * nDstDataTypeSize,
4128
0
                            dstDataType, nDstDataTypeSize, nDstXSize);
4129
0
        }
4130
0
    }
4131
4132
0
    VSIFree(pafWrkScanline);
4133
0
    VSIFreeAligned(padfWeights);
4134
0
    VSIFree(padfHorizontalFiltered);
4135
0
    VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4136
4137
0
    return CE_None;
4138
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, true, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, false>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)
4139
4140
template <bool bKernelWithNegativeWeights, bool bNeedRescale>
4141
static CPLErr
4142
GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
4143
                                      const void *pChunk, void **ppDstBuffer,
4144
                                      GDALDataType *peDstBufferDataType)
4145
0
{
4146
0
    GDALResampleAlg eResample;
4147
0
    if (EQUAL(args.pszResampling, "BILINEAR"))
4148
0
        eResample = GRA_Bilinear;
4149
0
    else if (EQUAL(args.pszResampling, "CUBIC"))
4150
0
        eResample = GRA_Cubic;
4151
0
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4152
0
        eResample = GRA_CubicSpline;
4153
0
    else if (EQUAL(args.pszResampling, "LANCZOS"))
4154
0
        eResample = GRA_Lanczos;
4155
0
    else
4156
0
    {
4157
0
        CPLAssert(false);
4158
0
        return CE_Failure;
4159
0
    }
4160
0
    const int nKernelRadius = GWKGetFilterRadius(eResample);
4161
0
    FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4162
0
    const FilterFunc4ValuesType pfnFilterFunc4Values =
4163
0
        GWKGetFilterFunc4Values(eResample);
4164
4165
0
    float fMaxVal = 0.f;
4166
    // Cubic, etc... can have overshoots, so make sure we clamp values to the
4167
    // maximum value if NBITS is set.
4168
0
    if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4169
0
        (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
4170
0
         args.eOvrDataType == GDT_UInt32))
4171
0
    {
4172
0
        int nBits = args.nOvrNBITS;
4173
0
        if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
4174
0
            nBits = 0;
4175
0
        if (nBits > 0 && nBits < 32)
4176
0
            fMaxVal = static_cast<float>((1U << nBits) - 1);
4177
0
    }
4178
4179
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4180
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
4181
0
        GDALGetDataTypeSizeBytes(args.eOvrDataType));
4182
0
    if (*ppDstBuffer == nullptr)
4183
0
    {
4184
0
        return CE_Failure;
4185
0
    }
4186
0
    *peDstBufferDataType = args.eOvrDataType;
4187
4188
0
    switch (args.eWrkDataType)
4189
0
    {
4190
0
        case GDT_Byte:
4191
0
        {
4192
0
            return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
4193
0
                                                  bKernelWithNegativeWeights,
4194
0
                                                  bNeedRescale>(
4195
0
                args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
4196
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4197
0
        }
4198
4199
0
        case GDT_UInt16:
4200
0
        {
4201
0
            return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
4202
0
                                                  bKernelWithNegativeWeights,
4203
0
                                                  bNeedRescale>(
4204
0
                args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
4205
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4206
0
        }
4207
4208
0
        case GDT_Float32:
4209
0
        {
4210
0
            return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
4211
0
                                                  bKernelWithNegativeWeights,
4212
0
                                                  bNeedRescale>(
4213
0
                args, static_cast<const float *>(pChunk), *ppDstBuffer,
4214
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4215
0
        }
4216
4217
0
        case GDT_Float64:
4218
0
        {
4219
0
            return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
4220
0
                                                  bKernelWithNegativeWeights,
4221
0
                                                  bNeedRescale>(
4222
0
                args, static_cast<const double *>(pChunk), *ppDstBuffer,
4223
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4224
0
        }
4225
4226
0
        default:
4227
0
            break;
4228
0
    }
4229
4230
0
    CPLAssert(false);
4231
0
    return CE_Failure;
4232
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionInternal<true, true>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionInternal<false, true>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionInternal<false, false>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
4233
4234
static CPLErr
4235
GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4236
                              const void *pChunk, void **ppDstBuffer,
4237
                              GDALDataType *peDstBufferDataType)
4238
0
{
4239
0
    if (EQUAL(args.pszResampling, "CUBIC") ||
4240
0
        EQUAL(args.pszResampling, "LANCZOS"))
4241
0
        return GDALResampleChunk_ConvolutionInternal<
4242
0
            /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
4243
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
4244
0
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4245
0
        return GDALResampleChunk_ConvolutionInternal<false, true>(
4246
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
4247
0
    else
4248
0
        return GDALResampleChunk_ConvolutionInternal<false, false>(
4249
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
4250
0
}
4251
4252
/************************************************************************/
4253
/*                       GDALResampleChunkC32R()                        */
4254
/************************************************************************/
4255
4256
static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
4257
                                    const float *pafChunk, const int nChunkYOff,
4258
                                    const int nChunkYSize, const int nDstYOff,
4259
                                    const int nDstYOff2, const int nOvrXSize,
4260
                                    const int nOvrYSize, void **ppDstBuffer,
4261
                                    GDALDataType *peDstBufferDataType,
4262
                                    const char *pszResampling)
4263
4264
0
{
4265
0
    enum Method
4266
0
    {
4267
0
        NEAR,
4268
0
        AVERAGE,
4269
0
        AVERAGE_MAGPHASE,
4270
0
        RMS,
4271
0
    };
4272
4273
0
    Method eMethod = NEAR;
4274
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
4275
0
    {
4276
0
        eMethod = NEAR;
4277
0
    }
4278
0
    else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
4279
0
    {
4280
0
        eMethod = AVERAGE_MAGPHASE;
4281
0
    }
4282
0
    else if (EQUAL(pszResampling, "RMS"))
4283
0
    {
4284
0
        eMethod = RMS;
4285
0
    }
4286
0
    else if (STARTS_WITH_CI(pszResampling, "AVER"))
4287
0
    {
4288
0
        eMethod = AVERAGE;
4289
0
    }
4290
0
    else
4291
0
    {
4292
0
        CPLError(
4293
0
            CE_Failure, CPLE_NotSupported,
4294
0
            "Resampling method %s is not supported for complex data types. "
4295
0
            "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
4296
0
            pszResampling);
4297
0
        return CE_Failure;
4298
0
    }
4299
4300
0
    const int nOXSize = nOvrXSize;
4301
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
4302
0
                                       GDALGetDataTypeSizeBytes(GDT_CFloat32));
4303
0
    if (*ppDstBuffer == nullptr)
4304
0
    {
4305
0
        return CE_Failure;
4306
0
    }
4307
0
    float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
4308
0
    *peDstBufferDataType = GDT_CFloat32;
4309
4310
0
    const int nOYSize = nOvrYSize;
4311
0
    const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
4312
0
    const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
4313
4314
    /* ==================================================================== */
4315
    /*      Loop over destination scanlines.                                */
4316
    /* ==================================================================== */
4317
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
4318
0
    {
4319
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
4320
0
        if (nSrcYOff < nChunkYOff)
4321
0
            nSrcYOff = nChunkYOff;
4322
4323
0
        int nSrcYOff2 =
4324
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
4325
0
        if (nSrcYOff2 == nSrcYOff)
4326
0
            nSrcYOff2++;
4327
4328
0
        if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
4329
0
        {
4330
0
            if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
4331
0
                nSrcYOff = nSrcHeight - 1;
4332
0
            nSrcYOff2 = nSrcHeight;
4333
0
        }
4334
0
        if (nSrcYOff2 > nChunkYOff + nChunkYSize)
4335
0
            nSrcYOff2 = nChunkYOff + nChunkYSize;
4336
4337
0
        const float *const pafSrcScanline =
4338
0
            pafChunk +
4339
0
            (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
4340
0
        float *const pafDstScanline =
4341
0
            pafDstBuffer +
4342
0
            static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
4343
4344
        /* --------------------------------------------------------------------
4345
         */
4346
        /*      Loop over destination pixels */
4347
        /* --------------------------------------------------------------------
4348
         */
4349
0
        for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
4350
0
        {
4351
0
            const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
4352
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4353
0
            int nSrcXOff2 =
4354
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
4355
0
            if (nSrcXOff2 == nSrcXOff)
4356
0
                nSrcXOff2++;
4357
0
            if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
4358
0
            {
4359
0
                if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
4360
0
                    nSrcXOff = nSrcWidth - 1;
4361
0
                nSrcXOff2 = nSrcWidth;
4362
0
            }
4363
0
            const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
4364
4365
0
            if (eMethod == NEAR)
4366
0
            {
4367
0
                pafDstScanline[iDstPixelSZ * 2] =
4368
0
                    pafSrcScanline[nSrcXOffSZ * 2];
4369
0
                pafDstScanline[iDstPixelSZ * 2 + 1] =
4370
0
                    pafSrcScanline[nSrcXOffSZ * 2 + 1];
4371
0
            }
4372
0
            else if (eMethod == AVERAGE_MAGPHASE)
4373
0
            {
4374
0
                double dfTotalR = 0.0;
4375
0
                double dfTotalI = 0.0;
4376
0
                double dfTotalM = 0.0;
4377
0
                size_t nCount = 0;
4378
4379
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4380
0
                {
4381
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4382
0
                    {
4383
0
                        const double dfR = double(
4384
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4385
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4386
0
                                               nSrcWidth * 2]);
4387
0
                        const double dfI = double(
4388
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4389
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4390
0
                                               nSrcWidth * 2 +
4391
0
                                           1]);
4392
0
                        dfTotalR += dfR;
4393
0
                        dfTotalI += dfI;
4394
0
                        dfTotalM += std::hypot(dfR, dfI);
4395
0
                        ++nCount;
4396
0
                    }
4397
0
                }
4398
4399
0
                CPLAssert(nCount > 0);
4400
0
                if (nCount == 0)
4401
0
                {
4402
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4403
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4404
0
                }
4405
0
                else
4406
0
                {
4407
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4408
0
                        dfTotalR / static_cast<double>(nCount));
4409
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4410
0
                        dfTotalI / static_cast<double>(nCount));
4411
0
                    const double dfM =
4412
0
                        double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
4413
0
                                          pafDstScanline[iDstPixelSZ * 2 + 1]));
4414
0
                    const double dfDesiredM =
4415
0
                        dfTotalM / static_cast<double>(nCount);
4416
0
                    double dfRatio = 1.0;
4417
0
                    if (dfM != 0.0)
4418
0
                        dfRatio = dfDesiredM / dfM;
4419
4420
0
                    pafDstScanline[iDstPixelSZ * 2] *=
4421
0
                        static_cast<float>(dfRatio);
4422
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] *=
4423
0
                        static_cast<float>(dfRatio);
4424
0
                }
4425
0
            }
4426
0
            else if (eMethod == RMS)
4427
0
            {
4428
0
                double dfTotalR = 0.0;
4429
0
                double dfTotalI = 0.0;
4430
0
                size_t nCount = 0;
4431
4432
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4433
0
                {
4434
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4435
0
                    {
4436
0
                        const double dfR = double(
4437
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4438
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4439
0
                                               nSrcWidth * 2]);
4440
0
                        const double dfI = double(
4441
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4442
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4443
0
                                               nSrcWidth * 2 +
4444
0
                                           1]);
4445
4446
0
                        dfTotalR += SQUARE(dfR);
4447
0
                        dfTotalI += SQUARE(dfI);
4448
4449
0
                        ++nCount;
4450
0
                    }
4451
0
                }
4452
4453
0
                CPLAssert(nCount > 0);
4454
0
                if (nCount == 0)
4455
0
                {
4456
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4457
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4458
0
                }
4459
0
                else
4460
0
                {
4461
                    /* compute RMS */
4462
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4463
0
                        sqrt(dfTotalR / static_cast<double>(nCount)));
4464
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4465
0
                        sqrt(dfTotalI / static_cast<double>(nCount)));
4466
0
                }
4467
0
            }
4468
0
            else if (eMethod == AVERAGE)
4469
0
            {
4470
0
                double dfTotalR = 0.0;
4471
0
                double dfTotalI = 0.0;
4472
0
                size_t nCount = 0;
4473
4474
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4475
0
                {
4476
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4477
0
                    {
4478
                        // TODO(schwehr): Maybe use std::complex?
4479
0
                        dfTotalR += double(
4480
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4481
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4482
0
                                               nSrcWidth * 2]);
4483
0
                        dfTotalI += double(
4484
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4485
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4486
0
                                               nSrcWidth * 2 +
4487
0
                                           1]);
4488
0
                        ++nCount;
4489
0
                    }
4490
0
                }
4491
4492
0
                CPLAssert(nCount > 0);
4493
0
                if (nCount == 0)
4494
0
                {
4495
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4496
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4497
0
                }
4498
0
                else
4499
0
                {
4500
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4501
0
                        dfTotalR / static_cast<double>(nCount));
4502
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4503
0
                        dfTotalI / static_cast<double>(nCount));
4504
0
                }
4505
0
            }
4506
0
        }
4507
0
    }
4508
4509
0
    return CE_None;
4510
0
}
4511
4512
/************************************************************************/
4513
/*                  GDALRegenerateCascadingOverviews()                  */
4514
/*                                                                      */
4515
/*      Generate a list of overviews in order from largest to           */
4516
/*      smallest, computing each from the next larger.                  */
4517
/************************************************************************/
4518
4519
static CPLErr GDALRegenerateCascadingOverviews(
4520
    GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4521
    const char *pszResampling, GDALProgressFunc pfnProgress,
4522
    void *pProgressData, CSLConstList papszOptions)
4523
4524
0
{
4525
    /* -------------------------------------------------------------------- */
4526
    /*      First, we must put the overviews in order from largest to       */
4527
    /*      smallest.                                                       */
4528
    /* -------------------------------------------------------------------- */
4529
0
    for (int i = 0; i < nOverviews - 1; ++i)
4530
0
    {
4531
0
        for (int j = 0; j < nOverviews - i - 1; ++j)
4532
0
        {
4533
0
            if (papoOvrBands[j]->GetXSize() *
4534
0
                    static_cast<float>(papoOvrBands[j]->GetYSize()) <
4535
0
                papoOvrBands[j + 1]->GetXSize() *
4536
0
                    static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4537
0
            {
4538
0
                GDALRasterBand *poTempBand = papoOvrBands[j];
4539
0
                papoOvrBands[j] = papoOvrBands[j + 1];
4540
0
                papoOvrBands[j + 1] = poTempBand;
4541
0
            }
4542
0
        }
4543
0
    }
4544
4545
    /* -------------------------------------------------------------------- */
4546
    /*      Count total pixels so we can prepare appropriate scaled         */
4547
    /*      progress functions.                                             */
4548
    /* -------------------------------------------------------------------- */
4549
0
    double dfTotalPixels = 0.0;
4550
4551
0
    for (int i = 0; i < nOverviews; ++i)
4552
0
    {
4553
0
        dfTotalPixels += papoOvrBands[i]->GetXSize() *
4554
0
                         static_cast<double>(papoOvrBands[i]->GetYSize());
4555
0
    }
4556
4557
    /* -------------------------------------------------------------------- */
4558
    /*      Generate all the bands.                                         */
4559
    /* -------------------------------------------------------------------- */
4560
0
    double dfPixelsProcessed = 0.0;
4561
4562
0
    for (int i = 0; i < nOverviews; ++i)
4563
0
    {
4564
0
        GDALRasterBand *poBaseBand = poSrcBand;
4565
0
        if (i != 0)
4566
0
            poBaseBand = papoOvrBands[i - 1];
4567
4568
0
        double dfPixels = papoOvrBands[i]->GetXSize() *
4569
0
                          static_cast<double>(papoOvrBands[i]->GetYSize());
4570
4571
0
        void *pScaledProgressData = GDALCreateScaledProgress(
4572
0
            dfPixelsProcessed / dfTotalPixels,
4573
0
            (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4574
0
            pProgressData);
4575
4576
0
        const CPLErr eErr = GDALRegenerateOverviewsEx(
4577
0
            poBaseBand, 1,
4578
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4579
0
            pszResampling, GDALScaledProgress, pScaledProgressData,
4580
0
            papszOptions);
4581
0
        GDALDestroyScaledProgress(pScaledProgressData);
4582
4583
0
        if (eErr != CE_None)
4584
0
            return eErr;
4585
4586
0
        dfPixelsProcessed += dfPixels;
4587
4588
        // Only do the bit2grayscale promotion on the base band.
4589
0
        if (STARTS_WITH_CI(pszResampling,
4590
0
                           "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4591
0
            pszResampling = "AVERAGE";
4592
0
    }
4593
4594
0
    return CE_None;
4595
0
}
4596
4597
/************************************************************************/
4598
/*                    GDALGetResampleFunction()                         */
4599
/************************************************************************/
4600
4601
GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4602
                                             int *pnRadius)
4603
0
{
4604
0
    if (pnRadius)
4605
0
        *pnRadius = 0;
4606
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
4607
0
        return GDALResampleChunk_Near;
4608
0
    else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4609
0
             EQUAL(pszResampling, "RMS"))
4610
0
        return GDALResampleChunk_AverageOrRMS;
4611
0
    else if (EQUAL(pszResampling, "GAUSS"))
4612
0
    {
4613
0
        if (pnRadius)
4614
0
            *pnRadius = 1;
4615
0
        return GDALResampleChunk_Gauss;
4616
0
    }
4617
0
    else if (EQUAL(pszResampling, "MODE"))
4618
0
        return GDALResampleChunk_Mode;
4619
0
    else if (EQUAL(pszResampling, "CUBIC"))
4620
0
    {
4621
0
        if (pnRadius)
4622
0
            *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4623
0
        return GDALResampleChunk_Convolution;
4624
0
    }
4625
0
    else if (EQUAL(pszResampling, "CUBICSPLINE"))
4626
0
    {
4627
0
        if (pnRadius)
4628
0
            *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4629
0
        return GDALResampleChunk_Convolution;
4630
0
    }
4631
0
    else if (EQUAL(pszResampling, "LANCZOS"))
4632
0
    {
4633
0
        if (pnRadius)
4634
0
            *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4635
0
        return GDALResampleChunk_Convolution;
4636
0
    }
4637
0
    else if (EQUAL(pszResampling, "BILINEAR"))
4638
0
    {
4639
0
        if (pnRadius)
4640
0
            *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4641
0
        return GDALResampleChunk_Convolution;
4642
0
    }
4643
0
    else
4644
0
    {
4645
0
        CPLError(
4646
0
            CE_Failure, CPLE_AppDefined,
4647
0
            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4648
0
            pszResampling);
4649
0
        return nullptr;
4650
0
    }
4651
0
}
4652
4653
/************************************************************************/
4654
/*                      GDALGetOvrWorkDataType()                        */
4655
/************************************************************************/
4656
4657
GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4658
                                    GDALDataType eSrcDataType)
4659
0
{
4660
0
    if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4661
0
    {
4662
0
        return eSrcDataType;
4663
0
    }
4664
0
    else if (eSrcDataType == GDT_Byte &&
4665
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4666
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4667
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4668
0
              EQUAL(pszResampling, "LANCZOS") ||
4669
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4670
0
    {
4671
0
        return GDT_Byte;
4672
0
    }
4673
0
    else if (eSrcDataType == GDT_UInt16 &&
4674
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4675
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4676
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4677
0
              EQUAL(pszResampling, "LANCZOS") ||
4678
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4679
0
    {
4680
0
        return GDT_UInt16;
4681
0
    }
4682
0
    else if (EQUAL(pszResampling, "GAUSS"))
4683
0
        return GDT_Float64;
4684
4685
0
    if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4686
0
        eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4687
0
        eSrcDataType == GDT_Float32)
4688
0
    {
4689
0
        return GDT_Float32;
4690
0
    }
4691
0
    return GDT_Float64;
4692
0
}
4693
4694
namespace
4695
{
4696
// Structure to hold a pointer to free with CPLFree()
4697
struct PointerHolder
4698
{
4699
    void *ptr = nullptr;
4700
4701
0
    explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4702
0
    {
4703
0
    }
4704
4705
    ~PointerHolder()
4706
0
    {
4707
0
        CPLFree(ptr);
4708
0
    }
4709
4710
    PointerHolder(const PointerHolder &) = delete;
4711
    PointerHolder &operator=(const PointerHolder &) = delete;
4712
};
4713
}  // namespace
4714
4715
/************************************************************************/
4716
/*                      GDALRegenerateOverviews()                       */
4717
/************************************************************************/
4718
4719
/**
4720
 * \brief Generate downsampled overviews.
4721
 *
4722
 * This function will generate one or more overview images from a base image
4723
 * using the requested downsampling algorithm.  Its primary use is for
4724
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4725
 * used to generate downsampled images in one file from another outside the
4726
 * overview architecture.
4727
 *
4728
 * The output bands need to exist in advance.
4729
 *
4730
 * The full set of resampling algorithms is documented in
4731
 * GDALDataset::BuildOverviews().
4732
 *
4733
 * This function will honour properly NODATA_VALUES tuples (special dataset
4734
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4735
 * considered as the nodata value and not each value of the triplet
4736
 * independently per band.
4737
 *
4738
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4739
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4740
 * overview computation.
4741
 *
4742
 * @param hSrcBand the source (base level) band.
4743
 * @param nOverviewCount the number of downsampled bands being generated.
4744
 * @param pahOvrBands the list of downsampled bands to be generated.
4745
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4746
 * @param pfnProgress progress report function.
4747
 * @param pProgressData progress function callback data.
4748
 * @return CE_None on success or CE_Failure on failure.
4749
 */
4750
CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4751
                               GDALRasterBandH *pahOvrBands,
4752
                               const char *pszResampling,
4753
                               GDALProgressFunc pfnProgress,
4754
                               void *pProgressData)
4755
4756
0
{
4757
0
    return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4758
0
                                     pszResampling, pfnProgress, pProgressData,
4759
0
                                     nullptr);
4760
0
}
4761
4762
/************************************************************************/
4763
/*                     GDALRegenerateOverviewsEx()                      */
4764
/************************************************************************/
4765
4766
constexpr int RADIUS_TO_DIAMETER = 2;
4767
4768
/**
4769
 * \brief Generate downsampled overviews.
4770
 *
4771
 * This function will generate one or more overview images from a base image
4772
 * using the requested downsampling algorithm.  Its primary use is for
4773
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4774
 * used to generate downsampled images in one file from another outside the
4775
 * overview architecture.
4776
 *
4777
 * The output bands need to exist in advance.
4778
 *
4779
 * The full set of resampling algorithms is documented in
4780
 * GDALDataset::BuildOverviews().
4781
 *
4782
 * This function will honour properly NODATA_VALUES tuples (special dataset
4783
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4784
 * considered as the nodata value and not each value of the triplet
4785
 * independently per band.
4786
 *
4787
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4788
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4789
 * overview computation.
4790
 *
4791
 * @param hSrcBand the source (base level) band.
4792
 * @param nOverviewCount the number of downsampled bands being generated.
4793
 * @param pahOvrBands the list of downsampled bands to be generated.
4794
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4795
 * @param pfnProgress progress report function.
4796
 * @param pProgressData progress function callback data.
4797
 * @param papszOptions NULL terminated list of options as key=value pairs, or
4798
 * NULL
4799
 * @return CE_None on success or CE_Failure on failure.
4800
 * @since GDAL 3.6
4801
 */
4802
CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4803
                                 GDALRasterBandH *pahOvrBands,
4804
                                 const char *pszResampling,
4805
                                 GDALProgressFunc pfnProgress,
4806
                                 void *pProgressData, CSLConstList papszOptions)
4807
4808
0
{
4809
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4810
0
    GDALRasterBand **papoOvrBands =
4811
0
        reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4812
4813
0
    if (pfnProgress == nullptr)
4814
0
        pfnProgress = GDALDummyProgress;
4815
4816
0
    if (EQUAL(pszResampling, "NONE"))
4817
0
        return CE_None;
4818
4819
0
    int nKernelRadius = 0;
4820
0
    GDALResampleFunction pfnResampleFn =
4821
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
4822
4823
0
    if (pfnResampleFn == nullptr)
4824
0
        return CE_Failure;
4825
4826
    /* -------------------------------------------------------------------- */
4827
    /*      Check color tables...                                           */
4828
    /* -------------------------------------------------------------------- */
4829
0
    GDALColorTable *poColorTable = nullptr;
4830
4831
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4832
0
         EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4833
0
        poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4834
0
    {
4835
0
        poColorTable = poSrcBand->GetColorTable();
4836
0
        if (poColorTable != nullptr)
4837
0
        {
4838
0
            if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4839
0
            {
4840
0
                CPLError(CE_Warning, CPLE_AppDefined,
4841
0
                         "Computing overviews on palette index raster bands "
4842
0
                         "with a palette whose color interpretation is not RGB "
4843
0
                         "will probably lead to unexpected results.");
4844
0
                poColorTable = nullptr;
4845
0
            }
4846
0
            else if (poColorTable->IsIdentity())
4847
0
            {
4848
0
                poColorTable = nullptr;
4849
0
            }
4850
0
        }
4851
0
        else
4852
0
        {
4853
0
            CPLError(CE_Warning, CPLE_AppDefined,
4854
0
                     "Computing overviews on palette index raster bands "
4855
0
                     "without a palette will probably lead to unexpected "
4856
0
                     "results.");
4857
0
        }
4858
0
    }
4859
    // Not ready yet
4860
0
    else if ((EQUAL(pszResampling, "CUBIC") ||
4861
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4862
0
              EQUAL(pszResampling, "LANCZOS") ||
4863
0
              EQUAL(pszResampling, "BILINEAR")) &&
4864
0
             poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4865
0
    {
4866
0
        CPLError(CE_Warning, CPLE_AppDefined,
4867
0
                 "Computing %s overviews on palette index raster bands "
4868
0
                 "will probably lead to unexpected results.",
4869
0
                 pszResampling);
4870
0
    }
4871
4872
    // If we have a nodata mask and we are doing something more complicated
4873
    // than nearest neighbouring, we have to fetch to nodata mask.
4874
4875
0
    GDALRasterBand *poMaskBand = nullptr;
4876
0
    bool bUseNoDataMask = false;
4877
0
    bool bCanUseCascaded = true;
4878
4879
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4880
0
    {
4881
        // Special case if we are an alpha/mask band. We want it to be
4882
        // considered as the mask band to avoid alpha=0 to be taken into account
4883
        // in average computation.
4884
0
        if (poSrcBand->IsMaskBand())
4885
0
        {
4886
0
            poMaskBand = poSrcBand;
4887
0
            bUseNoDataMask = true;
4888
0
        }
4889
0
        else
4890
0
        {
4891
0
            poMaskBand = poSrcBand->GetMaskBand();
4892
0
            const int nMaskFlags = poSrcBand->GetMaskFlags();
4893
0
            bCanUseCascaded =
4894
0
                (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4895
0
            bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4896
0
        }
4897
0
    }
4898
4899
    /* -------------------------------------------------------------------- */
4900
    /*      If we are operating on multiple overviews, and using            */
4901
    /*      averaging, lets do them in cascading order to reduce the        */
4902
    /*      amount of computation.                                          */
4903
    /* -------------------------------------------------------------------- */
4904
4905
    // In case the mask made be computed from another band of the dataset,
4906
    // we can't use cascaded generation, as the computation of the overviews
4907
    // of the band used for the mask band may not have yet occurred (#3033).
4908
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4909
0
         EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4910
0
         EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4911
0
         EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4912
0
         EQUAL(pszResampling, "MODE")) &&
4913
0
        nOverviewCount > 1 && bCanUseCascaded)
4914
0
        return GDALRegenerateCascadingOverviews(
4915
0
            poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4916
0
            pProgressData, papszOptions);
4917
4918
    /* -------------------------------------------------------------------- */
4919
    /*      Setup one horizontal swath to read from the raw buffer.         */
4920
    /* -------------------------------------------------------------------- */
4921
0
    int nFRXBlockSize = 0;
4922
0
    int nFRYBlockSize = 0;
4923
0
    poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4924
4925
0
    const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4926
0
    const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4927
0
                                       EQUAL(pszResampling, "MODE") ||
4928
0
                                       !GDALDataTypeIsComplex(eSrcDataType);
4929
0
    const GDALDataType eWrkDataType =
4930
0
        bUseGenericResampleFn
4931
0
            ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4932
0
            : GDT_CFloat32;
4933
4934
0
    const int nWidth = poSrcBand->GetXSize();
4935
0
    const int nHeight = poSrcBand->GetYSize();
4936
4937
0
    int nMaxOvrFactor = 1;
4938
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4939
0
    {
4940
0
        const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4941
0
        const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4942
0
        nMaxOvrFactor = std::max(
4943
0
            nMaxOvrFactor,
4944
0
            static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4945
0
        nMaxOvrFactor = std::max(
4946
0
            nMaxOvrFactor,
4947
0
            static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4948
0
    }
4949
4950
0
    int nFullResYChunk = nFRYBlockSize;
4951
0
    int nMaxChunkYSizeQueried = 0;
4952
4953
0
    const auto UpdateChunkHeightAndGetChunkSize =
4954
0
        [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4955
0
         eWrkDataType, nWidth]()
4956
0
    {
4957
        // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4958
        // + nFullResYChunk) / nMaxOvrFactor)
4959
0
        if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4960
0
        {
4961
0
            return GINTBIG_MAX;
4962
0
        }
4963
0
        nFullResYChunk =
4964
0
            std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4965
0
        if ((nKernelRadius > 0 &&
4966
0
             nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4967
0
            nFullResYChunk >
4968
0
                INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4969
0
        {
4970
0
            return GINTBIG_MAX;
4971
0
        }
4972
0
        nMaxChunkYSizeQueried =
4973
0
            nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4974
0
        if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4975
0
            std::numeric_limits<int64_t>::max() /
4976
0
                (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4977
0
        {
4978
0
            return GINTBIG_MAX;
4979
0
        }
4980
0
        return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4981
0
               nMaxChunkYSizeQueried * nWidth;
4982
0
    };
4983
4984
0
    const char *pszChunkYSize =
4985
0
        CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4986
0
#ifndef __COVERITY__
4987
    // Only configurable for debug / testing
4988
0
    if (pszChunkYSize)
4989
0
    {
4990
0
        nFullResYChunk = atoi(pszChunkYSize);
4991
0
    }
4992
0
#endif
4993
4994
    // Only configurable for debug / testing
4995
0
    const int nChunkMaxSize =
4996
0
        atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4997
4998
0
    auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4999
0
    if (nChunkSize > nChunkMaxSize)
5000
0
    {
5001
0
        if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
5002
0
            !GDALDataTypeIsComplex(eSrcDataType) &&
5003
0
            (!STARTS_WITH_CI(pszResampling, "AVER") ||
5004
0
             EQUAL(pszResampling, "AVERAGE")))
5005
0
        {
5006
            // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
5007
            // which use a block based strategy, which is much less memory
5008
            // hungry.
5009
0
            return GDALRegenerateOverviewsMultiBand(
5010
0
                1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
5011
0
                pfnProgress, pProgressData, papszOptions);
5012
0
        }
5013
0
        else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
5014
0
        {
5015
0
            return GDALRegenerateCascadingOverviews(
5016
0
                poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
5017
0
                pfnProgress, pProgressData, papszOptions);
5018
0
        }
5019
0
    }
5020
0
    else if (pszChunkYSize == nullptr)
5021
0
    {
5022
        // Try to get as close as possible to nChunkMaxSize
5023
0
        while (nChunkSize < nChunkMaxSize / 2)
5024
0
        {
5025
0
            nFullResYChunk *= 2;
5026
0
            nChunkSize = UpdateChunkHeightAndGetChunkSize();
5027
0
        }
5028
0
    }
5029
5030
0
    int nHasNoData = 0;
5031
0
    const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
5032
0
    const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
5033
0
    const bool bPropagateNoData =
5034
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5035
5036
    // Structure describing a resampling job
5037
0
    struct OvrJob
5038
0
    {
5039
        // Buffers to free when job is finished
5040
0
        std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
5041
0
        std::shared_ptr<PointerHolder> oSrcBufferHolder{};
5042
0
        std::unique_ptr<PointerHolder> oDstBufferHolder{};
5043
5044
0
        GDALRasterBand *poDstBand = nullptr;
5045
5046
        // Input parameters of pfnResampleFn
5047
0
        GDALResampleFunction pfnResampleFn = nullptr;
5048
0
        int nSrcWidth = 0;
5049
0
        int nSrcHeight = 0;
5050
0
        int nDstWidth = 0;
5051
0
        GDALOverviewResampleArgs args{};
5052
0
        const void *pChunk = nullptr;
5053
0
        bool bUseGenericResampleFn = false;
5054
5055
        // Output values of resampling function
5056
0
        CPLErr eErr = CE_Failure;
5057
0
        void *pDstBuffer = nullptr;
5058
0
        GDALDataType eDstBufferDataType = GDT_Unknown;
5059
5060
0
        void SetSrcMaskBufferHolder(
5061
0
            const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
5062
0
        {
5063
0
            oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
5064
0
        }
5065
5066
0
        void SetSrcBufferHolder(
5067
0
            const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
5068
0
        {
5069
0
            oSrcBufferHolder = oSrcBufferHolderIn;
5070
0
        }
5071
5072
0
        void NotifyFinished()
5073
0
        {
5074
0
            std::lock_guard guard(mutex);
5075
0
            bFinished = true;
5076
0
            cv.notify_one();
5077
0
        }
5078
5079
0
        bool IsFinished()
5080
0
        {
5081
0
            std::lock_guard guard(mutex);
5082
0
            return bFinished;
5083
0
        }
5084
5085
0
        void WaitFinished()
5086
0
        {
5087
0
            std::unique_lock oGuard(mutex);
5088
0
            while (!bFinished)
5089
0
            {
5090
0
                cv.wait(oGuard);
5091
0
            }
5092
0
        }
5093
5094
0
      private:
5095
        // Synchronization
5096
0
        bool bFinished = false;
5097
0
        std::mutex mutex{};
5098
0
        std::condition_variable cv{};
5099
0
    };
5100
5101
    // Thread function to resample
5102
0
    const auto JobResampleFunc = [](void *pData)
5103
0
    {
5104
0
        OvrJob *poJob = static_cast<OvrJob *>(pData);
5105
5106
0
        if (poJob->bUseGenericResampleFn)
5107
0
        {
5108
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5109
0
                                               &(poJob->pDstBuffer),
5110
0
                                               &(poJob->eDstBufferDataType));
5111
0
        }
5112
0
        else
5113
0
        {
5114
0
            poJob->eErr = GDALResampleChunkC32R(
5115
0
                poJob->nSrcWidth, poJob->nSrcHeight,
5116
0
                static_cast<const float *>(poJob->pChunk),
5117
0
                poJob->args.nChunkYOff, poJob->args.nChunkYSize,
5118
0
                poJob->args.nDstYOff, poJob->args.nDstYOff2,
5119
0
                poJob->args.nOvrXSize, poJob->args.nOvrYSize,
5120
0
                &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5121
0
                poJob->args.pszResampling);
5122
0
        }
5123
5124
0
        poJob->oDstBufferHolder =
5125
0
            std::make_unique<PointerHolder>(poJob->pDstBuffer);
5126
5127
0
        poJob->NotifyFinished();
5128
0
    };
5129
5130
    // Function to write resample data to target band
5131
0
    const auto WriteJobData = [](const OvrJob *poJob)
5132
0
    {
5133
0
        return poJob->poDstBand->RasterIO(
5134
0
            GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
5135
0
            poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5136
0
            poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5137
0
            poJob->eDstBufferDataType, 0, 0, nullptr);
5138
0
    };
5139
5140
    // Wait for completion of oldest job and serialize it
5141
0
    const auto WaitAndFinalizeOldestJob =
5142
0
        [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5143
0
    {
5144
0
        auto poOldestJob = jobList.front().get();
5145
0
        poOldestJob->WaitFinished();
5146
0
        CPLErr l_eErr = poOldestJob->eErr;
5147
0
        if (l_eErr == CE_None)
5148
0
        {
5149
0
            l_eErr = WriteJobData(poOldestJob);
5150
0
        }
5151
5152
0
        jobList.pop_front();
5153
0
        return l_eErr;
5154
0
    };
5155
5156
    // Queue of jobs
5157
0
    std::list<std::unique_ptr<OvrJob>> jobList;
5158
5159
0
    GByte *pabyChunkNodataMask = nullptr;
5160
0
    void *pChunk = nullptr;
5161
5162
0
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5163
0
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5164
0
                                                       ? CPLGetNumCPUs()
5165
0
                                                       : atoi(pszThreads)));
5166
0
    auto poThreadPool =
5167
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5168
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5169
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
5170
5171
    /* -------------------------------------------------------------------- */
5172
    /*      Loop over image operating on chunks.                            */
5173
    /* -------------------------------------------------------------------- */
5174
0
    int nChunkYOff = 0;
5175
0
    CPLErr eErr = CE_None;
5176
5177
0
    for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
5178
0
         nChunkYOff += nFullResYChunk)
5179
0
    {
5180
0
        if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
5181
0
                         pProgressData))
5182
0
        {
5183
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5184
0
            eErr = CE_Failure;
5185
0
        }
5186
5187
0
        if (nFullResYChunk + nChunkYOff > nHeight)
5188
0
            nFullResYChunk = nHeight - nChunkYOff;
5189
5190
0
        int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
5191
0
        int nChunkYSizeQueried =
5192
0
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
5193
0
        if (nChunkYOffQueried < 0)
5194
0
        {
5195
0
            nChunkYSizeQueried += nChunkYOffQueried;
5196
0
            nChunkYOffQueried = 0;
5197
0
        }
5198
0
        if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
5199
0
            nChunkYSizeQueried = nHeight - nChunkYOffQueried;
5200
5201
        // Avoid accumulating too many tasks and exhaust RAM
5202
        // Try to complete already finished jobs
5203
0
        while (eErr == CE_None && !jobList.empty())
5204
0
        {
5205
0
            auto poOldestJob = jobList.front().get();
5206
0
            if (!poOldestJob->IsFinished())
5207
0
                break;
5208
0
            eErr = poOldestJob->eErr;
5209
0
            if (eErr == CE_None)
5210
0
            {
5211
0
                eErr = WriteJobData(poOldestJob);
5212
0
            }
5213
5214
0
            jobList.pop_front();
5215
0
        }
5216
5217
        // And in case we have saturated the number of threads,
5218
        // wait for completion of tasks to go below the threshold.
5219
0
        while (eErr == CE_None &&
5220
0
               jobList.size() >= static_cast<size_t>(nThreads))
5221
0
        {
5222
0
            eErr = WaitAndFinalizeOldestJob(jobList);
5223
0
        }
5224
5225
        // (Re)allocate buffers if needed
5226
0
        if (pChunk == nullptr)
5227
0
        {
5228
0
            pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
5229
0
                                         nMaxChunkYSizeQueried, nWidth);
5230
0
        }
5231
0
        if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
5232
0
        {
5233
0
            pabyChunkNodataMask = static_cast<GByte *>(
5234
0
                VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
5235
0
        }
5236
5237
0
        if (pChunk == nullptr ||
5238
0
            (bUseNoDataMask && pabyChunkNodataMask == nullptr))
5239
0
        {
5240
0
            CPLFree(pChunk);
5241
0
            CPLFree(pabyChunkNodataMask);
5242
0
            return CE_Failure;
5243
0
        }
5244
5245
        // Read chunk.
5246
0
        if (eErr == CE_None)
5247
0
            eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5248
0
                                       nChunkYSizeQueried, pChunk, nWidth,
5249
0
                                       nChunkYSizeQueried, eWrkDataType, 0, 0,
5250
0
                                       nullptr);
5251
0
        if (eErr == CE_None && bUseNoDataMask)
5252
0
            eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5253
0
                                        nChunkYSizeQueried, pabyChunkNodataMask,
5254
0
                                        nWidth, nChunkYSizeQueried, GDT_Byte, 0,
5255
0
                                        0, nullptr);
5256
5257
        // Special case to promote 1bit data to 8bit 0/255 values.
5258
0
        if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
5259
0
        {
5260
0
            if (eWrkDataType == GDT_Float32)
5261
0
            {
5262
0
                float *pafChunk = static_cast<float *>(pChunk);
5263
0
                for (size_t i = 0;
5264
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5265
0
                {
5266
0
                    if (pafChunk[i] == 1.0f)
5267
0
                        pafChunk[i] = 255.0f;
5268
0
                }
5269
0
            }
5270
0
            else if (eWrkDataType == GDT_Byte)
5271
0
            {
5272
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
5273
0
                for (size_t i = 0;
5274
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5275
0
                {
5276
0
                    if (pabyChunk[i] == 1)
5277
0
                        pabyChunk[i] = 255;
5278
0
                }
5279
0
            }
5280
0
            else if (eWrkDataType == GDT_UInt16)
5281
0
            {
5282
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5283
0
                for (size_t i = 0;
5284
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5285
0
                {
5286
0
                    if (pasChunk[i] == 1)
5287
0
                        pasChunk[i] = 255;
5288
0
                }
5289
0
            }
5290
0
            else if (eWrkDataType == GDT_Float64)
5291
0
            {
5292
0
                double *padfChunk = static_cast<double *>(pChunk);
5293
0
                for (size_t i = 0;
5294
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5295
0
                {
5296
0
                    if (padfChunk[i] == 1.0)
5297
0
                        padfChunk[i] = 255.0;
5298
0
                }
5299
0
            }
5300
0
            else
5301
0
            {
5302
0
                CPLAssert(false);
5303
0
            }
5304
0
        }
5305
0
        else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
5306
0
        {
5307
0
            if (eWrkDataType == GDT_Float32)
5308
0
            {
5309
0
                float *pafChunk = static_cast<float *>(pChunk);
5310
0
                for (size_t i = 0;
5311
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5312
0
                {
5313
0
                    if (pafChunk[i] == 1.0f)
5314
0
                        pafChunk[i] = 0.0f;
5315
0
                    else if (pafChunk[i] == 0.0f)
5316
0
                        pafChunk[i] = 255.0f;
5317
0
                }
5318
0
            }
5319
0
            else if (eWrkDataType == GDT_Byte)
5320
0
            {
5321
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
5322
0
                for (size_t i = 0;
5323
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5324
0
                {
5325
0
                    if (pabyChunk[i] == 1)
5326
0
                        pabyChunk[i] = 0;
5327
0
                    else if (pabyChunk[i] == 0)
5328
0
                        pabyChunk[i] = 255;
5329
0
                }
5330
0
            }
5331
0
            else if (eWrkDataType == GDT_UInt16)
5332
0
            {
5333
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5334
0
                for (size_t i = 0;
5335
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5336
0
                {
5337
0
                    if (pasChunk[i] == 1)
5338
0
                        pasChunk[i] = 0;
5339
0
                    else if (pasChunk[i] == 0)
5340
0
                        pasChunk[i] = 255;
5341
0
                }
5342
0
            }
5343
0
            else if (eWrkDataType == GDT_Float64)
5344
0
            {
5345
0
                double *padfChunk = static_cast<double *>(pChunk);
5346
0
                for (size_t i = 0;
5347
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5348
0
                {
5349
0
                    if (padfChunk[i] == 1.0)
5350
0
                        padfChunk[i] = 0.0;
5351
0
                    else if (padfChunk[i] == 0.0)
5352
0
                        padfChunk[i] = 255.0;
5353
0
                }
5354
0
            }
5355
0
            else
5356
0
            {
5357
0
                CPLAssert(false);
5358
0
            }
5359
0
        }
5360
5361
0
        auto oSrcBufferHolder =
5362
0
            std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
5363
0
        auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
5364
0
            poJobQueue ? pabyChunkNodataMask : nullptr);
5365
5366
0
        for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
5367
0
             ++iOverview)
5368
0
        {
5369
0
            GDALRasterBand *poDstBand = papoOvrBands[iOverview];
5370
0
            const int nDstWidth = poDstBand->GetXSize();
5371
0
            const int nDstHeight = poDstBand->GetYSize();
5372
5373
0
            const double dfXRatioDstToSrc =
5374
0
                static_cast<double>(nWidth) / nDstWidth;
5375
0
            const double dfYRatioDstToSrc =
5376
0
                static_cast<double>(nHeight) / nDstHeight;
5377
5378
            /* --------------------------------------------------------------------
5379
             */
5380
            /*      Figure out the line to start writing to, and the first line
5381
             */
5382
            /*      to not write to.  In theory this approach should ensure that
5383
             */
5384
            /*      every output line will be written if all input chunks are */
5385
            /*      processed. */
5386
            /* --------------------------------------------------------------------
5387
             */
5388
0
            int nDstYOff =
5389
0
                static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5390
0
            if (nDstYOff == nDstHeight)
5391
0
                continue;
5392
0
            int nDstYOff2 = static_cast<int>(
5393
0
                0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5394
5395
0
            if (nChunkYOff + nFullResYChunk == nHeight)
5396
0
                nDstYOff2 = nDstHeight;
5397
#if DEBUG_VERBOSE
5398
            CPLDebug("GDAL",
5399
                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5400
                     nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5401
                     nDstWidth, nDstYOff2 - nDstYOff);
5402
#endif
5403
5404
0
            auto poJob = std::make_unique<OvrJob>();
5405
0
            poJob->pfnResampleFn = pfnResampleFn;
5406
0
            poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5407
0
            poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5408
0
            poJob->args.nOvrXSize = poDstBand->GetXSize();
5409
0
            poJob->args.nOvrYSize = poDstBand->GetYSize();
5410
0
            const char *pszNBITS =
5411
0
                poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5412
0
            poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5413
0
            poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5414
0
            poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5415
0
            poJob->args.eWrkDataType = eWrkDataType;
5416
0
            poJob->pChunk = pChunk;
5417
0
            poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
5418
0
            poJob->nSrcWidth = nWidth;
5419
0
            poJob->nSrcHeight = nHeight;
5420
0
            poJob->args.nChunkXOff = 0;
5421
0
            poJob->args.nChunkXSize = nWidth;
5422
0
            poJob->args.nChunkYOff = nChunkYOffQueried;
5423
0
            poJob->args.nChunkYSize = nChunkYSizeQueried;
5424
0
            poJob->nDstWidth = nDstWidth;
5425
0
            poJob->args.nDstXOff = 0;
5426
0
            poJob->args.nDstXOff2 = nDstWidth;
5427
0
            poJob->args.nDstYOff = nDstYOff;
5428
0
            poJob->args.nDstYOff2 = nDstYOff2;
5429
0
            poJob->poDstBand = poDstBand;
5430
0
            poJob->args.pszResampling = pszResampling;
5431
0
            poJob->args.bHasNoData = bHasNoData;
5432
0
            poJob->args.dfNoDataValue = dfNoDataValue;
5433
0
            poJob->args.poColorTable = poColorTable;
5434
0
            poJob->args.eSrcDataType = eSrcDataType;
5435
0
            poJob->args.bPropagateNoData = bPropagateNoData;
5436
5437
0
            if (poJobQueue)
5438
0
            {
5439
0
                poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5440
0
                poJob->SetSrcBufferHolder(oSrcBufferHolder);
5441
0
                poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5442
0
                jobList.emplace_back(std::move(poJob));
5443
0
            }
5444
0
            else
5445
0
            {
5446
0
                JobResampleFunc(poJob.get());
5447
0
                eErr = poJob->eErr;
5448
0
                if (eErr == CE_None)
5449
0
                {
5450
0
                    eErr = WriteJobData(poJob.get());
5451
0
                }
5452
0
            }
5453
0
        }
5454
5455
0
        if (poJobQueue)
5456
0
        {
5457
0
            pChunk = nullptr;
5458
0
            pabyChunkNodataMask = nullptr;
5459
0
        }
5460
0
    }
5461
5462
0
    VSIFree(pChunk);
5463
0
    VSIFree(pabyChunkNodataMask);
5464
5465
    // Wait for all pending jobs to complete
5466
0
    while (!jobList.empty())
5467
0
    {
5468
0
        const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5469
0
        if (l_eErr != CE_None && eErr == CE_None)
5470
0
            eErr = l_eErr;
5471
0
    }
5472
5473
    /* -------------------------------------------------------------------- */
5474
    /*      Renormalized overview mean / stddev if needed.                  */
5475
    /* -------------------------------------------------------------------- */
5476
0
    if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5477
0
    {
5478
0
        GDALOverviewMagnitudeCorrection(
5479
0
            poSrcBand, nOverviewCount,
5480
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5481
0
            GDALDummyProgress, nullptr);
5482
0
    }
5483
5484
    /* -------------------------------------------------------------------- */
5485
    /*      It can be important to flush out data to overviews.             */
5486
    /* -------------------------------------------------------------------- */
5487
0
    for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5488
0
         ++iOverview)
5489
0
    {
5490
0
        eErr = papoOvrBands[iOverview]->FlushCache(false);
5491
0
    }
5492
5493
0
    if (eErr == CE_None)
5494
0
        pfnProgress(1.0, nullptr, pProgressData);
5495
5496
0
    return eErr;
5497
0
}
5498
5499
/************************************************************************/
5500
/*            GDALRegenerateOverviewsMultiBand()                        */
5501
/************************************************************************/
5502
5503
/**
5504
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5505
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5506
 *
5507
 * This function will generate one or more overview images from a base
5508
 * image using the requested downsampling algorithm.  Its primary use
5509
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5510
 * can also be used to generate downsampled images in one file from another
5511
 * outside the overview architecture.
5512
 *
5513
 * The output bands need to exist in advance and share the same characteristics
5514
 * (type, dimensions)
5515
 *
5516
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5517
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5518
 *
5519
 * It does not support color tables or complex data types.
5520
 *
5521
 * The pseudo-algorithm used by the function is :
5522
 *    for each overview
5523
 *       iterate on lines of the source by a step of deltay
5524
 *           iterate on columns of the source  by a step of deltax
5525
 *               read the source data of size deltax * deltay for all the bands
5526
 *               generate the corresponding overview block for all the bands
5527
 *
5528
 * This function will honour properly NODATA_VALUES tuples (special dataset
5529
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5530
 * considered as the nodata value and not each value of the triplet
5531
 * independently per band.
5532
 *
5533
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5534
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5535
 * overview computation.
5536
 *
5537
 * @param nBands the number of bands, size of papoSrcBands and size of
5538
 *               first dimension of papapoOverviewBands
5539
 * @param papoSrcBands the list of source bands to downsample
5540
 * @param nOverviews the number of downsampled overview levels being generated.
5541
 * @param papapoOverviewBands bidimension array of bands. First dimension is
5542
 *                            indexed by nBands. Second dimension is indexed by
5543
 *                            nOverviews.
5544
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5545
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5546
 * @param pfnProgress progress report function.
5547
 * @param pProgressData progress function callback data.
5548
 * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5549
 *                     key=value pairs, or NULL
5550
 *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5551
 *                     options can be specified to express that overviews should
5552
 *                     be regenerated only in the specified subset of the source
5553
 *                     dataset.
5554
 * @return CE_None on success or CE_Failure on failure.
5555
 */
5556
5557
CPLErr GDALRegenerateOverviewsMultiBand(
5558
    int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5559
    GDALRasterBand *const *const *papapoOverviewBands,
5560
    const char *pszResampling, GDALProgressFunc pfnProgress,
5561
    void *pProgressData, CSLConstList papszOptions)
5562
0
{
5563
0
    CPL_IGNORE_RET_VAL(papszOptions);
5564
5565
0
    if (pfnProgress == nullptr)
5566
0
        pfnProgress = GDALDummyProgress;
5567
5568
0
    if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5569
0
        return CE_None;
5570
5571
    // Sanity checks.
5572
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5573
0
        !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5574
0
        !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5575
0
        !EQUAL(pszResampling, "CUBICSPLINE") &&
5576
0
        !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5577
0
        !EQUAL(pszResampling, "MODE"))
5578
0
    {
5579
0
        CPLError(CE_Failure, CPLE_NotSupported,
5580
0
                 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5581
0
                 "not supported",
5582
0
                 pszResampling);
5583
0
        return CE_Failure;
5584
0
    }
5585
5586
0
    int nKernelRadius = 0;
5587
0
    GDALResampleFunction pfnResampleFn =
5588
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
5589
0
    if (pfnResampleFn == nullptr)
5590
0
        return CE_Failure;
5591
5592
0
    const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5593
0
    const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5594
0
    if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5595
0
        return CE_None;
5596
0
    GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5597
0
    for (int iBand = 1; iBand < nBands; ++iBand)
5598
0
    {
5599
0
        if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5600
0
            papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5601
0
        {
5602
0
            CPLError(
5603
0
                CE_Failure, CPLE_NotSupported,
5604
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5605
0
                "have the same dimensions");
5606
0
            return CE_Failure;
5607
0
        }
5608
0
        if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5609
0
        {
5610
0
            CPLError(
5611
0
                CE_Failure, CPLE_NotSupported,
5612
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5613
0
                "have the same data type");
5614
0
            return CE_Failure;
5615
0
        }
5616
0
    }
5617
5618
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5619
0
    {
5620
0
        const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5621
0
        const int nDstWidth = poOvrFirstBand->GetXSize();
5622
0
        const int nDstHeight = poOvrFirstBand->GetYSize();
5623
0
        for (int iBand = 1; iBand < nBands; ++iBand)
5624
0
        {
5625
0
            const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5626
0
            if (poOvrBand->GetXSize() != nDstWidth ||
5627
0
                poOvrBand->GetYSize() != nDstHeight)
5628
0
            {
5629
0
                CPLError(
5630
0
                    CE_Failure, CPLE_NotSupported,
5631
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5632
0
                    "of the same level must have the same dimensions");
5633
0
                return CE_Failure;
5634
0
            }
5635
0
            if (poOvrBand->GetRasterDataType() != eDataType)
5636
0
            {
5637
0
                CPLError(
5638
0
                    CE_Failure, CPLE_NotSupported,
5639
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5640
0
                    "must have the same data type as the source bands");
5641
0
                return CE_Failure;
5642
0
            }
5643
0
        }
5644
0
    }
5645
5646
    // First pass to compute the total number of pixels to write.
5647
0
    double dfTotalPixelCount = 0;
5648
0
    const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5649
0
    const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5650
0
    const int nSrcXSize = atoi(CSLFetchNameValueDef(
5651
0
        papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5652
0
    const int nSrcYSize = atoi(CSLFetchNameValueDef(
5653
0
        papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5654
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5655
0
    {
5656
0
        dfTotalPixelCount +=
5657
0
            static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5658
0
            papapoOverviewBands[0][iOverview]->GetXSize() *
5659
0
            static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5660
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5661
0
    }
5662
5663
0
    const GDALDataType eWrkDataType =
5664
0
        GDALGetOvrWorkDataType(pszResampling, eDataType);
5665
0
    const int nWrkDataTypeSize =
5666
0
        std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5667
5668
0
    const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5669
5670
    // If we have a nodata mask and we are doing something more complicated
5671
    // than nearest neighbouring, we have to fetch to nodata mask.
5672
0
    const bool bUseNoDataMask =
5673
0
        !STARTS_WITH_CI(pszResampling, "NEAR") &&
5674
0
        (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5675
5676
0
    std::vector<bool> abHasNoData(nBands);
5677
0
    std::vector<double> adfNoDataValue(nBands);
5678
5679
0
    for (int iBand = 0; iBand < nBands; ++iBand)
5680
0
    {
5681
0
        int nHasNoData = 0;
5682
0
        adfNoDataValue[iBand] =
5683
0
            papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5684
0
        abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5685
0
    }
5686
0
    const bool bPropagateNoData =
5687
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5688
5689
0
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5690
0
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5691
0
                                                       ? CPLGetNumCPUs()
5692
0
                                                       : atoi(pszThreads)));
5693
0
    auto poThreadPool =
5694
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5695
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5696
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
5697
5698
    // Only configurable for debug / testing
5699
0
    const GIntBig nChunkMaxSize = []() -> GIntBig
5700
0
    {
5701
0
        const char *pszVal =
5702
0
            CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5703
0
        if (pszVal)
5704
0
        {
5705
0
            GIntBig nRet = 0;
5706
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5707
0
            return std::max<GIntBig>(100, nRet);
5708
0
        }
5709
0
        return 10 * 1024 * 1024;
5710
0
    }();
5711
5712
    // Only configurable for debug / testing
5713
0
    const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5714
0
    {
5715
0
        const char *pszVal = CPLGetConfigOption(
5716
0
            "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5717
0
        if (pszVal)
5718
0
        {
5719
0
            GIntBig nRet = 0;
5720
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5721
0
            return std::max<GIntBig>(100, nRet);
5722
0
        }
5723
0
        const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5724
0
        if (nUsableRAM > 0)
5725
0
            return nUsableRAM / 10;
5726
        // Select a value to be able to at least downsample by 2 for a RGB
5727
        // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5728
0
        return 100 * 1024 * 1024;
5729
0
    }();
5730
5731
    // Second pass to do the real job.
5732
0
    double dfCurPixelCount = 0;
5733
0
    CPLErr eErr = CE_None;
5734
0
    for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5735
0
         ++iOverview)
5736
0
    {
5737
0
        int iSrcOverview = -1;  // -1 means the source bands.
5738
5739
0
        const int nDstTotalWidth =
5740
0
            papapoOverviewBands[0][iOverview]->GetXSize();
5741
0
        const int nDstTotalHeight =
5742
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5743
5744
        // Compute the coordinates of the target region to refresh
5745
0
        constexpr double EPS = 1e-8;
5746
0
        const int nDstXOffStart = static_cast<int>(
5747
0
            static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5748
0
            EPS);
5749
0
        const int nDstXOffEnd =
5750
0
            std::min(static_cast<int>(
5751
0
                         std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5752
0
                                       nToplevelSrcWidth * nDstTotalWidth -
5753
0
                                   EPS)),
5754
0
                     nDstTotalWidth);
5755
0
        const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5756
0
        const int nDstYOffStart =
5757
0
            static_cast<int>(static_cast<double>(nSrcYOff) /
5758
0
                                 nToplevelSrcHeight * nDstTotalHeight +
5759
0
                             EPS);
5760
0
        const int nDstYOffEnd =
5761
0
            std::min(static_cast<int>(
5762
0
                         std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5763
0
                                       nToplevelSrcHeight * nDstTotalHeight -
5764
0
                                   EPS)),
5765
0
                     nDstTotalHeight);
5766
0
        const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5767
5768
        // Try to use previous level of overview as the source to compute
5769
        // the next level.
5770
0
        int nSrcWidth = nToplevelSrcWidth;
5771
0
        int nSrcHeight = nToplevelSrcHeight;
5772
0
        if (iOverview > 0 &&
5773
0
            papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5774
0
        {
5775
0
            nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5776
0
            nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5777
0
            iSrcOverview = iOverview - 1;
5778
0
        }
5779
5780
0
        const double dfXRatioDstToSrc =
5781
0
            static_cast<double>(nSrcWidth) / nDstTotalWidth;
5782
0
        const double dfYRatioDstToSrc =
5783
0
            static_cast<double>(nSrcHeight) / nDstTotalHeight;
5784
5785
0
        const int nOvrFactor =
5786
0
            std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5787
0
                                 static_cast<int>(0.5 + dfYRatioDstToSrc)));
5788
5789
0
        int nDstChunkXSize = 0;
5790
0
        int nDstChunkYSize = 0;
5791
0
        papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5792
0
                                                        &nDstChunkYSize);
5793
5794
0
        constexpr int PIXEL_MARGIN = 2;
5795
        // Try to extend the chunk size so that the memory needed to acquire
5796
        // source pixels goes up to 10 MB.
5797
        // This can help for drivers that support multi-threaded reading
5798
0
        const int nFullResYChunk = static_cast<int>(std::min<double>(
5799
0
            nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5800
0
        const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5801
0
            nSrcHeight,
5802
0
            nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5803
0
                                 nKernelRadius * nOvrFactor));
5804
0
        while (nDstChunkXSize < nDstWidth)
5805
0
        {
5806
0
            constexpr int INCREASE_FACTOR = 2;
5807
5808
0
            const int nFullResXChunk = static_cast<int>(std::min<double>(
5809
0
                nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5810
0
                                              dfXRatioDstToSrc));
5811
5812
0
            const int nFullResXChunkQueried =
5813
0
                static_cast<int>(std::min<int64_t>(
5814
0
                    nSrcWidth,
5815
0
                    nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5816
0
                                         nKernelRadius * nOvrFactor));
5817
5818
0
            if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5819
0
                             nFullResYChunkQueried / nWrkDataTypeSize)
5820
0
            {
5821
0
                break;
5822
0
            }
5823
5824
0
            nDstChunkXSize *= INCREASE_FACTOR;
5825
0
        }
5826
0
        nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5827
5828
0
        const int nFullResXChunk = static_cast<int>(std::min<double>(
5829
0
            nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5830
0
        const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5831
0
            nSrcWidth,
5832
0
            nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5833
0
                                 nKernelRadius * nOvrFactor));
5834
5835
        // Make sure that the RAM requirements to acquire the source data does
5836
        // not exceed nChunkMaxSizeForTempFile
5837
        // If so, reduce the destination chunk size, generate overviews in a
5838
        // temporary dataset, and copy that temporary dataset over the target
5839
        // overview bands (to avoid issues with lossy compression)
5840
0
        const bool bOverflowFullResXChunkYChunkQueried =
5841
0
            nBands > std::numeric_limits<int64_t>::max() /
5842
0
                         nFullResXChunkQueried / nFullResYChunkQueried /
5843
0
                         nWrkDataTypeSize;
5844
5845
0
        const auto nMemRequirement =
5846
0
            bOverflowFullResXChunkYChunkQueried
5847
0
                ? 0
5848
0
                : static_cast<GIntBig>(nFullResXChunkQueried) *
5849
0
                      nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5850
        // Use a temporary dataset with a smaller destination chunk size
5851
0
        const auto nOverShootFactor =
5852
0
            nMemRequirement / nChunkMaxSizeForTempFile;
5853
5854
0
        constexpr int MIN_OVERSHOOT_FACTOR = 4;
5855
0
        const auto nSqrtOverShootFactor = std::max<GIntBig>(
5856
0
            MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5857
0
                                      static_cast<double>(nOverShootFactor)))));
5858
0
        constexpr int DEFAULT_CHUNK_SIZE = 256;
5859
0
        constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5860
0
        const int nReducedDstChunkXSize =
5861
0
            bOverflowFullResXChunkYChunkQueried
5862
0
                ? DEFAULT_CHUNK_SIZE
5863
0
                : std::max(1, static_cast<int>(nDstChunkXSize /
5864
0
                                               nSqrtOverShootFactor) &
5865
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5866
0
        const int nReducedDstChunkYSize =
5867
0
            bOverflowFullResXChunkYChunkQueried
5868
0
                ? DEFAULT_CHUNK_SIZE
5869
0
                : std::max(1, static_cast<int>(nDstChunkYSize /
5870
0
                                               nSqrtOverShootFactor) &
5871
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5872
5873
0
        if (bOverflowFullResXChunkYChunkQueried ||
5874
0
            nMemRequirement > nChunkMaxSizeForTempFile)
5875
0
        {
5876
0
            const auto nDTSize =
5877
0
                std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5878
0
            const bool bTmpDSMemRequirementOverflow =
5879
0
                nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5880
0
                             nDstHeight / nDTSize;
5881
0
            const auto nTmpDSMemRequirement =
5882
0
                bTmpDSMemRequirementOverflow
5883
0
                    ? 0
5884
0
                    : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5885
0
                          nDTSize;
5886
5887
            // make sure that one band buffer doesn't overflow size_t
5888
0
            const bool bChunkSizeOverflow =
5889
0
                static_cast<size_t>(nDTSize) >
5890
0
                std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5891
0
            const size_t nChunkSize =
5892
0
                bChunkSizeOverflow
5893
0
                    ? 0
5894
0
                    : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5895
5896
0
            const auto CreateVRT =
5897
0
                [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5898
0
                 pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5899
0
                 iSrcOverview, &abHasNoData,
5900
0
                 &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5901
0
            {
5902
0
                auto poVRTDS = std::make_unique<VRTDataset>(
5903
0
                    nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5904
0
                    nVRTBlockYSize);
5905
5906
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5907
0
                {
5908
0
                    auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5909
0
                    poVRTSrc->SetResampling(pszResampling);
5910
0
                    poVRTDS->AddBand(eWrkDataType);
5911
0
                    auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5912
0
                        poVRTDS->GetRasterBand(iBand + 1));
5913
5914
0
                    auto poSrcBand = papoSrcBands[iBand];
5915
0
                    if (iSrcOverview != -1)
5916
0
                        poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5917
0
                    poVRTBand->ConfigureSource(
5918
0
                        poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5919
0
                        nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5920
                    // Add the source to the band
5921
0
                    poVRTBand->AddSource(poVRTSrc.release());
5922
0
                    if (abHasNoData[iBand])
5923
0
                        poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5924
0
                }
5925
5926
0
                if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5927
0
                    poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5928
0
                {
5929
0
                    VRTSourcedRasterBand *poMaskVRTBand =
5930
0
                        cpl::down_cast<VRTSourcedRasterBand *>(
5931
0
                            poVRTDS->GetRasterBand(1)->GetMaskBand());
5932
0
                    auto poSrcBand = papoSrcBands[0];
5933
0
                    if (iSrcOverview != -1)
5934
0
                        poSrcBand = papapoOverviewBands[0][iSrcOverview];
5935
0
                    poMaskVRTBand->AddMaskBandSource(
5936
0
                        poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5937
0
                        0, 0, nDstTotalWidth, nDstTotalHeight);
5938
0
                }
5939
5940
0
                return poVRTDS;
5941
0
            };
5942
5943
            // If the overview accommodates chunking, do so and recurse
5944
            // to avoid generating full size temporary files
5945
0
            if (!bOverflowFullResXChunkYChunkQueried &&
5946
0
                !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5947
0
                (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5948
0
            {
5949
                // Create a VRT with the smaller chunk to do the scaling
5950
0
                auto poVRTDS =
5951
0
                    CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5952
5953
0
                std::vector<GDALRasterBand *> apoVRTBand(nBands);
5954
0
                std::vector<GDALRasterBand *> apoDstBand(nBands);
5955
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5956
0
                {
5957
0
                    apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5958
0
                    apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5959
0
                }
5960
5961
                // Use a flag to avoid reading from the overview being built
5962
0
                GDALRasterIOExtraArg sExtraArg;
5963
0
                INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5964
0
                if (iSrcOverview == -1)
5965
0
                    sExtraArg.bUseOnlyThisScale = true;
5966
5967
                // A single band buffer for data transfer to the overview
5968
0
                std::vector<GByte> abyChunk;
5969
0
                try
5970
0
                {
5971
0
                    abyChunk.resize(nChunkSize);
5972
0
                }
5973
0
                catch (const std::exception &)
5974
0
                {
5975
0
                    CPLError(CE_Failure, CPLE_OutOfMemory,
5976
0
                             "Out of memory allocating temporary buffer");
5977
0
                    return CE_Failure;
5978
0
                }
5979
5980
                // Loop over output height, in chunks
5981
0
                for (int nDstYOff = nDstYOffStart;
5982
0
                     nDstYOff < nDstYOffEnd && eErr == CE_None;
5983
0
                     /* */)
5984
0
                {
5985
0
                    const int nDstYCount =
5986
0
                        std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5987
                    // Loop over output width, in output chunks
5988
0
                    for (int nDstXOff = nDstXOffStart;
5989
0
                         nDstXOff < nDstXOffEnd && eErr == CE_None;
5990
0
                         /* */)
5991
0
                    {
5992
0
                        const int nDstXCount =
5993
0
                            std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5994
                        // Read and transfer the chunk to the overview
5995
0
                        for (int iBand = 0; iBand < nBands && eErr == CE_None;
5996
0
                             ++iBand)
5997
0
                        {
5998
0
                            eErr = apoVRTBand[iBand]->RasterIO(
5999
0
                                GF_Read, nDstXOff, nDstYOff, nDstXCount,
6000
0
                                nDstYCount, abyChunk.data(), nDstXCount,
6001
0
                                nDstYCount, eDataType, 0, 0, &sExtraArg);
6002
0
                            if (eErr == CE_None)
6003
0
                            {
6004
0
                                eErr = apoDstBand[iBand]->RasterIO(
6005
0
                                    GF_Write, nDstXOff, nDstYOff, nDstXCount,
6006
0
                                    nDstYCount, abyChunk.data(), nDstXCount,
6007
0
                                    nDstYCount, eDataType, 0, 0, nullptr);
6008
0
                            }
6009
0
                        }
6010
6011
0
                        dfCurPixelCount +=
6012
0
                            static_cast<double>(nDstXCount) * nDstYCount;
6013
6014
0
                        nDstXOff += nDstXCount;
6015
0
                    }  // width
6016
6017
0
                    if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
6018
0
                                     nullptr, pProgressData))
6019
0
                    {
6020
0
                        CPLError(CE_Failure, CPLE_UserInterrupt,
6021
0
                                 "User terminated");
6022
0
                        eErr = CE_Failure;
6023
0
                    }
6024
6025
0
                    nDstYOff += nDstYCount;
6026
0
                }  // height
6027
6028
0
                if (CE_None != eErr)
6029
0
                {
6030
0
                    CPLError(CE_Failure, CPLE_AppDefined,
6031
0
                             "Error while writing overview");
6032
0
                    return CE_Failure;
6033
0
                }
6034
6035
0
                pfnProgress(1.0, nullptr, pProgressData);
6036
                // Flush the overviews we just generated
6037
0
                for (int iBand = 0; iBand < nBands; ++iBand)
6038
0
                    apoDstBand[iBand]->FlushCache(false);
6039
6040
0
                continue;  // Next overview
6041
0
            }              // chunking via temporary dataset
6042
6043
0
            std::unique_ptr<GDALDataset> poTmpDS;
6044
            // Config option mostly/only for autotest purposes
6045
0
            const char *pszGDAL_OVR_TEMP_DRIVER =
6046
0
                CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
6047
0
            if ((!bTmpDSMemRequirementOverflow &&
6048
0
                 nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
6049
0
                 !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
6050
0
                EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
6051
0
            {
6052
0
                auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
6053
0
                if (!poTmpDrv)
6054
0
                {
6055
0
                    eErr = CE_Failure;
6056
0
                    break;
6057
0
                }
6058
0
                poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
6059
0
                                               nDstTotalHeight, nBands,
6060
0
                                               eDataType, nullptr));
6061
0
            }
6062
0
            else
6063
0
            {
6064
                // Create a temporary file for the overview
6065
0
                auto poTmpDrv =
6066
0
                    GetGDALDriverManager()->GetDriverByName("GTiff");
6067
0
                if (!poTmpDrv)
6068
0
                {
6069
0
                    eErr = CE_Failure;
6070
0
                    break;
6071
0
                }
6072
0
                std::string osTmpFilename;
6073
0
                auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
6074
0
                if (poDstDS)
6075
0
                {
6076
0
                    osTmpFilename = poDstDS->GetDescription();
6077
0
                    VSIStatBufL sStatBuf;
6078
0
                    if (!osTmpFilename.empty() &&
6079
0
                        VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
6080
0
                        osTmpFilename += "_tmp_ovr.tif";
6081
0
                }
6082
0
                if (osTmpFilename.empty())
6083
0
                {
6084
0
                    osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
6085
0
                    osTmpFilename += ".tif";
6086
0
                }
6087
0
                CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
6088
0
                         osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
6089
0
                CPLStringList aosCO;
6090
0
                if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
6091
0
                          (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
6092
0
                {
6093
0
                    aosCO.SetNameValue("TILED", "YES");
6094
0
                    aosCO.SetNameValue("BLOCKXSIZE",
6095
0
                                       CPLSPrintf("%d", nReducedDstChunkXSize));
6096
0
                    aosCO.SetNameValue("BLOCKYSIZE",
6097
0
                                       CPLSPrintf("%d", nReducedDstChunkYSize));
6098
0
                }
6099
0
                if (const char *pszCOList =
6100
0
                        poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
6101
0
                {
6102
0
                    aosCO.SetNameValue(
6103
0
                        "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
6104
0
                }
6105
0
                poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
6106
0
                                               nDstHeight, nBands, eDataType,
6107
0
                                               aosCO.List()));
6108
0
                if (poTmpDS)
6109
0
                {
6110
0
                    poTmpDS->MarkSuppressOnClose();
6111
0
                    VSIUnlink(osTmpFilename.c_str());
6112
0
                }
6113
0
            }
6114
0
            if (!poTmpDS)
6115
0
            {
6116
0
                eErr = CE_Failure;
6117
0
                break;
6118
0
            }
6119
6120
            // Create a full size VRT to do the resampling without edge effects
6121
0
            auto poVRTDS =
6122
0
                CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6123
6124
            // Allocate a band buffer with the overview chunk size
6125
0
            std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
6126
0
                VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
6127
0
                                    nDstChunkYSize));
6128
0
            if (pDstBuffer == nullptr)
6129
0
            {
6130
0
                eErr = CE_Failure;
6131
0
                break;
6132
0
            }
6133
6134
            // Use a flag to avoid reading the overview being built
6135
0
            GDALRasterIOExtraArg sExtraArg;
6136
0
            INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6137
0
            if (iSrcOverview == -1)
6138
0
                sExtraArg.bUseOnlyThisScale = true;
6139
6140
            // Scale and copy data from the VRT to the temp file
6141
0
            for (int nDstYOff = nDstYOffStart;
6142
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
6143
0
                 /* */)
6144
0
            {
6145
0
                const int nDstYCount =
6146
0
                    std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
6147
0
                for (int nDstXOff = nDstXOffStart;
6148
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
6149
0
                     /* */)
6150
0
                {
6151
0
                    const int nDstXCount =
6152
0
                        std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
6153
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
6154
0
                         ++iBand)
6155
0
                    {
6156
0
                        auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
6157
0
                        eErr = poSrcBand->RasterIO(
6158
0
                            GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
6159
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
6160
0
                            eWrkDataType, 0, 0, &sExtraArg);
6161
0
                        if (eErr == CE_None)
6162
0
                        {
6163
                            // Write to the temporary dataset, shifted
6164
0
                            auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
6165
0
                            eErr = poOvrBand->RasterIO(
6166
0
                                GF_Write, nDstXOff - nDstXOffStart,
6167
0
                                nDstYOff - nDstYOffStart, nDstXCount,
6168
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
6169
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
6170
0
                        }
6171
0
                    }
6172
0
                    nDstXOff += nDstXCount;
6173
0
                }
6174
0
                nDstYOff += nDstYCount;
6175
0
            }
6176
6177
            // Copy from the temporary to the overview
6178
0
            for (int nDstYOff = nDstYOffStart;
6179
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
6180
0
                 /* */)
6181
0
            {
6182
0
                const int nDstYCount =
6183
0
                    std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6184
0
                for (int nDstXOff = nDstXOffStart;
6185
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
6186
0
                     /* */)
6187
0
                {
6188
0
                    const int nDstXCount =
6189
0
                        std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6190
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
6191
0
                         ++iBand)
6192
0
                    {
6193
0
                        auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
6194
0
                        eErr = poSrcBand->RasterIO(
6195
0
                            GF_Read, nDstXOff - nDstXOffStart,
6196
0
                            nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
6197
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
6198
0
                            eWrkDataType, 0, 0, nullptr);
6199
0
                        if (eErr == CE_None)
6200
0
                        {
6201
                            // Write to the destination overview bands
6202
0
                            auto poOvrBand =
6203
0
                                papapoOverviewBands[iBand][iOverview];
6204
0
                            eErr = poOvrBand->RasterIO(
6205
0
                                GF_Write, nDstXOff, nDstYOff, nDstXCount,
6206
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
6207
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
6208
0
                        }
6209
0
                    }
6210
0
                    nDstXOff += nDstXCount;
6211
0
                }
6212
0
                nDstYOff += nDstYCount;
6213
0
            }
6214
6215
0
            if (eErr != CE_None)
6216
0
            {
6217
0
                CPLError(CE_Failure, CPLE_AppDefined,
6218
0
                         "Failed to write overview %d", iOverview);
6219
0
                return eErr;
6220
0
            }
6221
6222
            // Flush the data to overviews.
6223
0
            for (int iBand = 0; iBand < nBands; ++iBand)
6224
0
                papapoOverviewBands[iBand][iOverview]->FlushCache(false);
6225
6226
0
            continue;
6227
0
        }
6228
6229
        // Structure describing a resampling job
6230
0
        struct OvrJob
6231
0
        {
6232
            // Buffers to free when job is finished
6233
0
            std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
6234
0
            std::unique_ptr<PointerHolder> oSrcBufferHolder{};
6235
0
            std::unique_ptr<PointerHolder> oDstBufferHolder{};
6236
6237
0
            GDALRasterBand *poDstBand = nullptr;
6238
6239
            // Input parameters of pfnResampleFn
6240
0
            GDALResampleFunction pfnResampleFn = nullptr;
6241
0
            GDALOverviewResampleArgs args{};
6242
0
            const void *pChunk = nullptr;
6243
6244
            // Output values of resampling function
6245
0
            CPLErr eErr = CE_Failure;
6246
0
            void *pDstBuffer = nullptr;
6247
0
            GDALDataType eDstBufferDataType = GDT_Unknown;
6248
6249
0
            void NotifyFinished()
6250
0
            {
6251
0
                std::lock_guard guard(mutex);
6252
0
                bFinished = true;
6253
0
                cv.notify_one();
6254
0
            }
6255
6256
0
            bool IsFinished()
6257
0
            {
6258
0
                std::lock_guard guard(mutex);
6259
0
                return bFinished;
6260
0
            }
6261
6262
0
            void WaitFinished()
6263
0
            {
6264
0
                std::unique_lock oGuard(mutex);
6265
0
                while (!bFinished)
6266
0
                {
6267
0
                    cv.wait(oGuard);
6268
0
                }
6269
0
            }
6270
6271
0
          private:
6272
            // Synchronization
6273
0
            bool bFinished = false;
6274
0
            std::mutex mutex{};
6275
0
            std::condition_variable cv{};
6276
0
        };
6277
6278
        // Thread function to resample
6279
0
        const auto JobResampleFunc = [](void *pData)
6280
0
        {
6281
0
            OvrJob *poJob = static_cast<OvrJob *>(pData);
6282
6283
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
6284
0
                                               &(poJob->pDstBuffer),
6285
0
                                               &(poJob->eDstBufferDataType));
6286
6287
0
            poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
6288
6289
0
            poJob->NotifyFinished();
6290
0
        };
6291
6292
        // Function to write resample data to target band
6293
0
        const auto WriteJobData = [](const OvrJob *poJob)
6294
0
        {
6295
0
            return poJob->poDstBand->RasterIO(
6296
0
                GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
6297
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6298
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
6299
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6300
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff,
6301
0
                poJob->eDstBufferDataType, 0, 0, nullptr);
6302
0
        };
6303
6304
        // Wait for completion of oldest job and serialize it
6305
0
        const auto WaitAndFinalizeOldestJob =
6306
0
            [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
6307
0
        {
6308
0
            auto poOldestJob = jobList.front().get();
6309
0
            poOldestJob->WaitFinished();
6310
0
            CPLErr l_eErr = poOldestJob->eErr;
6311
0
            if (l_eErr == CE_None)
6312
0
            {
6313
0
                l_eErr = WriteJobData(poOldestJob);
6314
0
            }
6315
6316
0
            jobList.pop_front();
6317
0
            return l_eErr;
6318
0
        };
6319
6320
        // Queue of jobs
6321
0
        std::list<std::unique_ptr<OvrJob>> jobList;
6322
6323
0
        std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
6324
0
        std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
6325
0
            apabyChunkNoDataMask(nBands);
6326
6327
        // Iterate on destination overview, block by block.
6328
0
        for (int nDstYOff = nDstYOffStart;
6329
0
             nDstYOff < nDstYOffEnd && eErr == CE_None;
6330
0
             nDstYOff += nDstChunkYSize)
6331
0
        {
6332
0
            int nDstYCount;
6333
0
            if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
6334
0
                nDstYCount = nDstChunkYSize;
6335
0
            else
6336
0
                nDstYCount = nDstYOffEnd - nDstYOff;
6337
6338
0
            int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
6339
0
            int nChunkYOff2 = static_cast<int>(
6340
0
                ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
6341
0
            if (nChunkYOff2 > nSrcHeight ||
6342
0
                nDstYOff + nDstYCount == nDstTotalHeight)
6343
0
                nChunkYOff2 = nSrcHeight;
6344
0
            int nYCount = nChunkYOff2 - nChunkYOff;
6345
0
            CPLAssert(nYCount <= nFullResYChunk);
6346
6347
0
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
6348
0
            int nChunkYSizeQueried =
6349
0
                nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6350
0
            if (nChunkYOffQueried < 0)
6351
0
            {
6352
0
                nChunkYSizeQueried += nChunkYOffQueried;
6353
0
                nChunkYOffQueried = 0;
6354
0
            }
6355
0
            if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
6356
0
                nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
6357
0
            CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
6358
6359
0
            if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
6360
0
                             nullptr, pProgressData))
6361
0
            {
6362
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6363
0
                eErr = CE_Failure;
6364
0
            }
6365
6366
            // Iterate on destination overview, block by block.
6367
0
            for (int nDstXOff = nDstXOffStart;
6368
0
                 nDstXOff < nDstXOffEnd && eErr == CE_None;
6369
0
                 nDstXOff += nDstChunkXSize)
6370
0
            {
6371
0
                int nDstXCount = 0;
6372
0
                if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
6373
0
                    nDstXCount = nDstChunkXSize;
6374
0
                else
6375
0
                    nDstXCount = nDstXOffEnd - nDstXOff;
6376
6377
0
                dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6378
6379
0
                int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6380
0
                int nChunkXOff2 = static_cast<int>(
6381
0
                    ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6382
0
                if (nChunkXOff2 > nSrcWidth ||
6383
0
                    nDstXOff + nDstXCount == nDstTotalWidth)
6384
0
                    nChunkXOff2 = nSrcWidth;
6385
0
                const int nXCount = nChunkXOff2 - nChunkXOff;
6386
0
                CPLAssert(nXCount <= nFullResXChunk);
6387
6388
0
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6389
0
                int nChunkXSizeQueried =
6390
0
                    nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6391
0
                if (nChunkXOffQueried < 0)
6392
0
                {
6393
0
                    nChunkXSizeQueried += nChunkXOffQueried;
6394
0
                    nChunkXOffQueried = 0;
6395
0
                }
6396
0
                if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6397
0
                    nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6398
0
                CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6399
#if DEBUG_VERBOSE
6400
                CPLDebug("GDAL",
6401
                         "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6402
                         nChunkXOffQueried, nChunkYOffQueried,
6403
                         nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6404
                         nDstYOff, nDstXCount, nDstYCount);
6405
#endif
6406
6407
                // Avoid accumulating too many tasks and exhaust RAM
6408
6409
                // Try to complete already finished jobs
6410
0
                while (eErr == CE_None && !jobList.empty())
6411
0
                {
6412
0
                    auto poOldestJob = jobList.front().get();
6413
0
                    if (!poOldestJob->IsFinished())
6414
0
                        break;
6415
0
                    eErr = poOldestJob->eErr;
6416
0
                    if (eErr == CE_None)
6417
0
                    {
6418
0
                        eErr = WriteJobData(poOldestJob);
6419
0
                    }
6420
6421
0
                    jobList.pop_front();
6422
0
                }
6423
6424
                // And in case we have saturated the number of threads,
6425
                // wait for completion of tasks to go below the threshold.
6426
0
                while (eErr == CE_None &&
6427
0
                       jobList.size() >= static_cast<size_t>(nThreads))
6428
0
                {
6429
0
                    eErr = WaitAndFinalizeOldestJob(jobList);
6430
0
                }
6431
6432
                // Read the source buffers for all the bands.
6433
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6434
0
                {
6435
                    // (Re)allocate buffers if needed
6436
0
                    if (apaChunk[iBand] == nullptr)
6437
0
                    {
6438
0
                        apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6439
0
                            nFullResXChunkQueried, nFullResYChunkQueried,
6440
0
                            nWrkDataTypeSize));
6441
0
                        if (apaChunk[iBand] == nullptr)
6442
0
                        {
6443
0
                            eErr = CE_Failure;
6444
0
                        }
6445
0
                    }
6446
0
                    if (bUseNoDataMask &&
6447
0
                        apabyChunkNoDataMask[iBand] == nullptr)
6448
0
                    {
6449
0
                        apabyChunkNoDataMask[iBand].reset(
6450
0
                            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6451
0
                                nFullResXChunkQueried, nFullResYChunkQueried)));
6452
0
                        if (apabyChunkNoDataMask[iBand] == nullptr)
6453
0
                        {
6454
0
                            eErr = CE_Failure;
6455
0
                        }
6456
0
                    }
6457
6458
0
                    if (eErr == CE_None)
6459
0
                    {
6460
0
                        GDALRasterBand *poSrcBand = nullptr;
6461
0
                        if (iSrcOverview == -1)
6462
0
                            poSrcBand = papoSrcBands[iBand];
6463
0
                        else
6464
0
                            poSrcBand =
6465
0
                                papapoOverviewBands[iBand][iSrcOverview];
6466
0
                        eErr = poSrcBand->RasterIO(
6467
0
                            GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6468
0
                            nChunkXSizeQueried, nChunkYSizeQueried,
6469
0
                            apaChunk[iBand].get(), nChunkXSizeQueried,
6470
0
                            nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6471
6472
0
                        if (bUseNoDataMask && eErr == CE_None)
6473
0
                        {
6474
0
                            auto poMaskBand = poSrcBand->IsMaskBand()
6475
0
                                                  ? poSrcBand
6476
0
                                                  : poSrcBand->GetMaskBand();
6477
0
                            eErr = poMaskBand->RasterIO(
6478
0
                                GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6479
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6480
0
                                apabyChunkNoDataMask[iBand].get(),
6481
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6482
0
                                GDT_Byte, 0, 0, nullptr);
6483
0
                        }
6484
0
                    }
6485
0
                }
6486
6487
                // Compute the resulting overview block.
6488
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6489
0
                {
6490
0
                    auto poJob = std::make_unique<OvrJob>();
6491
0
                    poJob->pfnResampleFn = pfnResampleFn;
6492
0
                    poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6493
0
                    poJob->args.eOvrDataType =
6494
0
                        poJob->poDstBand->GetRasterDataType();
6495
0
                    poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6496
0
                    poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6497
0
                    const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6498
0
                        "NBITS", "IMAGE_STRUCTURE");
6499
0
                    poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6500
0
                    poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6501
0
                    poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6502
0
                    poJob->args.eWrkDataType = eWrkDataType;
6503
0
                    poJob->pChunk = apaChunk[iBand].get();
6504
0
                    poJob->args.pabyChunkNodataMask =
6505
0
                        apabyChunkNoDataMask[iBand].get();
6506
0
                    poJob->args.nChunkXOff = nChunkXOffQueried;
6507
0
                    poJob->args.nChunkXSize = nChunkXSizeQueried;
6508
0
                    poJob->args.nChunkYOff = nChunkYOffQueried;
6509
0
                    poJob->args.nChunkYSize = nChunkYSizeQueried;
6510
0
                    poJob->args.nDstXOff = nDstXOff;
6511
0
                    poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6512
0
                    poJob->args.nDstYOff = nDstYOff;
6513
0
                    poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6514
0
                    poJob->args.pszResampling = pszResampling;
6515
0
                    poJob->args.bHasNoData = abHasNoData[iBand];
6516
0
                    poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6517
0
                    poJob->args.eSrcDataType = eDataType;
6518
0
                    poJob->args.bPropagateNoData = bPropagateNoData;
6519
6520
0
                    if (poJobQueue)
6521
0
                    {
6522
0
                        poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6523
0
                            apabyChunkNoDataMask[iBand].release()));
6524
6525
0
                        poJob->oSrcBufferHolder.reset(
6526
0
                            new PointerHolder(apaChunk[iBand].release()));
6527
6528
0
                        poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6529
0
                        jobList.emplace_back(std::move(poJob));
6530
0
                    }
6531
0
                    else
6532
0
                    {
6533
0
                        JobResampleFunc(poJob.get());
6534
0
                        eErr = poJob->eErr;
6535
0
                        if (eErr == CE_None)
6536
0
                        {
6537
0
                            eErr = WriteJobData(poJob.get());
6538
0
                        }
6539
0
                    }
6540
0
                }
6541
0
            }
6542
0
        }
6543
6544
        // Wait for all pending jobs to complete
6545
0
        while (!jobList.empty())
6546
0
        {
6547
0
            const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6548
0
            if (l_eErr != CE_None && eErr == CE_None)
6549
0
                eErr = l_eErr;
6550
0
        }
6551
6552
        // Flush the data to overviews.
6553
0
        for (int iBand = 0; iBand < nBands; ++iBand)
6554
0
        {
6555
0
            if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6556
0
                CE_None)
6557
0
                eErr = CE_Failure;
6558
0
        }
6559
0
    }
6560
6561
0
    if (eErr == CE_None)
6562
0
        pfnProgress(1.0, nullptr, pProgressData);
6563
6564
0
    return eErr;
6565
0
}
6566
6567
/************************************************************************/
6568
/*            GDALRegenerateOverviewsMultiBand()                        */
6569
/************************************************************************/
6570
6571
/**
6572
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6573
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6574
 *
6575
 * This function will generate one or more overview images from a base
6576
 * image using the requested downsampling algorithm.  Its primary use
6577
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
6578
 * can also be used to generate downsampled images in one file from another
6579
 * outside the overview architecture.
6580
 *
6581
 * The output bands need to exist in advance and share the same characteristics
6582
 * (type, dimensions)
6583
 *
6584
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6585
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6586
 *
6587
 * It does not support color tables or complex data types.
6588
 *
6589
 * The pseudo-algorithm used by the function is :
6590
 *    for each overview
6591
 *       iterate on lines of the source by a step of deltay
6592
 *           iterate on columns of the source  by a step of deltax
6593
 *               read the source data of size deltax * deltay for all the bands
6594
 *               generate the corresponding overview block for all the bands
6595
 *
6596
 * This function will honour properly NODATA_VALUES tuples (special dataset
6597
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6598
 * considered as the nodata value and not each value of the triplet
6599
 * independently per band.
6600
 *
6601
 * The GDAL_NUM_THREADS configuration option can be set
6602
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6603
 * overview computation.
6604
 *
6605
 * @param apoSrcBands the list of source bands to downsample
6606
 * @param aapoOverviewBands bidimension array of bands. First dimension is
6607
 *                          indexed by bands. Second dimension is indexed by
6608
 *                          overview levels. All aapoOverviewBands[i] arrays
6609
 *                          must have the same size (i.e. same number of
6610
 *                          overviews)
6611
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6612
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6613
 * @param pfnProgress progress report function.
6614
 * @param pProgressData progress function callback data.
6615
 * @param papszOptions NULL terminated list of options as
6616
 *                     key=value pairs, or NULL
6617
 *                     The XOFF, YOFF, XSIZE and YSIZE
6618
 *                     options can be specified to express that overviews should
6619
 *                     be regenerated only in the specified subset of the source
6620
 *                     dataset.
6621
 * @return CE_None on success or CE_Failure on failure.
6622
 * @since 3.10
6623
 */
6624
6625
CPLErr GDALRegenerateOverviewsMultiBand(
6626
    const std::vector<GDALRasterBand *> &apoSrcBands,
6627
    const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6628
    const char *pszResampling, GDALProgressFunc pfnProgress,
6629
    void *pProgressData, CSLConstList papszOptions)
6630
0
{
6631
0
    CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6632
0
    for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6633
0
    {
6634
0
        CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6635
0
    }
6636
6637
0
    if (aapoOverviewBands.empty())
6638
0
        return CE_None;
6639
6640
0
    std::vector<GDALRasterBand **> apapoOverviewBands;
6641
0
    for (auto &apoOverviewBands : aapoOverviewBands)
6642
0
    {
6643
0
        auto papoOverviewBands = static_cast<GDALRasterBand **>(
6644
0
            CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6645
0
        for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6646
0
        {
6647
0
            papoOverviewBands[i] = apoOverviewBands[i];
6648
0
        }
6649
0
        apapoOverviewBands.push_back(papoOverviewBands);
6650
0
    }
6651
0
    const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6652
0
        static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6653
0
        static_cast<int>(aapoOverviewBands[0].size()),
6654
0
        apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6655
0
        papszOptions);
6656
0
    for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6657
0
        CPLFree(papoOverviewBands);
6658
0
    return eErr;
6659
0
}
6660
6661
/************************************************************************/
6662
/*                        GDALComputeBandStats()                        */
6663
/************************************************************************/
6664
6665
/** Undocumented
6666
 * @param hSrcBand undocumented.
6667
 * @param nSampleStep Step between scanlines used to compute statistics.
6668
 *                    When nSampleStep is equal to 1, all scanlines will
6669
 *                    be processed.
6670
 * @param pdfMean undocumented.
6671
 * @param pdfStdDev undocumented.
6672
 * @param pfnProgress undocumented.
6673
 * @param pProgressData undocumented.
6674
 * @return undocumented
6675
 */
6676
CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6677
                                        int nSampleStep, double *pdfMean,
6678
                                        double *pdfStdDev,
6679
                                        GDALProgressFunc pfnProgress,
6680
                                        void *pProgressData)
6681
6682
0
{
6683
0
    VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6684
6685
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6686
6687
0
    if (pfnProgress == nullptr)
6688
0
        pfnProgress = GDALDummyProgress;
6689
6690
0
    const int nWidth = poSrcBand->GetXSize();
6691
0
    const int nHeight = poSrcBand->GetYSize();
6692
6693
0
    if (nSampleStep >= nHeight || nSampleStep < 1)
6694
0
        nSampleStep = 1;
6695
6696
0
    GDALDataType eWrkType = GDT_Unknown;
6697
0
    float *pafData = nullptr;
6698
0
    GDALDataType eType = poSrcBand->GetRasterDataType();
6699
0
    const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6700
0
    if (bComplex)
6701
0
    {
6702
0
        pafData = static_cast<float *>(
6703
0
            VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6704
0
        eWrkType = GDT_CFloat32;
6705
0
    }
6706
0
    else
6707
0
    {
6708
0
        pafData =
6709
0
            static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6710
0
        eWrkType = GDT_Float32;
6711
0
    }
6712
6713
0
    if (nWidth == 0 || pafData == nullptr)
6714
0
    {
6715
0
        VSIFree(pafData);
6716
0
        return CE_Failure;
6717
0
    }
6718
6719
    /* -------------------------------------------------------------------- */
6720
    /*      Loop over all sample lines.                                     */
6721
    /* -------------------------------------------------------------------- */
6722
0
    double dfSum = 0.0;
6723
0
    double dfSum2 = 0.0;
6724
0
    int iLine = 0;
6725
0
    GIntBig nSamples = 0;
6726
6727
0
    do
6728
0
    {
6729
0
        if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6730
0
                         pProgressData))
6731
0
        {
6732
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6733
0
            CPLFree(pafData);
6734
0
            return CE_Failure;
6735
0
        }
6736
6737
0
        const CPLErr eErr =
6738
0
            poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6739
0
                                1, eWrkType, 0, 0, nullptr);
6740
0
        if (eErr != CE_None)
6741
0
        {
6742
0
            CPLFree(pafData);
6743
0
            return eErr;
6744
0
        }
6745
6746
0
        for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6747
0
        {
6748
0
            float fValue = 0.0f;
6749
6750
0
            if (bComplex)
6751
0
            {
6752
                // Compute the magnitude of the complex value.
6753
0
                fValue =
6754
0
                    std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6755
0
                               pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6756
0
            }
6757
0
            else
6758
0
            {
6759
0
                fValue = pafData[iPixel];
6760
0
            }
6761
6762
0
            dfSum += static_cast<double>(fValue);
6763
0
            dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
6764
0
        }
6765
6766
0
        nSamples += nWidth;
6767
0
        iLine += nSampleStep;
6768
0
    } while (iLine < nHeight);
6769
6770
0
    if (!pfnProgress(1.0, nullptr, pProgressData))
6771
0
    {
6772
0
        CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6773
0
        CPLFree(pafData);
6774
0
        return CE_Failure;
6775
0
    }
6776
6777
    /* -------------------------------------------------------------------- */
6778
    /*      Produce the result values.                                      */
6779
    /* -------------------------------------------------------------------- */
6780
0
    if (pdfMean != nullptr)
6781
0
        *pdfMean = dfSum / nSamples;
6782
6783
0
    if (pdfStdDev != nullptr)
6784
0
    {
6785
0
        const double dfMean = dfSum / nSamples;
6786
6787
0
        *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6788
0
    }
6789
6790
0
    CPLFree(pafData);
6791
6792
0
    return CE_None;
6793
0
}
6794
6795
/************************************************************************/
6796
/*                  GDALOverviewMagnitudeCorrection()                   */
6797
/*                                                                      */
6798
/*      Correct the mean and standard deviation of the overviews of     */
6799
/*      the given band to match the base layer approximately.           */
6800
/************************************************************************/
6801
6802
/** Undocumented
6803
 * @param hBaseBand undocumented.
6804
 * @param nOverviewCount undocumented.
6805
 * @param pahOverviews undocumented.
6806
 * @param pfnProgress undocumented.
6807
 * @param pProgressData undocumented.
6808
 * @return undocumented
6809
 */
6810
CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6811
                                       int nOverviewCount,
6812
                                       GDALRasterBandH *pahOverviews,
6813
                                       GDALProgressFunc pfnProgress,
6814
                                       void *pProgressData)
6815
6816
0
{
6817
0
    VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6818
6819
    /* -------------------------------------------------------------------- */
6820
    /*      Compute mean/stddev for source raster.                          */
6821
    /* -------------------------------------------------------------------- */
6822
0
    double dfOrigMean = 0.0;
6823
0
    double dfOrigStdDev = 0.0;
6824
0
    {
6825
0
        const CPLErr eErr =
6826
0
            GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6827
0
                                 pfnProgress, pProgressData);
6828
6829
0
        if (eErr != CE_None)
6830
0
            return eErr;
6831
0
    }
6832
6833
    /* -------------------------------------------------------------------- */
6834
    /*      Loop on overview bands.                                         */
6835
    /* -------------------------------------------------------------------- */
6836
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6837
0
    {
6838
0
        GDALRasterBand *poOverview =
6839
0
            GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6840
0
        double dfOverviewMean, dfOverviewStdDev;
6841
6842
0
        const CPLErr eErr =
6843
0
            GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6844
0
                                 &dfOverviewStdDev, pfnProgress, pProgressData);
6845
6846
0
        if (eErr != CE_None)
6847
0
            return eErr;
6848
6849
0
        double dfGain = 1.0;
6850
0
        if (dfOrigStdDev >= 0.0001)
6851
0
            dfGain = dfOrigStdDev / dfOverviewStdDev;
6852
6853
        /* --------------------------------------------------------------------
6854
         */
6855
        /*      Apply gain and offset. */
6856
        /* --------------------------------------------------------------------
6857
         */
6858
0
        const int nWidth = poOverview->GetXSize();
6859
0
        const int nHeight = poOverview->GetYSize();
6860
6861
0
        GDALDataType eWrkType = GDT_Unknown;
6862
0
        float *pafData = nullptr;
6863
0
        const GDALDataType eType = poOverview->GetRasterDataType();
6864
0
        const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6865
0
        if (bComplex)
6866
0
        {
6867
0
            pafData = static_cast<float *>(
6868
0
                VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6869
0
            eWrkType = GDT_CFloat32;
6870
0
        }
6871
0
        else
6872
0
        {
6873
0
            pafData = static_cast<float *>(
6874
0
                VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6875
0
            eWrkType = GDT_Float32;
6876
0
        }
6877
6878
0
        if (pafData == nullptr)
6879
0
        {
6880
0
            return CE_Failure;
6881
0
        }
6882
6883
0
        for (int iLine = 0; iLine < nHeight; ++iLine)
6884
0
        {
6885
0
            if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6886
0
                             pProgressData))
6887
0
            {
6888
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6889
0
                CPLFree(pafData);
6890
0
                return CE_Failure;
6891
0
            }
6892
6893
0
            if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6894
0
                                     nWidth, 1, eWrkType, 0, 0,
6895
0
                                     nullptr) != CE_None)
6896
0
            {
6897
0
                CPLFree(pafData);
6898
0
                return CE_Failure;
6899
0
            }
6900
6901
0
            for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6902
0
            {
6903
0
                if (bComplex)
6904
0
                {
6905
0
                    pafData[static_cast<size_t>(iPixel) * 2] *=
6906
0
                        static_cast<float>(dfGain);
6907
0
                    pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
6908
0
                        static_cast<float>(dfGain);
6909
0
                }
6910
0
                else
6911
0
                {
6912
0
                    pafData[iPixel] = static_cast<float>(
6913
0
                        (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
6914
0
                        dfOrigMean);
6915
0
                }
6916
0
            }
6917
6918
0
            if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6919
0
                                     nWidth, 1, eWrkType, 0, 0,
6920
0
                                     nullptr) != CE_None)
6921
0
            {
6922
0
                CPLFree(pafData);
6923
0
                return CE_Failure;
6924
0
            }
6925
0
        }
6926
6927
0
        if (!pfnProgress(1.0, nullptr, pProgressData))
6928
0
        {
6929
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6930
0
            CPLFree(pafData);
6931
0
            return CE_Failure;
6932
0
        }
6933
6934
0
        CPLFree(pafData);
6935
0
    }
6936
6937
0
    return CE_None;
6938
0
}