Coverage Report

Created: 2026-02-14 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/gcore/overview.cpp
Line
Count
Source
1
2
/******************************************************************************
3
 *
4
 * Project:  GDAL Core
5
 * Purpose:  Helper code to implement overview support in different drivers.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2000, Frank Warmerdam
10
 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14
15
#include "cpl_port.h"
16
#include "gdal_priv.h"
17
18
#include <cmath>
19
#include <cstddef>
20
#include <cstdlib>
21
22
#include <algorithm>
23
#include <complex>
24
#include <condition_variable>
25
#include <limits>
26
#include <list>
27
#include <memory>
28
#include <mutex>
29
#include <vector>
30
31
#include "cpl_conv.h"
32
#include "cpl_error.h"
33
#include "cpl_float.h"
34
#include "cpl_progress.h"
35
#include "cpl_vsi.h"
36
#include "cpl_worker_thread_pool.h"
37
#include "gdal.h"
38
#include "gdal_thread_pool.h"
39
#include "gdalwarper.h"
40
#include "gdal_vrt.h"
41
#include "vrtdataset.h"
42
43
#ifdef USE_NEON_OPTIMIZATIONS
44
#include "include_sse2neon.h"
45
46
#if (!defined(__aarch64__) && !defined(_M_ARM64))
47
#define ARM_V7
48
#endif
49
50
#define USE_SSE2
51
52
#include "gdalsse_priv.h"
53
54
// Restrict to 64bit processors because they are guaranteed to have SSE2,
55
// or if __AVX2__ is defined.
56
#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
57
#define USE_SSE2
58
59
#include "gdalsse_priv.h"
60
61
#ifdef __SSE3__
62
#include <pmmintrin.h>
63
#endif
64
#ifdef __SSSE3__
65
#include <tmmintrin.h>
66
#endif
67
#ifdef __SSE4_1__
68
#include <smmintrin.h>
69
#endif
70
#ifdef __AVX2__
71
#include <immintrin.h>
72
#endif
73
74
#endif
75
76
// To be included after above USE_SSE2 and include gdalsse_priv.h
77
// to avoid build issue on Windows x86
78
#include "gdal_priv_templates.hpp"
79
80
/************************************************************************/
81
/*                       GDALResampleChunk_Near()                       */
82
/************************************************************************/
83
84
template <class T>
85
static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
86
                                      const T *pChunk, T **ppDstBuffer)
87
88
0
{
89
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
90
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
91
0
    const GDALDataType eWrkDataType = args.eWrkDataType;
92
0
    const int nChunkXOff = args.nChunkXOff;
93
0
    const int nChunkXSize = args.nChunkXSize;
94
0
    const int nChunkYOff = args.nChunkYOff;
95
0
    const int nDstXOff = args.nDstXOff;
96
0
    const int nDstXOff2 = args.nDstXOff2;
97
0
    const int nDstYOff = args.nDstYOff;
98
0
    const int nDstYOff2 = args.nDstYOff2;
99
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
100
101
    /* -------------------------------------------------------------------- */
102
    /*      Allocate buffers.                                               */
103
    /* -------------------------------------------------------------------- */
104
0
    *ppDstBuffer = static_cast<T *>(
105
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
106
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
107
0
    if (*ppDstBuffer == nullptr)
108
0
    {
109
0
        return CE_Failure;
110
0
    }
111
0
    T *const pDstBuffer = *ppDstBuffer;
112
113
0
    int *panSrcXOff =
114
0
        static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
115
116
0
    if (panSrcXOff == nullptr)
117
0
    {
118
0
        return CE_Failure;
119
0
    }
120
121
    /* ==================================================================== */
122
    /*      Precompute inner loop constants.                                */
123
    /* ==================================================================== */
124
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
125
0
    {
126
0
        int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
127
0
        if (nSrcXOff < nChunkXOff)
128
0
            nSrcXOff = nChunkXOff;
129
130
0
        panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
131
0
    }
132
133
    /* ==================================================================== */
134
    /*      Loop over destination scanlines.                                */
135
    /* ==================================================================== */
136
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
137
0
    {
138
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
139
0
        if (nSrcYOff < nChunkYOff)
140
0
            nSrcYOff = nChunkYOff;
141
142
0
        const T *const pSrcScanline =
143
0
            pChunk +
144
0
            (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
145
0
            nChunkXOff;
146
147
        /* --------------------------------------------------------------------
148
         */
149
        /*      Loop over destination pixels */
150
        /* --------------------------------------------------------------------
151
         */
152
0
        T *pDstScanline =
153
0
            pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
154
0
        for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
155
0
        {
156
0
            pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
157
0
        }
158
0
    }
159
160
0
    CPLFree(panSrcXOff);
161
162
0
    return CE_None;
163
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>**)
164
165
static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
166
                                     const void *pChunk, void **ppDstBuffer,
167
                                     GDALDataType *peDstBufferDataType)
168
0
{
169
0
    *peDstBufferDataType = args.eWrkDataType;
170
0
    switch (args.eWrkDataType)
171
0
    {
172
        // For nearest resampling, as no computation is done, only the
173
        // size of the data type matters.
174
0
        case GDT_UInt8:
175
0
        case GDT_Int8:
176
0
        {
177
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
178
0
            return GDALResampleChunk_NearT(
179
0
                args, static_cast<const uint8_t *>(pChunk),
180
0
                reinterpret_cast<uint8_t **>(ppDstBuffer));
181
0
        }
182
183
0
        case GDT_Int16:
184
0
        case GDT_UInt16:
185
0
        case GDT_Float16:
186
0
        {
187
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
188
0
            return GDALResampleChunk_NearT(
189
0
                args, static_cast<const uint16_t *>(pChunk),
190
0
                reinterpret_cast<uint16_t **>(ppDstBuffer));
191
0
        }
192
193
0
        case GDT_CInt16:
194
0
        case GDT_CFloat16:
195
0
        case GDT_Int32:
196
0
        case GDT_UInt32:
197
0
        case GDT_Float32:
198
0
        {
199
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
200
0
            return GDALResampleChunk_NearT(
201
0
                args, static_cast<const uint32_t *>(pChunk),
202
0
                reinterpret_cast<uint32_t **>(ppDstBuffer));
203
0
        }
204
205
0
        case GDT_CInt32:
206
0
        case GDT_CFloat32:
207
0
        case GDT_Int64:
208
0
        case GDT_UInt64:
209
0
        case GDT_Float64:
210
0
        {
211
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
212
0
            return GDALResampleChunk_NearT(
213
0
                args, static_cast<const uint64_t *>(pChunk),
214
0
                reinterpret_cast<uint64_t **>(ppDstBuffer));
215
0
        }
216
217
0
        case GDT_CFloat64:
218
0
        {
219
0
            return GDALResampleChunk_NearT(
220
0
                args, static_cast<const std::complex<double> *>(pChunk),
221
0
                reinterpret_cast<std::complex<double> **>(ppDstBuffer));
222
0
        }
223
224
0
        case GDT_Unknown:
225
0
        case GDT_TypeCount:
226
0
            break;
227
0
    }
228
0
    CPLAssert(false);
229
0
    return CE_Failure;
230
0
}
231
232
namespace
233
{
234
235
// Find in the color table the entry whose RGB value is the closest
236
// (using quadratic distance) to the test color, ignoring transparent entries.
237
int BestColorEntry(const std::vector<GDALColorEntry> &entries,
238
                   const GDALColorEntry &test)
239
0
{
240
0
    int nMinDist = std::numeric_limits<int>::max();
241
0
    size_t bestEntry = 0;
242
0
    for (size_t i = 0; i < entries.size(); ++i)
243
0
    {
244
0
        const GDALColorEntry &entry = entries[i];
245
        // Ignore transparent entries
246
0
        if (entry.c4 == 0)
247
0
            continue;
248
249
0
        int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
250
0
                    ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
251
0
                    ((test.c3 - entry.c3) * (test.c3 - entry.c3));
252
0
        if (nDist < nMinDist)
253
0
        {
254
0
            nMinDist = nDist;
255
0
            bestEntry = i;
256
0
        }
257
0
    }
258
0
    return static_cast<int>(bestEntry);
259
0
}
260
261
std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
262
                                           int &transparentIdx)
263
0
{
264
0
    std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
265
266
0
    transparentIdx = -1;
267
0
    int i = 0;
268
0
    for (auto &entry : entries)
269
0
    {
270
0
        table.GetColorEntryAsRGB(i, &entry);
271
0
        if (transparentIdx < 0 && entry.c4 == 0)
272
0
            transparentIdx = i;
273
0
        ++i;
274
0
    }
275
0
    return entries;
276
0
}
277
278
}  // unnamed  namespace
279
280
/************************************************************************/
281
/*                               SQUARE()                               */
282
/************************************************************************/
283
284
template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
285
0
{
286
0
    return static_cast<Tsquare>(val) * val;
287
0
}
Unexecuted instantiation: int SQUARE<int, int>(int)
Unexecuted instantiation: double SQUARE<double, double>(double)
Unexecuted instantiation: float SQUARE<float, float>(float)
288
289
/************************************************************************/
290
/*                         ComputeIntegerRMS()                          */
291
/************************************************************************/
292
// Compute rms = sqrt(sumSquares / weight) in such a way that it is the
293
// integer that minimizes abs(rms**2 - sumSquares / weight)
294
template <class T, class Twork>
295
inline T ComputeIntegerRMS(double sumSquares, double weight)
296
0
{
297
0
    const double sumDivWeight = sumSquares / weight;
298
0
    T rms = static_cast<T>(sqrt(sumDivWeight));
299
300
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
301
    // Naive version:
302
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
303
0
    if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
304
0
        2 * sumDivWeight)
305
0
        rms += 1;
306
0
    return rms;
307
0
}
Unexecuted instantiation: unsigned char ComputeIntegerRMS<unsigned char, int>(double, double)
Unexecuted instantiation: unsigned short ComputeIntegerRMS<unsigned short, unsigned long>(double, double)
308
309
template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
310
{
311
    CPLAssert(false);
312
    return 0;
313
}
314
315
template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
316
0
{
317
    // It has been verified that given the correction on rms below, using
318
    // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
319
    // is equivalent, so use the former as it is used twice.
320
0
    const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
321
0
    const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
322
0
    GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
323
324
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
325
    // Naive version:
326
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
327
    // Optimized version for integer case and weight == 4
328
0
    if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
329
0
        rms += 1;
330
0
    return rms;
331
0
}
332
333
template <>
334
inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
335
0
{
336
0
    const double sumDivWeight = sumSquares * 0.25;
337
0
    GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
338
339
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
340
    // Naive version:
341
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
342
    // Optimized version for integer case and weight == 4
343
0
    if (static_cast<GUInt32>(rms) * (rms + 1) <
344
0
        static_cast<GUInt32>(sumDivWeight + 0.25))
345
0
        rms += 1;
346
0
    return rms;
347
0
}
348
349
#ifdef USE_SSE2
350
351
/************************************************************************/
352
/*                    QuadraticMeanByteSSE2OrAVX2()                     */
353
/************************************************************************/
354
355
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
356
#define sse2_packus_epi32 _mm_packus_epi32
357
#else
358
inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
359
0
{
360
0
    const auto minus32768_32 = _mm_set1_epi32(-32768);
361
0
    const auto minus32768_16 = _mm_set1_epi16(-32768);
362
0
    a = _mm_add_epi32(a, minus32768_32);
363
0
    b = _mm_add_epi32(b, minus32768_32);
364
0
    a = _mm_packs_epi32(a, b);
365
0
    a = _mm_sub_epi16(a, minus32768_16);
366
0
    return a;
367
0
}
368
#endif
369
370
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
371
#define sse2_hadd_epi16 _mm_hadd_epi16
372
#else
373
inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
374
0
{
375
    // Horizontal addition of adjacent pairs
376
0
    const auto mask = _mm_set1_epi32(0xFFFF);
377
0
    const auto horizLo =
378
0
        _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
379
0
    const auto horizHi =
380
0
        _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
381
382
    // Recombine low and high parts
383
0
    return _mm_packs_epi32(horizLo, horizHi);
384
0
}
385
#endif
386
387
#ifdef __AVX2__
388
389
#define set1_epi16 _mm256_set1_epi16
390
#define set1_epi32 _mm256_set1_epi32
391
#define setzero _mm256_setzero_si256
392
#define set1_ps _mm256_set1_ps
393
#define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
394
#define unpacklo_epi8 _mm256_unpacklo_epi8
395
#define unpackhi_epi8 _mm256_unpackhi_epi8
396
#define madd_epi16 _mm256_madd_epi16
397
#define add_epi32 _mm256_add_epi32
398
#define mul_ps _mm256_mul_ps
399
#define cvtepi32_ps _mm256_cvtepi32_ps
400
#define sqrt_ps _mm256_sqrt_ps
401
#define cvttps_epi32 _mm256_cvttps_epi32
402
#define packs_epi32 _mm256_packs_epi32
403
#define packus_epi32 _mm256_packus_epi32
404
#define srli_epi32 _mm256_srli_epi32
405
#define mullo_epi16 _mm256_mullo_epi16
406
#define srli_epi16 _mm256_srli_epi16
407
#define cmpgt_epi16 _mm256_cmpgt_epi16
408
#define add_epi16 _mm256_add_epi16
409
#define sub_epi16 _mm256_sub_epi16
410
#define packus_epi16 _mm256_packus_epi16
411
412
/* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
413
/* to get the lower 128-bit bits of what would be a true 256-bit vector register
414
 */
415
416
inline __m256i FIXUP_LANES(__m256i x)
417
{
418
    return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
419
}
420
421
#define store_lo(x, y)                                                         \
422
    _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
423
                     _mm256_extracti128_si256(FIXUP_LANES(y), 0))
424
#define storeu_int(x, y)                                                       \
425
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
426
#define hadd_epi16 _mm256_hadd_epi16
427
#else
428
0
#define set1_epi16 _mm_set1_epi16
429
0
#define set1_epi32 _mm_set1_epi32
430
0
#define setzero _mm_setzero_si128
431
#define set1_ps _mm_set1_ps
432
0
#define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
433
0
#define unpacklo_epi8 _mm_unpacklo_epi8
434
0
#define unpackhi_epi8 _mm_unpackhi_epi8
435
0
#define madd_epi16 _mm_madd_epi16
436
0
#define add_epi32 _mm_add_epi32
437
#define mul_ps _mm_mul_ps
438
0
#define cvtepi32_ps _mm_cvtepi32_ps
439
0
#define sqrt_ps _mm_sqrt_ps
440
0
#define cvttps_epi32 _mm_cvttps_epi32
441
0
#define packs_epi32 _mm_packs_epi32
442
0
#define packus_epi32 sse2_packus_epi32
443
0
#define srli_epi32 _mm_srli_epi32
444
0
#define mullo_epi16 _mm_mullo_epi16
445
0
#define srli_epi16 _mm_srli_epi16
446
0
#define cmpgt_epi16 _mm_cmpgt_epi16
447
0
#define add_epi16 _mm_add_epi16
448
0
#define sub_epi16 _mm_sub_epi16
449
0
#define packus_epi16 _mm_packus_epi16
450
0
#define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
451
0
#define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
452
0
#define hadd_epi16 sse2_hadd_epi16
453
#endif
454
455
template <class T>
456
static int
457
#if defined(__GNUC__)
458
    __attribute__((noinline))
459
#endif
460
    QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
461
                                const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
462
                                T *CPL_RESTRICT pDstScanline)
463
0
{
464
    // Optimized implementation for RMS on Byte by
465
    // processing by group of 8 output pixels, so as to use
466
    // a single _mm_sqrt_ps() call for 4 output pixels
467
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
468
469
0
    int iDstPixel = 0;
470
0
    const auto one16 = set1_epi16(1);
471
0
    const auto one32 = set1_epi32(1);
472
0
    const auto zero = setzero();
473
0
    const auto minus32768 = set1_epi16(-32768);
474
475
0
    constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
476
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
477
0
    {
478
        // Load 2 * DEST_ELTS bytes from each line
479
0
        auto firstLine = loadu_int(pSrcScanlineShifted);
480
0
        auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
481
        // Extend those Bytes as UInt16s
482
0
        auto firstLineLo = unpacklo_epi8(firstLine, zero);
483
0
        auto firstLineHi = unpackhi_epi8(firstLine, zero);
484
0
        auto secondLineLo = unpacklo_epi8(secondLine, zero);
485
0
        auto secondLineHi = unpackhi_epi8(secondLine, zero);
486
487
        // Multiplication of 16 bit values and horizontal
488
        // addition of 32 bit results
489
        // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
490
0
        firstLineLo = madd_epi16(firstLineLo, firstLineLo);
491
0
        firstLineHi = madd_epi16(firstLineHi, firstLineHi);
492
0
        secondLineLo = madd_epi16(secondLineLo, secondLineLo);
493
0
        secondLineHi = madd_epi16(secondLineHi, secondLineHi);
494
495
        // Vertical addition
496
0
        const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
497
0
        const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
498
499
0
        const auto sumSquaresPlusOneDiv4Lo =
500
0
            srli_epi32(add_epi32(sumSquaresLo, one32), 2);
501
0
        const auto sumSquaresPlusOneDiv4Hi =
502
0
            srli_epi32(add_epi32(sumSquaresHi, one32), 2);
503
504
        // Take square root and truncate/floor to int32
505
0
        const auto rmsLo =
506
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
507
0
        const auto rmsHi =
508
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
509
510
        // Merge back low and high registers with each RMS value
511
        // as a 16 bit value.
512
0
        auto rms = packs_epi32(rmsLo, rmsHi);
513
514
        // Round to upper value if it minimizes the
515
        // error |rms^2 - sumSquares/4|
516
        // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
517
        //    rms += 1;
518
        // which is equivalent to:
519
        // if( rms * (rms + 1) < (sumSquares+1) / 4 )
520
        //    rms += 1;
521
        // And both left and right parts fit on 16 (unsigned) bits
522
0
        const auto sumSquaresPlusOneDiv4 =
523
0
            packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
524
        // cmpgt_epi16 operates on signed int16, but here
525
        // we have unsigned values, so shift them by -32768 before
526
0
        const auto mask = cmpgt_epi16(
527
0
            add_epi16(sumSquaresPlusOneDiv4, minus32768),
528
0
            add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
529
        // The value of the mask will be -1 when the correction needs to be
530
        // applied
531
0
        rms = sub_epi16(rms, mask);
532
533
        // Pack each 16 bit RMS value to 8 bits
534
0
        rms = packus_epi16(rms, rms /* could be anything */);
535
0
        store_lo(&pDstScanline[iDstPixel], rms);
536
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
537
0
    }
538
539
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
540
0
    return iDstPixel;
541
0
}
542
543
/************************************************************************/
544
/*                       AverageByteSSE2OrAVX2()                        */
545
/************************************************************************/
546
547
static int
548
AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
549
                      const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
550
                      GByte *CPL_RESTRICT pDstScanline)
551
0
{
552
    // Optimized implementation for average on Byte by
553
    // processing by group of 16 output pixels for SSE2, or 32 for AVX2
554
555
0
    const auto zero = setzero();
556
0
    const auto two16 = set1_epi16(2);
557
0
    const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
558
559
0
    constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
560
0
    int iDstPixel = 0;
561
0
    for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
562
0
         iDstPixel += 2 * DEST_ELTS)
563
0
    {
564
0
        decltype(setzero()) average0;
565
0
        {
566
            // Load 2 * DEST_ELTS bytes from each line
567
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
568
0
            const auto secondLine =
569
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
570
            // Extend those Bytes as UInt16s
571
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
572
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
573
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
574
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
575
576
            // Vertical addition
577
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
578
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
579
580
            // Horizontal addition of adjacent pairs, and recombine low and high
581
            // parts
582
0
            const auto sum = hadd_epi16(sumLo, sumHi);
583
584
            // average = (sum + 2) / 4
585
0
            average0 = srli_epi16(add_epi16(sum, two16), 2);
586
587
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
588
0
        }
589
590
0
        decltype(setzero()) average1;
591
0
        {
592
            // Load 2 * DEST_ELTS bytes from each line
593
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
594
0
            const auto secondLine =
595
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
596
            // Extend those Bytes as UInt16s
597
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
598
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
599
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
600
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
601
602
            // Vertical addition
603
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
604
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
605
606
            // Horizontal addition of adjacent pairs, and recombine low and high
607
            // parts
608
0
            const auto sum = hadd_epi16(sumLo, sumHi);
609
610
            // average = (sum + 2) / 4
611
0
            average1 = srli_epi16(add_epi16(sum, two16), 2);
612
613
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
614
0
        }
615
616
        // Pack each 16 bit average value to 8 bits
617
0
        const auto average = packus_epi16(average0, average1);
618
0
        storeu_int(&pDstScanline[iDstPixel], average);
619
0
    }
620
621
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
622
0
    return iDstPixel;
623
0
}
624
625
/************************************************************************/
626
/*                      QuadraticMeanUInt16SSE2()                       */
627
/************************************************************************/
628
629
#ifdef __SSE3__
630
#define sse2_hadd_pd _mm_hadd_pd
631
#else
632
inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
633
0
{
634
0
    auto aLo_bLo =
635
0
        _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
636
0
    auto aHi_bHi =
637
0
        _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
638
0
    return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
639
0
}
640
#endif
641
642
inline __m128d SQUARE_PD(__m128d x)
643
0
{
644
0
    return _mm_mul_pd(x, x);
645
0
}
646
647
#ifdef __AVX2__
648
649
inline __m256d SQUARE_PD(__m256d x)
650
{
651
    return _mm256_mul_pd(x, x);
652
}
653
654
inline __m256d FIXUP_LANES(__m256d x)
655
{
656
    return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
657
}
658
659
inline __m256 FIXUP_LANES(__m256 x)
660
{
661
    return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
662
}
663
664
#endif
665
666
static int
667
QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
668
                        const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
669
                        uint16_t *CPL_RESTRICT pDstScanline)
670
0
{
671
    // Optimized implementation for RMS on UInt16 by
672
    // processing by group of 4 output pixels.
673
0
    const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
674
675
0
    int iDstPixel = 0;
676
0
    const auto zero = _mm_setzero_si128();
677
678
#ifdef __AVX2__
679
    const auto zeroDot25 = _mm256_set1_pd(0.25);
680
    const auto zeroDot5 = _mm256_set1_pd(0.5);
681
682
    // The first four 0's could be anything, as we only take the bottom
683
    // 128 bits.
684
    const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
685
#else
686
0
    const auto zeroDot25 = _mm_set1_pd(0.25);
687
0
    const auto zeroDot5 = _mm_set1_pd(0.5);
688
0
#endif
689
690
0
    constexpr int DEST_ELTS =
691
0
        static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
692
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
693
0
    {
694
        // Load 8 UInt16 from each line
695
0
        const auto firstLine = _mm_loadu_si128(
696
0
            reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
697
0
        const auto secondLine =
698
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(
699
0
                pSrcScanlineShifted + nChunkXSize));
700
701
        // Detect if all of the source values fit in 14 bits.
702
        // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
703
        // and we can do a much faster implementation.
704
0
        const auto maskTmp =
705
0
            _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
706
#if defined(__i386__) || defined(_M_IX86)
707
        uint64_t nMaskFitsIn14Bits = 0;
708
        _mm_storel_epi64(
709
            reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
710
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
711
#else
712
0
        const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
713
0
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
714
0
#endif
715
0
        if (nMaskFitsIn14Bits == 0)
716
0
        {
717
            // Multiplication of 16 bit values and horizontal
718
            // addition of 32 bit results
719
0
            const auto firstLineHSumSquare =
720
0
                _mm_madd_epi16(firstLine, firstLine);
721
0
            const auto secondLineHSumSquare =
722
0
                _mm_madd_epi16(secondLine, secondLine);
723
            // Vertical addition
724
0
            const auto sumSquares =
725
0
                _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
726
            // In theory we should take sqrt(sumSquares * 0.25f)
727
            // but given the rounding we do, this is equivalent to
728
            // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
729
            // sumSquares <= 4 * 16383^2
730
0
            const auto one32 = _mm_set1_epi32(1);
731
0
            const auto sumSquaresPlusOneDiv4 =
732
0
                _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
733
            // Take square root and truncate/floor to int32
734
0
            auto rms = _mm_cvttps_epi32(
735
0
                _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
736
737
            // Round to upper value if it minimizes the
738
            // error |rms^2 - sumSquares/4|
739
            // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
740
            //    rms += 1;
741
            // which is equivalent to:
742
            // if( rms * rms + rms < (sumSquares+1) / 4 )
743
            //    rms += 1;
744
0
            auto mask =
745
0
                _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
746
0
                                _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
747
0
            rms = _mm_sub_epi32(rms, mask);
748
            // Pack each 32 bit RMS value to 16 bits
749
0
            rms = _mm_packs_epi32(rms, rms /* could be anything */);
750
0
            _mm_storel_epi64(
751
0
                reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
752
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
753
0
            continue;
754
0
        }
755
756
        // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
757
        // to 32 bit would result in 4 multiplications instead of 8, but
758
        // mullo/mulhi have a worse throughput than mul_pd.
759
760
        // Extend those UInt16s as UInt32s
761
0
        const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
762
0
        const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
763
0
        const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
764
0
        const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
765
766
#ifdef __AVX2__
767
        // Multiplication of 32 bit values previously converted to 64 bit double
768
        const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
769
        const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
770
        const auto secondLineLoDbl =
771
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
772
        const auto secondLineHiDbl =
773
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
774
775
        // Vertical addition of squares
776
        const auto sumSquaresLo =
777
            _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
778
        const auto sumSquaresHi =
779
            _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
780
781
        // Horizontal addition of squares
782
        const auto sumSquares =
783
            FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
784
785
        const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
786
787
        // Take square root and truncate/floor to int32
788
        auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
789
        const auto rmsDouble = _mm256_cvtepi32_pd(rms);
790
        const auto right = _mm256_sub_pd(
791
            sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
792
793
        auto mask =
794
            _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
795
        // Extract 32-bit from each of the 4 64-bit masks
796
        // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
797
        // _MM_SHUFFLE(2,0,2,0)));
798
        mask = _mm256_permutevar8x32_ps(mask, permutation);
799
        const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
800
801
        // Apply the correction
802
        rms = _mm_sub_epi32(rms, maskI);
803
804
        // Pack each 32 bit RMS value to 16 bits
805
        rms = _mm_packus_epi32(rms, rms /* could be anything */);
806
#else
807
        // Multiplication of 32 bit values previously converted to 64 bit double
808
0
        const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
809
0
        const auto firstLineLoHi =
810
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
811
0
        const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
812
0
        const auto firstLineHiHi =
813
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
814
815
0
        const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
816
0
        const auto secondLineLoHi =
817
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
818
0
        const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
819
0
        const auto secondLineHiHi =
820
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
821
822
        // Vertical addition of squares
823
0
        const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
824
0
        const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
825
0
        const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
826
0
        const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
827
828
        // Horizontal addition of squares
829
0
        const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
830
0
        const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
831
832
0
        const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
833
0
        const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
834
        // Take square root and truncate/floor to int32
835
0
        const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
836
0
        const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
837
838
        // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
839
        // if( 0.5 < sumDivWeight - (rms * rms + rms) )
840
        //     rms += 1;
841
0
        const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
842
0
        const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
843
0
        const auto rightLo = _mm_sub_pd(
844
0
            sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
845
0
        const auto rightHi = _mm_sub_pd(
846
0
            sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
847
848
0
        const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
849
0
        const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
850
        // The value of the mask will be -1 when the correction needs to be
851
        // applied
852
0
        const auto mask = _mm_castps_si128(_mm_shuffle_ps(
853
0
            maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
854
855
0
        auto rms = _mm_castps_si128(
856
0
            _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
857
        // Apply the correction
858
0
        rms = _mm_sub_epi32(rms, mask);
859
860
        // Pack each 32 bit RMS value to 16 bits
861
0
        rms = sse2_packus_epi32(rms, rms /* could be anything */);
862
0
#endif
863
864
0
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
865
0
                         rms);
866
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
867
0
    }
868
869
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
870
0
    return iDstPixel;
871
0
}
872
873
/************************************************************************/
874
/*                         AverageUInt16SSE2()                          */
875
/************************************************************************/
876
877
static int
878
AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
879
                  const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
880
                  uint16_t *CPL_RESTRICT pDstScanline)
881
0
{
882
    // Optimized implementation for average on UInt16 by
883
    // processing by group of 8 output pixels.
884
885
0
    const auto mask = _mm_set1_epi32(0xFFFF);
886
0
    const auto two = _mm_set1_epi32(2);
887
0
    const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
888
889
0
    int iDstPixel = 0;
890
0
    constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
891
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
892
0
    {
893
0
        __m128i averageLow;
894
        // Load 8 UInt16 from each line
895
0
        {
896
0
            const auto firstLine = _mm_loadu_si128(
897
0
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
898
0
            const auto secondLine =
899
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
900
0
                    pSrcScanlineShifted + nChunkXSize));
901
902
            // Horizontal addition and extension to 32 bit
903
0
            const auto horizAddFirstLine = _mm_add_epi32(
904
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
905
0
            const auto horizAddSecondLine =
906
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
907
0
                              _mm_srli_epi32(secondLine, 16));
908
909
            // Vertical addition and average computation
910
            // average = (sum + 2) >> 2
911
0
            const auto sum = _mm_add_epi32(
912
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
913
0
            averageLow = _mm_srli_epi32(sum, 2);
914
0
        }
915
        // Load 8 UInt16 from each line
916
0
        __m128i averageHigh;
917
0
        {
918
0
            const auto firstLine =
919
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
920
0
                    pSrcScanlineShifted + DEST_ELTS));
921
0
            const auto secondLine =
922
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
923
0
                    pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
924
925
            // Horizontal addition and extension to 32 bit
926
0
            const auto horizAddFirstLine = _mm_add_epi32(
927
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
928
0
            const auto horizAddSecondLine =
929
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
930
0
                              _mm_srli_epi32(secondLine, 16));
931
932
            // Vertical addition and average computation
933
            // average = (sum + 2) >> 2
934
0
            const auto sum = _mm_add_epi32(
935
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
936
0
            averageHigh = _mm_srli_epi32(sum, 2);
937
0
        }
938
939
        // Pack each 32 bit average value to 16 bits
940
0
        auto average = sse2_packus_epi32(averageLow, averageHigh);
941
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
942
0
                         average);
943
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
944
0
    }
945
946
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
947
0
    return iDstPixel;
948
0
}
949
950
/************************************************************************/
951
/*                       QuadraticMeanFloatSSE2()                       */
952
/************************************************************************/
953
954
#if !defined(ARM_V7)
955
956
#ifdef __SSE3__
957
#define sse2_hadd_ps _mm_hadd_ps
958
#else
959
inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
960
0
{
961
0
    auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
962
0
    auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
963
0
    return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
964
0
}
965
#endif
966
967
#ifdef __AVX2__
968
#define set1_ps _mm256_set1_ps
969
#define loadu_ps _mm256_loadu_ps
970
#define andnot_ps _mm256_andnot_ps
971
#define and_ps _mm256_and_ps
972
#define max_ps _mm256_max_ps
973
#define shuffle_ps _mm256_shuffle_ps
974
#define div_ps _mm256_div_ps
975
#define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
976
#define mul_ps _mm256_mul_ps
977
#define add_ps _mm256_add_ps
978
#define hadd_ps _mm256_hadd_ps
979
#define sqrt_ps _mm256_sqrt_ps
980
#define or_ps _mm256_or_ps
981
#define unpacklo_ps _mm256_unpacklo_ps
982
#define unpackhi_ps _mm256_unpackhi_ps
983
#define storeu_ps _mm256_storeu_ps
984
#define blendv_ps _mm256_blendv_ps
985
986
inline __m256 SQUARE_PS(__m256 x)
987
{
988
    return _mm256_mul_ps(x, x);
989
}
990
991
#else
992
993
0
#define set1_ps _mm_set1_ps
994
0
#define loadu_ps _mm_loadu_ps
995
0
#define andnot_ps _mm_andnot_ps
996
#define and_ps _mm_and_ps
997
0
#define max_ps _mm_max_ps
998
0
#define shuffle_ps _mm_shuffle_ps
999
0
#define div_ps _mm_div_ps
1000
0
#define cmpeq_ps _mm_cmpeq_ps
1001
0
#define mul_ps _mm_mul_ps
1002
0
#define add_ps _mm_add_ps
1003
#define hadd_ps sse2_hadd_ps
1004
0
#define sqrt_ps _mm_sqrt_ps
1005
#define or_ps _mm_or_ps
1006
#define unpacklo_ps _mm_unpacklo_ps
1007
#define unpackhi_ps _mm_unpackhi_ps
1008
0
#define storeu_ps _mm_storeu_ps
1009
1010
inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
1011
0
{
1012
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
1013
    return _mm_blendv_ps(a, b, mask);
1014
#else
1015
0
    return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
1016
0
#endif
1017
0
}
1018
1019
inline __m128 SQUARE_PS(__m128 x)
1020
0
{
1021
0
    return _mm_mul_ps(x, x);
1022
0
}
1023
1024
inline __m128 FIXUP_LANES(__m128 x)
1025
0
{
1026
0
    return x;
1027
0
}
1028
1029
#endif
1030
1031
static int
1032
#if defined(__GNUC__)
1033
    __attribute__((noinline))
1034
#endif
1035
    QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1036
                           const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1037
                           float *CPL_RESTRICT pDstScanline)
1038
0
{
1039
    // Optimized implementation for RMS on Float32 by
1040
    // processing by group of output pixels.
1041
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1042
1043
0
    int iDstPixel = 0;
1044
0
    const auto minus_zero = set1_ps(-0.0f);
1045
0
    const auto zeroDot25 = set1_ps(0.25f);
1046
0
    const auto one = set1_ps(1.0f);
1047
0
    const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1048
0
    constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
1049
1050
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1051
0
    {
1052
        // Load 2*DEST_ELTS Float32 from each line
1053
0
        auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1054
0
        auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
1055
0
        auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1056
0
        auto secondLineHi =
1057
0
            loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
1058
1059
        // Take the absolute value
1060
0
        firstLineLo = andnot_ps(minus_zero, firstLineLo);
1061
0
        firstLineHi = andnot_ps(minus_zero, firstLineHi);
1062
0
        secondLineLo = andnot_ps(minus_zero, secondLineLo);
1063
0
        secondLineHi = andnot_ps(minus_zero, secondLineHi);
1064
1065
0
        auto firstLineEven =
1066
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1067
0
        auto firstLineOdd =
1068
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1069
0
        auto secondLineEven =
1070
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1071
0
        auto secondLineOdd =
1072
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1073
1074
        // Compute the maximum of each DEST_ELTS value to RMS-average
1075
0
        const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1076
0
                                 max_ps(secondLineEven, secondLineEven));
1077
1078
        // Normalize each value by the maximum of the DEST_ELTS ones.
1079
        // This step is important to avoid that the square evaluates to infinity
1080
        // for sufficiently big input.
1081
0
        auto invMax = div_ps(one, maxV);
1082
        // Deal with 0 being the maximum to correct division by zero
1083
        // note: comparing to -0 leads to identical results as to comparing with
1084
        // 0
1085
0
        invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1086
1087
0
        firstLineEven = mul_ps(firstLineEven, invMax);
1088
0
        firstLineOdd = mul_ps(firstLineOdd, invMax);
1089
0
        secondLineEven = mul_ps(secondLineEven, invMax);
1090
0
        secondLineOdd = mul_ps(secondLineOdd, invMax);
1091
1092
        // Compute squares
1093
0
        firstLineEven = SQUARE_PS(firstLineEven);
1094
0
        firstLineOdd = SQUARE_PS(firstLineOdd);
1095
0
        secondLineEven = SQUARE_PS(secondLineEven);
1096
0
        secondLineOdd = SQUARE_PS(secondLineOdd);
1097
1098
0
        const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1099
0
                                       add_ps(secondLineEven, secondLineOdd));
1100
1101
0
        auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1102
1103
        // Deal with infinity being the maximum
1104
0
        const auto maskIsInf = cmpeq_ps(maxV, infv);
1105
0
        rms = blendv_ps(rms, infv, maskIsInf);
1106
1107
0
        rms = FIXUP_LANES(rms);
1108
1109
0
        storeu_ps(&pDstScanline[iDstPixel], rms);
1110
0
        pSrcScanlineShifted += DEST_ELTS * 2;
1111
0
    }
1112
1113
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1114
0
    return iDstPixel;
1115
0
}
1116
1117
/************************************************************************/
1118
/*                          AverageFloatSSE2()                          */
1119
/************************************************************************/
1120
1121
static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1122
                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1123
                            float *CPL_RESTRICT pDstScanline)
1124
0
{
1125
    // Optimized implementation for average on Float32 by
1126
    // processing by group of output pixels.
1127
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1128
1129
0
    int iDstPixel = 0;
1130
0
    const auto zeroDot25 = _mm_set1_ps(0.25f);
1131
0
    constexpr int DEST_ELTS =
1132
0
        static_cast<int>(sizeof(zeroDot25) / sizeof(float));
1133
1134
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1135
0
    {
1136
        // Load 2 * DEST_ELTS Float32 from each line
1137
0
        const auto firstLineLo =
1138
0
            _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
1139
0
        const auto firstLineHi = _mm_mul_ps(
1140
0
            _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
1141
0
        const auto secondLineLo = _mm_mul_ps(
1142
0
            _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
1143
0
        const auto secondLineHi = _mm_mul_ps(
1144
0
            _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
1145
0
            zeroDot25);
1146
1147
        // Vertical addition
1148
0
        const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
1149
0
        const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
1150
1151
        // Horizontal addition
1152
0
        const auto average = sse2_hadd_ps(tmpLo, tmpHi);
1153
1154
0
        _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1155
0
        pSrcScanlineShifted += DEST_ELTS * 2;
1156
0
    }
1157
1158
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1159
0
    return iDstPixel;
1160
0
}
1161
1162
/************************************************************************/
1163
/*                         AverageDoubleSSE2()                          */
1164
/************************************************************************/
1165
1166
static int
1167
AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
1168
                  const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1169
                  double *CPL_RESTRICT pDstScanline)
1170
0
{
1171
    // Optimized implementation for average on Float64 by
1172
    // processing by group of output pixels.
1173
0
    const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1174
1175
0
    int iDstPixel = 0;
1176
0
    const auto zeroDot25 = _mm_set1_pd(0.25);
1177
0
    constexpr int DEST_ELTS =
1178
0
        static_cast<int>(sizeof(zeroDot25) / sizeof(double));
1179
1180
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
1181
0
    {
1182
        // Load 4 * DEST_ELTS Float64 from each line
1183
0
        const auto firstLine0 = _mm_mul_pd(
1184
0
            _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
1185
0
        const auto firstLine1 = _mm_mul_pd(
1186
0
            _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
1187
0
        const auto secondLine0 = _mm_mul_pd(
1188
0
            _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
1189
0
            zeroDot25);
1190
0
        const auto secondLine1 = _mm_mul_pd(
1191
0
            _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
1192
0
            zeroDot25);
1193
1194
        // Vertical addition
1195
0
        const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
1196
0
        const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
1197
1198
        // Horizontal addition
1199
0
        const auto average0 = sse2_hadd_pd(tmp0, tmp1);
1200
1201
0
        _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
1202
0
        pSrcScanlineShifted += DEST_ELTS * 2;
1203
0
    }
1204
1205
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1206
0
    return iDstPixel;
1207
0
}
1208
1209
#endif
1210
1211
#endif
1212
1213
/************************************************************************/
1214
/*                   GDALResampleChunk_AverageOrRMS()                   */
1215
/************************************************************************/
1216
1217
template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
1218
static CPLErr
1219
GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1220
                                 const T *pChunk, void **ppDstBuffer)
1221
0
{
1222
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1223
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1224
0
    const double dfSrcXDelta = args.dfSrcXDelta;
1225
0
    const double dfSrcYDelta = args.dfSrcYDelta;
1226
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1227
0
    const int nChunkXOff = args.nChunkXOff;
1228
0
    const int nChunkYOff = args.nChunkYOff;
1229
0
    const int nChunkXSize = args.nChunkXSize;
1230
0
    const int nChunkYSize = args.nChunkYSize;
1231
0
    const int nDstXOff = args.nDstXOff;
1232
0
    const int nDstXOff2 = args.nDstXOff2;
1233
0
    const int nDstYOff = args.nDstYOff;
1234
0
    const int nDstYOff2 = args.nDstYOff2;
1235
0
    const char *pszResampling = args.pszResampling;
1236
0
    bool bHasNoData = args.bHasNoData;
1237
0
    const double dfNoDataValue = args.dfNoDataValue;
1238
0
    const GDALColorTable *const poColorTable =
1239
0
        !bQuadraticMean &&
1240
                // AVERAGE_BIT2GRAYSCALE
1241
0
                CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"))
1242
0
            ? nullptr
1243
0
            : args.poColorTable;
1244
0
    const bool bPropagateNoData = args.bPropagateNoData;
1245
1246
0
    T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
1247
0
    const T tReplacementVal =
1248
0
        bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1249
0
                         args.eOvrDataType, dfNoDataValue))
1250
0
                   : 0;
1251
1252
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1253
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1254
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
1255
1256
    /* -------------------------------------------------------------------- */
1257
    /*      Allocate buffers.                                               */
1258
    /* -------------------------------------------------------------------- */
1259
0
    *ppDstBuffer = static_cast<T *>(
1260
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1261
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
1262
0
    if (*ppDstBuffer == nullptr)
1263
0
    {
1264
0
        return CE_Failure;
1265
0
    }
1266
0
    T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1267
1268
0
    struct PrecomputedXValue
1269
0
    {
1270
0
        int nLeftXOffShifted;
1271
0
        int nRightXOffShifted;
1272
0
        double dfLeftWeight;
1273
0
        double dfRightWeight;
1274
0
        double dfTotalWeightFullLine;
1275
0
    };
1276
1277
0
    PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1278
0
        VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1279
1280
0
    if (pasSrcX == nullptr)
1281
0
    {
1282
0
        return CE_Failure;
1283
0
    }
1284
1285
0
    std::vector<GDALColorEntry> colorEntries;
1286
1287
0
    if (poColorTable)
1288
0
    {
1289
0
        int nTransparentIdx = -1;
1290
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1291
1292
        // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1293
        // it as nodata value
1294
0
        if (bHasNoData && dfNoDataValue >= 0.0 &&
1295
0
            tNoDataValue < colorEntries.size())
1296
0
            colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1297
1298
        // Or if we have no explicit nodata, but a color table entry that is
1299
        // transparent, consider it as the nodata value
1300
0
        else if (!bHasNoData && nTransparentIdx >= 0)
1301
0
        {
1302
0
            bHasNoData = true;
1303
0
            tNoDataValue = static_cast<T>(nTransparentIdx);
1304
0
        }
1305
0
    }
1306
1307
    /* ==================================================================== */
1308
    /*      Precompute inner loop constants.                                */
1309
    /* ==================================================================== */
1310
0
    bool bSrcXSpacingIsTwo = true;
1311
0
    int nLastSrcXOff2 = -1;
1312
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1313
0
    {
1314
0
        const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1315
        // Apply some epsilon to avoid numerical precision issues
1316
0
        int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1317
0
        const double dfSrcXOff2 =
1318
0
            dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1319
0
        int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1320
1321
0
        if (nSrcXOff < nChunkXOff)
1322
0
            nSrcXOff = nChunkXOff;
1323
0
        if (nSrcXOff2 == nSrcXOff)
1324
0
            nSrcXOff2++;
1325
0
        if (nSrcXOff2 > nChunkRightXOff)
1326
0
            nSrcXOff2 = nChunkRightXOff;
1327
1328
0
        pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1329
0
        pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1330
0
            nSrcXOff2 - nChunkXOff;
1331
0
        pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1332
0
            (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1333
0
        pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1334
0
            1 - (nSrcXOff2 - dfSrcXOff2);
1335
0
        pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1336
0
            pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1337
0
        if (nSrcXOff + 1 < nSrcXOff2)
1338
0
        {
1339
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1340
0
                nSrcXOff2 - nSrcXOff - 2;
1341
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1342
0
                pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1343
0
        }
1344
1345
0
        if (nSrcXOff2 - nSrcXOff != 2 ||
1346
0
            (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1347
0
        {
1348
0
            bSrcXSpacingIsTwo = false;
1349
0
        }
1350
0
        nLastSrcXOff2 = nSrcXOff2;
1351
0
    }
1352
1353
    /* ==================================================================== */
1354
    /*      Loop over destination scanlines.                                */
1355
    /* ==================================================================== */
1356
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1357
0
    {
1358
0
        const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1359
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1360
0
        if (nSrcYOff < nChunkYOff)
1361
0
            nSrcYOff = nChunkYOff;
1362
1363
0
        const double dfSrcYOff2 =
1364
0
            dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1365
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1366
0
        if (nSrcYOff2 == nSrcYOff)
1367
0
            ++nSrcYOff2;
1368
0
        if (nSrcYOff2 > nChunkBottomYOff)
1369
0
            nSrcYOff2 = nChunkBottomYOff;
1370
1371
0
        T *const pDstScanline =
1372
0
            pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1373
1374
        /* --------------------------------------------------------------------
1375
         */
1376
        /*      Loop over destination pixels */
1377
        /* --------------------------------------------------------------------
1378
         */
1379
0
        if (poColorTable == nullptr)
1380
0
        {
1381
0
            if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1382
0
                pabyChunkNodataMask == nullptr)
1383
0
            {
1384
                if constexpr (eWrkDataType == GDT_UInt8 ||
1385
                              eWrkDataType == GDT_UInt16)
1386
0
                {
1387
                    // Optimized case : no nodata, overview by a factor of 2 and
1388
                    // regular x and y src spacing.
1389
0
                    const T *pSrcScanlineShifted =
1390
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1391
0
                        static_cast<size_t>(nSrcYOff - nChunkYOff) *
1392
0
                            nChunkXSize;
1393
0
                    int iDstPixel = 0;
1394
0
#ifdef USE_SSE2
1395
                    if constexpr (eWrkDataType == GDT_UInt8)
1396
0
                    {
1397
                        if constexpr (bQuadraticMean)
1398
0
                        {
1399
0
                            iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1400
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1401
0
                                pDstScanline);
1402
                        }
1403
                        else
1404
0
                        {
1405
0
                            iDstPixel = AverageByteSSE2OrAVX2(
1406
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1407
0
                                pDstScanline);
1408
0
                        }
1409
                    }
1410
                    else
1411
0
                    {
1412
0
                        static_assert(eWrkDataType == GDT_UInt16);
1413
                        if constexpr (bQuadraticMean)
1414
0
                        {
1415
0
                            iDstPixel = QuadraticMeanUInt16SSE2(
1416
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1417
0
                                pDstScanline);
1418
                        }
1419
                        else
1420
0
                        {
1421
0
                            iDstPixel = AverageUInt16SSE2(
1422
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1423
0
                                pDstScanline);
1424
0
                        }
1425
0
                    }
1426
0
#endif
1427
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1428
0
                    {
1429
0
                        Tsum nTotal = 0;
1430
0
                        T nVal;
1431
                        if constexpr (bQuadraticMean)
1432
0
                            nTotal =
1433
0
                                SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1434
0
                                SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1435
0
                                SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1436
0
                                SQUARE<Tsum>(
1437
                                    pSrcScanlineShifted[1 + nChunkXSize]);
1438
                        else
1439
0
                            nTotal = pSrcScanlineShifted[0] +
1440
0
                                     pSrcScanlineShifted[1] +
1441
0
                                     pSrcScanlineShifted[nChunkXSize] +
1442
0
                                     pSrcScanlineShifted[1 + nChunkXSize];
1443
1444
0
                        constexpr int nTotalWeight = 4;
1445
                        if constexpr (bQuadraticMean)
1446
0
                            nVal = ComputeIntegerRMS_4values<T>(nTotal);
1447
                        else
1448
0
                            nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1449
0
                                                  nTotalWeight);
1450
1451
                        // No need to compare nVal against tNoDataValue as we
1452
                        // are in a case where pabyChunkNodataMask == nullptr
1453
                        // implies the absence of nodata value.
1454
0
                        pDstScanline[iDstPixel] = nVal;
1455
0
                        pSrcScanlineShifted += 2;
1456
0
                    }
1457
                }
1458
                else
1459
0
                {
1460
0
                    static_assert(eWrkDataType == GDT_Float32 ||
1461
0
                                  eWrkDataType == GDT_Float64);
1462
0
                    const T *pSrcScanlineShifted =
1463
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1464
0
                        static_cast<size_t>(nSrcYOff - nChunkYOff) *
1465
0
                            nChunkXSize;
1466
0
                    int iDstPixel = 0;
1467
0
#if defined(USE_SSE2) && !defined(ARM_V7)
1468
                    if constexpr (eWrkDataType == GDT_Float32)
1469
0
                    {
1470
0
                        static_assert(std::is_same_v<T, float>);
1471
                        if constexpr (bQuadraticMean)
1472
0
                        {
1473
0
                            iDstPixel = QuadraticMeanFloatSSE2(
1474
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1475
0
                                pDstScanline);
1476
                        }
1477
                        else
1478
0
                        {
1479
0
                            iDstPixel = AverageFloatSSE2(
1480
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1481
0
                                pDstScanline);
1482
0
                        }
1483
                    }
1484
                    else
1485
0
                    {
1486
                        if constexpr (!bQuadraticMean)
1487
0
                        {
1488
0
                            iDstPixel = AverageDoubleSSE2(
1489
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1490
0
                                pDstScanline);
1491
0
                        }
1492
0
                    }
1493
0
#endif
1494
1495
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1496
0
                    {
1497
0
                        T nVal;
1498
1499
                        if constexpr (bQuadraticMean)
1500
0
                        {
1501
                            // Avoid issues with large values by renormalizing
1502
0
                            const auto max = std::max(
1503
0
                                {std::fabs(pSrcScanlineShifted[0]),
1504
0
                                 std::fabs(pSrcScanlineShifted[1]),
1505
0
                                 std::fabs(pSrcScanlineShifted[nChunkXSize]),
1506
0
                                 std::fabs(
1507
0
                                     pSrcScanlineShifted[1 + nChunkXSize])});
1508
0
                            if (max == 0)
1509
0
                            {
1510
0
                                nVal = 0;
1511
0
                            }
1512
0
                            else if (std::isinf(max))
1513
0
                            {
1514
                                // If there is at least one infinity value,
1515
                                // then just summing, and taking the abs
1516
                                // value will give the expected result:
1517
                                // * +inf if all values are +inf
1518
                                // * +inf if all values are -inf
1519
                                // * NaN otherwise
1520
0
                                nVal = std::fabs(
1521
0
                                    pSrcScanlineShifted[0] +
1522
0
                                    pSrcScanlineShifted[1] +
1523
0
                                    pSrcScanlineShifted[nChunkXSize] +
1524
0
                                    pSrcScanlineShifted[1 + nChunkXSize]);
1525
0
                            }
1526
0
                            else
1527
0
                            {
1528
0
                                const auto inv_max = static_cast<T>(1.0) / max;
1529
0
                                nVal =
1530
0
                                    max *
1531
0
                                    std::sqrt(
1532
0
                                        static_cast<T>(0.25) *
1533
0
                                        (SQUARE(pSrcScanlineShifted[0] *
1534
0
                                                inv_max) +
1535
0
                                         SQUARE(pSrcScanlineShifted[1] *
1536
0
                                                inv_max) +
1537
0
                                         SQUARE(
1538
0
                                             pSrcScanlineShifted[nChunkXSize] *
1539
0
                                             inv_max) +
1540
0
                                         SQUARE(
1541
0
                                             pSrcScanlineShifted[1 +
1542
0
                                                                 nChunkXSize] *
1543
0
                                             inv_max)));
1544
0
                            }
1545
                        }
1546
                        else
1547
0
                        {
1548
0
                            constexpr auto weight = static_cast<T>(0.25);
1549
                            // Multiply each value by weight to avoid
1550
                            // potential overflow
1551
0
                            nVal =
1552
0
                                (weight * pSrcScanlineShifted[0] +
1553
0
                                 weight * pSrcScanlineShifted[1] +
1554
0
                                 weight * pSrcScanlineShifted[nChunkXSize] +
1555
0
                                 weight * pSrcScanlineShifted[1 + nChunkXSize]);
1556
0
                        }
1557
1558
                        // No need to compare nVal against tNoDataValue as we
1559
                        // are in a case where pabyChunkNodataMask == nullptr
1560
                        // implies the absence of nodata value.
1561
0
                        pDstScanline[iDstPixel] = nVal;
1562
0
                        pSrcScanlineShifted += 2;
1563
0
                    }
1564
0
                }
1565
0
            }
1566
0
            else
1567
0
            {
1568
0
                const double dfBottomWeight =
1569
0
                    (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1570
0
                                                : 1.0 - (dfSrcYOff - nSrcYOff);
1571
0
                const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1572
0
                nSrcYOff -= nChunkYOff;
1573
0
                nSrcYOff2 -= nChunkYOff;
1574
1575
0
                double dfTotalWeightFullColumn = dfBottomWeight;
1576
0
                if (nSrcYOff + 1 < nSrcYOff2)
1577
0
                {
1578
0
                    dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1579
0
                    dfTotalWeightFullColumn += dfTopWeight;
1580
0
                }
1581
1582
0
                for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1583
0
                {
1584
0
                    const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1585
0
                    const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1586
1587
0
                    double dfTotal = 0;
1588
0
                    double dfTotalWeight = 0;
1589
0
                    [[maybe_unused]] double dfMulFactor = 1.0;
1590
0
                    [[maybe_unused]] double dfInvMulFactor = 1.0;
1591
0
                    constexpr bool bUseMulFactor =
1592
0
                        (eWrkDataType == GDT_Float32 ||
1593
0
                         eWrkDataType == GDT_Float64);
1594
0
                    if (pabyChunkNodataMask == nullptr)
1595
0
                    {
1596
                        if constexpr (bUseMulFactor)
1597
0
                        {
1598
                            if constexpr (bQuadraticMean)
1599
0
                            {
1600
0
                                T mulFactor = 0;
1601
0
                                auto pChunkShifted =
1602
0
                                    pChunk +
1603
0
                                    static_cast<size_t>(nSrcYOff) * nChunkXSize;
1604
1605
0
                                for (int iY = nSrcYOff; iY < nSrcYOff2;
1606
0
                                     ++iY, pChunkShifted += nChunkXSize)
1607
0
                                {
1608
0
                                    for (int iX = nSrcXOff; iX < nSrcXOff2;
1609
0
                                         ++iX)
1610
0
                                        mulFactor = std::max(
1611
0
                                            mulFactor,
1612
0
                                            std::fabs(pChunkShifted[iX]));
1613
0
                                }
1614
0
                                dfMulFactor = double(mulFactor);
1615
0
                                dfInvMulFactor =
1616
0
                                    dfMulFactor > 0 &&
1617
0
                                            std::isfinite(dfMulFactor)
1618
0
                                        ? 1.0 / dfMulFactor
1619
0
                                        : 1.0;
1620
                            }
1621
                            else
1622
0
                            {
1623
0
                                dfMulFactor = (nSrcYOff2 - nSrcYOff) *
1624
0
                                              (nSrcXOff2 - nSrcXOff);
1625
0
                                dfInvMulFactor = 1.0 / dfMulFactor;
1626
0
                            }
1627
0
                        }
1628
1629
0
                        auto pChunkShifted =
1630
0
                            pChunk +
1631
0
                            static_cast<size_t>(nSrcYOff) * nChunkXSize;
1632
0
                        int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1633
0
                        double dfWeightY = dfBottomWeight;
1634
0
                        while (true)
1635
0
                        {
1636
0
                            double dfTotalLine;
1637
                            if constexpr (bQuadraticMean)
1638
0
                            {
1639
                                // Left pixel
1640
0
                                {
1641
0
                                    const T val = pChunkShifted[nSrcXOff];
1642
0
                                    dfTotalLine =
1643
0
                                        SQUARE(double(val) * dfInvMulFactor) *
1644
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1645
0
                                }
1646
1647
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1648
0
                                {
1649
                                    // Middle pixels
1650
0
                                    for (int iX = nSrcXOff + 1;
1651
0
                                         iX < nSrcXOff2 - 1; ++iX)
1652
0
                                    {
1653
0
                                        const T val = pChunkShifted[iX];
1654
0
                                        dfTotalLine += SQUARE(double(val) *
1655
0
                                                              dfInvMulFactor);
1656
0
                                    }
1657
1658
                                    // Right pixel
1659
0
                                    {
1660
0
                                        const T val =
1661
0
                                            pChunkShifted[nSrcXOff2 - 1];
1662
0
                                        dfTotalLine +=
1663
0
                                            SQUARE(double(val) *
1664
0
                                                   dfInvMulFactor) *
1665
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1666
0
                                    }
1667
0
                                }
1668
                            }
1669
                            else
1670
0
                            {
1671
                                // Left pixel
1672
0
                                {
1673
0
                                    const T val = pChunkShifted[nSrcXOff];
1674
0
                                    dfTotalLine =
1675
0
                                        double(val) * dfInvMulFactor *
1676
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1677
0
                                }
1678
1679
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1680
0
                                {
1681
                                    // Middle pixels
1682
0
                                    for (int iX = nSrcXOff + 1;
1683
0
                                         iX < nSrcXOff2 - 1; ++iX)
1684
0
                                    {
1685
0
                                        const T val = pChunkShifted[iX];
1686
0
                                        dfTotalLine +=
1687
0
                                            double(val) * dfInvMulFactor;
1688
0
                                    }
1689
1690
                                    // Right pixel
1691
0
                                    {
1692
0
                                        const T val =
1693
0
                                            pChunkShifted[nSrcXOff2 - 1];
1694
0
                                        dfTotalLine +=
1695
0
                                            double(val) * dfInvMulFactor *
1696
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1697
0
                                    }
1698
0
                                }
1699
0
                            }
1700
1701
0
                            dfTotal += dfTotalLine * dfWeightY;
1702
0
                            --nCounterY;
1703
0
                            if (nCounterY < 0)
1704
0
                                break;
1705
0
                            pChunkShifted += nChunkXSize;
1706
0
                            dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1707
0
                        }
1708
1709
0
                        dfTotalWeight =
1710
0
                            pasSrcX[iDstPixel].dfTotalWeightFullLine *
1711
0
                            dfTotalWeightFullColumn;
1712
0
                    }
1713
0
                    else
1714
0
                    {
1715
0
                        size_t nCount = 0;
1716
0
                        for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1717
0
                        {
1718
0
                            const auto pChunkShifted =
1719
0
                                pChunk + static_cast<size_t>(iY) * nChunkXSize;
1720
1721
0
                            double dfTotalLine = 0;
1722
0
                            double dfTotalWeightLine = 0;
1723
                            // Left pixel
1724
0
                            {
1725
0
                                const int iX = nSrcXOff;
1726
0
                                const T val = pChunkShifted[iX];
1727
0
                                if (pabyChunkNodataMask
1728
0
                                        [iX +
1729
0
                                         static_cast<size_t>(iY) * nChunkXSize])
1730
0
                                {
1731
0
                                    nCount++;
1732
0
                                    const double dfWeightX =
1733
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1734
0
                                    dfTotalWeightLine = dfWeightX;
1735
                                    if constexpr (bQuadraticMean)
1736
0
                                        dfTotalLine =
1737
                                            SQUARE(double(val)) * dfWeightX;
1738
                                    else
1739
0
                                        dfTotalLine = double(val) * dfWeightX;
1740
0
                                }
1741
0
                            }
1742
1743
0
                            if (nSrcXOff < nSrcXOff2 - 1)
1744
0
                            {
1745
                                // Middle pixels
1746
0
                                for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1747
0
                                     ++iX)
1748
0
                                {
1749
0
                                    const T val = pChunkShifted[iX];
1750
0
                                    if (pabyChunkNodataMask
1751
0
                                            [iX + static_cast<size_t>(iY) *
1752
0
                                                      nChunkXSize])
1753
0
                                    {
1754
0
                                        nCount++;
1755
0
                                        dfTotalWeightLine += 1;
1756
                                        if constexpr (bQuadraticMean)
1757
0
                                            dfTotalLine += SQUARE(double(val));
1758
                                        else
1759
0
                                            dfTotalLine += double(val);
1760
0
                                    }
1761
0
                                }
1762
1763
                                // Right pixel
1764
0
                                {
1765
0
                                    const int iX = nSrcXOff2 - 1;
1766
0
                                    const T val = pChunkShifted[iX];
1767
0
                                    if (pabyChunkNodataMask
1768
0
                                            [iX + static_cast<size_t>(iY) *
1769
0
                                                      nChunkXSize])
1770
0
                                    {
1771
0
                                        nCount++;
1772
0
                                        const double dfWeightX =
1773
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1774
0
                                        dfTotalWeightLine += dfWeightX;
1775
                                        if constexpr (bQuadraticMean)
1776
0
                                            dfTotalLine +=
1777
                                                SQUARE(double(val)) * dfWeightX;
1778
                                        else
1779
0
                                            dfTotalLine +=
1780
0
                                                double(val) * dfWeightX;
1781
0
                                    }
1782
0
                                }
1783
0
                            }
1784
1785
0
                            const double dfWeightY =
1786
0
                                (iY == nSrcYOff)        ? dfBottomWeight
1787
0
                                : (iY + 1 == nSrcYOff2) ? dfTopWeight
1788
0
                                                        : 1.0;
1789
0
                            dfTotal += dfTotalLine * dfWeightY;
1790
0
                            dfTotalWeight += dfTotalWeightLine * dfWeightY;
1791
0
                        }
1792
1793
0
                        if (nCount == 0 ||
1794
0
                            (bPropagateNoData &&
1795
0
                             nCount <
1796
0
                                 static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1797
0
                                     (nSrcXOff2 - nSrcXOff)))
1798
0
                        {
1799
0
                            pDstScanline[iDstPixel] = tNoDataValue;
1800
0
                            continue;
1801
0
                        }
1802
0
                    }
1803
                    if constexpr (eWrkDataType == GDT_UInt8)
1804
0
                    {
1805
0
                        T nVal;
1806
                        if constexpr (bQuadraticMean)
1807
0
                            nVal = ComputeIntegerRMS<T, int>(dfTotal,
1808
                                                             dfTotalWeight);
1809
                        else
1810
0
                            nVal =
1811
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1812
0
                        if (bHasNoData && nVal == tNoDataValue)
1813
0
                            nVal = tReplacementVal;
1814
0
                        pDstScanline[iDstPixel] = nVal;
1815
                    }
1816
                    else if constexpr (eWrkDataType == GDT_UInt16)
1817
0
                    {
1818
0
                        T nVal;
1819
                        if constexpr (bQuadraticMean)
1820
0
                            nVal = ComputeIntegerRMS<T, uint64_t>(
1821
                                dfTotal, dfTotalWeight);
1822
                        else
1823
0
                            nVal =
1824
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1825
0
                        if (bHasNoData && nVal == tNoDataValue)
1826
0
                            nVal = tReplacementVal;
1827
0
                        pDstScanline[iDstPixel] = nVal;
1828
                    }
1829
                    else
1830
0
                    {
1831
0
                        T nVal;
1832
                        if constexpr (bQuadraticMean)
1833
0
                        {
1834
                            if constexpr (bUseMulFactor)
1835
0
                                nVal = static_cast<T>(
1836
0
                                    dfMulFactor *
1837
                                    sqrt(dfTotal / dfTotalWeight));
1838
                            else
1839
                                nVal = static_cast<T>(
1840
                                    sqrt(dfTotal / dfTotalWeight));
1841
                        }
1842
                        else
1843
0
                        {
1844
                            if constexpr (bUseMulFactor)
1845
0
                                nVal = static_cast<T>(
1846
                                    dfMulFactor * (dfTotal / dfTotalWeight));
1847
                            else
1848
                                nVal = static_cast<T>(dfTotal / dfTotalWeight);
1849
0
                        }
1850
0
                        if (bHasNoData && nVal == tNoDataValue)
1851
0
                            nVal = tReplacementVal;
1852
0
                        pDstScanline[iDstPixel] = nVal;
1853
0
                    }
1854
0
                }
1855
0
            }
1856
0
        }
1857
0
        else
1858
0
        {
1859
0
            nSrcYOff -= nChunkYOff;
1860
0
            nSrcYOff2 -= nChunkYOff;
1861
1862
0
            for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1863
0
            {
1864
0
                const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1865
0
                const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1866
1867
0
                uint64_t nTotalR = 0;
1868
0
                uint64_t nTotalG = 0;
1869
0
                uint64_t nTotalB = 0;
1870
0
                size_t nCount = 0;
1871
1872
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1873
0
                {
1874
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1875
0
                    {
1876
0
                        const T val =
1877
0
                            pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1878
                        // cppcheck-suppress unsignedLessThanZero
1879
0
                        if (val < 0 || val >= colorEntries.size())
1880
0
                            continue;
1881
0
                        const size_t idx = static_cast<size_t>(val);
1882
0
                        const auto &entry = colorEntries[idx];
1883
0
                        if (entry.c4)
1884
0
                        {
1885
                            if constexpr (bQuadraticMean)
1886
0
                            {
1887
0
                                nTotalR += SQUARE<int>(entry.c1);
1888
0
                                nTotalG += SQUARE<int>(entry.c2);
1889
0
                                nTotalB += SQUARE<int>(entry.c3);
1890
0
                                ++nCount;
1891
                            }
1892
                            else
1893
0
                            {
1894
0
                                nTotalR += entry.c1;
1895
0
                                nTotalG += entry.c2;
1896
0
                                nTotalB += entry.c3;
1897
0
                                ++nCount;
1898
0
                            }
1899
0
                        }
1900
0
                    }
1901
0
                }
1902
1903
0
                if (nCount == 0 ||
1904
0
                    (bPropagateNoData &&
1905
0
                     nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1906
0
                                  (nSrcXOff2 - nSrcXOff)))
1907
0
                {
1908
0
                    pDstScanline[iDstPixel] = tNoDataValue;
1909
0
                }
1910
0
                else
1911
0
                {
1912
0
                    GDALColorEntry color;
1913
                    if constexpr (bQuadraticMean)
1914
0
                    {
1915
0
                        color.c1 =
1916
0
                            static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1917
0
                        color.c2 =
1918
0
                            static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1919
0
                        color.c3 =
1920
0
                            static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1921
                    }
1922
                    else
1923
0
                    {
1924
0
                        color.c1 =
1925
0
                            static_cast<short>((nTotalR + nCount / 2) / nCount);
1926
0
                        color.c2 =
1927
0
                            static_cast<short>((nTotalG + nCount / 2) / nCount);
1928
0
                        color.c3 =
1929
0
                            static_cast<short>((nTotalB + nCount / 2) / nCount);
1930
0
                    }
1931
0
                    pDstScanline[iDstPixel] =
1932
0
                        static_cast<T>(BestColorEntry(colorEntries, color));
1933
0
                }
1934
0
            }
1935
0
        }
1936
0
    }
1937
1938
0
    CPLFree(pasSrcX);
1939
1940
0
    return CE_None;
1941
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned char, int, (GDALDataType)1, true>(GDALOverviewResampleArgs const&, unsigned char const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, double, (GDALDataType)2, true>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<float, double, (GDALDataType)6, true>(GDALOverviewResampleArgs const&, float const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<double, double, (GDALDataType)7, true>(GDALOverviewResampleArgs const&, double const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned char, int, (GDALDataType)1, false>(GDALOverviewResampleArgs const&, unsigned char const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, unsigned int, (GDALDataType)2, false>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<float, double, (GDALDataType)6, false>(GDALOverviewResampleArgs const&, float const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<double, double, (GDALDataType)7, false>(GDALOverviewResampleArgs const&, double const*, void**)
1942
1943
template <bool bQuadraticMean>
1944
static CPLErr
1945
GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
1946
                                       const void *pChunk, void **ppDstBuffer,
1947
                                       GDALDataType *peDstBufferDataType)
1948
0
{
1949
0
    *peDstBufferDataType = args.eWrkDataType;
1950
0
    switch (args.eWrkDataType)
1951
0
    {
1952
0
        case GDT_UInt8:
1953
0
        {
1954
0
            return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_UInt8,
1955
0
                                                    bQuadraticMean>(
1956
0
                args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1957
0
        }
1958
1959
0
        case GDT_UInt16:
1960
0
        {
1961
            if constexpr (bQuadraticMean)
1962
0
            {
1963
                // Use double as accumulation type, because UInt32 could overflow
1964
0
                return GDALResampleChunk_AverageOrRMS_T<
1965
0
                    GUInt16, double, GDT_UInt16, bQuadraticMean>(
1966
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1967
            }
1968
            else
1969
0
            {
1970
0
                return GDALResampleChunk_AverageOrRMS_T<
1971
0
                    GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
1972
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1973
0
            }
1974
0
        }
1975
1976
0
        case GDT_Float32:
1977
0
        {
1978
0
            return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
1979
0
                                                    bQuadraticMean>(
1980
0
                args, static_cast<const float *>(pChunk), ppDstBuffer);
1981
0
        }
1982
1983
0
        case GDT_Float64:
1984
0
        {
1985
0
            return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
1986
0
                                                    bQuadraticMean>(
1987
0
                args, static_cast<const double *>(pChunk), ppDstBuffer);
1988
0
        }
1989
1990
0
        default:
1991
0
            break;
1992
0
    }
1993
1994
0
    CPLAssert(false);
1995
0
    return CE_Failure;
1996
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMSInternal<true>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMSInternal<false>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
1997
1998
static CPLErr
1999
GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
2000
                               const void *pChunk, void **ppDstBuffer,
2001
                               GDALDataType *peDstBufferDataType)
2002
0
{
2003
0
    if (EQUAL(args.pszResampling, "RMS"))
2004
0
        return GDALResampleChunk_AverageOrRMSInternal<true>(
2005
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
2006
0
    else
2007
0
        return GDALResampleChunk_AverageOrRMSInternal<false>(
2008
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
2009
0
}
2010
2011
/************************************************************************/
2012
/*                      GDALResampleChunk_Gauss()                       */
2013
/************************************************************************/
2014
2015
static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
2016
                                      const void *pChunk, void **ppDstBuffer,
2017
                                      GDALDataType *peDstBufferDataType)
2018
2019
0
{
2020
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2021
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2022
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2023
0
    const int nChunkXOff = args.nChunkXOff;
2024
0
    const int nChunkXSize = args.nChunkXSize;
2025
0
    const int nChunkYOff = args.nChunkYOff;
2026
0
    const int nChunkYSize = args.nChunkYSize;
2027
0
    const int nDstXOff = args.nDstXOff;
2028
0
    const int nDstXOff2 = args.nDstXOff2;
2029
0
    const int nDstYOff = args.nDstYOff;
2030
0
    const int nDstYOff2 = args.nDstYOff2;
2031
0
    const bool bHasNoData = args.bHasNoData;
2032
0
    double dfNoDataValue = args.dfNoDataValue;
2033
0
    const GDALColorTable *poColorTable = args.poColorTable;
2034
2035
0
    const double *const padfChunk = static_cast<const double *>(pChunk);
2036
2037
0
    *ppDstBuffer =
2038
0
        VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
2039
0
                            GDALGetDataTypeSizeBytes(GDT_Float64));
2040
0
    if (*ppDstBuffer == nullptr)
2041
0
    {
2042
0
        return CE_Failure;
2043
0
    }
2044
0
    *peDstBufferDataType = GDT_Float64;
2045
0
    double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
2046
2047
    /* -------------------------------------------------------------------- */
2048
    /*      Create the filter kernel and allocate scanline buffer.          */
2049
    /* -------------------------------------------------------------------- */
2050
0
    int nGaussMatrixDim = 3;
2051
0
    const int *panGaussMatrix;
2052
0
    constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
2053
0
    constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
2054
0
                                        4,  6, 24, 36, 24, 6, 4,  16, 24,
2055
0
                                        16, 4, 1,  4,  6,  4, 1};
2056
0
    constexpr int anGaussMatrix7x7[] = {
2057
0
        1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
2058
0
        6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
2059
0
        120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
2060
0
        90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
2061
2062
0
    const int nOXSize = args.nOvrXSize;
2063
0
    const int nOYSize = args.nOvrYSize;
2064
0
    const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
2065
2066
    // matrix for gauss filter
2067
0
    if (nResYFactor <= 2)
2068
0
    {
2069
0
        panGaussMatrix = anGaussMatrix3x3;
2070
0
        nGaussMatrixDim = 3;
2071
0
    }
2072
0
    else if (nResYFactor <= 4)
2073
0
    {
2074
0
        panGaussMatrix = anGaussMatrix5x5;
2075
0
        nGaussMatrixDim = 5;
2076
0
    }
2077
0
    else
2078
0
    {
2079
0
        panGaussMatrix = anGaussMatrix7x7;
2080
0
        nGaussMatrixDim = 7;
2081
0
    }
2082
2083
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2084
    int *panGaussMatrixDup = static_cast<int *>(
2085
        CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
2086
    memcpy(panGaussMatrixDup, panGaussMatrix,
2087
           sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
2088
    panGaussMatrix = panGaussMatrixDup;
2089
#endif
2090
2091
0
    if (!bHasNoData)
2092
0
        dfNoDataValue = 0.0;
2093
2094
0
    std::vector<GDALColorEntry> colorEntries;
2095
0
    int nTransparentIdx = -1;
2096
0
    if (poColorTable)
2097
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
2098
2099
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
2100
    // it as nodata value.
2101
0
    if (bHasNoData && dfNoDataValue >= 0.0 &&
2102
0
        dfNoDataValue < colorEntries.size())
2103
0
        colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
2104
2105
    // Or if we have no explicit nodata, but a color table entry that is
2106
    // transparent, consider it as the nodata value.
2107
0
    else if (!bHasNoData && nTransparentIdx >= 0)
2108
0
    {
2109
0
        dfNoDataValue = nTransparentIdx;
2110
0
    }
2111
2112
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2113
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2114
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
2115
2116
    /* ==================================================================== */
2117
    /*      Loop over destination scanlines.                                */
2118
    /* ==================================================================== */
2119
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2120
0
    {
2121
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
2122
0
        int nSrcYOff2 =
2123
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
2124
2125
0
        if (nSrcYOff < nChunkYOff)
2126
0
        {
2127
0
            nSrcYOff = nChunkYOff;
2128
0
            nSrcYOff2++;
2129
0
        }
2130
2131
0
        const int iSizeY = nSrcYOff2 - nSrcYOff;
2132
0
        nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
2133
0
        nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
2134
2135
0
        if (nSrcYOff2 > nChunkBottomYOff ||
2136
0
            (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
2137
0
        {
2138
0
            nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
2139
0
        }
2140
2141
0
        int nYShiftGaussMatrix = 0;
2142
0
        if (nSrcYOff < nChunkYOff)
2143
0
        {
2144
0
            nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
2145
0
            nSrcYOff = nChunkYOff;
2146
0
        }
2147
2148
0
        const double *const padfSrcScanline =
2149
0
            padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2150
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
2151
0
        if (pabyChunkNodataMask != nullptr)
2152
0
            pabySrcScanlineNodataMask =
2153
0
                pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
2154
2155
        /* --------------------------------------------------------------------
2156
         */
2157
        /*      Loop over destination pixels */
2158
        /* --------------------------------------------------------------------
2159
         */
2160
0
        double *const padfDstScanline =
2161
0
            padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
2162
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2163
0
        {
2164
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
2165
0
            int nSrcXOff2 =
2166
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
2167
2168
0
            if (nSrcXOff < nChunkXOff)
2169
0
            {
2170
0
                nSrcXOff = nChunkXOff;
2171
0
                nSrcXOff2++;
2172
0
            }
2173
2174
0
            const int iSizeX = nSrcXOff2 - nSrcXOff;
2175
0
            nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2176
0
            nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2177
2178
0
            if (nSrcXOff2 > nChunkRightXOff ||
2179
0
                (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2180
0
            {
2181
0
                nSrcXOff2 =
2182
0
                    std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2183
0
            }
2184
2185
0
            int nXShiftGaussMatrix = 0;
2186
0
            if (nSrcXOff < nChunkXOff)
2187
0
            {
2188
0
                nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2189
0
                nSrcXOff = nChunkXOff;
2190
0
            }
2191
2192
0
            if (poColorTable == nullptr)
2193
0
            {
2194
0
                double dfTotal = 0.0;
2195
0
                GInt64 nCount = 0;
2196
0
                const int *panLineWeight =
2197
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2198
0
                    nXShiftGaussMatrix;
2199
2200
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2201
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2202
0
                {
2203
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2204
0
                    {
2205
0
                        const double val =
2206
0
                            padfSrcScanline[iX - nChunkXOff +
2207
0
                                            static_cast<GPtrDiff_t>(iY -
2208
0
                                                                    nSrcYOff) *
2209
0
                                                nChunkXSize];
2210
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2211
0
                            pabySrcScanlineNodataMask[iX - nChunkXOff +
2212
0
                                                      static_cast<GPtrDiff_t>(
2213
0
                                                          iY - nSrcYOff) *
2214
0
                                                          nChunkXSize])
2215
0
                        {
2216
0
                            const int nWeight = panLineWeight[i];
2217
0
                            dfTotal += val * nWeight;
2218
0
                            nCount += nWeight;
2219
0
                        }
2220
0
                    }
2221
0
                }
2222
2223
0
                if (nCount == 0)
2224
0
                {
2225
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2226
0
                }
2227
0
                else
2228
0
                {
2229
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2230
0
                }
2231
0
            }
2232
0
            else
2233
0
            {
2234
0
                GInt64 nTotalR = 0;
2235
0
                GInt64 nTotalG = 0;
2236
0
                GInt64 nTotalB = 0;
2237
0
                GInt64 nTotalWeight = 0;
2238
0
                const int *panLineWeight =
2239
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2240
0
                    nXShiftGaussMatrix;
2241
2242
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2243
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2244
0
                {
2245
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2246
0
                    {
2247
0
                        const double val =
2248
0
                            padfSrcScanline[iX - nChunkXOff +
2249
0
                                            static_cast<GPtrDiff_t>(iY -
2250
0
                                                                    nSrcYOff) *
2251
0
                                                nChunkXSize];
2252
0
                        if (val < 0 || val >= colorEntries.size())
2253
0
                            continue;
2254
2255
0
                        size_t idx = static_cast<size_t>(val);
2256
0
                        if (colorEntries[idx].c4)
2257
0
                        {
2258
0
                            const int nWeight = panLineWeight[i];
2259
0
                            nTotalR +=
2260
0
                                static_cast<GInt64>(colorEntries[idx].c1) *
2261
0
                                nWeight;
2262
0
                            nTotalG +=
2263
0
                                static_cast<GInt64>(colorEntries[idx].c2) *
2264
0
                                nWeight;
2265
0
                            nTotalB +=
2266
0
                                static_cast<GInt64>(colorEntries[idx].c3) *
2267
0
                                nWeight;
2268
0
                            nTotalWeight += nWeight;
2269
0
                        }
2270
0
                    }
2271
0
                }
2272
2273
0
                if (nTotalWeight == 0)
2274
0
                {
2275
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2276
0
                }
2277
0
                else
2278
0
                {
2279
0
                    GDALColorEntry color;
2280
2281
0
                    color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2282
0
                                                  nTotalWeight);
2283
0
                    color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2284
0
                                                  nTotalWeight);
2285
0
                    color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2286
0
                                                  nTotalWeight);
2287
0
                    padfDstScanline[iDstPixel - nDstXOff] =
2288
0
                        BestColorEntry(colorEntries, color);
2289
0
                }
2290
0
            }
2291
0
        }
2292
0
    }
2293
2294
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2295
    CPLFree(panGaussMatrixDup);
2296
#endif
2297
2298
0
    return CE_None;
2299
0
}
2300
2301
/************************************************************************/
2302
/*                       GDALResampleChunk_Mode()                       */
2303
/************************************************************************/
2304
2305
template <class T> static inline bool IsSame(T a, T b)
2306
0
{
2307
0
    return a == b;
2308
0
}
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned char>(unsigned char, unsigned char)
Unexecuted instantiation: overview.cpp:bool IsSame<signed char>(signed char, signed char)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned short>(unsigned short, unsigned short)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned int>(unsigned int, unsigned int)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned long>(unsigned long, unsigned long)
2309
2310
template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
2311
0
{
2312
0
    return a == b || (CPLIsNan(a) && CPLIsNan(b));
2313
0
}
2314
2315
template <> bool IsSame<float>(float a, float b)
2316
0
{
2317
0
    return a == b || (std::isnan(a) && std::isnan(b));
2318
0
}
2319
2320
template <> bool IsSame<double>(double a, double b)
2321
0
{
2322
0
    return a == b || (std::isnan(a) && std::isnan(b));
2323
0
}
2324
2325
namespace
2326
{
2327
struct ComplexFloat16
2328
{
2329
    GFloat16 r;
2330
    GFloat16 i;
2331
};
2332
}  // namespace
2333
2334
template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
2335
0
{
2336
0
    return (a.r == b.r && a.i == b.i) ||
2337
0
           (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
2338
0
}
2339
2340
template <>
2341
bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2342
0
{
2343
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2344
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2345
0
}
2346
2347
template <>
2348
bool IsSame<std::complex<double>>(std::complex<double> a,
2349
                                  std::complex<double> b)
2350
0
{
2351
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2352
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2353
0
}
2354
2355
template <class T>
2356
static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2357
                                      const T *pChunk, T *const pDstBuffer)
2358
2359
0
{
2360
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2361
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2362
0
    const double dfSrcXDelta = args.dfSrcXDelta;
2363
0
    const double dfSrcYDelta = args.dfSrcYDelta;
2364
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2365
0
    const int nChunkXOff = args.nChunkXOff;
2366
0
    const int nChunkXSize = args.nChunkXSize;
2367
0
    const int nChunkYOff = args.nChunkYOff;
2368
0
    const int nChunkYSize = args.nChunkYSize;
2369
0
    const int nDstXOff = args.nDstXOff;
2370
0
    const int nDstXOff2 = args.nDstXOff2;
2371
0
    const int nDstYOff = args.nDstYOff;
2372
0
    const int nDstYOff2 = args.nDstYOff2;
2373
0
    const bool bHasNoData = args.bHasNoData;
2374
0
    const GDALColorTable *poColorTable = args.poColorTable;
2375
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
2376
2377
0
    T tNoDataValue;
2378
    if constexpr (std::is_same<T, ComplexFloat16>::value)
2379
0
    {
2380
0
        tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
2381
0
        tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
2382
    }
2383
    else if constexpr (std::is_same<T, std::complex<float>>::value ||
2384
                       std::is_same<T, std::complex<double>>::value)
2385
0
    {
2386
0
        using BaseT = typename T::value_type;
2387
0
        tNoDataValue =
2388
0
            std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2389
0
                                std::numeric_limits<BaseT>::quiet_NaN());
2390
    }
2391
0
    else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2392
0
        tNoDataValue = 0;
2393
0
    else
2394
0
        tNoDataValue = static_cast<T>(args.dfNoDataValue);
2395
2396
0
    using CountType = uint32_t;
2397
0
    CountType nMaxNumPx = 0;
2398
0
    T *paVals = nullptr;
2399
0
    CountType *panCounts = nullptr;
2400
2401
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2402
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2403
0
    std::vector<int> anVals(256, 0);
2404
2405
    /* ==================================================================== */
2406
    /*      Loop over destination scanlines.                                */
2407
    /* ==================================================================== */
2408
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2409
0
    {
2410
0
        const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2411
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2412
#ifdef only_pixels_with_more_than_10_pct_participation
2413
        // When oversampling, don't take into account pixels that have a tiny
2414
        // participation in the resulting pixel
2415
        if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2416
            nSrcYOff < nChunkBottomYOff)
2417
            nSrcYOff++;
2418
#endif
2419
0
        if (nSrcYOff < nChunkYOff)
2420
0
            nSrcYOff = nChunkYOff;
2421
2422
0
        const double dfSrcYOff2 =
2423
0
            dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2424
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2425
#ifdef only_pixels_with_more_than_10_pct_participation
2426
        // When oversampling, don't take into account pixels that have a tiny
2427
        // participation in the resulting pixel
2428
        if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2429
            nSrcYOff2 > nChunkYOff)
2430
            nSrcYOff2--;
2431
#endif
2432
0
        if (nSrcYOff2 == nSrcYOff)
2433
0
            ++nSrcYOff2;
2434
0
        if (nSrcYOff2 > nChunkBottomYOff)
2435
0
            nSrcYOff2 = nChunkBottomYOff;
2436
2437
0
        const T *const paSrcScanline =
2438
0
            pChunk +
2439
0
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2440
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
2441
0
        if (pabyChunkNodataMask != nullptr)
2442
0
            pabySrcScanlineNodataMask =
2443
0
                pabyChunkNodataMask +
2444
0
                static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2445
2446
0
        T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2447
        /* --------------------------------------------------------------------
2448
         */
2449
        /*      Loop over destination pixels */
2450
        /* --------------------------------------------------------------------
2451
         */
2452
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2453
0
        {
2454
0
            const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2455
            // Apply some epsilon to avoid numerical precision issues
2456
0
            int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2457
#ifdef only_pixels_with_more_than_10_pct_participation
2458
            // When oversampling, don't take into account pixels that have a
2459
            // tiny participation in the resulting pixel
2460
            if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2461
                nSrcXOff < nChunkRightXOff)
2462
                nSrcXOff++;
2463
#endif
2464
0
            if (nSrcXOff < nChunkXOff)
2465
0
                nSrcXOff = nChunkXOff;
2466
2467
0
            const double dfSrcXOff2 =
2468
0
                dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2469
0
            int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2470
#ifdef only_pixels_with_more_than_10_pct_participation
2471
            // When oversampling, don't take into account pixels that have a
2472
            // tiny participation in the resulting pixel
2473
            if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2474
                nSrcXOff2 > nChunkXOff)
2475
                nSrcXOff2--;
2476
#endif
2477
0
            if (nSrcXOff2 == nSrcXOff)
2478
0
                nSrcXOff2++;
2479
0
            if (nSrcXOff2 > nChunkRightXOff)
2480
0
                nSrcXOff2 = nChunkRightXOff;
2481
2482
0
            bool bRegularProcessing = false;
2483
            if constexpr (!std::is_same<T, GByte>::value)
2484
0
                bRegularProcessing = true;
2485
0
            else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2486
0
                bRegularProcessing = true;
2487
2488
0
            if (bRegularProcessing)
2489
0
            {
2490
                // Sanity check to make sure the allocation of paVals and
2491
                // panCounts don't overflow.
2492
0
                static_assert(sizeof(CountType) <= sizeof(size_t));
2493
0
                if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2494
0
                    static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
2495
0
                        (std::numeric_limits<CountType>::max() /
2496
0
                         std::max(sizeof(T), sizeof(CountType))) /
2497
0
                            static_cast<CountType>(nSrcXOff2 - nSrcXOff))
2498
0
                {
2499
0
                    CPLError(CE_Failure, CPLE_NotSupported,
2500
0
                             "Too big downsampling factor");
2501
0
                    CPLFree(paVals);
2502
0
                    CPLFree(panCounts);
2503
0
                    return CE_Failure;
2504
0
                }
2505
0
                const CountType nNumPx =
2506
0
                    static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
2507
0
                    (nSrcXOff2 - nSrcXOff);
2508
0
                CountType iMaxInd = 0;
2509
0
                CountType iMaxVal = 0;
2510
2511
0
                if (paVals == nullptr || nNumPx > nMaxNumPx)
2512
0
                {
2513
0
                    T *paValsNew = static_cast<T *>(
2514
0
                        VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2515
0
                    CountType *panCountsNew =
2516
0
                        static_cast<CountType *>(VSI_REALLOC_VERBOSE(
2517
0
                            panCounts, nNumPx * sizeof(CountType)));
2518
0
                    if (paValsNew != nullptr)
2519
0
                        paVals = paValsNew;
2520
0
                    if (panCountsNew != nullptr)
2521
0
                        panCounts = panCountsNew;
2522
0
                    if (paValsNew == nullptr || panCountsNew == nullptr)
2523
0
                    {
2524
0
                        CPLFree(paVals);
2525
0
                        CPLFree(panCounts);
2526
0
                        return CE_Failure;
2527
0
                    }
2528
0
                    nMaxNumPx = nNumPx;
2529
0
                }
2530
2531
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2532
0
                {
2533
0
                    const GPtrDiff_t iTotYOff =
2534
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2535
0
                        nChunkXOff;
2536
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2537
0
                    {
2538
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2539
0
                            pabySrcScanlineNodataMask[iX + iTotYOff])
2540
0
                        {
2541
0
                            const T val = paSrcScanline[iX + iTotYOff];
2542
0
                            CountType i = 0;  // Used after for.
2543
2544
                            // Check array for existing entry.
2545
0
                            for (; i < iMaxInd; ++i)
2546
0
                            {
2547
0
                                if (IsSame(paVals[i], val))
2548
0
                                {
2549
0
                                    if (++panCounts[i] > panCounts[iMaxVal])
2550
0
                                    {
2551
0
                                        iMaxVal = i;
2552
0
                                    }
2553
0
                                    break;
2554
0
                                }
2555
0
                            }
2556
2557
                            // Add to arr if entry not already there.
2558
0
                            if (i == iMaxInd)
2559
0
                            {
2560
0
                                paVals[iMaxInd] = val;
2561
0
                                panCounts[iMaxInd] = 1;
2562
2563
0
                                if (iMaxInd == 0)
2564
0
                                {
2565
0
                                    iMaxVal = iMaxInd;
2566
0
                                }
2567
2568
0
                                ++iMaxInd;
2569
0
                            }
2570
0
                        }
2571
0
                    }
2572
0
                }
2573
2574
0
                if (iMaxInd == 0)
2575
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2576
0
                else
2577
0
                    paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2578
0
            }
2579
            else if constexpr (std::is_same<T, GByte>::value)
2580
            // ( eSrcDataType == GDT_UInt8 && nEntryCount < 256 )
2581
0
            {
2582
                // So we go here for a paletted or non-paletted byte band.
2583
                // The input values are then between 0 and 255.
2584
0
                int nMaxVal = 0;
2585
0
                int iMaxInd = -1;
2586
2587
                // The cost of this zeroing might be high. Perhaps we should
2588
                // just use the above generic case, and go to this one if the
2589
                // number of source pixels is large enough
2590
0
                std::fill(anVals.begin(), anVals.end(), 0);
2591
2592
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2593
0
                {
2594
0
                    const GPtrDiff_t iTotYOff =
2595
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2596
0
                        nChunkXOff;
2597
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2598
0
                    {
2599
0
                        const T val = paSrcScanline[iX + iTotYOff];
2600
0
                        if (!bHasNoData || val != tNoDataValue)
2601
0
                        {
2602
0
                            int nVal = static_cast<int>(val);
2603
0
                            if (++anVals[nVal] > nMaxVal)
2604
0
                            {
2605
                                // Sum the density.
2606
                                // Is it the most common value so far?
2607
0
                                iMaxInd = nVal;
2608
0
                                nMaxVal = anVals[nVal];
2609
0
                            }
2610
0
                        }
2611
0
                    }
2612
0
                }
2613
2614
0
                if (iMaxInd == -1)
2615
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2616
0
                else
2617
0
                    paDstScanline[iDstPixel - nDstXOff] =
2618
0
                        static_cast<T>(iMaxInd);
2619
0
            }
2620
0
        }
2621
0
    }
2622
2623
0
    CPLFree(paVals);
2624
0
    CPLFree(panCounts);
2625
2626
0
    return CE_None;
2627
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<signed char>(GDALOverviewResampleArgs const&, signed char const*, signed char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<cpl::Float16>(GDALOverviewResampleArgs const&, cpl::Float16 const*, cpl::Float16*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<float>(GDALOverviewResampleArgs const&, float const*, float*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<double>(GDALOverviewResampleArgs const&, double const*, double*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<(anonymous namespace)::ComplexFloat16>(GDALOverviewResampleArgs const&, (anonymous namespace)::ComplexFloat16 const*, (anonymous namespace)::ComplexFloat16*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<float> >(GDALOverviewResampleArgs const&, std::__1::complex<float> const*, std::__1::complex<float>*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>*)
2628
2629
static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2630
                                     const void *pChunk, void **ppDstBuffer,
2631
                                     GDALDataType *peDstBufferDataType)
2632
0
{
2633
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2634
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2635
0
        GDALGetDataTypeSizeBytes(args.eWrkDataType));
2636
0
    if (*ppDstBuffer == nullptr)
2637
0
    {
2638
0
        return CE_Failure;
2639
0
    }
2640
2641
0
    CPLAssert(args.eSrcDataType == args.eWrkDataType);
2642
2643
0
    *peDstBufferDataType = args.eWrkDataType;
2644
0
    switch (args.eWrkDataType)
2645
0
    {
2646
        // For mode resampling, as no computation is done, only the
2647
        // size of the data type matters... except for Byte where we have
2648
        // special processing. And for floating point values
2649
0
        case GDT_UInt8:
2650
0
        {
2651
0
            return GDALResampleChunk_ModeT(args,
2652
0
                                           static_cast<const GByte *>(pChunk),
2653
0
                                           static_cast<GByte *>(*ppDstBuffer));
2654
0
        }
2655
2656
0
        case GDT_Int8:
2657
0
        {
2658
0
            return GDALResampleChunk_ModeT(args,
2659
0
                                           static_cast<const int8_t *>(pChunk),
2660
0
                                           static_cast<int8_t *>(*ppDstBuffer));
2661
0
        }
2662
2663
0
        case GDT_Int16:
2664
0
        case GDT_UInt16:
2665
0
        {
2666
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2667
0
            return GDALResampleChunk_ModeT(
2668
0
                args, static_cast<const uint16_t *>(pChunk),
2669
0
                static_cast<uint16_t *>(*ppDstBuffer));
2670
0
        }
2671
2672
0
        case GDT_CInt16:
2673
0
        case GDT_Int32:
2674
0
        case GDT_UInt32:
2675
0
        {
2676
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2677
0
            return GDALResampleChunk_ModeT(
2678
0
                args, static_cast<const uint32_t *>(pChunk),
2679
0
                static_cast<uint32_t *>(*ppDstBuffer));
2680
0
        }
2681
2682
0
        case GDT_CInt32:
2683
0
        case GDT_Int64:
2684
0
        case GDT_UInt64:
2685
0
        {
2686
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2687
0
            return GDALResampleChunk_ModeT(
2688
0
                args, static_cast<const uint64_t *>(pChunk),
2689
0
                static_cast<uint64_t *>(*ppDstBuffer));
2690
0
        }
2691
2692
0
        case GDT_Float16:
2693
0
        {
2694
0
            return GDALResampleChunk_ModeT(
2695
0
                args, static_cast<const GFloat16 *>(pChunk),
2696
0
                static_cast<GFloat16 *>(*ppDstBuffer));
2697
0
        }
2698
2699
0
        case GDT_Float32:
2700
0
        {
2701
0
            return GDALResampleChunk_ModeT(args,
2702
0
                                           static_cast<const float *>(pChunk),
2703
0
                                           static_cast<float *>(*ppDstBuffer));
2704
0
        }
2705
2706
0
        case GDT_Float64:
2707
0
        {
2708
0
            return GDALResampleChunk_ModeT(args,
2709
0
                                           static_cast<const double *>(pChunk),
2710
0
                                           static_cast<double *>(*ppDstBuffer));
2711
0
        }
2712
2713
0
        case GDT_CFloat16:
2714
0
        {
2715
0
            return GDALResampleChunk_ModeT(
2716
0
                args, static_cast<const ComplexFloat16 *>(pChunk),
2717
0
                static_cast<ComplexFloat16 *>(*ppDstBuffer));
2718
0
        }
2719
2720
0
        case GDT_CFloat32:
2721
0
        {
2722
0
            return GDALResampleChunk_ModeT(
2723
0
                args, static_cast<const std::complex<float> *>(pChunk),
2724
0
                static_cast<std::complex<float> *>(*ppDstBuffer));
2725
0
        }
2726
2727
0
        case GDT_CFloat64:
2728
0
        {
2729
0
            return GDALResampleChunk_ModeT(
2730
0
                args, static_cast<const std::complex<double> *>(pChunk),
2731
0
                static_cast<std::complex<double> *>(*ppDstBuffer));
2732
0
        }
2733
2734
0
        case GDT_Unknown:
2735
0
        case GDT_TypeCount:
2736
0
            break;
2737
0
    }
2738
2739
0
    CPLAssert(false);
2740
0
    return CE_Failure;
2741
0
}
2742
2743
/************************************************************************/
2744
/*                 GDALResampleConvolutionHorizontal()                  */
2745
/************************************************************************/
2746
2747
template <class T>
2748
static inline double
2749
GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2750
                                  int nSrcPixelCount)
2751
0
{
2752
0
    double dfVal1 = 0.0;
2753
0
    double dfVal2 = 0.0;
2754
0
    int i = 0;  // Used after for.
2755
    // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2756
    // manually (untypical) unrolled loop in -O2 and -O3:
2757
    // https://github.com/OSGeo/gdal/issues/9508
2758
0
#if !defined(__INTEL_CLANG_COMPILER)
2759
0
    for (; i < nSrcPixelCount - 3; i += 4)
2760
0
    {
2761
0
        dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
2762
0
        dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
2763
0
        dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
2764
0
        dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
2765
0
    }
2766
0
#endif
2767
0
    for (; i < nSrcPixelCount; ++i)
2768
0
    {
2769
0
        dfVal1 += double(pChunk[i]) * padfWeights[i];
2770
0
    }
2771
0
    return dfVal1 + dfVal2;
2772
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<float>(float const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<double>(double const*, double const*, int)
2773
2774
template <class T>
2775
static inline void GDALResampleConvolutionHorizontalWithMask(
2776
    const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2777
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2778
0
{
2779
0
    dfVal = 0;
2780
0
    dfWeightSum = 0;
2781
0
    int i = 0;
2782
0
    for (; i < nSrcPixelCount - 3; i += 4)
2783
0
    {
2784
0
        const double dfWeight0 = padfWeights[i] * pabyMask[i];
2785
0
        const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2786
0
        const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2787
0
        const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2788
0
        dfVal += double(pChunk[i + 0]) * dfWeight0;
2789
0
        dfVal += double(pChunk[i + 1]) * dfWeight1;
2790
0
        dfVal += double(pChunk[i + 2]) * dfWeight2;
2791
0
        dfVal += double(pChunk[i + 3]) * dfWeight3;
2792
0
        dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2793
0
    }
2794
0
    for (; i < nSrcPixelCount; ++i)
2795
0
    {
2796
0
        const double dfWeight = padfWeights[i] * pabyMask[i];
2797
0
        dfVal += double(pChunk[i]) * dfWeight;
2798
0
        dfWeightSum += dfWeight;
2799
0
    }
2800
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<float>(float const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<double>(double const*, unsigned char const*, double const*, int, double&, double&)
2801
2802
template <class T>
2803
static inline void GDALResampleConvolutionHorizontal_3rows(
2804
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2805
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2806
    double &dfRes2, double &dfRes3)
2807
0
{
2808
0
    double dfVal1 = 0.0;
2809
0
    double dfVal2 = 0.0;
2810
0
    double dfVal3 = 0.0;
2811
0
    double dfVal4 = 0.0;
2812
0
    double dfVal5 = 0.0;
2813
0
    double dfVal6 = 0.0;
2814
0
    int i = 0;  // Used after for.
2815
0
    for (; i < nSrcPixelCount - 3; i += 4)
2816
0
    {
2817
0
        dfVal1 += double(pChunkRow1[i + 0]) * padfWeights[i + 0];
2818
0
        dfVal1 += double(pChunkRow1[i + 1]) * padfWeights[i + 1];
2819
0
        dfVal2 += double(pChunkRow1[i + 2]) * padfWeights[i + 2];
2820
0
        dfVal2 += double(pChunkRow1[i + 3]) * padfWeights[i + 3];
2821
0
        dfVal3 += double(pChunkRow2[i + 0]) * padfWeights[i + 0];
2822
0
        dfVal3 += double(pChunkRow2[i + 1]) * padfWeights[i + 1];
2823
0
        dfVal4 += double(pChunkRow2[i + 2]) * padfWeights[i + 2];
2824
0
        dfVal4 += double(pChunkRow2[i + 3]) * padfWeights[i + 3];
2825
0
        dfVal5 += double(pChunkRow3[i + 0]) * padfWeights[i + 0];
2826
0
        dfVal5 += double(pChunkRow3[i + 1]) * padfWeights[i + 1];
2827
0
        dfVal6 += double(pChunkRow3[i + 2]) * padfWeights[i + 2];
2828
0
        dfVal6 += double(pChunkRow3[i + 3]) * padfWeights[i + 3];
2829
0
    }
2830
0
    for (; i < nSrcPixelCount; ++i)
2831
0
    {
2832
0
        dfVal1 += double(pChunkRow1[i]) * padfWeights[i];
2833
0
        dfVal3 += double(pChunkRow2[i]) * padfWeights[i];
2834
0
        dfVal5 += double(pChunkRow3[i]) * padfWeights[i];
2835
0
    }
2836
0
    dfRes1 = dfVal1 + dfVal2;
2837
0
    dfRes2 = dfVal3 + dfVal4;
2838
0
    dfRes3 = dfVal5 + dfVal6;
2839
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2840
2841
template <class T>
2842
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2843
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2844
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2845
    double &dfRes2, double &dfRes3)
2846
0
{
2847
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2848
0
                                            padfWeights, nSrcPixelCount, dfRes1,
2849
0
                                            dfRes2, dfRes3);
2850
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2851
2852
template <class T>
2853
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2854
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2855
    const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2856
0
{
2857
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2858
0
                                            padfWeights, 4, dfRes1, dfRes2,
2859
0
                                            dfRes3);
2860
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<float>(float const*, float const*, float const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<double>(double const*, double const*, double const*, double const*, double&, double&, double&)
2861
2862
/************************************************************************/
2863
/*                  GDALResampleConvolutionVertical()                   */
2864
/************************************************************************/
2865
2866
template <class T>
2867
static inline double
2868
GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2869
                                const double *padfWeights, int nSrcLineCount)
2870
0
{
2871
0
    double dfVal1 = 0.0;
2872
0
    double dfVal2 = 0.0;
2873
0
    int i = 0;
2874
0
    size_t j = 0;
2875
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2876
0
    {
2877
0
        dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2878
0
        dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2879
0
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2880
0
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2881
0
    }
2882
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2883
0
    {
2884
0
        dfVal1 += pChunk[j] * padfWeights[i];
2885
0
    }
2886
0
    return dfVal1 + dfVal2;
2887
0
}
2888
2889
template <class T>
2890
static inline void GDALResampleConvolutionVertical_2cols(
2891
    const T *pChunk, size_t nStride, const double *padfWeights,
2892
    int nSrcLineCount, double &dfRes1, double &dfRes2)
2893
0
{
2894
0
    double dfVal1 = 0.0;
2895
0
    double dfVal2 = 0.0;
2896
0
    double dfVal3 = 0.0;
2897
0
    double dfVal4 = 0.0;
2898
0
    int i = 0;
2899
0
    size_t j = 0;
2900
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2901
0
    {
2902
0
        dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2903
0
        dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2904
0
        dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2905
0
        dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2906
0
        dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2907
0
        dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2908
0
        dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2909
0
        dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2910
0
    }
2911
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2912
0
    {
2913
0
        dfVal1 += pChunk[j + 0] * padfWeights[i];
2914
0
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2915
0
    }
2916
0
    dfRes1 = dfVal1 + dfVal2;
2917
0
    dfRes2 = dfVal3 + dfVal4;
2918
0
}
2919
2920
#ifdef USE_SSE2
2921
2922
#ifdef __AVX__
2923
/************************************************************************/
2924
/*              GDALResampleConvolutionVertical_16cols<T>               */
2925
/************************************************************************/
2926
2927
template <class T>
2928
static inline void
2929
GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2930
                                       const double *padfWeights,
2931
                                       int nSrcLineCount, float *afDest)
2932
{
2933
    int i = 0;
2934
    size_t j = 0;
2935
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2936
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2937
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2938
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2939
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2940
    {
2941
        XMMReg4Double w0 =
2942
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2943
        XMMReg4Double w1 =
2944
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2945
        XMMReg4Double w2 =
2946
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2947
        XMMReg4Double w3 =
2948
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2949
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2950
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2951
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2952
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2953
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2954
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2955
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2956
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2957
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2958
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2959
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2960
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2961
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2962
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2963
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2964
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2965
    }
2966
    for (; i < nSrcLineCount; ++i, j += nStride)
2967
    {
2968
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2969
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2970
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2971
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2972
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2973
    }
2974
    v_acc0.Store4Val(afDest);
2975
    v_acc1.Store4Val(afDest + 4);
2976
    v_acc2.Store4Val(afDest + 8);
2977
    v_acc3.Store4Val(afDest + 12);
2978
}
2979
2980
template <class T>
2981
static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2982
                                                          const double *, int,
2983
                                                          double *)
2984
{
2985
    // Cannot be reached
2986
    CPLAssert(false);
2987
}
2988
2989
#else
2990
2991
/************************************************************************/
2992
/*               GDALResampleConvolutionVertical_8cols<T>               */
2993
/************************************************************************/
2994
2995
template <class T>
2996
static inline void
2997
GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
2998
                                      const double *padfWeights,
2999
                                      int nSrcLineCount, float *afDest)
3000
0
{
3001
0
    int i = 0;
3002
0
    size_t j = 0;
3003
0
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
3004
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3005
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
3006
0
    {
3007
0
        XMMReg4Double w0 =
3008
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
3009
0
        XMMReg4Double w1 =
3010
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
3011
0
        XMMReg4Double w2 =
3012
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
3013
0
        XMMReg4Double w3 =
3014
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
3015
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
3016
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
3017
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
3018
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
3019
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
3020
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
3021
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
3022
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
3023
0
    }
3024
0
    for (; i < nSrcLineCount; ++i, j += nStride)
3025
0
    {
3026
0
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
3027
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
3028
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
3029
0
    }
3030
0
    v_acc0.Store4Val(afDest);
3031
0
    v_acc1.Store4Val(afDest + 4);
3032
0
}
3033
3034
template <class T>
3035
static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
3036
                                                         const double *, int,
3037
                                                         double *)
3038
{
3039
    // Cannot be reached
3040
    CPLAssert(false);
3041
}
3042
3043
#endif  // __AVX__
3044
3045
/************************************************************************/
3046
/*               GDALResampleConvolutionHorizontalSSE2<T>               */
3047
/************************************************************************/
3048
3049
template <class T>
3050
static inline double GDALResampleConvolutionHorizontalSSE2(
3051
    const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3052
0
{
3053
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3054
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3055
0
    int i = 0;  // Used after for.
3056
0
    for (; i < nSrcPixelCount - 7; i += 8)
3057
0
    {
3058
        // Retrieve the pixel & accumulate
3059
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
3060
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
3061
0
        const XMMReg4Double v_weight1 =
3062
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3063
0
        const XMMReg4Double v_weight2 =
3064
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3065
3066
0
        v_acc1 += v_pixels1 * v_weight1;
3067
0
        v_acc2 += v_pixels2 * v_weight2;
3068
0
    }
3069
3070
0
    v_acc1 += v_acc2;
3071
3072
0
    double dfVal = v_acc1.GetHorizSum();
3073
0
    for (; i < nSrcPixelCount; ++i)
3074
0
    {
3075
0
        dfVal += pChunk[i] * padfWeightsAligned[i];
3076
0
    }
3077
0
    return dfVal;
3078
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned char>(unsigned char const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned short>(unsigned short const*, double const*, int)
3079
3080
/************************************************************************/
3081
/*               GDALResampleConvolutionHorizontal<GByte>               */
3082
/************************************************************************/
3083
3084
template <>
3085
inline double GDALResampleConvolutionHorizontal<GByte>(
3086
    const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3087
0
{
3088
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3089
0
                                                 nSrcPixelCount);
3090
0
}
3091
3092
template <>
3093
inline double GDALResampleConvolutionHorizontal<GUInt16>(
3094
    const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
3095
0
{
3096
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
3097
0
                                                 nSrcPixelCount);
3098
0
}
3099
3100
/************************************************************************/
3101
/*           GDALResampleConvolutionHorizontalWithMaskSSE2<T>           */
3102
/************************************************************************/
3103
3104
template <class T>
3105
static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
3106
    const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
3107
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
3108
0
{
3109
0
    int i = 0;  // Used after for.
3110
0
    XMMReg4Double v_acc = XMMReg4Double::Zero();
3111
0
    XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
3112
0
    for (; i < nSrcPixelCount - 3; i += 4)
3113
0
    {
3114
0
        const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
3115
0
        const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
3116
0
        XMMReg4Double v_weight =
3117
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3118
0
        v_weight *= v_mask;
3119
0
        v_acc += v_pixels * v_weight;
3120
0
        v_acc_weight += v_weight;
3121
0
    }
3122
3123
0
    dfVal = v_acc.GetHorizSum();
3124
0
    dfWeightSum = v_acc_weight.GetHorizSum();
3125
0
    for (; i < nSrcPixelCount; ++i)
3126
0
    {
3127
0
        const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
3128
0
        dfVal += pChunk[i] * dfWeight;
3129
0
        dfWeightSum += dfWeight;
3130
0
    }
3131
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned char>(unsigned char const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned short>(unsigned short const*, unsigned char const*, double const*, int, double&, double&)
3132
3133
/************************************************************************/
3134
/*           GDALResampleConvolutionHorizontalWithMask<GByte>           */
3135
/************************************************************************/
3136
3137
template <>
3138
inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
3139
    const GByte *pChunk, const GByte *pabyMask,
3140
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3141
    double &dfWeightSum)
3142
0
{
3143
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
3144
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3145
0
        dfWeightSum);
3146
0
}
3147
3148
template <>
3149
inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
3150
    const GUInt16 *pChunk, const GByte *pabyMask,
3151
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
3152
    double &dfWeightSum)
3153
0
{
3154
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
3155
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
3156
0
        dfWeightSum);
3157
0
}
3158
3159
/************************************************************************/
3160
/*           GDALResampleConvolutionHorizontal_3rows_SSE2<T>            */
3161
/************************************************************************/
3162
3163
template <class T>
3164
static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
3165
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3166
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3167
    double &dfRes2, double &dfRes3)
3168
0
{
3169
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
3170
0
                  v_acc2 = XMMReg4Double::Zero(),
3171
0
                  v_acc3 = XMMReg4Double::Zero();
3172
0
    int i = 0;
3173
0
    for (; i < nSrcPixelCount - 7; i += 8)
3174
0
    {
3175
        // Retrieve the pixel & accumulate.
3176
0
        XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3177
0
        XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
3178
0
        const XMMReg4Double v_weight1 =
3179
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3180
0
        const XMMReg4Double v_weight2 =
3181
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
3182
3183
0
        v_acc1 += v_pixels1 * v_weight1;
3184
0
        v_acc1 += v_pixels2 * v_weight2;
3185
3186
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3187
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
3188
0
        v_acc2 += v_pixels1 * v_weight1;
3189
0
        v_acc2 += v_pixels2 * v_weight2;
3190
3191
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3192
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
3193
0
        v_acc3 += v_pixels1 * v_weight1;
3194
0
        v_acc3 += v_pixels2 * v_weight2;
3195
0
    }
3196
3197
0
    dfRes1 = v_acc1.GetHorizSum();
3198
0
    dfRes2 = v_acc2.GetHorizSum();
3199
0
    dfRes3 = v_acc3.GetHorizSum();
3200
0
    for (; i < nSrcPixelCount; ++i)
3201
0
    {
3202
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3203
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3204
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3205
0
    }
3206
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
3207
3208
/************************************************************************/
3209
/*            GDALResampleConvolutionHorizontal_3rows<GByte>            */
3210
/************************************************************************/
3211
3212
template <>
3213
inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
3214
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3215
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3216
    double &dfRes2, double &dfRes3)
3217
0
{
3218
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
3219
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3220
0
        dfRes1, dfRes2, dfRes3);
3221
0
}
3222
3223
template <>
3224
inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
3225
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3226
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3227
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3228
0
{
3229
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
3230
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3231
0
        dfRes1, dfRes2, dfRes3);
3232
0
}
3233
3234
/************************************************************************/
3235
/*    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>    */
3236
/************************************************************************/
3237
3238
template <class T>
3239
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3240
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3241
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3242
    double &dfRes2, double &dfRes3)
3243
0
{
3244
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3245
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3246
0
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3247
0
    int i = 0;  // Use after for.
3248
0
    for (; i < nSrcPixelCount - 3; i += 4)
3249
0
    {
3250
        // Retrieve the pixel & accumulate.
3251
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3252
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3253
0
        const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3254
0
        const XMMReg4Double v_weight =
3255
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3256
3257
0
        v_acc1 += v_pixels1 * v_weight;
3258
0
        v_acc2 += v_pixels2 * v_weight;
3259
0
        v_acc3 += v_pixels3 * v_weight;
3260
0
    }
3261
3262
0
    dfRes1 = v_acc1.GetHorizSum();
3263
0
    dfRes2 = v_acc2.GetHorizSum();
3264
0
    dfRes3 = v_acc3.GetHorizSum();
3265
3266
0
    for (; i < nSrcPixelCount; ++i)
3267
0
    {
3268
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3269
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3270
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3271
0
    }
3272
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
3273
3274
/************************************************************************/
3275
/*    GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>     */
3276
/************************************************************************/
3277
3278
template <>
3279
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3280
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3281
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3282
    double &dfRes2, double &dfRes3)
3283
0
{
3284
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3285
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3286
0
        dfRes1, dfRes2, dfRes3);
3287
0
}
3288
3289
template <>
3290
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3291
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3292
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3293
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3294
0
{
3295
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3296
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3297
0
        dfRes1, dfRes2, dfRes3);
3298
0
}
3299
3300
/************************************************************************/
3301
/*      GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>      */
3302
/************************************************************************/
3303
3304
template <class T>
3305
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3306
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3307
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3308
    double &dfRes3)
3309
0
{
3310
0
    const XMMReg4Double v_weight =
3311
0
        XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3312
3313
    // Retrieve the pixel & accumulate.
3314
0
    const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3315
0
    const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3316
0
    const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3317
3318
0
    XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3319
0
    XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3320
0
    XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3321
3322
0
    dfRes1 = v_acc1.GetHorizSum();
3323
0
    dfRes2 = v_acc2.GetHorizSum();
3324
0
    dfRes3 = v_acc3.GetHorizSum();
3325
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, double&, double&, double&)
3326
3327
/************************************************************************/
3328
/*      GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>       */
3329
/************************************************************************/
3330
3331
template <>
3332
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3333
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3334
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3335
    double &dfRes3)
3336
0
{
3337
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3338
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3339
0
        dfRes3);
3340
0
}
3341
3342
template <>
3343
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3344
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3345
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3346
    double &dfRes2, double &dfRes3)
3347
0
{
3348
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3349
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3350
0
        dfRes3);
3351
0
}
3352
3353
#endif  // USE_SSE2
3354
3355
/************************************************************************/
3356
/*                   GDALResampleChunk_Convolution()                    */
3357
/************************************************************************/
3358
3359
template <class T, class Twork, GDALDataType eWrkDataType,
3360
          bool bKernelWithNegativeWeights, bool bNeedRescale>
3361
static CPLErr GDALResampleChunk_ConvolutionT(
3362
    const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3363
    FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3364
    int nKernelRadius, float fMaxVal)
3365
3366
0
{
3367
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3368
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3369
0
    const double dfSrcXDelta = args.dfSrcXDelta;
3370
0
    const double dfSrcYDelta = args.dfSrcYDelta;
3371
0
    constexpr int nBands = 1;
3372
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3373
0
    const int nChunkXOff = args.nChunkXOff;
3374
0
    const int nChunkXSize = args.nChunkXSize;
3375
0
    const int nChunkYOff = args.nChunkYOff;
3376
0
    const int nChunkYSize = args.nChunkYSize;
3377
0
    const int nDstXOff = args.nDstXOff;
3378
0
    const int nDstXOff2 = args.nDstXOff2;
3379
0
    const int nDstYOff = args.nDstYOff;
3380
0
    const int nDstYOff2 = args.nDstYOff2;
3381
0
    const bool bHasNoData = args.bHasNoData;
3382
0
    double dfNoDataValue = args.dfNoDataValue;
3383
3384
0
    if (!bHasNoData)
3385
0
        dfNoDataValue = 0.0;
3386
0
    const auto dstDataType = args.eOvrDataType;
3387
0
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3388
0
    const double dfReplacementVal =
3389
0
        bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3390
0
                   : dfNoDataValue;
3391
    // cppcheck-suppress unreadVariable
3392
0
    const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3393
0
    const bool bNoDataValueInt64Valid =
3394
0
        isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3395
0
    const auto nNodataValueInt64 =
3396
0
        bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3397
0
    constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3398
3399
    // TODO: we should have some generic function to do this.
3400
0
    Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3401
0
    Twork fDstMax = cpl::NumericLimits<Twork>::max();
3402
0
    if (dstDataType == GDT_UInt8)
3403
0
    {
3404
0
        fDstMin = std::numeric_limits<GByte>::min();
3405
0
        fDstMax = std::numeric_limits<GByte>::max();
3406
0
    }
3407
0
    else if (dstDataType == GDT_Int8)
3408
0
    {
3409
0
        fDstMin = std::numeric_limits<GInt8>::min();
3410
0
        fDstMax = std::numeric_limits<GInt8>::max();
3411
0
    }
3412
0
    else if (dstDataType == GDT_UInt16)
3413
0
    {
3414
0
        fDstMin = std::numeric_limits<GUInt16>::min();
3415
0
        fDstMax = std::numeric_limits<GUInt16>::max();
3416
0
    }
3417
0
    else if (dstDataType == GDT_Int16)
3418
0
    {
3419
0
        fDstMin = std::numeric_limits<GInt16>::min();
3420
0
        fDstMax = std::numeric_limits<GInt16>::max();
3421
0
    }
3422
0
    else if (dstDataType == GDT_UInt32)
3423
0
    {
3424
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3425
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3426
0
    }
3427
0
    else if (dstDataType == GDT_Int32)
3428
0
    {
3429
        // cppcheck-suppress unreadVariable
3430
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3431
        // cppcheck-suppress unreadVariable
3432
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3433
0
    }
3434
0
    else if (dstDataType == GDT_UInt64)
3435
0
    {
3436
        // cppcheck-suppress unreadVariable
3437
0
        fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3438
        // cppcheck-suppress unreadVariable
3439
        // (1 << 64) - 2048: largest uint64 value a double can hold
3440
0
        fDstMax = static_cast<Twork>(18446744073709549568ULL);
3441
0
    }
3442
0
    else if (dstDataType == GDT_Int64)
3443
0
    {
3444
        // cppcheck-suppress unreadVariable
3445
0
        fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3446
        // cppcheck-suppress unreadVariable
3447
        // (1 << 63) - 1024: largest int64 that a double can hold
3448
0
        fDstMax = static_cast<Twork>(9223372036854774784LL);
3449
0
    }
3450
3451
0
    auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3452
0
                               bNoDataValueInt64Valid, nNodataValueInt64,
3453
0
                               dfNoDataValue, dfReplacementVal](Twork fVal)
3454
0
    {
3455
0
        if (!bHasNoData)
3456
0
            return fVal;
3457
3458
        // Clamp value before comparing to nodata: this is only needed for
3459
        // kernels with negative weights (Lanczos)
3460
0
        Twork fClamped = fVal;
3461
0
        if (fClamped < fDstMin)
3462
0
            fClamped = fDstMin;
3463
0
        else if (fClamped > fDstMax)
3464
0
            fClamped = fDstMax;
3465
0
        if (isIntegerDT)
3466
0
        {
3467
0
            if (bNoDataValueInt64Valid)
3468
0
            {
3469
0
                const double fClampedRounded = double(std::round(fClamped));
3470
0
                if (fClampedRounded >=
3471
0
                        static_cast<double>(static_cast<Twork>(
3472
0
                            std::numeric_limits<int64_t>::min())) &&
3473
0
                    fClampedRounded <= static_cast<double>(static_cast<Twork>(
3474
0
                                           9223372036854774784LL)) &&
3475
0
                    nNodataValueInt64 ==
3476
0
                        static_cast<GInt64>(std::round(fClamped)))
3477
0
                {
3478
                    // Do not use the nodata value
3479
0
                    return static_cast<Twork>(dfReplacementVal);
3480
0
                }
3481
0
            }
3482
0
        }
3483
0
        else if (dfNoDataValue == static_cast<double>(fClamped))
3484
0
        {
3485
            // Do not use the nodata value
3486
0
            return static_cast<Twork>(dfReplacementVal);
3487
0
        }
3488
0
        return fClamped;
3489
0
    };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, true, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double)#1}::operator()(double) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double)#1}::operator()(double) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, false>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double)#1}::operator()(double) const
3490
3491
    /* -------------------------------------------------------------------- */
3492
    /*      Allocate work buffers.                                          */
3493
    /* -------------------------------------------------------------------- */
3494
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
3495
0
    Twork *pafWrkScanline = nullptr;
3496
0
    if (dstDataType != eWrkDataType)
3497
0
    {
3498
0
        pafWrkScanline =
3499
0
            static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3500
0
        if (pafWrkScanline == nullptr)
3501
0
            return CE_Failure;
3502
0
    }
3503
3504
0
    const double dfXScale = 1.0 / dfXRatioDstToSrc;
3505
0
    const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3506
0
    const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3507
0
    const double dfYScale = 1.0 / dfYRatioDstToSrc;
3508
0
    const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3509
0
    const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3510
3511
    // Temporary array to store result of horizontal filter.
3512
0
    double *const padfHorizontalFiltered = static_cast<double *>(
3513
0
        VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3514
3515
    // To store convolution coefficients.
3516
0
    double *const padfWeights =
3517
0
        static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3518
0
            static_cast<int>(
3519
0
                2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5) *
3520
0
            sizeof(double)));
3521
3522
0
    GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3523
0
    if (pabyChunkNodataMask)
3524
0
        pabyChunkNodataMaskHorizontalFiltered =
3525
0
            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3526
0
    if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3527
0
        (pabyChunkNodataMask != nullptr &&
3528
0
         pabyChunkNodataMaskHorizontalFiltered == nullptr))
3529
0
    {
3530
0
        VSIFree(pafWrkScanline);
3531
0
        VSIFree(padfHorizontalFiltered);
3532
0
        VSIFreeAligned(padfWeights);
3533
0
        VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3534
0
        return CE_Failure;
3535
0
    }
3536
3537
    /* ==================================================================== */
3538
    /*      First pass: horizontal filter                                   */
3539
    /* ==================================================================== */
3540
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3541
0
#ifdef USE_SSE2
3542
0
    const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3543
0
#endif
3544
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3545
0
    {
3546
0
        const double dfSrcPixel =
3547
0
            (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3548
0
        int nSrcPixelStart =
3549
0
            static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3550
0
        if (nSrcPixelStart < nChunkXOff)
3551
0
            nSrcPixelStart = nChunkXOff;
3552
0
        int nSrcPixelStop =
3553
0
            static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3554
0
        if (nSrcPixelStop > nChunkRightXOff)
3555
0
            nSrcPixelStop = nChunkRightXOff;
3556
#if 0
3557
        if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3558
        {
3559
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3560
        }
3561
        if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3562
        {
3563
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3564
        }
3565
#endif
3566
0
        const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3567
0
        double dfWeightSum = 0.0;
3568
3569
        // Compute convolution coefficients.
3570
0
        int nSrcPixel = nSrcPixelStart;
3571
0
        double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3572
0
        for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3573
0
        {
3574
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3575
0
            dfX += dfXScaleWeight;
3576
0
            padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3577
0
            dfX += dfXScaleWeight;
3578
0
            padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3579
0
            dfX += dfXScaleWeight;
3580
0
            padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3581
0
            dfX += dfXScaleWeight;
3582
0
            dfWeightSum +=
3583
0
                pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3584
0
        }
3585
0
        for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3586
0
        {
3587
0
            const double dfWeight = pfnFilterFunc(dfX);
3588
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3589
0
            dfWeightSum += dfWeight;
3590
0
        }
3591
3592
0
        const int nHeight = nChunkYSize * nBands;
3593
0
        if (pabyChunkNodataMask == nullptr)
3594
0
        {
3595
            // For floating-point data types, we must scale down a bit values
3596
            // if input values are close to +/- std::numeric_limits<T>::max()
3597
#ifdef OLD_CPPCHECK
3598
            constexpr double mulFactor = 1;
3599
#else
3600
0
            constexpr double mulFactor =
3601
0
                (bNeedRescale &&
3602
0
                 (std::is_same_v<T, float> || std::is_same_v<T, double>))
3603
0
                    ? 2
3604
0
                    : 1;
3605
0
#endif
3606
3607
0
            if (dfWeightSum != 0)
3608
0
            {
3609
0
                const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3610
0
                for (int i = 0; i < nSrcPixelCount; ++i)
3611
0
                {
3612
0
                    padfWeights[i] *= dfInvWeightSum;
3613
0
                }
3614
0
            }
3615
3616
0
            const auto ScaleValue = [
3617
#ifdef _MSC_VER
3618
                                        mulFactor
3619
#endif
3620
0
            ](double dfVal, [[maybe_unused]] const T *inputValues,
3621
0
                                    [[maybe_unused]] int nInputValues)
3622
0
            {
3623
0
                constexpr bool isFloat =
3624
0
                    std::is_same_v<T, float> || std::is_same_v<T, double>;
3625
                if constexpr (isFloat)
3626
0
                {
3627
0
                    if (std::isfinite(dfVal))
3628
0
                    {
3629
0
                        return std::clamp(dfVal,
3630
0
                                          -std::numeric_limits<double>::max() /
3631
0
                                              mulFactor,
3632
0
                                          std::numeric_limits<double>::max() /
3633
0
                                              mulFactor) *
3634
0
                               mulFactor;
3635
0
                    }
3636
                    else if constexpr (bKernelWithNegativeWeights)
3637
0
                    {
3638
0
                        if (std::isnan(dfVal))
3639
0
                        {
3640
                            // Either one of the input value is NaN or they are +/-Inf
3641
0
                            const bool isPositive = inputValues[0] >= 0;
3642
0
                            for (int i = 0; i < nInputValues; ++i)
3643
0
                            {
3644
0
                                if (std::isnan(inputValues[i]))
3645
0
                                    return dfVal;
3646
                                // cppcheck-suppress knownConditionTrueFalse
3647
0
                                if ((inputValues[i] >= 0) != isPositive)
3648
0
                                    return dfVal;
3649
0
                            }
3650
                            // All values are positive or negative infinity
3651
0
                            return static_cast<double>(inputValues[0]);
3652
0
                        }
3653
0
                    }
3654
0
                }
3655
0
                return dfVal;
3656
0
            };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, float const*, int)#1}::operator()(double, float const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, true, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int)#1}::operator()(double, double const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, float const*, int)#1}::operator()(double, float const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int)#1}::operator()(double, double const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, float const*, int)#1}::operator()(double, float const*, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, false>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int)#1}::operator()(double, double const*, int) const
3657
3658
0
            int iSrcLineOff = 0;
3659
0
#ifdef USE_SSE2
3660
0
            if (nSrcPixelCount == 4)
3661
0
            {
3662
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3663
0
                {
3664
0
                    const size_t j =
3665
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3666
0
                        (nSrcPixelStart - nChunkXOff);
3667
0
                    double dfVal1 = 0.0;
3668
0
                    double dfVal2 = 0.0;
3669
0
                    double dfVal3 = 0.0;
3670
0
                    GDALResampleConvolutionHorizontalPixelCount4_3rows(
3671
0
                        pChunk + j, pChunk + j + nChunkXSize,
3672
0
                        pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3673
0
                        dfVal2, dfVal3);
3674
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3675
0
                                               nDstXSize +
3676
0
                                           iDstPixel - nDstXOff] =
3677
0
                        ScaleValue(dfVal1, pChunk + j, 4);
3678
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3679
0
                                            1) *
3680
0
                                               nDstXSize +
3681
0
                                           iDstPixel - nDstXOff] =
3682
0
                        ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
3683
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3684
0
                                            2) *
3685
0
                                               nDstXSize +
3686
0
                                           iDstPixel - nDstXOff] =
3687
0
                        ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
3688
0
                }
3689
0
            }
3690
0
            else if (bSrcPixelCountLess8)
3691
0
            {
3692
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3693
0
                {
3694
0
                    const size_t j =
3695
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3696
0
                        (nSrcPixelStart - nChunkXOff);
3697
0
                    double dfVal1 = 0.0;
3698
0
                    double dfVal2 = 0.0;
3699
0
                    double dfVal3 = 0.0;
3700
0
                    GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3701
0
                        pChunk + j, pChunk + j + nChunkXSize,
3702
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3703
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3704
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3705
0
                                               nDstXSize +
3706
0
                                           iDstPixel - nDstXOff] =
3707
0
                        ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3708
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3709
0
                                            1) *
3710
0
                                               nDstXSize +
3711
0
                                           iDstPixel - nDstXOff] =
3712
0
                        ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3713
0
                                   nSrcPixelCount);
3714
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3715
0
                                            2) *
3716
0
                                               nDstXSize +
3717
0
                                           iDstPixel - nDstXOff] =
3718
0
                        ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3719
0
                                   nSrcPixelCount);
3720
0
                }
3721
0
            }
3722
0
            else
3723
0
#endif
3724
0
            {
3725
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3726
0
                {
3727
0
                    const size_t j =
3728
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3729
0
                        (nSrcPixelStart - nChunkXOff);
3730
0
                    double dfVal1 = 0.0;
3731
0
                    double dfVal2 = 0.0;
3732
0
                    double dfVal3 = 0.0;
3733
0
                    GDALResampleConvolutionHorizontal_3rows(
3734
0
                        pChunk + j, pChunk + j + nChunkXSize,
3735
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3736
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3737
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3738
0
                                               nDstXSize +
3739
0
                                           iDstPixel - nDstXOff] =
3740
0
                        ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
3741
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3742
0
                                            1) *
3743
0
                                               nDstXSize +
3744
0
                                           iDstPixel - nDstXOff] =
3745
0
                        ScaleValue(dfVal2, pChunk + j + nChunkXSize,
3746
0
                                   nSrcPixelCount);
3747
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3748
0
                                            2) *
3749
0
                                               nDstXSize +
3750
0
                                           iDstPixel - nDstXOff] =
3751
0
                        ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
3752
0
                                   nSrcPixelCount);
3753
0
                }
3754
0
            }
3755
0
            for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3756
0
            {
3757
0
                const size_t j =
3758
0
                    static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3759
0
                    (nSrcPixelStart - nChunkXOff);
3760
0
                const double dfVal = GDALResampleConvolutionHorizontal(
3761
0
                    pChunk + j, padfWeights, nSrcPixelCount);
3762
0
                padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3763
0
                                           nDstXSize +
3764
0
                                       iDstPixel - nDstXOff] =
3765
0
                    ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
3766
0
            }
3767
0
        }
3768
0
        else
3769
0
        {
3770
0
            for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3771
0
            {
3772
0
                const size_t j =
3773
0
                    static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3774
0
                    (nSrcPixelStart - nChunkXOff);
3775
3776
0
                if (bKernelWithNegativeWeights)
3777
0
                {
3778
0
                    int nConsecutiveValid = 0;
3779
0
                    int nMaxConsecutiveValid = 0;
3780
0
                    for (int k = 0; k < nSrcPixelCount; k++)
3781
0
                    {
3782
0
                        if (pabyChunkNodataMask[j + k])
3783
0
                            nConsecutiveValid++;
3784
0
                        else if (nConsecutiveValid)
3785
0
                        {
3786
0
                            nMaxConsecutiveValid = std::max(
3787
0
                                nMaxConsecutiveValid, nConsecutiveValid);
3788
0
                            nConsecutiveValid = 0;
3789
0
                        }
3790
0
                    }
3791
0
                    nMaxConsecutiveValid =
3792
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
3793
0
                    if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3794
0
                    {
3795
0
                        const size_t nTempOffset =
3796
0
                            static_cast<size_t>(iSrcLineOff) * nDstXSize +
3797
0
                            iDstPixel - nDstXOff;
3798
0
                        padfHorizontalFiltered[nTempOffset] = 0.0;
3799
0
                        pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3800
0
                        continue;
3801
0
                    }
3802
0
                }
3803
3804
0
                double dfVal = 0.0;
3805
0
                GDALResampleConvolutionHorizontalWithMask(
3806
0
                    pChunk + j, pabyChunkNodataMask + j, padfWeights,
3807
0
                    nSrcPixelCount, dfVal, dfWeightSum);
3808
0
                const size_t nTempOffset =
3809
0
                    static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3810
0
                    nDstXOff;
3811
0
                if (dfWeightSum > 0.0)
3812
0
                {
3813
0
                    padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3814
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3815
0
                }
3816
0
                else
3817
0
                {
3818
0
                    padfHorizontalFiltered[nTempOffset] = 0.0;
3819
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3820
0
                }
3821
0
            }
3822
0
        }
3823
0
    }
3824
3825
    /* ==================================================================== */
3826
    /*      Second pass: vertical filter                                    */
3827
    /* ==================================================================== */
3828
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3829
3830
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3831
0
    {
3832
0
        Twork *const pafDstScanline =
3833
0
            pafWrkScanline
3834
0
                ? pafWrkScanline
3835
0
                : static_cast<Twork *>(pDstBuffer) +
3836
0
                      static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3837
3838
0
        const double dfSrcLine =
3839
0
            (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3840
0
        int nSrcLineStart =
3841
0
            static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3842
0
        int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3843
0
        if (nSrcLineStart < nChunkYOff)
3844
0
            nSrcLineStart = nChunkYOff;
3845
0
        if (nSrcLineStop > nChunkBottomYOff)
3846
0
            nSrcLineStop = nChunkBottomYOff;
3847
#if 0
3848
        if( nSrcLineStart < nChunkYOff &&
3849
            nChunkYOff > 0 )
3850
        {
3851
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3852
        }
3853
        if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3854
        {
3855
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3856
        }
3857
#endif
3858
0
        const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3859
0
        double dfWeightSum = 0.0;
3860
3861
        // Compute convolution coefficients.
3862
0
        int nSrcLine = nSrcLineStart;  // Used after for.
3863
0
        double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3864
0
        for (; nSrcLine < nSrcLineStop - 3;
3865
0
             nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3866
0
        {
3867
0
            padfWeights[nSrcLine - nSrcLineStart] = dfY;
3868
0
            padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3869
0
            padfWeights[nSrcLine + 2 - nSrcLineStart] =
3870
0
                dfY + 2 * dfYScaleWeight;
3871
0
            padfWeights[nSrcLine + 3 - nSrcLineStart] =
3872
0
                dfY + 3 * dfYScaleWeight;
3873
0
            dfWeightSum +=
3874
0
                pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3875
0
        }
3876
0
        for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3877
0
        {
3878
0
            const double dfWeight = pfnFilterFunc(dfY);
3879
0
            padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3880
0
            dfWeightSum += dfWeight;
3881
0
        }
3882
3883
0
        if (pabyChunkNodataMask == nullptr)
3884
0
        {
3885
            // For floating-point data types, we must scale down a bit values
3886
            // if input values are close to +/- std::numeric_limits<T>::max()
3887
#ifdef OLD_CPPCHECK
3888
            constexpr double mulFactor = 1;
3889
#else
3890
0
            constexpr double mulFactor =
3891
0
                (bNeedRescale &&
3892
0
                 (std::is_same_v<T, float> || std::is_same_v<T, double>))
3893
0
                    ? 2
3894
0
                    : 1;
3895
0
#endif
3896
3897
0
            if (dfWeightSum != 0)
3898
0
            {
3899
0
                const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
3900
0
                for (int i = 0; i < nSrcLineCount; ++i)
3901
0
                    padfWeights[i] *= dfInvWeightSum;
3902
0
            }
3903
3904
0
            int iFilteredPixelOff = 0;  // Used after for.
3905
            // j used after for.
3906
0
            size_t j =
3907
0
                (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3908
0
#ifdef USE_SSE2
3909
            if constexpr ((!bNeedRescale || !std::is_same_v<T, float>) &&
3910
                          eWrkDataType == GDT_Float32)
3911
0
            {
3912
#ifdef __AVX__
3913
                for (; iFilteredPixelOff < nDstXSize - 15;
3914
                     iFilteredPixelOff += 16, j += 16)
3915
                {
3916
                    GDALResampleConvolutionVertical_16cols(
3917
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3918
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3919
                    if (bHasNoData)
3920
                    {
3921
                        for (int k = 0; k < 16; k++)
3922
                        {
3923
                            pafDstScanline[iFilteredPixelOff + k] =
3924
                                replaceValIfNodata(
3925
                                    pafDstScanline[iFilteredPixelOff + k]);
3926
                        }
3927
                    }
3928
                }
3929
#else
3930
0
                for (; iFilteredPixelOff < nDstXSize - 7;
3931
0
                     iFilteredPixelOff += 8, j += 8)
3932
0
                {
3933
0
                    GDALResampleConvolutionVertical_8cols(
3934
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3935
0
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3936
0
                    if (bHasNoData)
3937
0
                    {
3938
0
                        for (int k = 0; k < 8; k++)
3939
0
                        {
3940
0
                            pafDstScanline[iFilteredPixelOff + k] =
3941
0
                                replaceValIfNodata(
3942
0
                                    pafDstScanline[iFilteredPixelOff + k]);
3943
0
                        }
3944
0
                    }
3945
0
                }
3946
0
#endif
3947
3948
0
                for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3949
0
                {
3950
0
                    const Twork fVal =
3951
0
                        static_cast<Twork>(GDALResampleConvolutionVertical(
3952
0
                            padfHorizontalFiltered + j, nDstXSize, padfWeights,
3953
0
                            nSrcLineCount));
3954
0
                    pafDstScanline[iFilteredPixelOff] =
3955
0
                        replaceValIfNodata(fVal);
3956
0
                }
3957
            }
3958
            else
3959
#endif
3960
0
            {
3961
0
                const auto ScaleValue = [
3962
#ifdef _MSC_VER
3963
                                            mulFactor
3964
#endif
3965
0
                ](double dfVal, [[maybe_unused]] const double *inputValues,
3966
0
                                        [[maybe_unused]] int nStride,
3967
0
                                        [[maybe_unused]] int nInputValues)
3968
0
                {
3969
0
                    constexpr bool isFloat =
3970
0
                        std::is_same_v<T, float> || std::is_same_v<T, double>;
3971
                    if constexpr (isFloat)
3972
0
                    {
3973
0
                        if (std::isfinite(dfVal))
3974
0
                        {
3975
0
                            return std::clamp(
3976
0
                                       dfVal,
3977
0
                                       static_cast<double>(
3978
0
                                           -std::numeric_limits<Twork>::max()) /
3979
0
                                           mulFactor,
3980
0
                                       static_cast<double>(
3981
0
                                           std::numeric_limits<Twork>::max()) /
3982
0
                                           mulFactor) *
3983
0
                                   mulFactor;
3984
0
                        }
3985
                        else if constexpr (bKernelWithNegativeWeights)
3986
0
                        {
3987
0
                            if (std::isnan(dfVal))
3988
0
                            {
3989
                                // Either one of the input value is NaN or they are +/-Inf
3990
0
                                const bool isPositive = inputValues[0] >= 0;
3991
0
                                for (int i = 0; i < nInputValues; ++i)
3992
0
                                {
3993
0
                                    if (std::isnan(inputValues[i * nStride]))
3994
0
                                        return dfVal;
3995
                                    // cppcheck-suppress knownConditionTrueFalse
3996
0
                                    if ((inputValues[i] >= 0) != isPositive)
3997
0
                                        return dfVal;
3998
0
                                }
3999
                                // All values are positive or negative infinity
4000
0
                                return inputValues[0];
4001
0
                            }
4002
0
                        }
4003
0
                    }
4004
4005
0
                    return dfVal;
4006
0
                };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, true, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, false>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)::{lambda(double, double const*, int, int)#1}::operator()(double, double const*, int, int) const
4007
4008
0
                for (; iFilteredPixelOff < nDstXSize - 1;
4009
0
                     iFilteredPixelOff += 2, j += 2)
4010
0
                {
4011
0
                    double dfVal1 = 0.0;
4012
0
                    double dfVal2 = 0.0;
4013
0
                    GDALResampleConvolutionVertical_2cols(
4014
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
4015
0
                        nSrcLineCount, dfVal1, dfVal2);
4016
0
                    pafDstScanline[iFilteredPixelOff] =
4017
0
                        replaceValIfNodata(static_cast<Twork>(
4018
0
                            ScaleValue(dfVal1, padfHorizontalFiltered + j,
4019
0
                                       nDstXSize, nSrcLineCount)));
4020
0
                    pafDstScanline[iFilteredPixelOff + 1] =
4021
0
                        replaceValIfNodata(static_cast<Twork>(
4022
0
                            ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
4023
0
                                       nDstXSize, nSrcLineCount)));
4024
0
                }
4025
0
                if (iFilteredPixelOff < nDstXSize)
4026
0
                {
4027
0
                    const double dfVal = GDALResampleConvolutionVertical(
4028
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
4029
0
                        nSrcLineCount);
4030
0
                    pafDstScanline[iFilteredPixelOff] =
4031
0
                        replaceValIfNodata(static_cast<Twork>(
4032
0
                            ScaleValue(dfVal, padfHorizontalFiltered + j,
4033
0
                                       nDstXSize, nSrcLineCount)));
4034
0
                }
4035
0
            }
4036
0
        }
4037
0
        else
4038
0
        {
4039
0
            for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
4040
0
                 ++iFilteredPixelOff)
4041
0
            {
4042
0
                double dfVal = 0.0;
4043
0
                dfWeightSum = 0.0;
4044
0
                size_t j = (nSrcLineStart - nChunkYOff) *
4045
0
                               static_cast<size_t>(nDstXSize) +
4046
0
                           iFilteredPixelOff;
4047
0
                if (bKernelWithNegativeWeights)
4048
0
                {
4049
0
                    int nConsecutiveValid = 0;
4050
0
                    int nMaxConsecutiveValid = 0;
4051
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4052
0
                    {
4053
0
                        const double dfWeight =
4054
0
                            padfWeights[i] *
4055
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
4056
0
                        if (pabyChunkNodataMaskHorizontalFiltered[j])
4057
0
                        {
4058
0
                            nConsecutiveValid++;
4059
0
                        }
4060
0
                        else if (nConsecutiveValid)
4061
0
                        {
4062
0
                            nMaxConsecutiveValid = std::max(
4063
0
                                nMaxConsecutiveValid, nConsecutiveValid);
4064
0
                            nConsecutiveValid = 0;
4065
0
                        }
4066
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
4067
0
                        dfWeightSum += dfWeight;
4068
0
                    }
4069
0
                    nMaxConsecutiveValid =
4070
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
4071
0
                    if (nMaxConsecutiveValid < nSrcLineCount / 2)
4072
0
                    {
4073
0
                        pafDstScanline[iFilteredPixelOff] =
4074
0
                            static_cast<Twork>(dfNoDataValue);
4075
0
                        continue;
4076
0
                    }
4077
0
                }
4078
0
                else
4079
0
                {
4080
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
4081
0
                    {
4082
0
                        const double dfWeight =
4083
0
                            padfWeights[i] *
4084
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
4085
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
4086
0
                        dfWeightSum += dfWeight;
4087
0
                    }
4088
0
                }
4089
0
                if (dfWeightSum > 0.0)
4090
0
                {
4091
0
                    pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
4092
0
                        static_cast<Twork>(dfVal / dfWeightSum));
4093
0
                }
4094
0
                else
4095
0
                {
4096
0
                    pafDstScanline[iFilteredPixelOff] =
4097
0
                        static_cast<Twork>(dfNoDataValue);
4098
0
                }
4099
0
            }
4100
0
        }
4101
4102
0
        if (fMaxVal != 0.0f)
4103
0
        {
4104
            if constexpr (std::is_same_v<T, double>)
4105
0
            {
4106
0
                for (int i = 0; i < nDstXSize; ++i)
4107
0
                {
4108
0
                    if (pafDstScanline[i] > static_cast<double>(fMaxVal))
4109
0
                        pafDstScanline[i] = static_cast<double>(fMaxVal);
4110
0
                }
4111
            }
4112
            else
4113
0
            {
4114
0
                for (int i = 0; i < nDstXSize; ++i)
4115
0
                {
4116
0
                    if (pafDstScanline[i] > fMaxVal)
4117
0
                        pafDstScanline[i] = fMaxVal;
4118
0
                }
4119
0
            }
4120
0
        }
4121
4122
0
        if (pafWrkScanline)
4123
0
        {
4124
0
            GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
4125
0
                            static_cast<GByte *>(pDstBuffer) +
4126
0
                                static_cast<size_t>(iDstLine - nDstYOff) *
4127
0
                                    nDstXSize * nDstDataTypeSize,
4128
0
                            dstDataType, nDstDataTypeSize, nDstXSize);
4129
0
        }
4130
0
    }
4131
4132
0
    VSIFree(pafWrkScanline);
4133
0
    VSIFreeAligned(padfWeights);
4134
0
    VSIFree(padfHorizontalFiltered);
4135
0
    VSIFree(pabyChunkNodataMaskHorizontalFiltered);
4136
4137
0
    return CE_None;
4138
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, true, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, true, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, true>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, true>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6, false, false>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7, false, false>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, float)
4139
4140
template <bool bKernelWithNegativeWeights, bool bNeedRescale>
4141
static CPLErr
4142
GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
4143
                                      const void *pChunk, void **ppDstBuffer,
4144
                                      GDALDataType *peDstBufferDataType)
4145
0
{
4146
0
    GDALResampleAlg eResample;
4147
0
    if (EQUAL(args.pszResampling, "BILINEAR"))
4148
0
        eResample = GRA_Bilinear;
4149
0
    else if (EQUAL(args.pszResampling, "CUBIC"))
4150
0
        eResample = GRA_Cubic;
4151
0
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4152
0
        eResample = GRA_CubicSpline;
4153
0
    else if (EQUAL(args.pszResampling, "LANCZOS"))
4154
0
        eResample = GRA_Lanczos;
4155
0
    else
4156
0
    {
4157
0
        CPLAssert(false);
4158
0
        return CE_Failure;
4159
0
    }
4160
0
    const int nKernelRadius = GWKGetFilterRadius(eResample);
4161
0
    FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
4162
0
    const FilterFunc4ValuesType pfnFilterFunc4Values =
4163
0
        GWKGetFilterFunc4Values(eResample);
4164
4165
0
    float fMaxVal = 0.f;
4166
    // Cubic, etc... can have overshoots, so make sure we clamp values to the
4167
    // maximum value if NBITS is set.
4168
0
    if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
4169
0
        (args.eOvrDataType == GDT_UInt8 || args.eOvrDataType == GDT_UInt16 ||
4170
0
         args.eOvrDataType == GDT_UInt32))
4171
0
    {
4172
0
        int nBits = args.nOvrNBITS;
4173
0
        if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
4174
0
            nBits = 0;
4175
0
        if (nBits > 0 && nBits < 32)
4176
0
            fMaxVal = static_cast<float>((1U << nBits) - 1);
4177
0
    }
4178
4179
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
4180
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
4181
0
        GDALGetDataTypeSizeBytes(args.eOvrDataType));
4182
0
    if (*ppDstBuffer == nullptr)
4183
0
    {
4184
0
        return CE_Failure;
4185
0
    }
4186
0
    *peDstBufferDataType = args.eOvrDataType;
4187
4188
0
    switch (args.eWrkDataType)
4189
0
    {
4190
0
        case GDT_UInt8:
4191
0
        {
4192
0
            return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
4193
0
                                                  bKernelWithNegativeWeights,
4194
0
                                                  bNeedRescale>(
4195
0
                args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
4196
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4197
0
        }
4198
4199
0
        case GDT_UInt16:
4200
0
        {
4201
0
            return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
4202
0
                                                  bKernelWithNegativeWeights,
4203
0
                                                  bNeedRescale>(
4204
0
                args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
4205
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4206
0
        }
4207
4208
0
        case GDT_Float32:
4209
0
        {
4210
0
            return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
4211
0
                                                  bKernelWithNegativeWeights,
4212
0
                                                  bNeedRescale>(
4213
0
                args, static_cast<const float *>(pChunk), *ppDstBuffer,
4214
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4215
0
        }
4216
4217
0
        case GDT_Float64:
4218
0
        {
4219
0
            return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
4220
0
                                                  bKernelWithNegativeWeights,
4221
0
                                                  bNeedRescale>(
4222
0
                args, static_cast<const double *>(pChunk), *ppDstBuffer,
4223
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
4224
0
        }
4225
4226
0
        default:
4227
0
            break;
4228
0
    }
4229
4230
0
    CPLAssert(false);
4231
0
    return CE_Failure;
4232
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionInternal<true, true>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionInternal<false, true>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionInternal<false, false>(GDALOverviewResampleArgs const&, void const*, void**, GDALDataType*)
4233
4234
static CPLErr
4235
GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
4236
                              const void *pChunk, void **ppDstBuffer,
4237
                              GDALDataType *peDstBufferDataType)
4238
0
{
4239
0
    if (EQUAL(args.pszResampling, "CUBIC") ||
4240
0
        EQUAL(args.pszResampling, "LANCZOS"))
4241
0
        return GDALResampleChunk_ConvolutionInternal<
4242
0
            /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
4243
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
4244
0
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
4245
0
        return GDALResampleChunk_ConvolutionInternal<false, true>(
4246
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
4247
0
    else
4248
0
        return GDALResampleChunk_ConvolutionInternal<false, false>(
4249
0
            args, pChunk, ppDstBuffer, peDstBufferDataType);
4250
0
}
4251
4252
/************************************************************************/
4253
/*                       GDALResampleChunkC32R()                        */
4254
/************************************************************************/
4255
4256
static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
4257
                                    const float *pafChunk, const int nChunkYOff,
4258
                                    const int nChunkYSize, const int nDstYOff,
4259
                                    const int nDstYOff2, const int nOvrXSize,
4260
                                    const int nOvrYSize, void **ppDstBuffer,
4261
                                    GDALDataType *peDstBufferDataType,
4262
                                    const char *pszResampling)
4263
4264
0
{
4265
0
    enum Method
4266
0
    {
4267
0
        NEAR,
4268
0
        AVERAGE,
4269
0
        AVERAGE_MAGPHASE,
4270
0
        RMS,
4271
0
    };
4272
4273
0
    Method eMethod = NEAR;
4274
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
4275
0
    {
4276
0
        eMethod = NEAR;
4277
0
    }
4278
0
    else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
4279
0
    {
4280
0
        eMethod = AVERAGE_MAGPHASE;
4281
0
    }
4282
0
    else if (EQUAL(pszResampling, "RMS"))
4283
0
    {
4284
0
        eMethod = RMS;
4285
0
    }
4286
0
    else if (STARTS_WITH_CI(pszResampling, "AVER"))
4287
0
    {
4288
0
        eMethod = AVERAGE;
4289
0
    }
4290
0
    else
4291
0
    {
4292
0
        CPLError(
4293
0
            CE_Failure, CPLE_NotSupported,
4294
0
            "Resampling method %s is not supported for complex data types. "
4295
0
            "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
4296
0
            pszResampling);
4297
0
        return CE_Failure;
4298
0
    }
4299
4300
0
    const int nOXSize = nOvrXSize;
4301
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
4302
0
                                       GDALGetDataTypeSizeBytes(GDT_CFloat32));
4303
0
    if (*ppDstBuffer == nullptr)
4304
0
    {
4305
0
        return CE_Failure;
4306
0
    }
4307
0
    float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
4308
0
    *peDstBufferDataType = GDT_CFloat32;
4309
4310
0
    const int nOYSize = nOvrYSize;
4311
0
    const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
4312
0
    const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
4313
4314
    /* ==================================================================== */
4315
    /*      Loop over destination scanlines.                                */
4316
    /* ==================================================================== */
4317
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
4318
0
    {
4319
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
4320
0
        if (nSrcYOff < nChunkYOff)
4321
0
            nSrcYOff = nChunkYOff;
4322
4323
0
        int nSrcYOff2 =
4324
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
4325
0
        if (nSrcYOff2 == nSrcYOff)
4326
0
            nSrcYOff2++;
4327
4328
0
        if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
4329
0
        {
4330
0
            if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
4331
0
                nSrcYOff = nSrcHeight - 1;
4332
0
            nSrcYOff2 = nSrcHeight;
4333
0
        }
4334
0
        if (nSrcYOff2 > nChunkYOff + nChunkYSize)
4335
0
            nSrcYOff2 = nChunkYOff + nChunkYSize;
4336
4337
0
        const float *const pafSrcScanline =
4338
0
            pafChunk +
4339
0
            (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
4340
0
        float *const pafDstScanline =
4341
0
            pafDstBuffer +
4342
0
            static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
4343
4344
        /* --------------------------------------------------------------------
4345
         */
4346
        /*      Loop over destination pixels */
4347
        /* --------------------------------------------------------------------
4348
         */
4349
0
        for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
4350
0
        {
4351
0
            const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
4352
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
4353
0
            int nSrcXOff2 =
4354
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
4355
0
            if (nSrcXOff2 == nSrcXOff)
4356
0
                nSrcXOff2++;
4357
0
            if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
4358
0
            {
4359
0
                if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
4360
0
                    nSrcXOff = nSrcWidth - 1;
4361
0
                nSrcXOff2 = nSrcWidth;
4362
0
            }
4363
0
            const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
4364
4365
0
            if (eMethod == NEAR)
4366
0
            {
4367
0
                pafDstScanline[iDstPixelSZ * 2] =
4368
0
                    pafSrcScanline[nSrcXOffSZ * 2];
4369
0
                pafDstScanline[iDstPixelSZ * 2 + 1] =
4370
0
                    pafSrcScanline[nSrcXOffSZ * 2 + 1];
4371
0
            }
4372
0
            else if (eMethod == AVERAGE_MAGPHASE)
4373
0
            {
4374
0
                double dfTotalR = 0.0;
4375
0
                double dfTotalI = 0.0;
4376
0
                double dfTotalM = 0.0;
4377
0
                size_t nCount = 0;
4378
4379
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4380
0
                {
4381
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4382
0
                    {
4383
0
                        const double dfR = double(
4384
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4385
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4386
0
                                               nSrcWidth * 2]);
4387
0
                        const double dfI = double(
4388
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4389
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4390
0
                                               nSrcWidth * 2 +
4391
0
                                           1]);
4392
0
                        dfTotalR += dfR;
4393
0
                        dfTotalI += dfI;
4394
0
                        dfTotalM += std::hypot(dfR, dfI);
4395
0
                        ++nCount;
4396
0
                    }
4397
0
                }
4398
4399
0
                CPLAssert(nCount > 0);
4400
0
                if (nCount == 0)
4401
0
                {
4402
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4403
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4404
0
                }
4405
0
                else
4406
0
                {
4407
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4408
0
                        dfTotalR / static_cast<double>(nCount));
4409
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4410
0
                        dfTotalI / static_cast<double>(nCount));
4411
0
                    const double dfM =
4412
0
                        double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
4413
0
                                          pafDstScanline[iDstPixelSZ * 2 + 1]));
4414
0
                    const double dfDesiredM =
4415
0
                        dfTotalM / static_cast<double>(nCount);
4416
0
                    double dfRatio = 1.0;
4417
0
                    if (dfM != 0.0)
4418
0
                        dfRatio = dfDesiredM / dfM;
4419
4420
0
                    pafDstScanline[iDstPixelSZ * 2] *=
4421
0
                        static_cast<float>(dfRatio);
4422
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] *=
4423
0
                        static_cast<float>(dfRatio);
4424
0
                }
4425
0
            }
4426
0
            else if (eMethod == RMS)
4427
0
            {
4428
0
                double dfTotalR = 0.0;
4429
0
                double dfTotalI = 0.0;
4430
0
                size_t nCount = 0;
4431
4432
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4433
0
                {
4434
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4435
0
                    {
4436
0
                        const double dfR = double(
4437
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4438
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4439
0
                                               nSrcWidth * 2]);
4440
0
                        const double dfI = double(
4441
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4442
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4443
0
                                               nSrcWidth * 2 +
4444
0
                                           1]);
4445
4446
0
                        dfTotalR += SQUARE(dfR);
4447
0
                        dfTotalI += SQUARE(dfI);
4448
4449
0
                        ++nCount;
4450
0
                    }
4451
0
                }
4452
4453
0
                CPLAssert(nCount > 0);
4454
0
                if (nCount == 0)
4455
0
                {
4456
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4457
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4458
0
                }
4459
0
                else
4460
0
                {
4461
                    /* compute RMS */
4462
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4463
0
                        sqrt(dfTotalR / static_cast<double>(nCount)));
4464
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4465
0
                        sqrt(dfTotalI / static_cast<double>(nCount)));
4466
0
                }
4467
0
            }
4468
0
            else if (eMethod == AVERAGE)
4469
0
            {
4470
0
                double dfTotalR = 0.0;
4471
0
                double dfTotalI = 0.0;
4472
0
                size_t nCount = 0;
4473
4474
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4475
0
                {
4476
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4477
0
                    {
4478
                        // TODO(schwehr): Maybe use std::complex?
4479
0
                        dfTotalR += double(
4480
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4481
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4482
0
                                               nSrcWidth * 2]);
4483
0
                        dfTotalI += double(
4484
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4485
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4486
0
                                               nSrcWidth * 2 +
4487
0
                                           1]);
4488
0
                        ++nCount;
4489
0
                    }
4490
0
                }
4491
4492
0
                CPLAssert(nCount > 0);
4493
0
                if (nCount == 0)
4494
0
                {
4495
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4496
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4497
0
                }
4498
0
                else
4499
0
                {
4500
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4501
0
                        dfTotalR / static_cast<double>(nCount));
4502
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4503
0
                        dfTotalI / static_cast<double>(nCount));
4504
0
                }
4505
0
            }
4506
0
        }
4507
0
    }
4508
4509
0
    return CE_None;
4510
0
}
4511
4512
/************************************************************************/
4513
/*                  GDALRegenerateCascadingOverviews()                  */
4514
/*                                                                      */
4515
/*      Generate a list of overviews in order from largest to           */
4516
/*      smallest, computing each from the next larger.                  */
4517
/************************************************************************/
4518
4519
static CPLErr GDALRegenerateCascadingOverviews(
4520
    GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4521
    const char *pszResampling, GDALProgressFunc pfnProgress,
4522
    void *pProgressData, CSLConstList papszOptions)
4523
4524
0
{
4525
    /* -------------------------------------------------------------------- */
4526
    /*      First, we must put the overviews in order from largest to       */
4527
    /*      smallest.                                                       */
4528
    /* -------------------------------------------------------------------- */
4529
0
    for (int i = 0; i < nOverviews - 1; ++i)
4530
0
    {
4531
0
        for (int j = 0; j < nOverviews - i - 1; ++j)
4532
0
        {
4533
0
            if (papoOvrBands[j]->GetXSize() *
4534
0
                    static_cast<float>(papoOvrBands[j]->GetYSize()) <
4535
0
                papoOvrBands[j + 1]->GetXSize() *
4536
0
                    static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4537
0
            {
4538
0
                GDALRasterBand *poTempBand = papoOvrBands[j];
4539
0
                papoOvrBands[j] = papoOvrBands[j + 1];
4540
0
                papoOvrBands[j + 1] = poTempBand;
4541
0
            }
4542
0
        }
4543
0
    }
4544
4545
    /* -------------------------------------------------------------------- */
4546
    /*      Count total pixels so we can prepare appropriate scaled         */
4547
    /*      progress functions.                                             */
4548
    /* -------------------------------------------------------------------- */
4549
0
    double dfTotalPixels = 0.0;
4550
4551
0
    for (int i = 0; i < nOverviews; ++i)
4552
0
    {
4553
0
        dfTotalPixels += papoOvrBands[i]->GetXSize() *
4554
0
                         static_cast<double>(papoOvrBands[i]->GetYSize());
4555
0
    }
4556
4557
    /* -------------------------------------------------------------------- */
4558
    /*      Generate all the bands.                                         */
4559
    /* -------------------------------------------------------------------- */
4560
0
    double dfPixelsProcessed = 0.0;
4561
4562
0
    CPLStringList aosOptions(papszOptions);
4563
0
    aosOptions.SetNameValue("CASCADING", "YES");
4564
0
    for (int i = 0; i < nOverviews; ++i)
4565
0
    {
4566
0
        GDALRasterBand *poBaseBand = poSrcBand;
4567
0
        if (i != 0)
4568
0
            poBaseBand = papoOvrBands[i - 1];
4569
4570
0
        double dfPixels = papoOvrBands[i]->GetXSize() *
4571
0
                          static_cast<double>(papoOvrBands[i]->GetYSize());
4572
4573
0
        void *pScaledProgressData = GDALCreateScaledProgress(
4574
0
            dfPixelsProcessed / dfTotalPixels,
4575
0
            (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4576
0
            pProgressData);
4577
4578
0
        const CPLErr eErr = GDALRegenerateOverviewsEx(
4579
0
            poBaseBand, 1,
4580
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4581
0
            pszResampling, GDALScaledProgress, pScaledProgressData,
4582
0
            aosOptions.List());
4583
0
        GDALDestroyScaledProgress(pScaledProgressData);
4584
4585
0
        if (eErr != CE_None)
4586
0
            return eErr;
4587
4588
0
        dfPixelsProcessed += dfPixels;
4589
4590
        // Only do the bit2grayscale promotion on the base band.
4591
0
        if (STARTS_WITH_CI(pszResampling,
4592
0
                           "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4593
0
            pszResampling = "AVERAGE";
4594
0
    }
4595
4596
0
    return CE_None;
4597
0
}
4598
4599
/************************************************************************/
4600
/*                      GDALGetResampleFunction()                       */
4601
/************************************************************************/
4602
4603
GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4604
                                             int *pnRadius)
4605
0
{
4606
0
    if (pnRadius)
4607
0
        *pnRadius = 0;
4608
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
4609
0
        return GDALResampleChunk_Near;
4610
0
    else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4611
0
             EQUAL(pszResampling, "RMS"))
4612
0
        return GDALResampleChunk_AverageOrRMS;
4613
0
    else if (EQUAL(pszResampling, "GAUSS"))
4614
0
    {
4615
0
        if (pnRadius)
4616
0
            *pnRadius = 1;
4617
0
        return GDALResampleChunk_Gauss;
4618
0
    }
4619
0
    else if (EQUAL(pszResampling, "MODE"))
4620
0
        return GDALResampleChunk_Mode;
4621
0
    else if (EQUAL(pszResampling, "CUBIC"))
4622
0
    {
4623
0
        if (pnRadius)
4624
0
            *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4625
0
        return GDALResampleChunk_Convolution;
4626
0
    }
4627
0
    else if (EQUAL(pszResampling, "CUBICSPLINE"))
4628
0
    {
4629
0
        if (pnRadius)
4630
0
            *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4631
0
        return GDALResampleChunk_Convolution;
4632
0
    }
4633
0
    else if (EQUAL(pszResampling, "LANCZOS"))
4634
0
    {
4635
0
        if (pnRadius)
4636
0
            *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4637
0
        return GDALResampleChunk_Convolution;
4638
0
    }
4639
0
    else if (EQUAL(pszResampling, "BILINEAR"))
4640
0
    {
4641
0
        if (pnRadius)
4642
0
            *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4643
0
        return GDALResampleChunk_Convolution;
4644
0
    }
4645
0
    else
4646
0
    {
4647
0
        CPLError(
4648
0
            CE_Failure, CPLE_AppDefined,
4649
0
            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4650
0
            pszResampling);
4651
0
        return nullptr;
4652
0
    }
4653
0
}
4654
4655
/************************************************************************/
4656
/*                       GDALGetOvrWorkDataType()                       */
4657
/************************************************************************/
4658
4659
GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4660
                                    GDALDataType eSrcDataType)
4661
0
{
4662
0
    if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4663
0
    {
4664
0
        return eSrcDataType;
4665
0
    }
4666
0
    else if (eSrcDataType == GDT_UInt8 &&
4667
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4668
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4669
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4670
0
              EQUAL(pszResampling, "LANCZOS") ||
4671
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4672
0
    {
4673
0
        return GDT_UInt8;
4674
0
    }
4675
0
    else if (eSrcDataType == GDT_UInt16 &&
4676
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4677
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4678
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4679
0
              EQUAL(pszResampling, "LANCZOS") ||
4680
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4681
0
    {
4682
0
        return GDT_UInt16;
4683
0
    }
4684
0
    else if (EQUAL(pszResampling, "GAUSS"))
4685
0
        return GDT_Float64;
4686
4687
0
    if (eSrcDataType == GDT_UInt8 || eSrcDataType == GDT_Int8 ||
4688
0
        eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4689
0
        eSrcDataType == GDT_Float32)
4690
0
    {
4691
0
        return GDT_Float32;
4692
0
    }
4693
0
    return GDT_Float64;
4694
0
}
4695
4696
namespace
4697
{
4698
// Structure to hold a pointer to free with CPLFree()
4699
struct PointerHolder
4700
{
4701
    void *ptr = nullptr;
4702
4703
0
    explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4704
0
    {
4705
0
    }
4706
4707
    ~PointerHolder()
4708
0
    {
4709
0
        CPLFree(ptr);
4710
0
    }
4711
4712
    PointerHolder(const PointerHolder &) = delete;
4713
    PointerHolder &operator=(const PointerHolder &) = delete;
4714
};
4715
}  // namespace
4716
4717
/************************************************************************/
4718
/*                      GDALRegenerateOverviews()                       */
4719
/************************************************************************/
4720
4721
/**
4722
 * \brief Generate downsampled overviews.
4723
 *
4724
 * This function will generate one or more overview images from a base image
4725
 * using the requested downsampling algorithm.  Its primary use is for
4726
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4727
 * used to generate downsampled images in one file from another outside the
4728
 * overview architecture.
4729
 *
4730
 * The output bands need to exist in advance.
4731
 *
4732
 * The full set of resampling algorithms is documented in
4733
 * GDALDataset::BuildOverviews().
4734
 *
4735
 * This function will honour properly NODATA_VALUES tuples (special dataset
4736
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4737
 * considered as the nodata value and not each value of the triplet
4738
 * independently per band.
4739
 *
4740
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4741
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4742
 * overview computation.
4743
 *
4744
 * @param hSrcBand the source (base level) band.
4745
 * @param nOverviewCount the number of downsampled bands being generated.
4746
 * @param pahOvrBands the list of downsampled bands to be generated.
4747
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4748
 * @param pfnProgress progress report function.
4749
 * @param pProgressData progress function callback data.
4750
 * @return CE_None on success or CE_Failure on failure.
4751
 */
4752
CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4753
                               GDALRasterBandH *pahOvrBands,
4754
                               const char *pszResampling,
4755
                               GDALProgressFunc pfnProgress,
4756
                               void *pProgressData)
4757
4758
0
{
4759
0
    return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4760
0
                                     pszResampling, pfnProgress, pProgressData,
4761
0
                                     nullptr);
4762
0
}
4763
4764
/************************************************************************/
4765
/*                     GDALRegenerateOverviewsEx()                      */
4766
/************************************************************************/
4767
4768
constexpr int RADIUS_TO_DIAMETER = 2;
4769
4770
/**
4771
 * \brief Generate downsampled overviews.
4772
 *
4773
 * This function will generate one or more overview images from a base image
4774
 * using the requested downsampling algorithm.  Its primary use is for
4775
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4776
 * used to generate downsampled images in one file from another outside the
4777
 * overview architecture.
4778
 *
4779
 * The output bands need to exist in advance.
4780
 *
4781
 * The full set of resampling algorithms is documented in
4782
 * GDALDataset::BuildOverviews().
4783
 *
4784
 * This function will honour properly NODATA_VALUES tuples (special dataset
4785
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4786
 * considered as the nodata value and not each value of the triplet
4787
 * independently per band.
4788
 *
4789
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4790
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4791
 * overview computation.
4792
 *
4793
 * @param hSrcBand the source (base level) band.
4794
 * @param nOverviewCount the number of downsampled bands being generated.
4795
 * @param pahOvrBands the list of downsampled bands to be generated.
4796
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4797
 * @param pfnProgress progress report function.
4798
 * @param pProgressData progress function callback data.
4799
 * @param papszOptions NULL terminated list of options as key=value pairs, or
4800
 * NULL
4801
 * @return CE_None on success or CE_Failure on failure.
4802
 * @since GDAL 3.6
4803
 */
4804
CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4805
                                 GDALRasterBandH *pahOvrBands,
4806
                                 const char *pszResampling,
4807
                                 GDALProgressFunc pfnProgress,
4808
                                 void *pProgressData, CSLConstList papszOptions)
4809
4810
0
{
4811
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4812
0
    GDALRasterBand **papoOvrBands =
4813
0
        reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4814
4815
0
    if (pfnProgress == nullptr)
4816
0
        pfnProgress = GDALDummyProgress;
4817
4818
0
    if (EQUAL(pszResampling, "NONE"))
4819
0
        return CE_None;
4820
4821
0
    int nKernelRadius = 0;
4822
0
    GDALResampleFunction pfnResampleFn =
4823
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
4824
4825
0
    if (pfnResampleFn == nullptr)
4826
0
        return CE_Failure;
4827
4828
    /* -------------------------------------------------------------------- */
4829
    /*      Check color tables...                                           */
4830
    /* -------------------------------------------------------------------- */
4831
0
    GDALColorTable *poColorTable = nullptr;
4832
4833
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4834
0
         EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4835
0
        poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4836
0
    {
4837
0
        poColorTable = poSrcBand->GetColorTable();
4838
0
        if (poColorTable != nullptr)
4839
0
        {
4840
0
            if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4841
0
            {
4842
0
                CPLError(CE_Warning, CPLE_AppDefined,
4843
0
                         "Computing overviews on palette index raster bands "
4844
0
                         "with a palette whose color interpretation is not RGB "
4845
0
                         "will probably lead to unexpected results.");
4846
0
                poColorTable = nullptr;
4847
0
            }
4848
0
            else if (poColorTable->IsIdentity())
4849
0
            {
4850
0
                poColorTable = nullptr;
4851
0
            }
4852
0
        }
4853
0
        else
4854
0
        {
4855
0
            CPLError(CE_Warning, CPLE_AppDefined,
4856
0
                     "Computing overviews on palette index raster bands "
4857
0
                     "without a palette will probably lead to unexpected "
4858
0
                     "results.");
4859
0
        }
4860
0
    }
4861
    // Not ready yet
4862
0
    else if ((EQUAL(pszResampling, "CUBIC") ||
4863
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4864
0
              EQUAL(pszResampling, "LANCZOS") ||
4865
0
              EQUAL(pszResampling, "BILINEAR")) &&
4866
0
             poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4867
0
    {
4868
0
        CPLError(CE_Warning, CPLE_AppDefined,
4869
0
                 "Computing %s overviews on palette index raster bands "
4870
0
                 "will probably lead to unexpected results.",
4871
0
                 pszResampling);
4872
0
    }
4873
4874
    // If we have a nodata mask and we are doing something more complicated
4875
    // than nearest neighbouring, we have to fetch to nodata mask.
4876
4877
0
    GDALRasterBand *poMaskBand = nullptr;
4878
0
    bool bUseNoDataMask = false;
4879
0
    bool bCanUseCascaded = true;
4880
4881
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4882
0
    {
4883
        // Special case if we are an alpha/mask band. We want it to be
4884
        // considered as the mask band to avoid alpha=0 to be taken into account
4885
        // in average computation.
4886
0
        if (poSrcBand->IsMaskBand())
4887
0
        {
4888
0
            poMaskBand = poSrcBand;
4889
0
            bUseNoDataMask = true;
4890
0
        }
4891
0
        else
4892
0
        {
4893
0
            poMaskBand = poSrcBand->GetMaskBand();
4894
0
            const int nMaskFlags = poSrcBand->GetMaskFlags();
4895
0
            bCanUseCascaded =
4896
0
                (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4897
0
            bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4898
0
        }
4899
0
    }
4900
4901
0
    int nHasNoData = 0;
4902
0
    const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4903
0
    const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4904
0
    const bool bPropagateNoData =
4905
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4906
4907
0
    if (poSrcBand->GetBand() == 1 && bUseNoDataMask &&
4908
0
        CSLFetchNameValue(papszOptions, "CASCADING") == nullptr)
4909
0
    {
4910
0
        std::string osDetailMessage;
4911
0
        if (poSrcBand->HasConflictingMaskSources(&osDetailMessage, false))
4912
0
        {
4913
0
            CPLError(
4914
0
                CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
4915
0
                bHasNoData
4916
0
                    ? "Only the nodata value will be taken into account."
4917
0
                    : "Only the first listed one will be taken into account.");
4918
0
        }
4919
0
    }
4920
4921
    /* -------------------------------------------------------------------- */
4922
    /*      If we are operating on multiple overviews, and using            */
4923
    /*      averaging, lets do them in cascading order to reduce the        */
4924
    /*      amount of computation.                                          */
4925
    /* -------------------------------------------------------------------- */
4926
4927
    // In case the mask made be computed from another band of the dataset,
4928
    // we can't use cascaded generation, as the computation of the overviews
4929
    // of the band used for the mask band may not have yet occurred (#3033).
4930
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4931
0
         EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4932
0
         EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4933
0
         EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4934
0
         EQUAL(pszResampling, "MODE")) &&
4935
0
        nOverviewCount > 1 && bCanUseCascaded)
4936
0
        return GDALRegenerateCascadingOverviews(
4937
0
            poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4938
0
            pProgressData, papszOptions);
4939
4940
    /* -------------------------------------------------------------------- */
4941
    /*      Setup one horizontal swath to read from the raw buffer.         */
4942
    /* -------------------------------------------------------------------- */
4943
0
    int nFRXBlockSize = 0;
4944
0
    int nFRYBlockSize = 0;
4945
0
    poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4946
4947
0
    const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4948
0
    const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4949
0
                                       EQUAL(pszResampling, "MODE") ||
4950
0
                                       !GDALDataTypeIsComplex(eSrcDataType);
4951
0
    const GDALDataType eWrkDataType =
4952
0
        bUseGenericResampleFn
4953
0
            ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4954
0
            : GDT_CFloat32;
4955
4956
0
    const int nWidth = poSrcBand->GetXSize();
4957
0
    const int nHeight = poSrcBand->GetYSize();
4958
4959
0
    int nMaxOvrFactor = 1;
4960
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4961
0
    {
4962
0
        const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4963
0
        const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4964
0
        nMaxOvrFactor = std::max(
4965
0
            nMaxOvrFactor,
4966
0
            static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4967
0
        nMaxOvrFactor = std::max(
4968
0
            nMaxOvrFactor,
4969
0
            static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4970
0
    }
4971
4972
0
    int nFullResYChunk = nFRYBlockSize;
4973
0
    int nMaxChunkYSizeQueried = 0;
4974
4975
0
    const auto UpdateChunkHeightAndGetChunkSize =
4976
0
        [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4977
0
         eWrkDataType, nWidth]()
4978
0
    {
4979
        // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4980
        // + nFullResYChunk) / nMaxOvrFactor)
4981
0
        if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4982
0
        {
4983
0
            return GINTBIG_MAX;
4984
0
        }
4985
0
        nFullResYChunk =
4986
0
            std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4987
0
        if ((nKernelRadius > 0 &&
4988
0
             nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4989
0
            nFullResYChunk >
4990
0
                INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4991
0
        {
4992
0
            return GINTBIG_MAX;
4993
0
        }
4994
0
        nMaxChunkYSizeQueried =
4995
0
            nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4996
0
        if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4997
0
            std::numeric_limits<int64_t>::max() /
4998
0
                (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4999
0
        {
5000
0
            return GINTBIG_MAX;
5001
0
        }
5002
0
        return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
5003
0
               nMaxChunkYSizeQueried * nWidth;
5004
0
    };
5005
5006
0
    const char *pszChunkYSize =
5007
0
        CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
5008
0
#ifndef __COVERITY__
5009
    // Only configurable for debug / testing
5010
0
    if (pszChunkYSize)
5011
0
    {
5012
0
        nFullResYChunk = atoi(pszChunkYSize);
5013
0
    }
5014
0
#endif
5015
5016
    // Only configurable for debug / testing
5017
0
    const int nChunkMaxSize =
5018
0
        atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
5019
5020
0
    auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
5021
0
    if (nChunkSize > nChunkMaxSize)
5022
0
    {
5023
0
        if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
5024
0
            !GDALDataTypeIsComplex(eSrcDataType) &&
5025
0
            (!STARTS_WITH_CI(pszResampling, "AVER") ||
5026
0
             EQUAL(pszResampling, "AVERAGE")))
5027
0
        {
5028
            // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
5029
            // which use a block based strategy, which is much less memory
5030
            // hungry.
5031
0
            return GDALRegenerateOverviewsMultiBand(
5032
0
                1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
5033
0
                pfnProgress, pProgressData, papszOptions);
5034
0
        }
5035
0
        else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
5036
0
        {
5037
0
            return GDALRegenerateCascadingOverviews(
5038
0
                poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
5039
0
                pfnProgress, pProgressData, papszOptions);
5040
0
        }
5041
0
    }
5042
0
    else if (pszChunkYSize == nullptr)
5043
0
    {
5044
        // Try to get as close as possible to nChunkMaxSize
5045
0
        while (nChunkSize < nChunkMaxSize / 2)
5046
0
        {
5047
0
            nFullResYChunk *= 2;
5048
0
            nChunkSize = UpdateChunkHeightAndGetChunkSize();
5049
0
        }
5050
0
    }
5051
5052
    // Structure describing a resampling job
5053
0
    struct OvrJob
5054
0
    {
5055
        // Buffers to free when job is finished
5056
0
        std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
5057
0
        std::shared_ptr<PointerHolder> oSrcBufferHolder{};
5058
0
        std::unique_ptr<PointerHolder> oDstBufferHolder{};
5059
5060
0
        GDALRasterBand *poDstBand = nullptr;
5061
5062
        // Input parameters of pfnResampleFn
5063
0
        GDALResampleFunction pfnResampleFn = nullptr;
5064
0
        int nSrcWidth = 0;
5065
0
        int nSrcHeight = 0;
5066
0
        int nDstWidth = 0;
5067
0
        GDALOverviewResampleArgs args{};
5068
0
        const void *pChunk = nullptr;
5069
0
        bool bUseGenericResampleFn = false;
5070
5071
        // Output values of resampling function
5072
0
        CPLErr eErr = CE_Failure;
5073
0
        void *pDstBuffer = nullptr;
5074
0
        GDALDataType eDstBufferDataType = GDT_Unknown;
5075
5076
0
        void SetSrcMaskBufferHolder(
5077
0
            const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
5078
0
        {
5079
0
            oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
5080
0
        }
5081
5082
0
        void SetSrcBufferHolder(
5083
0
            const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
5084
0
        {
5085
0
            oSrcBufferHolder = oSrcBufferHolderIn;
5086
0
        }
5087
5088
0
        void NotifyFinished()
5089
0
        {
5090
0
            std::lock_guard guard(mutex);
5091
0
            bFinished = true;
5092
0
            cv.notify_one();
5093
0
        }
5094
5095
0
        bool IsFinished()
5096
0
        {
5097
0
            std::lock_guard guard(mutex);
5098
0
            return bFinished;
5099
0
        }
5100
5101
0
        void WaitFinished()
5102
0
        {
5103
0
            std::unique_lock oGuard(mutex);
5104
0
            while (!bFinished)
5105
0
            {
5106
0
                cv.wait(oGuard);
5107
0
            }
5108
0
        }
5109
5110
0
      private:
5111
        // Synchronization
5112
0
        bool bFinished = false;
5113
0
        std::mutex mutex{};
5114
0
        std::condition_variable cv{};
5115
0
    };
5116
5117
    // Thread function to resample
5118
0
    const auto JobResampleFunc = [](void *pData)
5119
0
    {
5120
0
        OvrJob *poJob = static_cast<OvrJob *>(pData);
5121
5122
0
        if (poJob->bUseGenericResampleFn)
5123
0
        {
5124
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5125
0
                                               &(poJob->pDstBuffer),
5126
0
                                               &(poJob->eDstBufferDataType));
5127
0
        }
5128
0
        else
5129
0
        {
5130
0
            poJob->eErr = GDALResampleChunkC32R(
5131
0
                poJob->nSrcWidth, poJob->nSrcHeight,
5132
0
                static_cast<const float *>(poJob->pChunk),
5133
0
                poJob->args.nChunkYOff, poJob->args.nChunkYSize,
5134
0
                poJob->args.nDstYOff, poJob->args.nDstYOff2,
5135
0
                poJob->args.nOvrXSize, poJob->args.nOvrYSize,
5136
0
                &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
5137
0
                poJob->args.pszResampling);
5138
0
        }
5139
5140
0
        poJob->oDstBufferHolder =
5141
0
            std::make_unique<PointerHolder>(poJob->pDstBuffer);
5142
5143
0
        poJob->NotifyFinished();
5144
0
    };
5145
5146
    // Function to write resample data to target band
5147
0
    const auto WriteJobData = [](const OvrJob *poJob)
5148
0
    {
5149
0
        return poJob->poDstBand->RasterIO(
5150
0
            GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
5151
0
            poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5152
0
            poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5153
0
            poJob->eDstBufferDataType, 0, 0, nullptr);
5154
0
    };
5155
5156
    // Wait for completion of oldest job and serialize it
5157
0
    const auto WaitAndFinalizeOldestJob =
5158
0
        [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5159
0
    {
5160
0
        auto poOldestJob = jobList.front().get();
5161
0
        poOldestJob->WaitFinished();
5162
0
        CPLErr l_eErr = poOldestJob->eErr;
5163
0
        if (l_eErr == CE_None)
5164
0
        {
5165
0
            l_eErr = WriteJobData(poOldestJob);
5166
0
        }
5167
5168
0
        jobList.pop_front();
5169
0
        return l_eErr;
5170
0
    };
5171
5172
    // Queue of jobs
5173
0
    std::list<std::unique_ptr<OvrJob>> jobList;
5174
5175
0
    GByte *pabyChunkNodataMask = nullptr;
5176
0
    void *pChunk = nullptr;
5177
5178
0
    const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5179
0
                                           /* bDefaultToAllCPUs=*/false);
5180
0
    auto poThreadPool =
5181
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5182
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5183
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
5184
5185
    /* -------------------------------------------------------------------- */
5186
    /*      Loop over image operating on chunks.                            */
5187
    /* -------------------------------------------------------------------- */
5188
0
    int nChunkYOff = 0;
5189
0
    CPLErr eErr = CE_None;
5190
5191
0
    for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
5192
0
         nChunkYOff += nFullResYChunk)
5193
0
    {
5194
0
        if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
5195
0
                         pProgressData))
5196
0
        {
5197
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5198
0
            eErr = CE_Failure;
5199
0
        }
5200
5201
0
        if (nFullResYChunk + nChunkYOff > nHeight)
5202
0
            nFullResYChunk = nHeight - nChunkYOff;
5203
5204
0
        int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
5205
0
        int nChunkYSizeQueried =
5206
0
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
5207
0
        if (nChunkYOffQueried < 0)
5208
0
        {
5209
0
            nChunkYSizeQueried += nChunkYOffQueried;
5210
0
            nChunkYOffQueried = 0;
5211
0
        }
5212
0
        if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
5213
0
            nChunkYSizeQueried = nHeight - nChunkYOffQueried;
5214
5215
        // Avoid accumulating too many tasks and exhaust RAM
5216
        // Try to complete already finished jobs
5217
0
        while (eErr == CE_None && !jobList.empty())
5218
0
        {
5219
0
            auto poOldestJob = jobList.front().get();
5220
0
            if (!poOldestJob->IsFinished())
5221
0
                break;
5222
0
            eErr = poOldestJob->eErr;
5223
0
            if (eErr == CE_None)
5224
0
            {
5225
0
                eErr = WriteJobData(poOldestJob);
5226
0
            }
5227
5228
0
            jobList.pop_front();
5229
0
        }
5230
5231
        // And in case we have saturated the number of threads,
5232
        // wait for completion of tasks to go below the threshold.
5233
0
        while (eErr == CE_None &&
5234
0
               jobList.size() >= static_cast<size_t>(nThreads))
5235
0
        {
5236
0
            eErr = WaitAndFinalizeOldestJob(jobList);
5237
0
        }
5238
5239
        // (Re)allocate buffers if needed
5240
0
        if (pChunk == nullptr)
5241
0
        {
5242
0
            pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
5243
0
                                         nMaxChunkYSizeQueried, nWidth);
5244
0
        }
5245
0
        if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
5246
0
        {
5247
0
            pabyChunkNodataMask = static_cast<GByte *>(
5248
0
                VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
5249
0
        }
5250
5251
0
        if (pChunk == nullptr ||
5252
0
            (bUseNoDataMask && pabyChunkNodataMask == nullptr))
5253
0
        {
5254
0
            CPLFree(pChunk);
5255
0
            CPLFree(pabyChunkNodataMask);
5256
0
            return CE_Failure;
5257
0
        }
5258
5259
        // Read chunk.
5260
0
        if (eErr == CE_None)
5261
0
            eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5262
0
                                       nChunkYSizeQueried, pChunk, nWidth,
5263
0
                                       nChunkYSizeQueried, eWrkDataType, 0, 0,
5264
0
                                       nullptr);
5265
0
        if (eErr == CE_None && bUseNoDataMask)
5266
0
            eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
5267
0
                                        nChunkYSizeQueried, pabyChunkNodataMask,
5268
0
                                        nWidth, nChunkYSizeQueried, GDT_UInt8,
5269
0
                                        0, 0, nullptr);
5270
5271
        // Special case to promote 1bit data to 8bit 0/255 values.
5272
0
        if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
5273
0
        {
5274
0
            if (eWrkDataType == GDT_Float32)
5275
0
            {
5276
0
                float *pafChunk = static_cast<float *>(pChunk);
5277
0
                for (size_t i = 0;
5278
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5279
0
                {
5280
0
                    if (pafChunk[i] == 1.0f)
5281
0
                        pafChunk[i] = 255.0f;
5282
0
                }
5283
0
            }
5284
0
            else if (eWrkDataType == GDT_UInt8)
5285
0
            {
5286
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
5287
0
                for (size_t i = 0;
5288
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5289
0
                {
5290
0
                    if (pabyChunk[i] == 1)
5291
0
                        pabyChunk[i] = 255;
5292
0
                }
5293
0
            }
5294
0
            else if (eWrkDataType == GDT_UInt16)
5295
0
            {
5296
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5297
0
                for (size_t i = 0;
5298
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5299
0
                {
5300
0
                    if (pasChunk[i] == 1)
5301
0
                        pasChunk[i] = 255;
5302
0
                }
5303
0
            }
5304
0
            else if (eWrkDataType == GDT_Float64)
5305
0
            {
5306
0
                double *padfChunk = static_cast<double *>(pChunk);
5307
0
                for (size_t i = 0;
5308
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5309
0
                {
5310
0
                    if (padfChunk[i] == 1.0)
5311
0
                        padfChunk[i] = 255.0;
5312
0
                }
5313
0
            }
5314
0
            else
5315
0
            {
5316
0
                CPLAssert(false);
5317
0
            }
5318
0
        }
5319
0
        else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
5320
0
        {
5321
0
            if (eWrkDataType == GDT_Float32)
5322
0
            {
5323
0
                float *pafChunk = static_cast<float *>(pChunk);
5324
0
                for (size_t i = 0;
5325
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5326
0
                {
5327
0
                    if (pafChunk[i] == 1.0f)
5328
0
                        pafChunk[i] = 0.0f;
5329
0
                    else if (pafChunk[i] == 0.0f)
5330
0
                        pafChunk[i] = 255.0f;
5331
0
                }
5332
0
            }
5333
0
            else if (eWrkDataType == GDT_UInt8)
5334
0
            {
5335
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
5336
0
                for (size_t i = 0;
5337
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5338
0
                {
5339
0
                    if (pabyChunk[i] == 1)
5340
0
                        pabyChunk[i] = 0;
5341
0
                    else if (pabyChunk[i] == 0)
5342
0
                        pabyChunk[i] = 255;
5343
0
                }
5344
0
            }
5345
0
            else if (eWrkDataType == GDT_UInt16)
5346
0
            {
5347
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
5348
0
                for (size_t i = 0;
5349
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5350
0
                {
5351
0
                    if (pasChunk[i] == 1)
5352
0
                        pasChunk[i] = 0;
5353
0
                    else if (pasChunk[i] == 0)
5354
0
                        pasChunk[i] = 255;
5355
0
                }
5356
0
            }
5357
0
            else if (eWrkDataType == GDT_Float64)
5358
0
            {
5359
0
                double *padfChunk = static_cast<double *>(pChunk);
5360
0
                for (size_t i = 0;
5361
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
5362
0
                {
5363
0
                    if (padfChunk[i] == 1.0)
5364
0
                        padfChunk[i] = 0.0;
5365
0
                    else if (padfChunk[i] == 0.0)
5366
0
                        padfChunk[i] = 255.0;
5367
0
                }
5368
0
            }
5369
0
            else
5370
0
            {
5371
0
                CPLAssert(false);
5372
0
            }
5373
0
        }
5374
5375
0
        auto oSrcBufferHolder =
5376
0
            std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
5377
0
        auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
5378
0
            poJobQueue ? pabyChunkNodataMask : nullptr);
5379
5380
0
        for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
5381
0
             ++iOverview)
5382
0
        {
5383
0
            GDALRasterBand *poDstBand = papoOvrBands[iOverview];
5384
0
            const int nDstWidth = poDstBand->GetXSize();
5385
0
            const int nDstHeight = poDstBand->GetYSize();
5386
5387
0
            const double dfXRatioDstToSrc =
5388
0
                static_cast<double>(nWidth) / nDstWidth;
5389
0
            const double dfYRatioDstToSrc =
5390
0
                static_cast<double>(nHeight) / nDstHeight;
5391
5392
            /* --------------------------------------------------------------------
5393
             */
5394
            /*      Figure out the line to start writing to, and the first line
5395
             */
5396
            /*      to not write to.  In theory this approach should ensure that
5397
             */
5398
            /*      every output line will be written if all input chunks are */
5399
            /*      processed. */
5400
            /* --------------------------------------------------------------------
5401
             */
5402
0
            int nDstYOff =
5403
0
                static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5404
0
            if (nDstYOff == nDstHeight)
5405
0
                continue;
5406
0
            int nDstYOff2 = static_cast<int>(
5407
0
                0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5408
5409
0
            if (nChunkYOff + nFullResYChunk == nHeight)
5410
0
                nDstYOff2 = nDstHeight;
5411
#if DEBUG_VERBOSE
5412
            CPLDebug("GDAL",
5413
                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5414
                     nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5415
                     nDstWidth, nDstYOff2 - nDstYOff);
5416
#endif
5417
5418
0
            auto poJob = std::make_unique<OvrJob>();
5419
0
            poJob->pfnResampleFn = pfnResampleFn;
5420
0
            poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5421
0
            poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5422
0
            poJob->args.nOvrXSize = poDstBand->GetXSize();
5423
0
            poJob->args.nOvrYSize = poDstBand->GetYSize();
5424
0
            const char *pszNBITS =
5425
0
                poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5426
0
            poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5427
0
            poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5428
0
            poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5429
0
            poJob->args.eWrkDataType = eWrkDataType;
5430
0
            poJob->pChunk = pChunk;
5431
0
            poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
5432
0
            poJob->nSrcWidth = nWidth;
5433
0
            poJob->nSrcHeight = nHeight;
5434
0
            poJob->args.nChunkXOff = 0;
5435
0
            poJob->args.nChunkXSize = nWidth;
5436
0
            poJob->args.nChunkYOff = nChunkYOffQueried;
5437
0
            poJob->args.nChunkYSize = nChunkYSizeQueried;
5438
0
            poJob->nDstWidth = nDstWidth;
5439
0
            poJob->args.nDstXOff = 0;
5440
0
            poJob->args.nDstXOff2 = nDstWidth;
5441
0
            poJob->args.nDstYOff = nDstYOff;
5442
0
            poJob->args.nDstYOff2 = nDstYOff2;
5443
0
            poJob->poDstBand = poDstBand;
5444
0
            poJob->args.pszResampling = pszResampling;
5445
0
            poJob->args.bHasNoData = bHasNoData;
5446
0
            poJob->args.dfNoDataValue = dfNoDataValue;
5447
0
            poJob->args.poColorTable = poColorTable;
5448
0
            poJob->args.eSrcDataType = eSrcDataType;
5449
0
            poJob->args.bPropagateNoData = bPropagateNoData;
5450
5451
0
            if (poJobQueue)
5452
0
            {
5453
0
                poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5454
0
                poJob->SetSrcBufferHolder(oSrcBufferHolder);
5455
0
                poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5456
0
                jobList.emplace_back(std::move(poJob));
5457
0
            }
5458
0
            else
5459
0
            {
5460
0
                JobResampleFunc(poJob.get());
5461
0
                eErr = poJob->eErr;
5462
0
                if (eErr == CE_None)
5463
0
                {
5464
0
                    eErr = WriteJobData(poJob.get());
5465
0
                }
5466
0
            }
5467
0
        }
5468
5469
0
        if (poJobQueue)
5470
0
        {
5471
0
            pChunk = nullptr;
5472
0
            pabyChunkNodataMask = nullptr;
5473
0
        }
5474
0
    }
5475
5476
0
    VSIFree(pChunk);
5477
0
    VSIFree(pabyChunkNodataMask);
5478
5479
    // Wait for all pending jobs to complete
5480
0
    while (!jobList.empty())
5481
0
    {
5482
0
        const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5483
0
        if (l_eErr != CE_None && eErr == CE_None)
5484
0
            eErr = l_eErr;
5485
0
    }
5486
5487
    /* -------------------------------------------------------------------- */
5488
    /*      Renormalized overview mean / stddev if needed.                  */
5489
    /* -------------------------------------------------------------------- */
5490
0
    if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5491
0
    {
5492
0
        GDALOverviewMagnitudeCorrection(
5493
0
            poSrcBand, nOverviewCount,
5494
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5495
0
            GDALDummyProgress, nullptr);
5496
0
    }
5497
5498
    /* -------------------------------------------------------------------- */
5499
    /*      It can be important to flush out data to overviews.             */
5500
    /* -------------------------------------------------------------------- */
5501
0
    for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5502
0
         ++iOverview)
5503
0
    {
5504
0
        eErr = papoOvrBands[iOverview]->FlushCache(false);
5505
0
    }
5506
5507
0
    if (eErr == CE_None)
5508
0
        pfnProgress(1.0, nullptr, pProgressData);
5509
5510
0
    return eErr;
5511
0
}
5512
5513
/************************************************************************/
5514
/*                  GDALRegenerateOverviewsMultiBand()                  */
5515
/************************************************************************/
5516
5517
/**
5518
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5519
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5520
 *
5521
 * This function will generate one or more overview images from a base
5522
 * image using the requested downsampling algorithm.  Its primary use
5523
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5524
 * can also be used to generate downsampled images in one file from another
5525
 * outside the overview architecture.
5526
 *
5527
 * The output bands need to exist in advance and share the same characteristics
5528
 * (type, dimensions)
5529
 *
5530
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5531
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5532
 *
5533
 * It does not support color tables or complex data types.
5534
 *
5535
 * The pseudo-algorithm used by the function is :
5536
 *    for each overview
5537
 *       iterate on lines of the source by a step of deltay
5538
 *           iterate on columns of the source  by a step of deltax
5539
 *               read the source data of size deltax * deltay for all the bands
5540
 *               generate the corresponding overview block for all the bands
5541
 *
5542
 * This function will honour properly NODATA_VALUES tuples (special dataset
5543
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5544
 * considered as the nodata value and not each value of the triplet
5545
 * independently per band.
5546
 *
5547
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5548
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5549
 * overview computation.
5550
 *
5551
 * @param nBands the number of bands, size of papoSrcBands and size of
5552
 *               first dimension of papapoOverviewBands
5553
 * @param papoSrcBands the list of source bands to downsample
5554
 * @param nOverviews the number of downsampled overview levels being generated.
5555
 * @param papapoOverviewBands bidimension array of bands. First dimension is
5556
 *                            indexed by nBands. Second dimension is indexed by
5557
 *                            nOverviews.
5558
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5559
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5560
 * @param pfnProgress progress report function.
5561
 * @param pProgressData progress function callback data.
5562
 * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5563
 *                     key=value pairs, or NULL
5564
 *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5565
 *                     options can be specified to express that overviews should
5566
 *                     be regenerated only in the specified subset of the source
5567
 *                     dataset.
5568
 * @return CE_None on success or CE_Failure on failure.
5569
 */
5570
5571
CPLErr GDALRegenerateOverviewsMultiBand(
5572
    int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5573
    GDALRasterBand *const *const *papapoOverviewBands,
5574
    const char *pszResampling, GDALProgressFunc pfnProgress,
5575
    void *pProgressData, CSLConstList papszOptions)
5576
0
{
5577
0
    CPL_IGNORE_RET_VAL(papszOptions);
5578
5579
0
    if (pfnProgress == nullptr)
5580
0
        pfnProgress = GDALDummyProgress;
5581
5582
0
    if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5583
0
        return CE_None;
5584
5585
    // Sanity checks.
5586
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5587
0
        !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5588
0
        !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5589
0
        !EQUAL(pszResampling, "CUBICSPLINE") &&
5590
0
        !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5591
0
        !EQUAL(pszResampling, "MODE"))
5592
0
    {
5593
0
        CPLError(CE_Failure, CPLE_NotSupported,
5594
0
                 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5595
0
                 "not supported",
5596
0
                 pszResampling);
5597
0
        return CE_Failure;
5598
0
    }
5599
5600
0
    int nKernelRadius = 0;
5601
0
    GDALResampleFunction pfnResampleFn =
5602
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
5603
0
    if (pfnResampleFn == nullptr)
5604
0
        return CE_Failure;
5605
5606
0
    const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5607
0
    const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5608
0
    if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5609
0
        return CE_None;
5610
0
    GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5611
0
    for (int iBand = 1; iBand < nBands; ++iBand)
5612
0
    {
5613
0
        if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5614
0
            papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5615
0
        {
5616
0
            CPLError(
5617
0
                CE_Failure, CPLE_NotSupported,
5618
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5619
0
                "have the same dimensions");
5620
0
            return CE_Failure;
5621
0
        }
5622
0
        if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5623
0
        {
5624
0
            CPLError(
5625
0
                CE_Failure, CPLE_NotSupported,
5626
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5627
0
                "have the same data type");
5628
0
            return CE_Failure;
5629
0
        }
5630
0
    }
5631
5632
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5633
0
    {
5634
0
        const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5635
0
        const int nDstWidth = poOvrFirstBand->GetXSize();
5636
0
        const int nDstHeight = poOvrFirstBand->GetYSize();
5637
0
        for (int iBand = 1; iBand < nBands; ++iBand)
5638
0
        {
5639
0
            const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5640
0
            if (poOvrBand->GetXSize() != nDstWidth ||
5641
0
                poOvrBand->GetYSize() != nDstHeight)
5642
0
            {
5643
0
                CPLError(
5644
0
                    CE_Failure, CPLE_NotSupported,
5645
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5646
0
                    "of the same level must have the same dimensions");
5647
0
                return CE_Failure;
5648
0
            }
5649
0
            if (poOvrBand->GetRasterDataType() != eDataType)
5650
0
            {
5651
0
                CPLError(
5652
0
                    CE_Failure, CPLE_NotSupported,
5653
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5654
0
                    "must have the same data type as the source bands");
5655
0
                return CE_Failure;
5656
0
            }
5657
0
        }
5658
0
    }
5659
5660
    // First pass to compute the total number of pixels to write.
5661
0
    double dfTotalPixelCount = 0;
5662
0
    const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5663
0
    const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5664
0
    const int nSrcXSize = atoi(CSLFetchNameValueDef(
5665
0
        papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5666
0
    const int nSrcYSize = atoi(CSLFetchNameValueDef(
5667
0
        papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5668
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5669
0
    {
5670
0
        dfTotalPixelCount +=
5671
0
            static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5672
0
            papapoOverviewBands[0][iOverview]->GetXSize() *
5673
0
            static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5674
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5675
0
    }
5676
5677
0
    const GDALDataType eWrkDataType =
5678
0
        GDALGetOvrWorkDataType(pszResampling, eDataType);
5679
0
    const int nWrkDataTypeSize =
5680
0
        std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5681
5682
0
    const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5683
5684
    // If we have a nodata mask and we are doing something more complicated
5685
    // than nearest neighbouring, we have to fetch to nodata mask.
5686
0
    const bool bUseNoDataMask =
5687
0
        !STARTS_WITH_CI(pszResampling, "NEAR") &&
5688
0
        (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5689
5690
0
    std::vector<bool> abHasNoData(nBands);
5691
0
    std::vector<double> adfNoDataValue(nBands);
5692
5693
0
    for (int iBand = 0; iBand < nBands; ++iBand)
5694
0
    {
5695
0
        int nHasNoData = 0;
5696
0
        adfNoDataValue[iBand] =
5697
0
            papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5698
0
        abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5699
0
    }
5700
5701
0
    std::string osDetailMessage;
5702
0
    if (bUseNoDataMask &&
5703
0
        papoSrcBands[0]->HasConflictingMaskSources(&osDetailMessage, false))
5704
0
    {
5705
0
        CPLError(CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
5706
0
                 abHasNoData[0]
5707
0
                     ? "Only the nodata value will be taken into account."
5708
0
                     : "Only the first listed one will be taken into account.");
5709
0
    }
5710
5711
0
    const bool bPropagateNoData =
5712
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5713
5714
0
    const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
5715
0
                                           /* bDefaultToAllCPUs=*/false);
5716
0
    auto poThreadPool =
5717
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5718
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5719
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
5720
5721
    // Only configurable for debug / testing
5722
0
    const GIntBig nChunkMaxSize = []() -> GIntBig
5723
0
    {
5724
0
        const char *pszVal =
5725
0
            CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5726
0
        if (pszVal)
5727
0
        {
5728
0
            GIntBig nRet = 0;
5729
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5730
0
            return std::max<GIntBig>(100, nRet);
5731
0
        }
5732
0
        return 10 * 1024 * 1024;
5733
0
    }();
5734
5735
    // Only configurable for debug / testing
5736
0
    const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5737
0
    {
5738
0
        const char *pszVal = CPLGetConfigOption(
5739
0
            "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5740
0
        if (pszVal)
5741
0
        {
5742
0
            GIntBig nRet = 0;
5743
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5744
0
            return std::max<GIntBig>(100, nRet);
5745
0
        }
5746
0
        const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5747
0
        if (nUsableRAM > 0)
5748
0
            return nUsableRAM / 10;
5749
        // Select a value to be able to at least downsample by 2 for a RGB
5750
        // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5751
0
        return 100 * 1024 * 1024;
5752
0
    }();
5753
5754
    // Second pass to do the real job.
5755
0
    double dfCurPixelCount = 0;
5756
0
    CPLErr eErr = CE_None;
5757
0
    for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5758
0
         ++iOverview)
5759
0
    {
5760
0
        int iSrcOverview = -1;  // -1 means the source bands.
5761
5762
0
        const int nDstTotalWidth =
5763
0
            papapoOverviewBands[0][iOverview]->GetXSize();
5764
0
        const int nDstTotalHeight =
5765
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5766
5767
        // Compute the coordinates of the target region to refresh
5768
0
        constexpr double EPS = 1e-8;
5769
0
        const int nDstXOffStart = static_cast<int>(
5770
0
            static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5771
0
            EPS);
5772
0
        const int nDstXOffEnd =
5773
0
            std::min(static_cast<int>(
5774
0
                         std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5775
0
                                       nToplevelSrcWidth * nDstTotalWidth -
5776
0
                                   EPS)),
5777
0
                     nDstTotalWidth);
5778
0
        const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5779
0
        const int nDstYOffStart =
5780
0
            static_cast<int>(static_cast<double>(nSrcYOff) /
5781
0
                                 nToplevelSrcHeight * nDstTotalHeight +
5782
0
                             EPS);
5783
0
        const int nDstYOffEnd =
5784
0
            std::min(static_cast<int>(
5785
0
                         std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5786
0
                                       nToplevelSrcHeight * nDstTotalHeight -
5787
0
                                   EPS)),
5788
0
                     nDstTotalHeight);
5789
0
        const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5790
5791
        // Try to use previous level of overview as the source to compute
5792
        // the next level.
5793
0
        int nSrcWidth = nToplevelSrcWidth;
5794
0
        int nSrcHeight = nToplevelSrcHeight;
5795
0
        if (iOverview > 0 &&
5796
0
            papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5797
0
        {
5798
0
            nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5799
0
            nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5800
0
            iSrcOverview = iOverview - 1;
5801
0
        }
5802
5803
0
        const double dfXRatioDstToSrc =
5804
0
            static_cast<double>(nSrcWidth) / nDstTotalWidth;
5805
0
        const double dfYRatioDstToSrc =
5806
0
            static_cast<double>(nSrcHeight) / nDstTotalHeight;
5807
5808
0
        const int nOvrFactor =
5809
0
            std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5810
0
                                 static_cast<int>(0.5 + dfYRatioDstToSrc)));
5811
5812
0
        int nDstChunkXSize = 0;
5813
0
        int nDstChunkYSize = 0;
5814
0
        papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5815
0
                                                        &nDstChunkYSize);
5816
5817
0
        constexpr int PIXEL_MARGIN = 2;
5818
        // Try to extend the chunk size so that the memory needed to acquire
5819
        // source pixels goes up to 10 MB.
5820
        // This can help for drivers that support multi-threaded reading
5821
0
        const int nFullResYChunk = static_cast<int>(std::min<double>(
5822
0
            nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5823
0
        const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5824
0
            nSrcHeight,
5825
0
            nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5826
0
                                 nKernelRadius * nOvrFactor));
5827
0
        while (nDstChunkXSize < nDstWidth)
5828
0
        {
5829
0
            constexpr int INCREASE_FACTOR = 2;
5830
5831
0
            const int nFullResXChunk = static_cast<int>(std::min<double>(
5832
0
                nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5833
0
                                              dfXRatioDstToSrc));
5834
5835
0
            const int nFullResXChunkQueried =
5836
0
                static_cast<int>(std::min<int64_t>(
5837
0
                    nSrcWidth,
5838
0
                    nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5839
0
                                         nKernelRadius * nOvrFactor));
5840
5841
0
            if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5842
0
                             nFullResYChunkQueried / nWrkDataTypeSize)
5843
0
            {
5844
0
                break;
5845
0
            }
5846
5847
0
            nDstChunkXSize *= INCREASE_FACTOR;
5848
0
        }
5849
0
        nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5850
5851
0
        const int nFullResXChunk = static_cast<int>(std::min<double>(
5852
0
            nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5853
0
        const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5854
0
            nSrcWidth,
5855
0
            nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5856
0
                                 nKernelRadius * nOvrFactor));
5857
5858
        // Make sure that the RAM requirements to acquire the source data does
5859
        // not exceed nChunkMaxSizeForTempFile
5860
        // If so, reduce the destination chunk size, generate overviews in a
5861
        // temporary dataset, and copy that temporary dataset over the target
5862
        // overview bands (to avoid issues with lossy compression)
5863
0
        const bool bOverflowFullResXChunkYChunkQueried =
5864
0
            nBands > std::numeric_limits<int64_t>::max() /
5865
0
                         nFullResXChunkQueried / nFullResYChunkQueried /
5866
0
                         nWrkDataTypeSize;
5867
5868
0
        const auto nMemRequirement =
5869
0
            bOverflowFullResXChunkYChunkQueried
5870
0
                ? 0
5871
0
                : static_cast<GIntBig>(nFullResXChunkQueried) *
5872
0
                      nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5873
        // Use a temporary dataset with a smaller destination chunk size
5874
0
        const auto nOverShootFactor =
5875
0
            nMemRequirement / nChunkMaxSizeForTempFile;
5876
5877
0
        constexpr int MIN_OVERSHOOT_FACTOR = 4;
5878
0
        const auto nSqrtOverShootFactor = std::max<GIntBig>(
5879
0
            MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5880
0
                                      static_cast<double>(nOverShootFactor)))));
5881
0
        constexpr int DEFAULT_CHUNK_SIZE = 256;
5882
0
        constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5883
0
        const int nReducedDstChunkXSize =
5884
0
            bOverflowFullResXChunkYChunkQueried
5885
0
                ? DEFAULT_CHUNK_SIZE
5886
0
                : std::max(1, static_cast<int>(nDstChunkXSize /
5887
0
                                               nSqrtOverShootFactor) &
5888
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5889
0
        const int nReducedDstChunkYSize =
5890
0
            bOverflowFullResXChunkYChunkQueried
5891
0
                ? DEFAULT_CHUNK_SIZE
5892
0
                : std::max(1, static_cast<int>(nDstChunkYSize /
5893
0
                                               nSqrtOverShootFactor) &
5894
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5895
5896
0
        if (bOverflowFullResXChunkYChunkQueried ||
5897
0
            nMemRequirement > nChunkMaxSizeForTempFile)
5898
0
        {
5899
0
            const auto nDTSize =
5900
0
                std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5901
0
            const bool bTmpDSMemRequirementOverflow =
5902
0
                nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5903
0
                             nDstHeight / nDTSize;
5904
0
            const auto nTmpDSMemRequirement =
5905
0
                bTmpDSMemRequirementOverflow
5906
0
                    ? 0
5907
0
                    : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5908
0
                          nDTSize;
5909
5910
            // make sure that one band buffer doesn't overflow size_t
5911
0
            const bool bChunkSizeOverflow =
5912
0
                static_cast<size_t>(nDTSize) >
5913
0
                std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5914
0
            const size_t nChunkSize =
5915
0
                bChunkSizeOverflow
5916
0
                    ? 0
5917
0
                    : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5918
5919
0
            const auto CreateVRT =
5920
0
                [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5921
0
                 pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5922
0
                 iSrcOverview, &abHasNoData,
5923
0
                 &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5924
0
            {
5925
0
                auto poVRTDS = std::make_unique<VRTDataset>(
5926
0
                    nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5927
0
                    nVRTBlockYSize);
5928
5929
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5930
0
                {
5931
0
                    auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5932
0
                    poVRTSrc->SetResampling(pszResampling);
5933
0
                    poVRTDS->AddBand(eWrkDataType);
5934
0
                    auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5935
0
                        poVRTDS->GetRasterBand(iBand + 1));
5936
5937
0
                    auto poSrcBand = papoSrcBands[iBand];
5938
0
                    if (iSrcOverview != -1)
5939
0
                        poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5940
0
                    poVRTBand->ConfigureSource(
5941
0
                        poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5942
0
                        nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5943
                    // Add the source to the band
5944
0
                    poVRTBand->AddSource(poVRTSrc.release());
5945
0
                    if (abHasNoData[iBand])
5946
0
                        poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5947
0
                }
5948
5949
0
                if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5950
0
                    poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5951
0
                {
5952
0
                    VRTSourcedRasterBand *poMaskVRTBand =
5953
0
                        cpl::down_cast<VRTSourcedRasterBand *>(
5954
0
                            poVRTDS->GetRasterBand(1)->GetMaskBand());
5955
0
                    auto poSrcBand = papoSrcBands[0];
5956
0
                    if (iSrcOverview != -1)
5957
0
                        poSrcBand = papapoOverviewBands[0][iSrcOverview];
5958
0
                    poMaskVRTBand->AddMaskBandSource(
5959
0
                        poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5960
0
                        0, 0, nDstTotalWidth, nDstTotalHeight);
5961
0
                }
5962
5963
0
                return poVRTDS;
5964
0
            };
5965
5966
            // If the overview accommodates chunking, do so and recurse
5967
            // to avoid generating full size temporary files
5968
0
            if (!bOverflowFullResXChunkYChunkQueried &&
5969
0
                !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5970
0
                (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5971
0
            {
5972
                // Create a VRT with the smaller chunk to do the scaling
5973
0
                auto poVRTDS =
5974
0
                    CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5975
5976
0
                std::vector<GDALRasterBand *> apoVRTBand(nBands);
5977
0
                std::vector<GDALRasterBand *> apoDstBand(nBands);
5978
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5979
0
                {
5980
0
                    apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5981
0
                    apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5982
0
                }
5983
5984
                // Use a flag to avoid reading from the overview being built
5985
0
                GDALRasterIOExtraArg sExtraArg;
5986
0
                INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5987
0
                if (iSrcOverview == -1)
5988
0
                    sExtraArg.bUseOnlyThisScale = true;
5989
5990
                // A single band buffer for data transfer to the overview
5991
0
                std::vector<GByte> abyChunk;
5992
0
                try
5993
0
                {
5994
0
                    abyChunk.resize(nChunkSize);
5995
0
                }
5996
0
                catch (const std::exception &)
5997
0
                {
5998
0
                    CPLError(CE_Failure, CPLE_OutOfMemory,
5999
0
                             "Out of memory allocating temporary buffer");
6000
0
                    return CE_Failure;
6001
0
                }
6002
6003
                // Loop over output height, in chunks
6004
0
                for (int nDstYOff = nDstYOffStart;
6005
0
                     nDstYOff < nDstYOffEnd && eErr == CE_None;
6006
0
                     /* */)
6007
0
                {
6008
0
                    const int nDstYCount =
6009
0
                        std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6010
                    // Loop over output width, in output chunks
6011
0
                    for (int nDstXOff = nDstXOffStart;
6012
0
                         nDstXOff < nDstXOffEnd && eErr == CE_None;
6013
0
                         /* */)
6014
0
                    {
6015
0
                        const int nDstXCount =
6016
0
                            std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6017
                        // Read and transfer the chunk to the overview
6018
0
                        for (int iBand = 0; iBand < nBands && eErr == CE_None;
6019
0
                             ++iBand)
6020
0
                        {
6021
0
                            eErr = apoVRTBand[iBand]->RasterIO(
6022
0
                                GF_Read, nDstXOff, nDstYOff, nDstXCount,
6023
0
                                nDstYCount, abyChunk.data(), nDstXCount,
6024
0
                                nDstYCount, eDataType, 0, 0, &sExtraArg);
6025
0
                            if (eErr == CE_None)
6026
0
                            {
6027
0
                                eErr = apoDstBand[iBand]->RasterIO(
6028
0
                                    GF_Write, nDstXOff, nDstYOff, nDstXCount,
6029
0
                                    nDstYCount, abyChunk.data(), nDstXCount,
6030
0
                                    nDstYCount, eDataType, 0, 0, nullptr);
6031
0
                            }
6032
0
                        }
6033
6034
0
                        dfCurPixelCount +=
6035
0
                            static_cast<double>(nDstXCount) * nDstYCount;
6036
6037
0
                        nDstXOff += nDstXCount;
6038
0
                    }  // width
6039
6040
0
                    if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
6041
0
                                     nullptr, pProgressData))
6042
0
                    {
6043
0
                        CPLError(CE_Failure, CPLE_UserInterrupt,
6044
0
                                 "User terminated");
6045
0
                        eErr = CE_Failure;
6046
0
                    }
6047
6048
0
                    nDstYOff += nDstYCount;
6049
0
                }  // height
6050
6051
0
                if (CE_None != eErr)
6052
0
                {
6053
0
                    CPLError(CE_Failure, CPLE_AppDefined,
6054
0
                             "Error while writing overview");
6055
0
                    return CE_Failure;
6056
0
                }
6057
6058
0
                pfnProgress(1.0, nullptr, pProgressData);
6059
                // Flush the overviews we just generated
6060
0
                for (int iBand = 0; iBand < nBands; ++iBand)
6061
0
                    apoDstBand[iBand]->FlushCache(false);
6062
6063
0
                continue;  // Next overview
6064
0
            }  // chunking via temporary dataset
6065
6066
0
            std::unique_ptr<GDALDataset> poTmpDS;
6067
            // Config option mostly/only for autotest purposes
6068
0
            const char *pszGDAL_OVR_TEMP_DRIVER =
6069
0
                CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
6070
0
            if ((!bTmpDSMemRequirementOverflow &&
6071
0
                 nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
6072
0
                 !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
6073
0
                EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
6074
0
            {
6075
0
                auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
6076
0
                if (!poTmpDrv)
6077
0
                {
6078
0
                    eErr = CE_Failure;
6079
0
                    break;
6080
0
                }
6081
0
                poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
6082
0
                                               nDstTotalHeight, nBands,
6083
0
                                               eDataType, nullptr));
6084
0
            }
6085
0
            else
6086
0
            {
6087
                // Create a temporary file for the overview
6088
0
                auto poTmpDrv =
6089
0
                    GetGDALDriverManager()->GetDriverByName("GTiff");
6090
0
                if (!poTmpDrv)
6091
0
                {
6092
0
                    eErr = CE_Failure;
6093
0
                    break;
6094
0
                }
6095
0
                std::string osTmpFilename;
6096
0
                auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
6097
0
                if (poDstDS)
6098
0
                {
6099
0
                    osTmpFilename = poDstDS->GetDescription();
6100
0
                    VSIStatBufL sStatBuf;
6101
0
                    if (!osTmpFilename.empty() &&
6102
0
                        VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
6103
0
                        osTmpFilename += "_tmp_ovr.tif";
6104
0
                }
6105
0
                if (osTmpFilename.empty())
6106
0
                {
6107
0
                    osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
6108
0
                    osTmpFilename += ".tif";
6109
0
                }
6110
0
                CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
6111
0
                         osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
6112
0
                CPLStringList aosCO;
6113
0
                if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
6114
0
                          (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
6115
0
                {
6116
0
                    aosCO.SetNameValue("TILED", "YES");
6117
0
                    aosCO.SetNameValue("BLOCKXSIZE",
6118
0
                                       CPLSPrintf("%d", nReducedDstChunkXSize));
6119
0
                    aosCO.SetNameValue("BLOCKYSIZE",
6120
0
                                       CPLSPrintf("%d", nReducedDstChunkYSize));
6121
0
                }
6122
0
                if (const char *pszCOList =
6123
0
                        poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
6124
0
                {
6125
0
                    aosCO.SetNameValue(
6126
0
                        "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
6127
0
                }
6128
0
                poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
6129
0
                                               nDstHeight, nBands, eDataType,
6130
0
                                               aosCO.List()));
6131
0
                if (poTmpDS)
6132
0
                {
6133
0
                    poTmpDS->MarkSuppressOnClose();
6134
0
                    VSIUnlink(osTmpFilename.c_str());
6135
0
                }
6136
0
            }
6137
0
            if (!poTmpDS)
6138
0
            {
6139
0
                eErr = CE_Failure;
6140
0
                break;
6141
0
            }
6142
6143
            // Create a full size VRT to do the resampling without edge effects
6144
0
            auto poVRTDS =
6145
0
                CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
6146
6147
            // Allocate a band buffer with the overview chunk size
6148
0
            std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
6149
0
                VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
6150
0
                                    nDstChunkYSize));
6151
0
            if (pDstBuffer == nullptr)
6152
0
            {
6153
0
                eErr = CE_Failure;
6154
0
                break;
6155
0
            }
6156
6157
            // Use a flag to avoid reading the overview being built
6158
0
            GDALRasterIOExtraArg sExtraArg;
6159
0
            INIT_RASTERIO_EXTRA_ARG(sExtraArg);
6160
0
            if (iSrcOverview == -1)
6161
0
                sExtraArg.bUseOnlyThisScale = true;
6162
6163
            // Scale and copy data from the VRT to the temp file
6164
0
            for (int nDstYOff = nDstYOffStart;
6165
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
6166
0
                 /* */)
6167
0
            {
6168
0
                const int nDstYCount =
6169
0
                    std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
6170
0
                for (int nDstXOff = nDstXOffStart;
6171
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
6172
0
                     /* */)
6173
0
                {
6174
0
                    const int nDstXCount =
6175
0
                        std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
6176
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
6177
0
                         ++iBand)
6178
0
                    {
6179
0
                        auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
6180
0
                        eErr = poSrcBand->RasterIO(
6181
0
                            GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
6182
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
6183
0
                            eWrkDataType, 0, 0, &sExtraArg);
6184
0
                        if (eErr == CE_None)
6185
0
                        {
6186
                            // Write to the temporary dataset, shifted
6187
0
                            auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
6188
0
                            eErr = poOvrBand->RasterIO(
6189
0
                                GF_Write, nDstXOff - nDstXOffStart,
6190
0
                                nDstYOff - nDstYOffStart, nDstXCount,
6191
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
6192
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
6193
0
                        }
6194
0
                    }
6195
0
                    nDstXOff += nDstXCount;
6196
0
                }
6197
0
                nDstYOff += nDstYCount;
6198
0
            }
6199
6200
            // Copy from the temporary to the overview
6201
0
            for (int nDstYOff = nDstYOffStart;
6202
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
6203
0
                 /* */)
6204
0
            {
6205
0
                const int nDstYCount =
6206
0
                    std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
6207
0
                for (int nDstXOff = nDstXOffStart;
6208
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
6209
0
                     /* */)
6210
0
                {
6211
0
                    const int nDstXCount =
6212
0
                        std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
6213
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
6214
0
                         ++iBand)
6215
0
                    {
6216
0
                        auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
6217
0
                        eErr = poSrcBand->RasterIO(
6218
0
                            GF_Read, nDstXOff - nDstXOffStart,
6219
0
                            nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
6220
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
6221
0
                            eWrkDataType, 0, 0, nullptr);
6222
0
                        if (eErr == CE_None)
6223
0
                        {
6224
                            // Write to the destination overview bands
6225
0
                            auto poOvrBand =
6226
0
                                papapoOverviewBands[iBand][iOverview];
6227
0
                            eErr = poOvrBand->RasterIO(
6228
0
                                GF_Write, nDstXOff, nDstYOff, nDstXCount,
6229
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
6230
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
6231
0
                        }
6232
0
                    }
6233
0
                    nDstXOff += nDstXCount;
6234
0
                }
6235
0
                nDstYOff += nDstYCount;
6236
0
            }
6237
6238
0
            if (eErr != CE_None)
6239
0
            {
6240
0
                CPLError(CE_Failure, CPLE_AppDefined,
6241
0
                         "Failed to write overview %d", iOverview);
6242
0
                return eErr;
6243
0
            }
6244
6245
            // Flush the data to overviews.
6246
0
            for (int iBand = 0; iBand < nBands; ++iBand)
6247
0
                papapoOverviewBands[iBand][iOverview]->FlushCache(false);
6248
6249
0
            continue;
6250
0
        }
6251
6252
        // Structure describing a resampling job
6253
0
        struct OvrJob
6254
0
        {
6255
            // Buffers to free when job is finished
6256
0
            std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
6257
0
            std::unique_ptr<PointerHolder> oSrcBufferHolder{};
6258
0
            std::unique_ptr<PointerHolder> oDstBufferHolder{};
6259
6260
0
            GDALRasterBand *poDstBand = nullptr;
6261
6262
            // Input parameters of pfnResampleFn
6263
0
            GDALResampleFunction pfnResampleFn = nullptr;
6264
0
            GDALOverviewResampleArgs args{};
6265
0
            const void *pChunk = nullptr;
6266
6267
            // Output values of resampling function
6268
0
            CPLErr eErr = CE_Failure;
6269
0
            void *pDstBuffer = nullptr;
6270
0
            GDALDataType eDstBufferDataType = GDT_Unknown;
6271
6272
0
            void NotifyFinished()
6273
0
            {
6274
0
                std::lock_guard guard(mutex);
6275
0
                bFinished = true;
6276
0
                cv.notify_one();
6277
0
            }
6278
6279
0
            bool IsFinished()
6280
0
            {
6281
0
                std::lock_guard guard(mutex);
6282
0
                return bFinished;
6283
0
            }
6284
6285
0
            void WaitFinished()
6286
0
            {
6287
0
                std::unique_lock oGuard(mutex);
6288
0
                while (!bFinished)
6289
0
                {
6290
0
                    cv.wait(oGuard);
6291
0
                }
6292
0
            }
6293
6294
0
          private:
6295
            // Synchronization
6296
0
            bool bFinished = false;
6297
0
            std::mutex mutex{};
6298
0
            std::condition_variable cv{};
6299
0
        };
6300
6301
        // Thread function to resample
6302
0
        const auto JobResampleFunc = [](void *pData)
6303
0
        {
6304
0
            OvrJob *poJob = static_cast<OvrJob *>(pData);
6305
6306
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
6307
0
                                               &(poJob->pDstBuffer),
6308
0
                                               &(poJob->eDstBufferDataType));
6309
6310
0
            poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
6311
6312
0
            poJob->NotifyFinished();
6313
0
        };
6314
6315
        // Function to write resample data to target band
6316
0
        const auto WriteJobData = [](const OvrJob *poJob)
6317
0
        {
6318
0
            return poJob->poDstBand->RasterIO(
6319
0
                GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
6320
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6321
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
6322
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
6323
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff,
6324
0
                poJob->eDstBufferDataType, 0, 0, nullptr);
6325
0
        };
6326
6327
        // Wait for completion of oldest job and serialize it
6328
0
        const auto WaitAndFinalizeOldestJob =
6329
0
            [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
6330
0
        {
6331
0
            auto poOldestJob = jobList.front().get();
6332
0
            poOldestJob->WaitFinished();
6333
0
            CPLErr l_eErr = poOldestJob->eErr;
6334
0
            if (l_eErr == CE_None)
6335
0
            {
6336
0
                l_eErr = WriteJobData(poOldestJob);
6337
0
            }
6338
6339
0
            jobList.pop_front();
6340
0
            return l_eErr;
6341
0
        };
6342
6343
        // Queue of jobs
6344
0
        std::list<std::unique_ptr<OvrJob>> jobList;
6345
6346
0
        std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
6347
0
        std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
6348
0
            apabyChunkNoDataMask(nBands);
6349
6350
        // Iterate on destination overview, block by block.
6351
0
        for (int nDstYOff = nDstYOffStart;
6352
0
             nDstYOff < nDstYOffEnd && eErr == CE_None;
6353
0
             nDstYOff += nDstChunkYSize)
6354
0
        {
6355
0
            int nDstYCount;
6356
0
            if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
6357
0
                nDstYCount = nDstChunkYSize;
6358
0
            else
6359
0
                nDstYCount = nDstYOffEnd - nDstYOff;
6360
6361
0
            int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
6362
0
            int nChunkYOff2 = static_cast<int>(
6363
0
                ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
6364
0
            if (nChunkYOff2 > nSrcHeight ||
6365
0
                nDstYOff + nDstYCount == nDstTotalHeight)
6366
0
                nChunkYOff2 = nSrcHeight;
6367
0
            int nYCount = nChunkYOff2 - nChunkYOff;
6368
0
            CPLAssert(nYCount <= nFullResYChunk);
6369
6370
0
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
6371
0
            int nChunkYSizeQueried =
6372
0
                nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6373
0
            if (nChunkYOffQueried < 0)
6374
0
            {
6375
0
                nChunkYSizeQueried += nChunkYOffQueried;
6376
0
                nChunkYOffQueried = 0;
6377
0
            }
6378
0
            if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
6379
0
                nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
6380
0
            CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
6381
6382
0
            if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
6383
0
                             nullptr, pProgressData))
6384
0
            {
6385
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6386
0
                eErr = CE_Failure;
6387
0
            }
6388
6389
            // Iterate on destination overview, block by block.
6390
0
            for (int nDstXOff = nDstXOffStart;
6391
0
                 nDstXOff < nDstXOffEnd && eErr == CE_None;
6392
0
                 nDstXOff += nDstChunkXSize)
6393
0
            {
6394
0
                int nDstXCount = 0;
6395
0
                if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
6396
0
                    nDstXCount = nDstChunkXSize;
6397
0
                else
6398
0
                    nDstXCount = nDstXOffEnd - nDstXOff;
6399
6400
0
                dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6401
6402
0
                int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6403
0
                int nChunkXOff2 = static_cast<int>(
6404
0
                    ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6405
0
                if (nChunkXOff2 > nSrcWidth ||
6406
0
                    nDstXOff + nDstXCount == nDstTotalWidth)
6407
0
                    nChunkXOff2 = nSrcWidth;
6408
0
                const int nXCount = nChunkXOff2 - nChunkXOff;
6409
0
                CPLAssert(nXCount <= nFullResXChunk);
6410
6411
0
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6412
0
                int nChunkXSizeQueried =
6413
0
                    nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6414
0
                if (nChunkXOffQueried < 0)
6415
0
                {
6416
0
                    nChunkXSizeQueried += nChunkXOffQueried;
6417
0
                    nChunkXOffQueried = 0;
6418
0
                }
6419
0
                if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6420
0
                    nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6421
0
                CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6422
#if DEBUG_VERBOSE
6423
                CPLDebug("GDAL",
6424
                         "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6425
                         nChunkXOffQueried, nChunkYOffQueried,
6426
                         nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6427
                         nDstYOff, nDstXCount, nDstYCount);
6428
#endif
6429
6430
                // Avoid accumulating too many tasks and exhaust RAM
6431
6432
                // Try to complete already finished jobs
6433
0
                while (eErr == CE_None && !jobList.empty())
6434
0
                {
6435
0
                    auto poOldestJob = jobList.front().get();
6436
0
                    if (!poOldestJob->IsFinished())
6437
0
                        break;
6438
0
                    eErr = poOldestJob->eErr;
6439
0
                    if (eErr == CE_None)
6440
0
                    {
6441
0
                        eErr = WriteJobData(poOldestJob);
6442
0
                    }
6443
6444
0
                    jobList.pop_front();
6445
0
                }
6446
6447
                // And in case we have saturated the number of threads,
6448
                // wait for completion of tasks to go below the threshold.
6449
0
                while (eErr == CE_None &&
6450
0
                       jobList.size() >= static_cast<size_t>(nThreads))
6451
0
                {
6452
0
                    eErr = WaitAndFinalizeOldestJob(jobList);
6453
0
                }
6454
6455
                // Read the source buffers for all the bands.
6456
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6457
0
                {
6458
                    // (Re)allocate buffers if needed
6459
0
                    if (apaChunk[iBand] == nullptr)
6460
0
                    {
6461
0
                        apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6462
0
                            nFullResXChunkQueried, nFullResYChunkQueried,
6463
0
                            nWrkDataTypeSize));
6464
0
                        if (apaChunk[iBand] == nullptr)
6465
0
                        {
6466
0
                            eErr = CE_Failure;
6467
0
                        }
6468
0
                    }
6469
0
                    if (bUseNoDataMask &&
6470
0
                        apabyChunkNoDataMask[iBand] == nullptr)
6471
0
                    {
6472
0
                        apabyChunkNoDataMask[iBand].reset(
6473
0
                            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6474
0
                                nFullResXChunkQueried, nFullResYChunkQueried)));
6475
0
                        if (apabyChunkNoDataMask[iBand] == nullptr)
6476
0
                        {
6477
0
                            eErr = CE_Failure;
6478
0
                        }
6479
0
                    }
6480
6481
0
                    if (eErr == CE_None)
6482
0
                    {
6483
0
                        GDALRasterBand *poSrcBand = nullptr;
6484
0
                        if (iSrcOverview == -1)
6485
0
                            poSrcBand = papoSrcBands[iBand];
6486
0
                        else
6487
0
                            poSrcBand =
6488
0
                                papapoOverviewBands[iBand][iSrcOverview];
6489
0
                        eErr = poSrcBand->RasterIO(
6490
0
                            GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6491
0
                            nChunkXSizeQueried, nChunkYSizeQueried,
6492
0
                            apaChunk[iBand].get(), nChunkXSizeQueried,
6493
0
                            nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6494
6495
0
                        if (bUseNoDataMask && eErr == CE_None)
6496
0
                        {
6497
0
                            auto poMaskBand = poSrcBand->IsMaskBand()
6498
0
                                                  ? poSrcBand
6499
0
                                                  : poSrcBand->GetMaskBand();
6500
0
                            eErr = poMaskBand->RasterIO(
6501
0
                                GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6502
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6503
0
                                apabyChunkNoDataMask[iBand].get(),
6504
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6505
0
                                GDT_UInt8, 0, 0, nullptr);
6506
0
                        }
6507
0
                    }
6508
0
                }
6509
6510
                // Compute the resulting overview block.
6511
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6512
0
                {
6513
0
                    auto poJob = std::make_unique<OvrJob>();
6514
0
                    poJob->pfnResampleFn = pfnResampleFn;
6515
0
                    poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6516
0
                    poJob->args.eOvrDataType =
6517
0
                        poJob->poDstBand->GetRasterDataType();
6518
0
                    poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6519
0
                    poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6520
0
                    const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6521
0
                        "NBITS", "IMAGE_STRUCTURE");
6522
0
                    poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6523
0
                    poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6524
0
                    poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6525
0
                    poJob->args.eWrkDataType = eWrkDataType;
6526
0
                    poJob->pChunk = apaChunk[iBand].get();
6527
0
                    poJob->args.pabyChunkNodataMask =
6528
0
                        apabyChunkNoDataMask[iBand].get();
6529
0
                    poJob->args.nChunkXOff = nChunkXOffQueried;
6530
0
                    poJob->args.nChunkXSize = nChunkXSizeQueried;
6531
0
                    poJob->args.nChunkYOff = nChunkYOffQueried;
6532
0
                    poJob->args.nChunkYSize = nChunkYSizeQueried;
6533
0
                    poJob->args.nDstXOff = nDstXOff;
6534
0
                    poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6535
0
                    poJob->args.nDstYOff = nDstYOff;
6536
0
                    poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6537
0
                    poJob->args.pszResampling = pszResampling;
6538
0
                    poJob->args.bHasNoData = abHasNoData[iBand];
6539
0
                    poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6540
0
                    poJob->args.eSrcDataType = eDataType;
6541
0
                    poJob->args.bPropagateNoData = bPropagateNoData;
6542
6543
0
                    if (poJobQueue)
6544
0
                    {
6545
0
                        poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6546
0
                            apabyChunkNoDataMask[iBand].release()));
6547
6548
0
                        poJob->oSrcBufferHolder.reset(
6549
0
                            new PointerHolder(apaChunk[iBand].release()));
6550
6551
0
                        poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6552
0
                        jobList.emplace_back(std::move(poJob));
6553
0
                    }
6554
0
                    else
6555
0
                    {
6556
0
                        JobResampleFunc(poJob.get());
6557
0
                        eErr = poJob->eErr;
6558
0
                        if (eErr == CE_None)
6559
0
                        {
6560
0
                            eErr = WriteJobData(poJob.get());
6561
0
                        }
6562
0
                    }
6563
0
                }
6564
0
            }
6565
0
        }
6566
6567
        // Wait for all pending jobs to complete
6568
0
        while (!jobList.empty())
6569
0
        {
6570
0
            const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6571
0
            if (l_eErr != CE_None && eErr == CE_None)
6572
0
                eErr = l_eErr;
6573
0
        }
6574
6575
        // Flush the data to overviews.
6576
0
        for (int iBand = 0; iBand < nBands; ++iBand)
6577
0
        {
6578
0
            if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6579
0
                CE_None)
6580
0
                eErr = CE_Failure;
6581
0
        }
6582
0
    }
6583
6584
0
    if (eErr == CE_None)
6585
0
        pfnProgress(1.0, nullptr, pProgressData);
6586
6587
0
    return eErr;
6588
0
}
6589
6590
/************************************************************************/
6591
/*                  GDALRegenerateOverviewsMultiBand()                  */
6592
/************************************************************************/
6593
6594
/**
6595
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6596
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6597
 *
6598
 * This function will generate one or more overview images from a base
6599
 * image using the requested downsampling algorithm.  Its primary use
6600
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
6601
 * can also be used to generate downsampled images in one file from another
6602
 * outside the overview architecture.
6603
 *
6604
 * The output bands need to exist in advance and share the same characteristics
6605
 * (type, dimensions)
6606
 *
6607
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6608
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6609
 *
6610
 * It does not support color tables or complex data types.
6611
 *
6612
 * The pseudo-algorithm used by the function is :
6613
 *    for each overview
6614
 *       iterate on lines of the source by a step of deltay
6615
 *           iterate on columns of the source  by a step of deltax
6616
 *               read the source data of size deltax * deltay for all the bands
6617
 *               generate the corresponding overview block for all the bands
6618
 *
6619
 * This function will honour properly NODATA_VALUES tuples (special dataset
6620
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6621
 * considered as the nodata value and not each value of the triplet
6622
 * independently per band.
6623
 *
6624
 * The GDAL_NUM_THREADS configuration option can be set
6625
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6626
 * overview computation.
6627
 *
6628
 * @param apoSrcBands the list of source bands to downsample
6629
 * @param aapoOverviewBands bidimension array of bands. First dimension is
6630
 *                          indexed by bands. Second dimension is indexed by
6631
 *                          overview levels. All aapoOverviewBands[i] arrays
6632
 *                          must have the same size (i.e. same number of
6633
 *                          overviews)
6634
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6635
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6636
 * @param pfnProgress progress report function.
6637
 * @param pProgressData progress function callback data.
6638
 * @param papszOptions NULL terminated list of options as
6639
 *                     key=value pairs, or NULL
6640
 *                     The XOFF, YOFF, XSIZE and YSIZE
6641
 *                     options can be specified to express that overviews should
6642
 *                     be regenerated only in the specified subset of the source
6643
 *                     dataset.
6644
 * @return CE_None on success or CE_Failure on failure.
6645
 * @since 3.10
6646
 */
6647
6648
CPLErr GDALRegenerateOverviewsMultiBand(
6649
    const std::vector<GDALRasterBand *> &apoSrcBands,
6650
    const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6651
    const char *pszResampling, GDALProgressFunc pfnProgress,
6652
    void *pProgressData, CSLConstList papszOptions)
6653
0
{
6654
0
    CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6655
0
    for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6656
0
    {
6657
0
        CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6658
0
    }
6659
6660
0
    if (aapoOverviewBands.empty())
6661
0
        return CE_None;
6662
6663
0
    std::vector<GDALRasterBand **> apapoOverviewBands;
6664
0
    for (auto &apoOverviewBands : aapoOverviewBands)
6665
0
    {
6666
0
        auto papoOverviewBands = static_cast<GDALRasterBand **>(
6667
0
            CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6668
0
        for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6669
0
        {
6670
0
            papoOverviewBands[i] = apoOverviewBands[i];
6671
0
        }
6672
0
        apapoOverviewBands.push_back(papoOverviewBands);
6673
0
    }
6674
0
    const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6675
0
        static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6676
0
        static_cast<int>(aapoOverviewBands[0].size()),
6677
0
        apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6678
0
        papszOptions);
6679
0
    for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6680
0
        CPLFree(papoOverviewBands);
6681
0
    return eErr;
6682
0
}
6683
6684
/************************************************************************/
6685
/*                        GDALComputeBandStats()                        */
6686
/************************************************************************/
6687
6688
/** Undocumented
6689
 * @param hSrcBand undocumented.
6690
 * @param nSampleStep Step between scanlines used to compute statistics.
6691
 *                    When nSampleStep is equal to 1, all scanlines will
6692
 *                    be processed.
6693
 * @param pdfMean undocumented.
6694
 * @param pdfStdDev undocumented.
6695
 * @param pfnProgress undocumented.
6696
 * @param pProgressData undocumented.
6697
 * @return undocumented
6698
 */
6699
CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6700
                                        int nSampleStep, double *pdfMean,
6701
                                        double *pdfStdDev,
6702
                                        GDALProgressFunc pfnProgress,
6703
                                        void *pProgressData)
6704
6705
0
{
6706
0
    VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6707
6708
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6709
6710
0
    if (pfnProgress == nullptr)
6711
0
        pfnProgress = GDALDummyProgress;
6712
6713
0
    const int nWidth = poSrcBand->GetXSize();
6714
0
    const int nHeight = poSrcBand->GetYSize();
6715
6716
0
    if (nSampleStep >= nHeight || nSampleStep < 1)
6717
0
        nSampleStep = 1;
6718
6719
0
    GDALDataType eWrkType = GDT_Unknown;
6720
0
    float *pafData = nullptr;
6721
0
    GDALDataType eType = poSrcBand->GetRasterDataType();
6722
0
    const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6723
0
    if (bComplex)
6724
0
    {
6725
0
        pafData = static_cast<float *>(
6726
0
            VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6727
0
        eWrkType = GDT_CFloat32;
6728
0
    }
6729
0
    else
6730
0
    {
6731
0
        pafData =
6732
0
            static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6733
0
        eWrkType = GDT_Float32;
6734
0
    }
6735
6736
0
    if (nWidth == 0 || pafData == nullptr)
6737
0
    {
6738
0
        VSIFree(pafData);
6739
0
        return CE_Failure;
6740
0
    }
6741
6742
    /* -------------------------------------------------------------------- */
6743
    /*      Loop over all sample lines.                                     */
6744
    /* -------------------------------------------------------------------- */
6745
0
    double dfSum = 0.0;
6746
0
    double dfSum2 = 0.0;
6747
0
    int iLine = 0;
6748
0
    GIntBig nSamples = 0;
6749
6750
0
    do
6751
0
    {
6752
0
        if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6753
0
                         pProgressData))
6754
0
        {
6755
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6756
0
            CPLFree(pafData);
6757
0
            return CE_Failure;
6758
0
        }
6759
6760
0
        const CPLErr eErr =
6761
0
            poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6762
0
                                1, eWrkType, 0, 0, nullptr);
6763
0
        if (eErr != CE_None)
6764
0
        {
6765
0
            CPLFree(pafData);
6766
0
            return eErr;
6767
0
        }
6768
6769
0
        for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6770
0
        {
6771
0
            float fValue = 0.0f;
6772
6773
0
            if (bComplex)
6774
0
            {
6775
                // Compute the magnitude of the complex value.
6776
0
                fValue =
6777
0
                    std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6778
0
                               pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6779
0
            }
6780
0
            else
6781
0
            {
6782
0
                fValue = pafData[iPixel];
6783
0
            }
6784
6785
0
            dfSum += static_cast<double>(fValue);
6786
0
            dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
6787
0
        }
6788
6789
0
        nSamples += nWidth;
6790
0
        iLine += nSampleStep;
6791
0
    } while (iLine < nHeight);
6792
6793
0
    if (!pfnProgress(1.0, nullptr, pProgressData))
6794
0
    {
6795
0
        CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6796
0
        CPLFree(pafData);
6797
0
        return CE_Failure;
6798
0
    }
6799
6800
    /* -------------------------------------------------------------------- */
6801
    /*      Produce the result values.                                      */
6802
    /* -------------------------------------------------------------------- */
6803
0
    if (pdfMean != nullptr)
6804
0
        *pdfMean = dfSum / nSamples;
6805
6806
0
    if (pdfStdDev != nullptr)
6807
0
    {
6808
0
        const double dfMean = dfSum / nSamples;
6809
6810
0
        *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6811
0
    }
6812
6813
0
    CPLFree(pafData);
6814
6815
0
    return CE_None;
6816
0
}
6817
6818
/************************************************************************/
6819
/*                  GDALOverviewMagnitudeCorrection()                   */
6820
/*                                                                      */
6821
/*      Correct the mean and standard deviation of the overviews of     */
6822
/*      the given band to match the base layer approximately.           */
6823
/************************************************************************/
6824
6825
/** Undocumented
6826
 * @param hBaseBand undocumented.
6827
 * @param nOverviewCount undocumented.
6828
 * @param pahOverviews undocumented.
6829
 * @param pfnProgress undocumented.
6830
 * @param pProgressData undocumented.
6831
 * @return undocumented
6832
 */
6833
CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6834
                                       int nOverviewCount,
6835
                                       GDALRasterBandH *pahOverviews,
6836
                                       GDALProgressFunc pfnProgress,
6837
                                       void *pProgressData)
6838
6839
0
{
6840
0
    VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6841
6842
    /* -------------------------------------------------------------------- */
6843
    /*      Compute mean/stddev for source raster.                          */
6844
    /* -------------------------------------------------------------------- */
6845
0
    double dfOrigMean = 0.0;
6846
0
    double dfOrigStdDev = 0.0;
6847
0
    {
6848
0
        const CPLErr eErr =
6849
0
            GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6850
0
                                 pfnProgress, pProgressData);
6851
6852
0
        if (eErr != CE_None)
6853
0
            return eErr;
6854
0
    }
6855
6856
    /* -------------------------------------------------------------------- */
6857
    /*      Loop on overview bands.                                         */
6858
    /* -------------------------------------------------------------------- */
6859
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6860
0
    {
6861
0
        GDALRasterBand *poOverview =
6862
0
            GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6863
0
        double dfOverviewMean, dfOverviewStdDev;
6864
6865
0
        const CPLErr eErr =
6866
0
            GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6867
0
                                 &dfOverviewStdDev, pfnProgress, pProgressData);
6868
6869
0
        if (eErr != CE_None)
6870
0
            return eErr;
6871
6872
0
        double dfGain = 1.0;
6873
0
        if (dfOrigStdDev >= 0.0001)
6874
0
            dfGain = dfOrigStdDev / dfOverviewStdDev;
6875
6876
        /* --------------------------------------------------------------------
6877
         */
6878
        /*      Apply gain and offset. */
6879
        /* --------------------------------------------------------------------
6880
         */
6881
0
        const int nWidth = poOverview->GetXSize();
6882
0
        const int nHeight = poOverview->GetYSize();
6883
6884
0
        GDALDataType eWrkType = GDT_Unknown;
6885
0
        float *pafData = nullptr;
6886
0
        const GDALDataType eType = poOverview->GetRasterDataType();
6887
0
        const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6888
0
        if (bComplex)
6889
0
        {
6890
0
            pafData = static_cast<float *>(
6891
0
                VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6892
0
            eWrkType = GDT_CFloat32;
6893
0
        }
6894
0
        else
6895
0
        {
6896
0
            pafData = static_cast<float *>(
6897
0
                VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6898
0
            eWrkType = GDT_Float32;
6899
0
        }
6900
6901
0
        if (pafData == nullptr)
6902
0
        {
6903
0
            return CE_Failure;
6904
0
        }
6905
6906
0
        for (int iLine = 0; iLine < nHeight; ++iLine)
6907
0
        {
6908
0
            if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6909
0
                             pProgressData))
6910
0
            {
6911
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6912
0
                CPLFree(pafData);
6913
0
                return CE_Failure;
6914
0
            }
6915
6916
0
            if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6917
0
                                     nWidth, 1, eWrkType, 0, 0,
6918
0
                                     nullptr) != CE_None)
6919
0
            {
6920
0
                CPLFree(pafData);
6921
0
                return CE_Failure;
6922
0
            }
6923
6924
0
            for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6925
0
            {
6926
0
                if (bComplex)
6927
0
                {
6928
0
                    pafData[static_cast<size_t>(iPixel) * 2] *=
6929
0
                        static_cast<float>(dfGain);
6930
0
                    pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
6931
0
                        static_cast<float>(dfGain);
6932
0
                }
6933
0
                else
6934
0
                {
6935
0
                    pafData[iPixel] = static_cast<float>(
6936
0
                        (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
6937
0
                        dfOrigMean);
6938
0
                }
6939
0
            }
6940
6941
0
            if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6942
0
                                     nWidth, 1, eWrkType, 0, 0,
6943
0
                                     nullptr) != CE_None)
6944
0
            {
6945
0
                CPLFree(pafData);
6946
0
                return CE_Failure;
6947
0
            }
6948
0
        }
6949
6950
0
        if (!pfnProgress(1.0, nullptr, pProgressData))
6951
0
        {
6952
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6953
0
            CPLFree(pafData);
6954
0
            return CE_Failure;
6955
0
        }
6956
6957
0
        CPLFree(pafData);
6958
0
    }
6959
6960
0
    return CE_None;
6961
0
}