Coverage Report

Created: 2025-06-13 06:18

/src/gdal/gcore/overview.cpp
Line
Count
Source (jump to first uncovered line)
1
2
/******************************************************************************
3
 *
4
 * Project:  GDAL Core
5
 * Purpose:  Helper code to implement overview support in different drivers.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2000, Frank Warmerdam
10
 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14
15
#include "cpl_port.h"
16
#include "gdal_priv.h"
17
18
#include <cmath>
19
#include <cstddef>
20
#include <cstdlib>
21
22
#include <algorithm>
23
#include <complex>
24
#include <condition_variable>
25
#include <limits>
26
#include <list>
27
#include <memory>
28
#include <mutex>
29
#include <vector>
30
31
#include "cpl_conv.h"
32
#include "cpl_error.h"
33
#include "cpl_float.h"
34
#include "cpl_progress.h"
35
#include "cpl_vsi.h"
36
#include "gdal.h"
37
#include "gdal_thread_pool.h"
38
#include "gdalwarper.h"
39
#include "gdal_vrt.h"
40
#include "vrtdataset.h"
41
42
#ifdef USE_NEON_OPTIMIZATIONS
43
#include "include_sse2neon.h"
44
#define USE_SSE2
45
46
#include "gdalsse_priv.h"
47
48
// Restrict to 64bit processors because they are guaranteed to have SSE2,
49
// or if __AVX2__ is defined.
50
#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
51
#define USE_SSE2
52
53
#include "gdalsse_priv.h"
54
55
#ifdef __SSE3__
56
#include <pmmintrin.h>
57
#endif
58
#ifdef __SSSE3__
59
#include <tmmintrin.h>
60
#endif
61
#ifdef __SSE4_1__
62
#include <smmintrin.h>
63
#endif
64
#ifdef __AVX2__
65
#include <immintrin.h>
66
#endif
67
68
#endif
69
70
// To be included after above USE_SSE2 and include gdalsse_priv.h
71
// to avoid build issue on Windows x86
72
#include "gdal_priv_templates.hpp"
73
74
/************************************************************************/
75
/*                      GDALResampleChunk_Near()                        */
76
/************************************************************************/
77
78
template <class T>
79
static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
80
                                      const T *pChunk, T **ppDstBuffer)
81
82
0
{
83
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
84
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
85
0
    const GDALDataType eWrkDataType = args.eWrkDataType;
86
0
    const int nChunkXOff = args.nChunkXOff;
87
0
    const int nChunkXSize = args.nChunkXSize;
88
0
    const int nChunkYOff = args.nChunkYOff;
89
0
    const int nDstXOff = args.nDstXOff;
90
0
    const int nDstXOff2 = args.nDstXOff2;
91
0
    const int nDstYOff = args.nDstYOff;
92
0
    const int nDstYOff2 = args.nDstYOff2;
93
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
94
95
    /* -------------------------------------------------------------------- */
96
    /*      Allocate buffers.                                               */
97
    /* -------------------------------------------------------------------- */
98
0
    *ppDstBuffer = static_cast<T *>(
99
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
100
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
101
0
    if (*ppDstBuffer == nullptr)
102
0
    {
103
0
        return CE_Failure;
104
0
    }
105
0
    T *const pDstBuffer = *ppDstBuffer;
106
107
0
    int *panSrcXOff =
108
0
        static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
109
110
0
    if (panSrcXOff == nullptr)
111
0
    {
112
0
        return CE_Failure;
113
0
    }
114
115
    /* ==================================================================== */
116
    /*      Precompute inner loop constants.                                */
117
    /* ==================================================================== */
118
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
119
0
    {
120
0
        int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
121
0
        if (nSrcXOff < nChunkXOff)
122
0
            nSrcXOff = nChunkXOff;
123
124
0
        panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
125
0
    }
126
127
    /* ==================================================================== */
128
    /*      Loop over destination scanlines.                                */
129
    /* ==================================================================== */
130
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
131
0
    {
132
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
133
0
        if (nSrcYOff < nChunkYOff)
134
0
            nSrcYOff = nChunkYOff;
135
136
0
        const T *const pSrcScanline =
137
0
            pChunk +
138
0
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
139
0
            nChunkXOff;
140
141
        /* --------------------------------------------------------------------
142
         */
143
        /*      Loop over destination pixels */
144
        /* --------------------------------------------------------------------
145
         */
146
0
        T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
147
0
        for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
148
0
        {
149
0
            pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
150
0
        }
151
0
    }
152
153
0
    CPLFree(panSrcXOff);
154
155
0
    return CE_None;
156
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>**)
157
158
static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
159
                                     const void *pChunk, void **ppDstBuffer,
160
                                     GDALDataType *peDstBufferDataType)
161
0
{
162
0
    *peDstBufferDataType = args.eWrkDataType;
163
0
    switch (args.eWrkDataType)
164
0
    {
165
        // For nearest resampling, as no computation is done, only the
166
        // size of the data type matters.
167
0
        case GDT_Byte:
168
0
        case GDT_Int8:
169
0
        {
170
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
171
0
            return GDALResampleChunk_NearT(
172
0
                args, static_cast<const uint8_t *>(pChunk),
173
0
                reinterpret_cast<uint8_t **>(ppDstBuffer));
174
0
        }
175
176
0
        case GDT_Int16:
177
0
        case GDT_UInt16:
178
0
        case GDT_Float16:
179
0
        {
180
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
181
0
            return GDALResampleChunk_NearT(
182
0
                args, static_cast<const uint16_t *>(pChunk),
183
0
                reinterpret_cast<uint16_t **>(ppDstBuffer));
184
0
        }
185
186
0
        case GDT_CInt16:
187
0
        case GDT_CFloat16:
188
0
        case GDT_Int32:
189
0
        case GDT_UInt32:
190
0
        case GDT_Float32:
191
0
        {
192
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
193
0
            return GDALResampleChunk_NearT(
194
0
                args, static_cast<const uint32_t *>(pChunk),
195
0
                reinterpret_cast<uint32_t **>(ppDstBuffer));
196
0
        }
197
198
0
        case GDT_CInt32:
199
0
        case GDT_CFloat32:
200
0
        case GDT_Int64:
201
0
        case GDT_UInt64:
202
0
        case GDT_Float64:
203
0
        {
204
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
205
0
            return GDALResampleChunk_NearT(
206
0
                args, static_cast<const uint64_t *>(pChunk),
207
0
                reinterpret_cast<uint64_t **>(ppDstBuffer));
208
0
        }
209
210
0
        case GDT_CFloat64:
211
0
        {
212
0
            return GDALResampleChunk_NearT(
213
0
                args, static_cast<const std::complex<double> *>(pChunk),
214
0
                reinterpret_cast<std::complex<double> **>(ppDstBuffer));
215
0
        }
216
217
0
        case GDT_Unknown:
218
0
        case GDT_TypeCount:
219
0
            break;
220
0
    }
221
0
    CPLAssert(false);
222
0
    return CE_Failure;
223
0
}
224
225
namespace
226
{
227
228
// Find in the color table the entry whose RGB value is the closest
229
// (using quadratic distance) to the test color, ignoring transparent entries.
230
int BestColorEntry(const std::vector<GDALColorEntry> &entries,
231
                   const GDALColorEntry &test)
232
0
{
233
0
    int nMinDist = std::numeric_limits<int>::max();
234
0
    size_t bestEntry = 0;
235
0
    for (size_t i = 0; i < entries.size(); ++i)
236
0
    {
237
0
        const GDALColorEntry &entry = entries[i];
238
        // Ignore transparent entries
239
0
        if (entry.c4 == 0)
240
0
            continue;
241
242
0
        int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
243
0
                    ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
244
0
                    ((test.c3 - entry.c3) * (test.c3 - entry.c3));
245
0
        if (nDist < nMinDist)
246
0
        {
247
0
            nMinDist = nDist;
248
0
            bestEntry = i;
249
0
        }
250
0
    }
251
0
    return static_cast<int>(bestEntry);
252
0
}
253
254
std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
255
                                           int &transparentIdx)
256
0
{
257
0
    std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
258
259
0
    transparentIdx = -1;
260
0
    int i = 0;
261
0
    for (auto &entry : entries)
262
0
    {
263
0
        table.GetColorEntryAsRGB(i, &entry);
264
0
        if (transparentIdx < 0 && entry.c4 == 0)
265
0
            transparentIdx = i;
266
0
        ++i;
267
0
    }
268
0
    return entries;
269
0
}
270
271
}  // unnamed  namespace
272
273
/************************************************************************/
274
/*                             SQUARE()                                 */
275
/************************************************************************/
276
277
template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
278
0
{
279
0
    return static_cast<Tsquare>(val) * val;
280
0
}
Unexecuted instantiation: int SQUARE<int, int>(int)
Unexecuted instantiation: double SQUARE<double, double>(double)
Unexecuted instantiation: unsigned int SQUARE<unsigned int, unsigned int>(unsigned int)
281
282
/************************************************************************/
283
/*                          ComputeIntegerRMS()                         */
284
/************************************************************************/
285
// Compute rms = sqrt(sumSquares / weight) in such a way that it is the
286
// integer that minimizes abs(rms**2 - sumSquares / weight)
287
template <class T, class Twork>
288
inline T ComputeIntegerRMS(double sumSquares, double weight)
289
0
{
290
0
    const double sumDivWeight = sumSquares / weight;
291
0
    T rms = static_cast<T>(sqrt(sumDivWeight));
292
293
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
294
    // Naive version:
295
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
296
0
    if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
297
0
        2 * sumDivWeight)
298
0
        rms += 1;
299
0
    return rms;
300
0
}
Unexecuted instantiation: unsigned char ComputeIntegerRMS<unsigned char, int>(double, double)
Unexecuted instantiation: unsigned short ComputeIntegerRMS<unsigned short, unsigned long>(double, double)
301
302
template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
303
0
{
304
0
    CPLAssert(false);
305
0
    return 0;
306
0
}
307
308
template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
309
0
{
310
    // It has been verified that given the correction on rms below, using
311
    // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
312
    // is equivalent, so use the former as it is used twice.
313
0
    const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
314
0
    const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
315
0
    GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
316
317
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
318
    // Naive version:
319
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
320
    // Optimized version for integer case and weight == 4
321
0
    if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
322
0
        rms += 1;
323
0
    return rms;
324
0
}
325
326
template <>
327
inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
328
0
{
329
0
    const double sumDivWeight = sumSquares * 0.25;
330
0
    GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
331
332
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
333
    // Naive version:
334
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
335
    // Optimized version for integer case and weight == 4
336
0
    if (static_cast<GUInt32>(rms) * (rms + 1) <
337
0
        static_cast<GUInt32>(sumDivWeight + 0.25))
338
0
        rms += 1;
339
0
    return rms;
340
0
}
341
342
#ifdef USE_SSE2
343
344
/************************************************************************/
345
/*                   QuadraticMeanByteSSE2OrAVX2()                      */
346
/************************************************************************/
347
348
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
349
#define sse2_packus_epi32 _mm_packus_epi32
350
#else
351
inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
352
0
{
353
0
    const auto minus32768_32 = _mm_set1_epi32(-32768);
354
0
    const auto minus32768_16 = _mm_set1_epi16(-32768);
355
0
    a = _mm_add_epi32(a, minus32768_32);
356
0
    b = _mm_add_epi32(b, minus32768_32);
357
0
    a = _mm_packs_epi32(a, b);
358
0
    a = _mm_sub_epi16(a, minus32768_16);
359
0
    return a;
360
0
}
361
#endif
362
363
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
364
#define sse2_hadd_epi16 _mm_hadd_epi16
365
#else
366
inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
367
0
{
368
    // Horizontal addition of adjacent pairs
369
0
    const auto mask = _mm_set1_epi32(0xFFFF);
370
0
    const auto horizLo =
371
0
        _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
372
0
    const auto horizHi =
373
0
        _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
374
375
    // Recombine low and high parts
376
0
    return _mm_packs_epi32(horizLo, horizHi);
377
0
}
378
#endif
379
380
#ifdef __AVX2__
381
382
#define DEST_ELTS 16
383
#define set1_epi16 _mm256_set1_epi16
384
#define set1_epi32 _mm256_set1_epi32
385
#define setzero _mm256_setzero_si256
386
#define set1_ps _mm256_set1_ps
387
#define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
388
#define unpacklo_epi8 _mm256_unpacklo_epi8
389
#define unpackhi_epi8 _mm256_unpackhi_epi8
390
#define madd_epi16 _mm256_madd_epi16
391
#define add_epi32 _mm256_add_epi32
392
#define mul_ps _mm256_mul_ps
393
#define cvtepi32_ps _mm256_cvtepi32_ps
394
#define sqrt_ps _mm256_sqrt_ps
395
#define cvttps_epi32 _mm256_cvttps_epi32
396
#define packs_epi32 _mm256_packs_epi32
397
#define packus_epi32 _mm256_packus_epi32
398
#define srli_epi32 _mm256_srli_epi32
399
#define mullo_epi16 _mm256_mullo_epi16
400
#define srli_epi16 _mm256_srli_epi16
401
#define cmpgt_epi16 _mm256_cmpgt_epi16
402
#define add_epi16 _mm256_add_epi16
403
#define sub_epi16 _mm256_sub_epi16
404
#define packus_epi16 _mm256_packus_epi16
405
406
/* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
407
/* to get the lower 128-bit bits of what would be a true 256-bit vector register
408
 */
409
410
inline __m256i FIXUP_LANES(__m256i x)
411
{
412
    return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
413
}
414
415
#define store_lo(x, y)                                                         \
416
    _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
417
                     _mm256_extracti128_si256(FIXUP_LANES(y), 0))
418
#define storeu_int(x, y)                                                       \
419
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
420
#define hadd_epi16 _mm256_hadd_epi16
421
#define zeroupper() _mm256_zeroupper()
422
#else
423
0
#define DEST_ELTS 8
424
0
#define set1_epi16 _mm_set1_epi16
425
0
#define set1_epi32 _mm_set1_epi32
426
0
#define setzero _mm_setzero_si128
427
#define set1_ps _mm_set1_ps
428
0
#define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
429
0
#define unpacklo_epi8 _mm_unpacklo_epi8
430
0
#define unpackhi_epi8 _mm_unpackhi_epi8
431
0
#define madd_epi16 _mm_madd_epi16
432
0
#define add_epi32 _mm_add_epi32
433
#define mul_ps _mm_mul_ps
434
0
#define cvtepi32_ps _mm_cvtepi32_ps
435
0
#define sqrt_ps _mm_sqrt_ps
436
0
#define cvttps_epi32 _mm_cvttps_epi32
437
0
#define packs_epi32 _mm_packs_epi32
438
0
#define packus_epi32 sse2_packus_epi32
439
0
#define srli_epi32 _mm_srli_epi32
440
0
#define mullo_epi16 _mm_mullo_epi16
441
0
#define srli_epi16 _mm_srli_epi16
442
0
#define cmpgt_epi16 _mm_cmpgt_epi16
443
0
#define add_epi16 _mm_add_epi16
444
0
#define sub_epi16 _mm_sub_epi16
445
0
#define packus_epi16 _mm_packus_epi16
446
0
#define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
447
0
#define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
448
0
#define hadd_epi16 sse2_hadd_epi16
449
0
#define zeroupper() (void)0
450
#endif
451
452
#if defined(__GNUC__) && defined(__AVX2__)
453
// Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
454
// -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
455
// where the registry that contains minus_zero is correctly
456
// loaded the first time the function is called (looking at the disassembly,
457
// one sees it is loaded much earlier than the function), but gets corrupted
458
// (zeroed) in following iterations.
459
// It appears the bug is due to the explicit zeroupper() call at the end of
460
// the function.
461
// The bug is at least solved in gcc 10.2.
462
// Inlining doesn't bring much here to performance.
463
// This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
464
// -O3 -mavx2 mode
465
#define NOINLINE __attribute__((noinline))
466
#else
467
#define NOINLINE
468
#endif
469
470
template <class T>
471
static int NOINLINE
472
QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
473
                            const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
474
                            T *CPL_RESTRICT pDstScanline)
475
0
{
476
    // Optimized implementation for RMS on Byte by
477
    // processing by group of 8 output pixels, so as to use
478
    // a single _mm_sqrt_ps() call for 4 output pixels
479
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
480
481
0
    int iDstPixel = 0;
482
0
    const auto one16 = set1_epi16(1);
483
0
    const auto one32 = set1_epi32(1);
484
0
    const auto zero = setzero();
485
0
    const auto minus32768 = set1_epi16(-32768);
486
487
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
488
0
    {
489
        // Load 2 * DEST_ELTS bytes from each line
490
0
        auto firstLine = loadu_int(pSrcScanlineShifted);
491
0
        auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
492
        // Extend those Bytes as UInt16s
493
0
        auto firstLineLo = unpacklo_epi8(firstLine, zero);
494
0
        auto firstLineHi = unpackhi_epi8(firstLine, zero);
495
0
        auto secondLineLo = unpacklo_epi8(secondLine, zero);
496
0
        auto secondLineHi = unpackhi_epi8(secondLine, zero);
497
498
        // Multiplication of 16 bit values and horizontal
499
        // addition of 32 bit results
500
        // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
501
0
        firstLineLo = madd_epi16(firstLineLo, firstLineLo);
502
0
        firstLineHi = madd_epi16(firstLineHi, firstLineHi);
503
0
        secondLineLo = madd_epi16(secondLineLo, secondLineLo);
504
0
        secondLineHi = madd_epi16(secondLineHi, secondLineHi);
505
506
        // Vertical addition
507
0
        const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
508
0
        const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
509
510
0
        const auto sumSquaresPlusOneDiv4Lo =
511
0
            srli_epi32(add_epi32(sumSquaresLo, one32), 2);
512
0
        const auto sumSquaresPlusOneDiv4Hi =
513
0
            srli_epi32(add_epi32(sumSquaresHi, one32), 2);
514
515
        // Take square root and truncate/floor to int32
516
0
        const auto rmsLo =
517
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
518
0
        const auto rmsHi =
519
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
520
521
        // Merge back low and high registers with each RMS value
522
        // as a 16 bit value.
523
0
        auto rms = packs_epi32(rmsLo, rmsHi);
524
525
        // Round to upper value if it minimizes the
526
        // error |rms^2 - sumSquares/4|
527
        // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
528
        //    rms += 1;
529
        // which is equivalent to:
530
        // if( rms * (rms + 1) < (sumSquares+1) / 4 )
531
        //    rms += 1;
532
        // And both left and right parts fit on 16 (unsigned) bits
533
0
        const auto sumSquaresPlusOneDiv4 =
534
0
            packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
535
        // cmpgt_epi16 operates on signed int16, but here
536
        // we have unsigned values, so shift them by -32768 before
537
0
        auto mask = cmpgt_epi16(
538
0
            add_epi16(sumSquaresPlusOneDiv4, minus32768),
539
0
            add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
540
        // The value of the mask will be -1 when the correction needs to be
541
        // applied
542
0
        rms = sub_epi16(rms, mask);
543
544
        // Pack each 16 bit RMS value to 8 bits
545
0
        rms = packus_epi16(rms, rms /* could be anything */);
546
0
        store_lo(&pDstScanline[iDstPixel], rms);
547
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
548
0
    }
549
0
    zeroupper();
550
551
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
552
0
    return iDstPixel;
553
0
}
554
555
/************************************************************************/
556
/*                      AverageByteSSE2OrAVX2()                         */
557
/************************************************************************/
558
559
template <class T>
560
static int
561
AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
562
                      const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
563
                      T *CPL_RESTRICT pDstScanline)
564
0
{
565
    // Optimized implementation for average on Byte by
566
    // processing by group of 16 output pixels for SSE2, or 32 for AVX2
567
568
0
    const auto zero = setzero();
569
0
    const auto two16 = set1_epi16(2);
570
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
571
572
0
    int iDstPixel = 0;
573
0
    for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
574
0
         iDstPixel += 2 * DEST_ELTS)
575
0
    {
576
0
        decltype(setzero()) average0;
577
0
        {
578
            // Load 2 * DEST_ELTS bytes from each line
579
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
580
0
            const auto secondLine =
581
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
582
            // Extend those Bytes as UInt16s
583
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
584
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
585
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
586
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
587
588
            // Vertical addition
589
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
590
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
591
592
            // Horizontal addition of adjacent pairs, and recombine low and high
593
            // parts
594
0
            const auto sum = hadd_epi16(sumLo, sumHi);
595
596
            // average = (sum + 2) / 4
597
0
            average0 = srli_epi16(add_epi16(sum, two16), 2);
598
599
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
600
0
        }
601
602
0
        decltype(setzero()) average1;
603
0
        {
604
            // Load 2 * DEST_ELTS bytes from each line
605
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
606
0
            const auto secondLine =
607
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
608
            // Extend those Bytes as UInt16s
609
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
610
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
611
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
612
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
613
614
            // Vertical addition
615
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
616
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
617
618
            // Horizontal addition of adjacent pairs, and recombine low and high
619
            // parts
620
0
            const auto sum = hadd_epi16(sumLo, sumHi);
621
622
            // average = (sum + 2) / 4
623
0
            average1 = srli_epi16(add_epi16(sum, two16), 2);
624
625
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
626
0
        }
627
628
        // Pack each 16 bit average value to 8 bits
629
0
        const auto average = packus_epi16(average0, average1);
630
0
        storeu_int(&pDstScanline[iDstPixel], average);
631
0
    }
632
0
    zeroupper();
633
634
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
635
0
    return iDstPixel;
636
0
}
637
638
/************************************************************************/
639
/*                     QuadraticMeanUInt16SSE2()                        */
640
/************************************************************************/
641
642
#ifdef __SSE3__
643
#define sse2_hadd_pd _mm_hadd_pd
644
#else
645
inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
646
0
{
647
0
    auto aLo_bLo =
648
0
        _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
649
0
    auto aHi_bHi =
650
0
        _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
651
0
    return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
652
0
}
653
#endif
654
655
inline __m128d SQUARE_PD(__m128d x)
656
0
{
657
0
    return _mm_mul_pd(x, x);
658
0
}
659
660
#ifdef __AVX2__
661
662
inline __m256d SQUARE_PD(__m256d x)
663
{
664
    return _mm256_mul_pd(x, x);
665
}
666
667
inline __m256d FIXUP_LANES(__m256d x)
668
{
669
    return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
670
}
671
672
inline __m256 FIXUP_LANES(__m256 x)
673
{
674
    return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
675
}
676
677
#endif
678
679
template <class T>
680
static int
681
QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
682
                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
683
                        T *CPL_RESTRICT pDstScanline)
684
0
{
685
    // Optimized implementation for RMS on UInt16 by
686
    // processing by group of 4 output pixels.
687
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
688
689
0
    int iDstPixel = 0;
690
0
    const auto zero = _mm_setzero_si128();
691
692
#ifdef __AVX2__
693
    const auto zeroDot25 = _mm256_set1_pd(0.25);
694
    const auto zeroDot5 = _mm256_set1_pd(0.5);
695
696
    // The first four 0's could be anything, as we only take the bottom
697
    // 128 bits.
698
    const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
699
#else
700
0
    const auto zeroDot25 = _mm_set1_pd(0.25);
701
0
    const auto zeroDot5 = _mm_set1_pd(0.5);
702
0
#endif
703
704
0
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
705
0
    {
706
        // Load 8 UInt16 from each line
707
0
        const auto firstLine = _mm_loadu_si128(
708
0
            reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
709
0
        const auto secondLine =
710
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(
711
0
                pSrcScanlineShifted + nChunkXSize));
712
713
        // Detect if all of the source values fit in 14 bits.
714
        // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
715
        // and we can do a much faster implementation.
716
0
        const auto maskTmp =
717
0
            _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
718
#if defined(__i386__) || defined(_M_IX86)
719
        uint64_t nMaskFitsIn14Bits = 0;
720
        _mm_storel_epi64(
721
            reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
722
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
723
#else
724
0
        const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
725
0
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
726
0
#endif
727
0
        if (nMaskFitsIn14Bits == 0)
728
0
        {
729
            // Multiplication of 16 bit values and horizontal
730
            // addition of 32 bit results
731
0
            const auto firstLineHSumSquare =
732
0
                _mm_madd_epi16(firstLine, firstLine);
733
0
            const auto secondLineHSumSquare =
734
0
                _mm_madd_epi16(secondLine, secondLine);
735
            // Vertical addition
736
0
            const auto sumSquares =
737
0
                _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
738
            // In theory we should take sqrt(sumSquares * 0.25f)
739
            // but given the rounding we do, this is equivalent to
740
            // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
741
            // sumSquares <= 4 * 16383^2
742
0
            const auto one32 = _mm_set1_epi32(1);
743
0
            const auto sumSquaresPlusOneDiv4 =
744
0
                _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
745
            // Take square root and truncate/floor to int32
746
0
            auto rms = _mm_cvttps_epi32(
747
0
                _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
748
749
            // Round to upper value if it minimizes the
750
            // error |rms^2 - sumSquares/4|
751
            // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
752
            //    rms += 1;
753
            // which is equivalent to:
754
            // if( rms * rms + rms < (sumSquares+1) / 4 )
755
            //    rms += 1;
756
0
            auto mask =
757
0
                _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
758
0
                                _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
759
0
            rms = _mm_sub_epi32(rms, mask);
760
            // Pack each 32 bit RMS value to 16 bits
761
0
            rms = _mm_packs_epi32(rms, rms /* could be anything */);
762
0
            _mm_storel_epi64(
763
0
                reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
764
0
            pSrcScanlineShifted += 8;
765
0
            continue;
766
0
        }
767
768
        // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
769
        // to 32 bit would result in 4 multiplications instead of 8, but
770
        // mullo/mulhi have a worse throughput than mul_pd.
771
772
        // Extend those UInt16s as UInt32s
773
0
        const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
774
0
        const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
775
0
        const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
776
0
        const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
777
778
#ifdef __AVX2__
779
        // Multiplication of 32 bit values previously converted to 64 bit double
780
        const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
781
        const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
782
        const auto secondLineLoDbl =
783
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
784
        const auto secondLineHiDbl =
785
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
786
787
        // Vertical addition of squares
788
        const auto sumSquaresLo =
789
            _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
790
        const auto sumSquaresHi =
791
            _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
792
793
        // Horizontal addition of squares
794
        const auto sumSquares =
795
            FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
796
797
        const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
798
799
        // Take square root and truncate/floor to int32
800
        auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
801
        const auto rmsDouble = _mm256_cvtepi32_pd(rms);
802
        const auto right = _mm256_sub_pd(
803
            sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
804
805
        auto mask =
806
            _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
807
        // Extract 32-bit from each of the 4 64-bit masks
808
        // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
809
        // _MM_SHUFFLE(2,0,2,0)));
810
        mask = _mm256_permutevar8x32_ps(mask, permutation);
811
        const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
812
813
        // Apply the correction
814
        rms = _mm_sub_epi32(rms, maskI);
815
816
        // Pack each 32 bit RMS value to 16 bits
817
        rms = _mm_packus_epi32(rms, rms /* could be anything */);
818
#else
819
        // Multiplication of 32 bit values previously converted to 64 bit double
820
0
        const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
821
0
        const auto firstLineLoHi =
822
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
823
0
        const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
824
0
        const auto firstLineHiHi =
825
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
826
827
0
        const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
828
0
        const auto secondLineLoHi =
829
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
830
0
        const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
831
0
        const auto secondLineHiHi =
832
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
833
834
        // Vertical addition of squares
835
0
        const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
836
0
        const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
837
0
        const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
838
0
        const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
839
840
        // Horizontal addition of squares
841
0
        const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
842
0
        const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
843
844
0
        const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
845
0
        const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
846
        // Take square root and truncate/floor to int32
847
0
        const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
848
0
        const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
849
850
        // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
851
        // if( 0.5 < sumDivWeight - (rms * rms + rms) )
852
        //     rms += 1;
853
0
        const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
854
0
        const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
855
0
        const auto rightLo = _mm_sub_pd(
856
0
            sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
857
0
        const auto rightHi = _mm_sub_pd(
858
0
            sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
859
860
0
        const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
861
0
        const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
862
        // The value of the mask will be -1 when the correction needs to be
863
        // applied
864
0
        const auto mask = _mm_castps_si128(_mm_shuffle_ps(
865
0
            maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
866
867
0
        auto rms = _mm_castps_si128(
868
0
            _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
869
        // Apply the correction
870
0
        rms = _mm_sub_epi32(rms, mask);
871
872
        // Pack each 32 bit RMS value to 16 bits
873
0
        rms = sse2_packus_epi32(rms, rms /* could be anything */);
874
0
#endif
875
876
0
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
877
0
                         rms);
878
0
        pSrcScanlineShifted += 8;
879
0
    }
880
881
0
    zeroupper();
882
883
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
884
0
    return iDstPixel;
885
0
}
886
887
/************************************************************************/
888
/*                         AverageUInt16SSE2()                          */
889
/************************************************************************/
890
891
template <class T>
892
static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
893
                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
894
                             T *CPL_RESTRICT pDstScanline)
895
0
{
896
    // Optimized implementation for average on UInt16 by
897
    // processing by group of 8 output pixels.
898
899
0
    const auto mask = _mm_set1_epi32(0xFFFF);
900
0
    const auto two = _mm_set1_epi32(2);
901
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
902
903
0
    int iDstPixel = 0;
904
0
    for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
905
0
    {
906
0
        __m128i averageLow;
907
        // Load 8 UInt16 from each line
908
0
        {
909
0
            const auto firstLine = _mm_loadu_si128(
910
0
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
911
0
            const auto secondLine =
912
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
913
0
                    pSrcScanlineShifted + nChunkXSize));
914
915
            // Horizontal addition and extension to 32 bit
916
0
            const auto horizAddFirstLine = _mm_add_epi32(
917
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
918
0
            const auto horizAddSecondLine =
919
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
920
0
                              _mm_srli_epi32(secondLine, 16));
921
922
            // Vertical addition and average computation
923
            // average = (sum + 2) >> 2
924
0
            const auto sum = _mm_add_epi32(
925
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
926
0
            averageLow = _mm_srli_epi32(sum, 2);
927
0
        }
928
        // Load 8 UInt16 from each line
929
0
        __m128i averageHigh;
930
0
        {
931
0
            const auto firstLine = _mm_loadu_si128(
932
0
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
933
0
            const auto secondLine =
934
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
935
0
                    pSrcScanlineShifted + 8 + nChunkXSize));
936
937
            // Horizontal addition and extension to 32 bit
938
0
            const auto horizAddFirstLine = _mm_add_epi32(
939
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
940
0
            const auto horizAddSecondLine =
941
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
942
0
                              _mm_srli_epi32(secondLine, 16));
943
944
            // Vertical addition and average computation
945
            // average = (sum + 2) >> 2
946
0
            const auto sum = _mm_add_epi32(
947
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
948
0
            averageHigh = _mm_srli_epi32(sum, 2);
949
0
        }
950
951
        // Pack each 32 bit average value to 16 bits
952
0
        auto average = sse2_packus_epi32(averageLow, averageHigh);
953
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
954
0
                         average);
955
0
        pSrcScanlineShifted += 16;
956
0
    }
957
958
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
959
0
    return iDstPixel;
960
0
}
961
962
/************************************************************************/
963
/*                      QuadraticMeanFloatSSE2()                        */
964
/************************************************************************/
965
966
#ifdef __SSE3__
967
#define sse2_hadd_ps _mm_hadd_ps
968
#else
969
inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
970
0
{
971
0
    auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
972
0
    auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
973
0
    return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
974
0
}
975
#endif
976
977
#ifdef __AVX2__
978
#define RMS_FLOAT_ELTS 8
979
#define set1_ps _mm256_set1_ps
980
#define loadu_ps _mm256_loadu_ps
981
#define andnot_ps _mm256_andnot_ps
982
#define and_ps _mm256_and_ps
983
#define max_ps _mm256_max_ps
984
#define shuffle_ps _mm256_shuffle_ps
985
#define div_ps _mm256_div_ps
986
#define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
987
#define mul_ps _mm256_mul_ps
988
#define add_ps _mm256_add_ps
989
#define hadd_ps _mm256_hadd_ps
990
#define sqrt_ps _mm256_sqrt_ps
991
#define or_ps _mm256_or_ps
992
#define unpacklo_ps _mm256_unpacklo_ps
993
#define unpackhi_ps _mm256_unpackhi_ps
994
#define storeu_ps _mm256_storeu_ps
995
996
inline __m256 SQUARE_PS(__m256 x)
997
{
998
    return _mm256_mul_ps(x, x);
999
}
1000
1001
#else
1002
1003
0
#define RMS_FLOAT_ELTS 4
1004
0
#define set1_ps _mm_set1_ps
1005
0
#define loadu_ps _mm_loadu_ps
1006
0
#define andnot_ps _mm_andnot_ps
1007
0
#define and_ps _mm_and_ps
1008
0
#define max_ps _mm_max_ps
1009
0
#define shuffle_ps _mm_shuffle_ps
1010
0
#define div_ps _mm_div_ps
1011
0
#define cmpeq_ps _mm_cmpeq_ps
1012
0
#define mul_ps _mm_mul_ps
1013
0
#define add_ps _mm_add_ps
1014
#define hadd_ps sse2_hadd_ps
1015
0
#define sqrt_ps _mm_sqrt_ps
1016
0
#define or_ps _mm_or_ps
1017
#define unpacklo_ps _mm_unpacklo_ps
1018
#define unpackhi_ps _mm_unpackhi_ps
1019
0
#define storeu_ps _mm_storeu_ps
1020
1021
inline __m128 SQUARE_PS(__m128 x)
1022
0
{
1023
0
    return _mm_mul_ps(x, x);
1024
0
}
1025
1026
inline __m128 FIXUP_LANES(__m128 x)
1027
0
{
1028
0
    return x;
1029
0
}
1030
1031
#endif
1032
1033
static int NOINLINE
1034
QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1035
                       const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1036
                       float *CPL_RESTRICT pDstScanline)
1037
0
{
1038
    // Optimized implementation for RMS on Float32 by
1039
    // processing by group of RMS_FLOAT_ELTS output pixels.
1040
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1041
1042
0
    int iDstPixel = 0;
1043
0
    const auto minus_zero = set1_ps(-0.0f);
1044
0
    const auto zeroDot25 = set1_ps(0.25f);
1045
0
    const auto one = set1_ps(1.0f);
1046
0
    const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1047
1048
0
    for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
1049
0
         iDstPixel += RMS_FLOAT_ELTS)
1050
0
    {
1051
        // Load 2*RMS_FLOAT_ELTS Float32 from each line
1052
0
        auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1053
0
        auto firstLineHi = loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS);
1054
0
        auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1055
0
        auto secondLineHi =
1056
0
            loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize);
1057
1058
        // Take the absolute value
1059
0
        firstLineLo = andnot_ps(minus_zero, firstLineLo);
1060
0
        firstLineHi = andnot_ps(minus_zero, firstLineHi);
1061
0
        secondLineLo = andnot_ps(minus_zero, secondLineLo);
1062
0
        secondLineHi = andnot_ps(minus_zero, secondLineHi);
1063
1064
0
        auto firstLineEven =
1065
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1066
0
        auto firstLineOdd =
1067
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1068
0
        auto secondLineEven =
1069
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1070
0
        auto secondLineOdd =
1071
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1072
1073
        // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1074
0
        const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1075
0
                                 max_ps(secondLineEven, secondLineEven));
1076
1077
        // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1078
        // This step is important to avoid that the square evaluates to infinity
1079
        // for sufficiently big input.
1080
0
        auto invMax = div_ps(one, maxV);
1081
        // Deal with 0 being the maximum to correct division by zero
1082
        // note: comparing to -0 leads to identical results as to comparing with
1083
        // 0
1084
0
        invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1085
1086
0
        firstLineEven = mul_ps(firstLineEven, invMax);
1087
0
        firstLineOdd = mul_ps(firstLineOdd, invMax);
1088
0
        secondLineEven = mul_ps(secondLineEven, invMax);
1089
0
        secondLineOdd = mul_ps(secondLineOdd, invMax);
1090
1091
        // Compute squares
1092
0
        firstLineEven = SQUARE_PS(firstLineEven);
1093
0
        firstLineOdd = SQUARE_PS(firstLineOdd);
1094
0
        secondLineEven = SQUARE_PS(secondLineEven);
1095
0
        secondLineOdd = SQUARE_PS(secondLineOdd);
1096
1097
0
        const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1098
0
                                       add_ps(secondLineEven, secondLineOdd));
1099
1100
0
        auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1101
1102
        // Deal with infinity being the maximum
1103
0
        const auto maskIsInf = cmpeq_ps(maxV, infv);
1104
0
        rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1105
1106
0
        rms = FIXUP_LANES(rms);
1107
1108
0
        storeu_ps(&pDstScanline[iDstPixel], rms);
1109
0
        pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1110
0
    }
1111
1112
0
    zeroupper();
1113
1114
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1115
0
    return iDstPixel;
1116
0
}
1117
1118
/************************************************************************/
1119
/*                        AverageFloatSSE2()                            */
1120
/************************************************************************/
1121
1122
static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1123
                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1124
                            float *CPL_RESTRICT pDstScanline)
1125
0
{
1126
    // Optimized implementation for average on Float32 by
1127
    // processing by group of 4 output pixels.
1128
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1129
1130
0
    int iDstPixel = 0;
1131
0
    const auto zeroDot25 = _mm_set1_ps(0.25f);
1132
1133
0
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1134
0
    {
1135
        // Load 8 Float32 from each line
1136
0
        const auto firstLineLo = _mm_loadu_ps(pSrcScanlineShifted);
1137
0
        const auto firstLineHi = _mm_loadu_ps(pSrcScanlineShifted + 4);
1138
0
        const auto secondLineLo =
1139
0
            _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize);
1140
0
        const auto secondLineHi =
1141
0
            _mm_loadu_ps(pSrcScanlineShifted + 4 + nChunkXSize);
1142
1143
        // Vertical addition
1144
0
        const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1145
0
        const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1146
1147
        // Horizontal addition
1148
0
        const auto sum = sse2_hadd_ps(sumLo, sumHi);
1149
1150
0
        const auto average = _mm_mul_ps(sum, zeroDot25);
1151
1152
0
        _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1153
0
        pSrcScanlineShifted += 8;
1154
0
    }
1155
1156
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1157
0
    return iDstPixel;
1158
0
}
1159
1160
#endif
1161
1162
/************************************************************************/
1163
/*                    GDALResampleChunk_AverageOrRMS()                  */
1164
/************************************************************************/
1165
1166
template <class T, class Tsum, GDALDataType eWrkDataType>
1167
static CPLErr
1168
GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1169
                                 const T *pChunk, void **ppDstBuffer)
1170
0
{
1171
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1172
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1173
0
    const double dfSrcXDelta = args.dfSrcXDelta;
1174
0
    const double dfSrcYDelta = args.dfSrcYDelta;
1175
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1176
0
    const int nChunkXOff = args.nChunkXOff;
1177
0
    const int nChunkYOff = args.nChunkYOff;
1178
0
    const int nChunkXSize = args.nChunkXSize;
1179
0
    const int nChunkYSize = args.nChunkYSize;
1180
0
    const int nDstXOff = args.nDstXOff;
1181
0
    const int nDstXOff2 = args.nDstXOff2;
1182
0
    const int nDstYOff = args.nDstYOff;
1183
0
    const int nDstYOff2 = args.nDstYOff2;
1184
0
    const char *pszResampling = args.pszResampling;
1185
0
    bool bHasNoData = args.bHasNoData;
1186
0
    const double dfNoDataValue = args.dfNoDataValue;
1187
0
    const GDALColorTable *poColorTable = args.poColorTable;
1188
0
    const bool bPropagateNoData = args.bPropagateNoData;
1189
1190
    // AVERAGE_BIT2GRAYSCALE
1191
0
    const bool bBit2Grayscale =
1192
0
        CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1193
0
    const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1194
0
    if (bBit2Grayscale)
1195
0
        poColorTable = nullptr;
1196
1197
0
    T tNoDataValue;
1198
0
    if (!bHasNoData)
1199
0
        tNoDataValue = 0;
1200
0
    else
1201
0
        tNoDataValue = static_cast<T>(dfNoDataValue);
1202
0
    const T tReplacementVal =
1203
0
        bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1204
0
                         args.eOvrDataType, dfNoDataValue))
1205
0
                   : 0;
1206
1207
0
    int nChunkRightXOff = nChunkXOff + nChunkXSize;
1208
0
    int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1209
0
    int nDstXWidth = nDstXOff2 - nDstXOff;
1210
1211
    /* -------------------------------------------------------------------- */
1212
    /*      Allocate buffers.                                               */
1213
    /* -------------------------------------------------------------------- */
1214
0
    *ppDstBuffer = static_cast<T *>(
1215
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1216
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
1217
0
    if (*ppDstBuffer == nullptr)
1218
0
    {
1219
0
        return CE_Failure;
1220
0
    }
1221
0
    T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1222
1223
0
    struct PrecomputedXValue
1224
0
    {
1225
0
        int nLeftXOffShifted;
1226
0
        int nRightXOffShifted;
1227
0
        double dfLeftWeight;
1228
0
        double dfRightWeight;
1229
0
        double dfTotalWeightFullLine;
1230
0
    };
1231
1232
0
    PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1233
0
        VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1234
1235
0
    if (pasSrcX == nullptr)
1236
0
    {
1237
0
        return CE_Failure;
1238
0
    }
1239
1240
0
    int nTransparentIdx = -1;
1241
0
    std::vector<GDALColorEntry> colorEntries;
1242
0
    if (poColorTable)
1243
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1244
1245
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1246
    // it as nodata value
1247
0
    if (bHasNoData && dfNoDataValue >= 0.0f &&
1248
0
        tNoDataValue < colorEntries.size())
1249
0
        colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1250
1251
    // Or if we have no explicit nodata, but a color table entry that is
1252
    // transparent, consider it as the nodata value
1253
0
    else if (!bHasNoData && nTransparentIdx >= 0)
1254
0
    {
1255
0
        bHasNoData = true;
1256
0
        tNoDataValue = static_cast<T>(nTransparentIdx);
1257
0
    }
1258
1259
    /* ==================================================================== */
1260
    /*      Precompute inner loop constants.                                */
1261
    /* ==================================================================== */
1262
0
    bool bSrcXSpacingIsTwo = true;
1263
0
    int nLastSrcXOff2 = -1;
1264
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1265
0
    {
1266
0
        double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1267
        // Apply some epsilon to avoid numerical precision issues
1268
0
        int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1269
0
        double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1270
0
        int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1271
1272
0
        if (nSrcXOff < nChunkXOff)
1273
0
            nSrcXOff = nChunkXOff;
1274
0
        if (nSrcXOff2 == nSrcXOff)
1275
0
            nSrcXOff2++;
1276
0
        if (nSrcXOff2 > nChunkRightXOff)
1277
0
            nSrcXOff2 = nChunkRightXOff;
1278
1279
0
        pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1280
0
        pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1281
0
            nSrcXOff2 - nChunkXOff;
1282
0
        pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1283
0
            (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1284
0
        pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1285
0
            1 - (nSrcXOff2 - dfSrcXOff2);
1286
0
        pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1287
0
            pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1288
0
        if (nSrcXOff + 1 < nSrcXOff2)
1289
0
        {
1290
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1291
0
                nSrcXOff2 - nSrcXOff - 2;
1292
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1293
0
                pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1294
0
        }
1295
1296
0
        if (nSrcXOff2 - nSrcXOff != 2 ||
1297
0
            (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1298
0
        {
1299
0
            bSrcXSpacingIsTwo = false;
1300
0
        }
1301
0
        nLastSrcXOff2 = nSrcXOff2;
1302
0
    }
1303
1304
    /* ==================================================================== */
1305
    /*      Loop over destination scanlines.                                */
1306
    /* ==================================================================== */
1307
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1308
0
    {
1309
0
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1310
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1311
0
        if (nSrcYOff < nChunkYOff)
1312
0
            nSrcYOff = nChunkYOff;
1313
1314
0
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1315
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1316
0
        if (nSrcYOff2 == nSrcYOff)
1317
0
            ++nSrcYOff2;
1318
0
        if (nSrcYOff2 > nChunkBottomYOff)
1319
0
            nSrcYOff2 = nChunkBottomYOff;
1320
1321
0
        T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1322
1323
        /* --------------------------------------------------------------------
1324
         */
1325
        /*      Loop over destination pixels */
1326
        /* --------------------------------------------------------------------
1327
         */
1328
0
        if (poColorTable == nullptr)
1329
0
        {
1330
0
            if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1331
0
                pabyChunkNodataMask == nullptr)
1332
0
            {
1333
                if constexpr (eWrkDataType == GDT_Byte ||
1334
                              eWrkDataType == GDT_UInt16)
1335
0
                {
1336
                    // Optimized case : no nodata, overview by a factor of 2 and
1337
                    // regular x and y src spacing.
1338
0
                    const T *pSrcScanlineShifted =
1339
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1340
0
                        static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1341
0
                            nChunkXSize;
1342
0
                    int iDstPixel = 0;
1343
0
#ifdef USE_SSE2
1344
                    if constexpr (eWrkDataType == GDT_Byte)
1345
0
                    {
1346
0
                        if (bQuadraticMean)
1347
0
                        {
1348
0
                            iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1349
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1350
0
                                pDstScanline);
1351
0
                        }
1352
0
                        else
1353
0
                        {
1354
0
                            iDstPixel = AverageByteSSE2OrAVX2(
1355
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1356
0
                                pDstScanline);
1357
0
                        }
1358
                    }
1359
                    else
1360
0
                    {
1361
0
                        static_assert(eWrkDataType == GDT_UInt16);
1362
0
                        if (bQuadraticMean)
1363
0
                        {
1364
0
                            iDstPixel = QuadraticMeanUInt16SSE2(
1365
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1366
0
                                pDstScanline);
1367
0
                        }
1368
0
                        else
1369
0
                        {
1370
0
                            iDstPixel = AverageUInt16SSE2(
1371
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1372
0
                                pDstScanline);
1373
0
                        }
1374
0
                    }
1375
0
#endif
1376
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1377
0
                    {
1378
0
                        Tsum nTotal = 0;
1379
0
                        T nVal;
1380
0
                        if (bQuadraticMean)
1381
0
                            nTotal =
1382
0
                                SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1383
0
                                SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1384
0
                                SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1385
0
                                SQUARE<Tsum>(
1386
0
                                    pSrcScanlineShifted[1 + nChunkXSize]);
1387
0
                        else
1388
0
                            nTotal = pSrcScanlineShifted[0] +
1389
0
                                     pSrcScanlineShifted[1] +
1390
0
                                     pSrcScanlineShifted[nChunkXSize] +
1391
0
                                     pSrcScanlineShifted[1 + nChunkXSize];
1392
1393
0
                        constexpr int nTotalWeight = 4;
1394
0
                        if (bQuadraticMean)
1395
0
                            nVal = ComputeIntegerRMS_4values<T>(nTotal);
1396
0
                        else
1397
0
                            nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1398
0
                                                  nTotalWeight);
1399
1400
                        // No need to compare nVal against tNoDataValue as we
1401
                        // are in a case where pabyChunkNodataMask == nullptr
1402
                        // implies the absence of nodata value.
1403
0
                        pDstScanline[iDstPixel] = nVal;
1404
0
                        pSrcScanlineShifted += 2;
1405
0
                    }
1406
                }
1407
                else
1408
0
                {
1409
0
                    static_assert(eWrkDataType == GDT_Float32 ||
1410
0
                                  eWrkDataType == GDT_Float64);
1411
0
                    const T *pSrcScanlineShifted =
1412
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1413
0
                        static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1414
0
                            nChunkXSize;
1415
0
                    int iDstPixel = 0;
1416
0
#ifdef USE_SSE2
1417
                    if constexpr (eWrkDataType == GDT_Float32)
1418
0
                    {
1419
0
                        static_assert(std::is_same_v<T, float>);
1420
0
                        if (bQuadraticMean)
1421
0
                        {
1422
0
                            iDstPixel = QuadraticMeanFloatSSE2(
1423
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1424
0
                                pDstScanline);
1425
0
                        }
1426
0
                        else
1427
0
                        {
1428
0
                            iDstPixel = AverageFloatSSE2(
1429
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1430
0
                                pDstScanline);
1431
0
                        }
1432
0
                    }
1433
0
#endif
1434
1435
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1436
0
                    {
1437
0
                        T nVal;
1438
0
                        if (bQuadraticMean)
1439
0
                        {
1440
                            // Cast to double to avoid overflows
1441
                            // (using std::hypot() is much slower)
1442
0
                            nVal = static_cast<T>(std::sqrt(
1443
0
                                0.25 *
1444
0
                                (SQUARE<double>(pSrcScanlineShifted[0]) +
1445
0
                                 SQUARE<double>(pSrcScanlineShifted[1]) +
1446
0
                                 SQUARE<double>(
1447
0
                                     pSrcScanlineShifted[nChunkXSize]) +
1448
0
                                 SQUARE<double>(
1449
0
                                     pSrcScanlineShifted[1 + nChunkXSize]))));
1450
0
                        }
1451
0
                        else
1452
0
                        {
1453
0
                            nVal = static_cast<T>(
1454
0
                                0.25f * (pSrcScanlineShifted[0] +
1455
0
                                         pSrcScanlineShifted[1] +
1456
0
                                         pSrcScanlineShifted[nChunkXSize] +
1457
0
                                         pSrcScanlineShifted[1 + nChunkXSize]));
1458
0
                        }
1459
1460
                        // No need to compare nVal against tNoDataValue as we
1461
                        // are in a case where pabyChunkNodataMask == nullptr
1462
                        // implies the absence of nodata value.
1463
0
                        pDstScanline[iDstPixel] = nVal;
1464
0
                        pSrcScanlineShifted += 2;
1465
0
                    }
1466
0
                }
1467
0
            }
1468
0
            else
1469
0
            {
1470
0
                const double dfBottomWeight =
1471
0
                    (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1472
0
                                                : 1.0 - (dfSrcYOff - nSrcYOff);
1473
0
                const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1474
0
                nSrcYOff -= nChunkYOff;
1475
0
                nSrcYOff2 -= nChunkYOff;
1476
1477
0
                double dfTotalWeightFullColumn = dfBottomWeight;
1478
0
                if (nSrcYOff + 1 < nSrcYOff2)
1479
0
                {
1480
0
                    dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1481
0
                    dfTotalWeightFullColumn += dfTopWeight;
1482
0
                }
1483
1484
0
                for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1485
0
                {
1486
0
                    const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1487
0
                    const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1488
1489
0
                    double dfTotal = 0;
1490
0
                    double dfTotalWeight = 0;
1491
0
                    if (pabyChunkNodataMask == nullptr)
1492
0
                    {
1493
0
                        auto pChunkShifted =
1494
0
                            pChunk +
1495
0
                            static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1496
0
                        int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1497
0
                        double dfWeightY = dfBottomWeight;
1498
0
                        while (true)
1499
0
                        {
1500
0
                            double dfTotalLine;
1501
0
                            if (bQuadraticMean)
1502
0
                            {
1503
                                // Left pixel
1504
0
                                {
1505
0
                                    const T val = pChunkShifted[nSrcXOff];
1506
0
                                    dfTotalLine =
1507
0
                                        SQUARE<double>(val) *
1508
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1509
0
                                }
1510
1511
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1512
0
                                {
1513
                                    // Middle pixels
1514
0
                                    for (int iX = nSrcXOff + 1;
1515
0
                                         iX + 1 < nSrcXOff2; ++iX)
1516
0
                                    {
1517
0
                                        const T val = pChunkShifted[iX];
1518
0
                                        dfTotalLine += SQUARE<double>(val);
1519
0
                                    }
1520
1521
                                    // Right pixel
1522
0
                                    {
1523
0
                                        const T val =
1524
0
                                            pChunkShifted[nSrcXOff2 - 1];
1525
0
                                        dfTotalLine +=
1526
0
                                            SQUARE<double>(val) *
1527
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1528
0
                                    }
1529
0
                                }
1530
0
                            }
1531
0
                            else
1532
0
                            {
1533
                                // Left pixel
1534
0
                                {
1535
0
                                    const T val = pChunkShifted[nSrcXOff];
1536
0
                                    dfTotalLine =
1537
0
                                        val * pasSrcX[iDstPixel].dfLeftWeight;
1538
0
                                }
1539
1540
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1541
0
                                {
1542
                                    // Middle pixels
1543
0
                                    for (int iX = nSrcXOff + 1;
1544
0
                                         iX + 1 < nSrcXOff2; ++iX)
1545
0
                                    {
1546
0
                                        const T val = pChunkShifted[iX];
1547
0
                                        dfTotalLine += val;
1548
0
                                    }
1549
1550
                                    // Right pixel
1551
0
                                    {
1552
0
                                        const T val =
1553
0
                                            pChunkShifted[nSrcXOff2 - 1];
1554
0
                                        dfTotalLine +=
1555
0
                                            val *
1556
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1557
0
                                    }
1558
0
                                }
1559
0
                            }
1560
1561
0
                            dfTotal += dfTotalLine * dfWeightY;
1562
0
                            --nCounterY;
1563
0
                            if (nCounterY < 0)
1564
0
                                break;
1565
0
                            pChunkShifted += nChunkXSize;
1566
0
                            dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1567
0
                        }
1568
1569
0
                        dfTotalWeight =
1570
0
                            pasSrcX[iDstPixel].dfTotalWeightFullLine *
1571
0
                            dfTotalWeightFullColumn;
1572
0
                    }
1573
0
                    else
1574
0
                    {
1575
0
                        GPtrDiff_t nCount = 0;
1576
0
                        for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1577
0
                        {
1578
0
                            const auto pChunkShifted =
1579
0
                                pChunk +
1580
0
                                static_cast<GPtrDiff_t>(iY) * nChunkXSize;
1581
1582
0
                            double dfTotalLine = 0;
1583
0
                            double dfTotalWeightLine = 0;
1584
                            // Left pixel
1585
0
                            {
1586
0
                                const int iX = nSrcXOff;
1587
0
                                const T val = pChunkShifted[iX];
1588
0
                                if (pabyChunkNodataMask[iX + iY * nChunkXSize])
1589
0
                                {
1590
0
                                    nCount++;
1591
0
                                    const double dfWeightX =
1592
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1593
0
                                    dfTotalWeightLine = dfWeightX;
1594
0
                                    if (bQuadraticMean)
1595
0
                                        dfTotalLine =
1596
0
                                            SQUARE<double>(val) * dfWeightX;
1597
0
                                    else
1598
0
                                        dfTotalLine = val * dfWeightX;
1599
0
                                }
1600
0
                            }
1601
1602
0
                            if (nSrcXOff + 1 < nSrcXOff2)
1603
0
                            {
1604
                                // Middle pixels
1605
0
                                for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
1606
0
                                     ++iX)
1607
0
                                {
1608
0
                                    const T val = pChunkShifted[iX];
1609
0
                                    if (pabyChunkNodataMask[iX +
1610
0
                                                            iY * nChunkXSize])
1611
0
                                    {
1612
0
                                        nCount++;
1613
0
                                        dfTotalWeightLine += 1;
1614
0
                                        if (bQuadraticMean)
1615
0
                                            dfTotalLine += SQUARE<double>(val);
1616
0
                                        else
1617
0
                                            dfTotalLine += val;
1618
0
                                    }
1619
0
                                }
1620
1621
                                // Right pixel
1622
0
                                {
1623
0
                                    const int iX = nSrcXOff2 - 1;
1624
0
                                    const T val = pChunkShifted[iX];
1625
0
                                    if (pabyChunkNodataMask[iX +
1626
0
                                                            iY * nChunkXSize])
1627
0
                                    {
1628
0
                                        nCount++;
1629
0
                                        const double dfWeightX =
1630
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1631
0
                                        dfTotalWeightLine += dfWeightX;
1632
0
                                        if (bQuadraticMean)
1633
0
                                            dfTotalLine +=
1634
0
                                                SQUARE<double>(val) * dfWeightX;
1635
0
                                        else
1636
0
                                            dfTotalLine += val * dfWeightX;
1637
0
                                    }
1638
0
                                }
1639
0
                            }
1640
1641
0
                            const double dfWeightY =
1642
0
                                (iY == nSrcYOff)        ? dfBottomWeight
1643
0
                                : (iY + 1 == nSrcYOff2) ? dfTopWeight
1644
0
                                                        : 1.0;
1645
0
                            dfTotal += dfTotalLine * dfWeightY;
1646
0
                            dfTotalWeight += dfTotalWeightLine * dfWeightY;
1647
0
                        }
1648
1649
0
                        if (nCount == 0 ||
1650
0
                            (bPropagateNoData &&
1651
0
                             nCount <
1652
0
                                 static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1653
0
                                     (nSrcXOff2 - nSrcXOff)))
1654
0
                        {
1655
0
                            pDstScanline[iDstPixel] = tNoDataValue;
1656
0
                            continue;
1657
0
                        }
1658
0
                    }
1659
                    if constexpr (eWrkDataType == GDT_Byte)
1660
0
                    {
1661
0
                        T nVal;
1662
0
                        if (bQuadraticMean)
1663
0
                            nVal = ComputeIntegerRMS<T, int>(dfTotal,
1664
0
                                                             dfTotalWeight);
1665
0
                        else
1666
0
                            nVal =
1667
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1668
0
                        if (bHasNoData && nVal == tNoDataValue)
1669
0
                            nVal = tReplacementVal;
1670
0
                        pDstScanline[iDstPixel] = nVal;
1671
                    }
1672
                    else if constexpr (eWrkDataType == GDT_UInt16)
1673
0
                    {
1674
0
                        T nVal;
1675
0
                        if (bQuadraticMean)
1676
0
                            nVal = ComputeIntegerRMS<T, uint64_t>(
1677
0
                                dfTotal, dfTotalWeight);
1678
0
                        else
1679
0
                            nVal =
1680
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1681
0
                        if (bHasNoData && nVal == tNoDataValue)
1682
0
                            nVal = tReplacementVal;
1683
0
                        pDstScanline[iDstPixel] = nVal;
1684
                    }
1685
                    else
1686
0
                    {
1687
0
                        T nVal;
1688
0
                        if (bQuadraticMean)
1689
0
                            nVal =
1690
0
                                static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1691
0
                        else
1692
0
                            nVal = static_cast<T>(dfTotal / dfTotalWeight);
1693
0
                        if (bHasNoData && nVal == tNoDataValue)
1694
0
                            nVal = tReplacementVal;
1695
0
                        pDstScanline[iDstPixel] = nVal;
1696
0
                    }
1697
0
                }
1698
0
            }
1699
0
        }
1700
0
        else
1701
0
        {
1702
0
            nSrcYOff -= nChunkYOff;
1703
0
            nSrcYOff2 -= nChunkYOff;
1704
1705
0
            for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1706
0
            {
1707
0
                const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1708
0
                const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1709
1710
0
                GPtrDiff_t nTotalR = 0;
1711
0
                GPtrDiff_t nTotalG = 0;
1712
0
                GPtrDiff_t nTotalB = 0;
1713
0
                GPtrDiff_t nCount = 0;
1714
1715
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1716
0
                {
1717
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1718
0
                    {
1719
0
                        const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
1720
0
                                                      nChunkXSize];
1721
                        // cppcheck-suppress unsignedLessThanZero
1722
0
                        if (val < 0 || val >= colorEntries.size())
1723
0
                            continue;
1724
0
                        size_t idx = static_cast<size_t>(val);
1725
0
                        const auto &entry = colorEntries[idx];
1726
0
                        if (entry.c4)
1727
0
                        {
1728
0
                            if (bQuadraticMean)
1729
0
                            {
1730
0
                                nTotalR += SQUARE<int>(entry.c1);
1731
0
                                nTotalG += SQUARE<int>(entry.c2);
1732
0
                                nTotalB += SQUARE<int>(entry.c3);
1733
0
                                ++nCount;
1734
0
                            }
1735
0
                            else
1736
0
                            {
1737
0
                                nTotalR += entry.c1;
1738
0
                                nTotalG += entry.c2;
1739
0
                                nTotalB += entry.c3;
1740
0
                                ++nCount;
1741
0
                            }
1742
0
                        }
1743
0
                    }
1744
0
                }
1745
1746
0
                if (nCount == 0 ||
1747
0
                    (bPropagateNoData &&
1748
0
                     nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1749
0
                                  (nSrcXOff2 - nSrcXOff)))
1750
0
                {
1751
0
                    pDstScanline[iDstPixel] = tNoDataValue;
1752
0
                }
1753
0
                else
1754
0
                {
1755
0
                    GDALColorEntry color;
1756
0
                    if (bQuadraticMean)
1757
0
                    {
1758
0
                        color.c1 =
1759
0
                            static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1760
0
                        color.c2 =
1761
0
                            static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1762
0
                        color.c3 =
1763
0
                            static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1764
0
                    }
1765
0
                    else
1766
0
                    {
1767
0
                        color.c1 =
1768
0
                            static_cast<short>((nTotalR + nCount / 2) / nCount);
1769
0
                        color.c2 =
1770
0
                            static_cast<short>((nTotalG + nCount / 2) / nCount);
1771
0
                        color.c3 =
1772
0
                            static_cast<short>((nTotalB + nCount / 2) / nCount);
1773
0
                    }
1774
0
                    pDstScanline[iDstPixel] =
1775
0
                        static_cast<T>(BestColorEntry(colorEntries, color));
1776
0
                }
1777
0
            }
1778
0
        }
1779
0
    }
1780
1781
0
    CPLFree(pasSrcX);
1782
1783
0
    return CE_None;
1784
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned char, int, (GDALDataType)1>(GDALOverviewResampleArgs const&, unsigned char const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, double, (GDALDataType)2>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, unsigned int, (GDALDataType)2>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<float, double, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void**)
1785
1786
static CPLErr
1787
GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1788
                               const void *pChunk, void **ppDstBuffer,
1789
                               GDALDataType *peDstBufferDataType)
1790
0
{
1791
0
    *peDstBufferDataType = args.eWrkDataType;
1792
0
    switch (args.eWrkDataType)
1793
0
    {
1794
0
        case GDT_Byte:
1795
0
        {
1796
0
            return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1797
0
                args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1798
0
        }
1799
1800
0
        case GDT_UInt16:
1801
0
        {
1802
0
            if (EQUAL(args.pszResampling, "RMS"))
1803
0
            {
1804
                // Use double as accumulation type, because UInt32 could overflow
1805
0
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1806
0
                                                        GDT_UInt16>(
1807
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1808
0
            }
1809
0
            else
1810
0
            {
1811
0
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1812
0
                                                        GDT_UInt16>(
1813
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1814
0
            }
1815
0
        }
1816
1817
0
        case GDT_Float32:
1818
0
        {
1819
0
            return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1820
0
                args, static_cast<const float *>(pChunk), ppDstBuffer);
1821
0
        }
1822
1823
0
        case GDT_Float64:
1824
0
        {
1825
0
            return GDALResampleChunk_AverageOrRMS_T<double, double,
1826
0
                                                    GDT_Float64>(
1827
0
                args, static_cast<const double *>(pChunk), ppDstBuffer);
1828
0
        }
1829
1830
0
        default:
1831
0
            break;
1832
0
    }
1833
1834
0
    CPLAssert(false);
1835
0
    return CE_Failure;
1836
0
}
1837
1838
/************************************************************************/
1839
/*                     GDALResampleChunk_Gauss()                        */
1840
/************************************************************************/
1841
1842
static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1843
                                      const void *pChunk, void **ppDstBuffer,
1844
                                      GDALDataType *peDstBufferDataType)
1845
1846
0
{
1847
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1848
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1849
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1850
0
    const int nChunkXOff = args.nChunkXOff;
1851
0
    const int nChunkXSize = args.nChunkXSize;
1852
0
    const int nChunkYOff = args.nChunkYOff;
1853
0
    const int nChunkYSize = args.nChunkYSize;
1854
0
    const int nDstXOff = args.nDstXOff;
1855
0
    const int nDstXOff2 = args.nDstXOff2;
1856
0
    const int nDstYOff = args.nDstYOff;
1857
0
    const int nDstYOff2 = args.nDstYOff2;
1858
0
    const bool bHasNoData = args.bHasNoData;
1859
0
    double dfNoDataValue = args.dfNoDataValue;
1860
0
    const GDALColorTable *poColorTable = args.poColorTable;
1861
1862
0
    const double *const padfChunk = static_cast<const double *>(pChunk);
1863
1864
0
    *ppDstBuffer =
1865
0
        VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1866
0
                            GDALGetDataTypeSizeBytes(GDT_Float64));
1867
0
    if (*ppDstBuffer == nullptr)
1868
0
    {
1869
0
        return CE_Failure;
1870
0
    }
1871
0
    *peDstBufferDataType = GDT_Float64;
1872
0
    double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1873
1874
    /* -------------------------------------------------------------------- */
1875
    /*      Create the filter kernel and allocate scanline buffer.          */
1876
    /* -------------------------------------------------------------------- */
1877
0
    int nGaussMatrixDim = 3;
1878
0
    const int *panGaussMatrix;
1879
0
    constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1880
0
    constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
1881
0
                                        4,  6, 24, 36, 24, 6, 4,  16, 24,
1882
0
                                        16, 4, 1,  4,  6,  4, 1};
1883
0
    constexpr int anGaussMatrix7x7[] = {
1884
0
        1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
1885
0
        6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
1886
0
        120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
1887
0
        90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
1888
1889
0
    const int nOXSize = args.nOvrXSize;
1890
0
    const int nOYSize = args.nOvrYSize;
1891
0
    const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1892
1893
    // matrix for gauss filter
1894
0
    if (nResYFactor <= 2)
1895
0
    {
1896
0
        panGaussMatrix = anGaussMatrix3x3;
1897
0
        nGaussMatrixDim = 3;
1898
0
    }
1899
0
    else if (nResYFactor <= 4)
1900
0
    {
1901
0
        panGaussMatrix = anGaussMatrix5x5;
1902
0
        nGaussMatrixDim = 5;
1903
0
    }
1904
0
    else
1905
0
    {
1906
0
        panGaussMatrix = anGaussMatrix7x7;
1907
0
        nGaussMatrixDim = 7;
1908
0
    }
1909
1910
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
1911
    int *panGaussMatrixDup = static_cast<int *>(
1912
        CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1913
    memcpy(panGaussMatrixDup, panGaussMatrix,
1914
           sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1915
    panGaussMatrix = panGaussMatrixDup;
1916
#endif
1917
1918
0
    if (!bHasNoData)
1919
0
        dfNoDataValue = 0.0;
1920
1921
0
    std::vector<GDALColorEntry> colorEntries;
1922
0
    int nTransparentIdx = -1;
1923
0
    if (poColorTable)
1924
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1925
1926
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1927
    // it as nodata value.
1928
0
    if (bHasNoData && dfNoDataValue >= 0.0f &&
1929
0
        dfNoDataValue < colorEntries.size())
1930
0
        colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1931
1932
    // Or if we have no explicit nodata, but a color table entry that is
1933
    // transparent, consider it as the nodata value.
1934
0
    else if (!bHasNoData && nTransparentIdx >= 0)
1935
0
    {
1936
0
        dfNoDataValue = nTransparentIdx;
1937
0
    }
1938
1939
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1940
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1941
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
1942
1943
    /* ==================================================================== */
1944
    /*      Loop over destination scanlines.                                */
1945
    /* ==================================================================== */
1946
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1947
0
    {
1948
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1949
0
        int nSrcYOff2 =
1950
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1951
1952
0
        if (nSrcYOff < nChunkYOff)
1953
0
        {
1954
0
            nSrcYOff = nChunkYOff;
1955
0
            nSrcYOff2++;
1956
0
        }
1957
1958
0
        const int iSizeY = nSrcYOff2 - nSrcYOff;
1959
0
        nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1960
0
        nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1961
1962
0
        if (nSrcYOff2 > nChunkBottomYOff ||
1963
0
            (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1964
0
        {
1965
0
            nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1966
0
        }
1967
1968
0
        int nYShiftGaussMatrix = 0;
1969
0
        if (nSrcYOff < nChunkYOff)
1970
0
        {
1971
0
            nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1972
0
            nSrcYOff = nChunkYOff;
1973
0
        }
1974
1975
0
        const double *const padfSrcScanline =
1976
0
            padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1977
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
1978
0
        if (pabyChunkNodataMask != nullptr)
1979
0
            pabySrcScanlineNodataMask =
1980
0
                pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1981
1982
        /* --------------------------------------------------------------------
1983
         */
1984
        /*      Loop over destination pixels */
1985
        /* --------------------------------------------------------------------
1986
         */
1987
0
        double *const padfDstScanline =
1988
0
            padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1989
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1990
0
        {
1991
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1992
0
            int nSrcXOff2 =
1993
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1994
1995
0
            if (nSrcXOff < nChunkXOff)
1996
0
            {
1997
0
                nSrcXOff = nChunkXOff;
1998
0
                nSrcXOff2++;
1999
0
            }
2000
2001
0
            const int iSizeX = nSrcXOff2 - nSrcXOff;
2002
0
            nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
2003
0
            nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
2004
2005
0
            if (nSrcXOff2 > nChunkRightXOff ||
2006
0
                (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
2007
0
            {
2008
0
                nSrcXOff2 =
2009
0
                    std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
2010
0
            }
2011
2012
0
            int nXShiftGaussMatrix = 0;
2013
0
            if (nSrcXOff < nChunkXOff)
2014
0
            {
2015
0
                nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2016
0
                nSrcXOff = nChunkXOff;
2017
0
            }
2018
2019
0
            if (poColorTable == nullptr)
2020
0
            {
2021
0
                double dfTotal = 0.0;
2022
0
                GInt64 nCount = 0;
2023
0
                const int *panLineWeight =
2024
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2025
0
                    nXShiftGaussMatrix;
2026
2027
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2028
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2029
0
                {
2030
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2031
0
                    {
2032
0
                        const double val =
2033
0
                            padfSrcScanline[iX - nChunkXOff +
2034
0
                                            static_cast<GPtrDiff_t>(iY -
2035
0
                                                                    nSrcYOff) *
2036
0
                                                nChunkXSize];
2037
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2038
0
                            pabySrcScanlineNodataMask[iX - nChunkXOff +
2039
0
                                                      static_cast<GPtrDiff_t>(
2040
0
                                                          iY - nSrcYOff) *
2041
0
                                                          nChunkXSize])
2042
0
                        {
2043
0
                            const int nWeight = panLineWeight[i];
2044
0
                            dfTotal += val * nWeight;
2045
0
                            nCount += nWeight;
2046
0
                        }
2047
0
                    }
2048
0
                }
2049
2050
0
                if (nCount == 0)
2051
0
                {
2052
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2053
0
                }
2054
0
                else
2055
0
                {
2056
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2057
0
                }
2058
0
            }
2059
0
            else
2060
0
            {
2061
0
                GInt64 nTotalR = 0;
2062
0
                GInt64 nTotalG = 0;
2063
0
                GInt64 nTotalB = 0;
2064
0
                GInt64 nTotalWeight = 0;
2065
0
                const int *panLineWeight =
2066
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2067
0
                    nXShiftGaussMatrix;
2068
2069
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2070
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2071
0
                {
2072
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2073
0
                    {
2074
0
                        const double val =
2075
0
                            padfSrcScanline[iX - nChunkXOff +
2076
0
                                            static_cast<GPtrDiff_t>(iY -
2077
0
                                                                    nSrcYOff) *
2078
0
                                                nChunkXSize];
2079
0
                        if (val < 0 || val >= colorEntries.size())
2080
0
                            continue;
2081
2082
0
                        size_t idx = static_cast<size_t>(val);
2083
0
                        if (colorEntries[idx].c4)
2084
0
                        {
2085
0
                            const int nWeight = panLineWeight[i];
2086
0
                            nTotalR +=
2087
0
                                static_cast<GInt64>(colorEntries[idx].c1) *
2088
0
                                nWeight;
2089
0
                            nTotalG +=
2090
0
                                static_cast<GInt64>(colorEntries[idx].c2) *
2091
0
                                nWeight;
2092
0
                            nTotalB +=
2093
0
                                static_cast<GInt64>(colorEntries[idx].c3) *
2094
0
                                nWeight;
2095
0
                            nTotalWeight += nWeight;
2096
0
                        }
2097
0
                    }
2098
0
                }
2099
2100
0
                if (nTotalWeight == 0)
2101
0
                {
2102
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2103
0
                }
2104
0
                else
2105
0
                {
2106
0
                    GDALColorEntry color;
2107
2108
0
                    color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2109
0
                                                  nTotalWeight);
2110
0
                    color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2111
0
                                                  nTotalWeight);
2112
0
                    color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2113
0
                                                  nTotalWeight);
2114
0
                    padfDstScanline[iDstPixel - nDstXOff] =
2115
0
                        BestColorEntry(colorEntries, color);
2116
0
                }
2117
0
            }
2118
0
        }
2119
0
    }
2120
2121
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2122
    CPLFree(panGaussMatrixDup);
2123
#endif
2124
2125
0
    return CE_None;
2126
0
}
2127
2128
/************************************************************************/
2129
/*                      GDALResampleChunk_Mode()                        */
2130
/************************************************************************/
2131
2132
template <class T> static inline bool IsSame(T a, T b)
2133
0
{
2134
0
    return a == b;
2135
0
}
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned char>(unsigned char, unsigned char)
Unexecuted instantiation: overview.cpp:bool IsSame<signed char>(signed char, signed char)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned short>(unsigned short, unsigned short)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned int>(unsigned int, unsigned int)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned long>(unsigned long, unsigned long)
2136
2137
template <> bool IsSame<float>(float a, float b)
2138
0
{
2139
0
    return a == b || (std::isnan(a) && std::isnan(b));
2140
0
}
2141
2142
template <> bool IsSame<double>(double a, double b)
2143
0
{
2144
0
    return a == b || (std::isnan(a) && std::isnan(b));
2145
0
}
2146
2147
template <>
2148
bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2149
0
{
2150
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2151
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2152
0
}
2153
2154
template <>
2155
bool IsSame<std::complex<double>>(std::complex<double> a,
2156
                                  std::complex<double> b)
2157
0
{
2158
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2159
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2160
0
}
2161
2162
template <class T>
2163
static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2164
                                      const T *pChunk, T *const pDstBuffer)
2165
2166
0
{
2167
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2168
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2169
0
    const double dfSrcXDelta = args.dfSrcXDelta;
2170
0
    const double dfSrcYDelta = args.dfSrcYDelta;
2171
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2172
0
    const int nChunkXOff = args.nChunkXOff;
2173
0
    const int nChunkXSize = args.nChunkXSize;
2174
0
    const int nChunkYOff = args.nChunkYOff;
2175
0
    const int nChunkYSize = args.nChunkYSize;
2176
0
    const int nDstXOff = args.nDstXOff;
2177
0
    const int nDstXOff2 = args.nDstXOff2;
2178
0
    const int nDstYOff = args.nDstYOff;
2179
0
    const int nDstYOff2 = args.nDstYOff2;
2180
0
    const bool bHasNoData = args.bHasNoData;
2181
0
    const GDALColorTable *poColorTable = args.poColorTable;
2182
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
2183
2184
0
    T tNoDataValue;
2185
    if constexpr (std::is_same<T, std::complex<float>>::value ||
2186
                  std::is_same<T, std::complex<double>>::value)
2187
0
    {
2188
0
        using BaseT = typename T::value_type;
2189
0
        tNoDataValue =
2190
0
            std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2191
0
                                std::numeric_limits<BaseT>::quiet_NaN());
2192
    }
2193
0
    else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2194
0
        tNoDataValue = 0;
2195
0
    else
2196
0
        tNoDataValue = static_cast<T>(args.dfNoDataValue);
2197
2198
0
    size_t nMaxNumPx = 0;
2199
0
    T *paVals = nullptr;
2200
0
    int *panSums = nullptr;
2201
2202
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2203
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2204
0
    std::vector<int> anVals(256, 0);
2205
2206
    /* ==================================================================== */
2207
    /*      Loop over destination scanlines.                                */
2208
    /* ==================================================================== */
2209
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2210
0
    {
2211
0
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2212
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2213
#ifdef only_pixels_with_more_than_10_pct_participation
2214
        // When oversampling, don't take into account pixels that have a tiny
2215
        // participation in the resulting pixel
2216
        if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2217
            nSrcYOff < nChunkBottomYOff)
2218
            nSrcYOff++;
2219
#endif
2220
0
        if (nSrcYOff < nChunkYOff)
2221
0
            nSrcYOff = nChunkYOff;
2222
2223
0
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2224
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2225
#ifdef only_pixels_with_more_than_10_pct_participation
2226
        // When oversampling, don't take into account pixels that have a tiny
2227
        // participation in the resulting pixel
2228
        if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2229
            nSrcYOff2 > nChunkYOff)
2230
            nSrcYOff2--;
2231
#endif
2232
0
        if (nSrcYOff2 == nSrcYOff)
2233
0
            ++nSrcYOff2;
2234
0
        if (nSrcYOff2 > nChunkBottomYOff)
2235
0
            nSrcYOff2 = nChunkBottomYOff;
2236
2237
0
        const T *const paSrcScanline =
2238
0
            pChunk +
2239
0
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2240
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
2241
0
        if (pabyChunkNodataMask != nullptr)
2242
0
            pabySrcScanlineNodataMask =
2243
0
                pabyChunkNodataMask +
2244
0
                static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2245
2246
0
        T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2247
        /* --------------------------------------------------------------------
2248
         */
2249
        /*      Loop over destination pixels */
2250
        /* --------------------------------------------------------------------
2251
         */
2252
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2253
0
        {
2254
0
            double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2255
            // Apply some epsilon to avoid numerical precision issues
2256
0
            int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2257
#ifdef only_pixels_with_more_than_10_pct_participation
2258
            // When oversampling, don't take into account pixels that have a
2259
            // tiny participation in the resulting pixel
2260
            if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2261
                nSrcXOff < nChunkRightXOff)
2262
                nSrcXOff++;
2263
#endif
2264
0
            if (nSrcXOff < nChunkXOff)
2265
0
                nSrcXOff = nChunkXOff;
2266
2267
0
            double dfSrcXOff2 =
2268
0
                dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2269
0
            int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2270
#ifdef only_pixels_with_more_than_10_pct_participation
2271
            // When oversampling, don't take into account pixels that have a
2272
            // tiny participation in the resulting pixel
2273
            if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2274
                nSrcXOff2 > nChunkXOff)
2275
                nSrcXOff2--;
2276
#endif
2277
0
            if (nSrcXOff2 == nSrcXOff)
2278
0
                nSrcXOff2++;
2279
0
            if (nSrcXOff2 > nChunkRightXOff)
2280
0
                nSrcXOff2 = nChunkRightXOff;
2281
2282
0
            bool bRegularProcessing = false;
2283
            if constexpr (!std::is_same<T, GByte>::value)
2284
0
                bRegularProcessing = true;
2285
0
            else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2286
0
                bRegularProcessing = true;
2287
2288
0
            if (bRegularProcessing)
2289
0
            {
2290
                // Not sure how much sense it makes to run a majority
2291
                // filter on floating point data, but here it is for the sake
2292
                // of compatibility. It won't look right on RGB images by the
2293
                // nature of the filter.
2294
2295
0
                if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2296
0
                    nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2297
0
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2298
0
                            static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2299
0
                        std::numeric_limits<size_t>::max() / sizeof(float))
2300
0
                {
2301
0
                    CPLError(CE_Failure, CPLE_NotSupported,
2302
0
                             "Too big downsampling factor");
2303
0
                    CPLFree(paVals);
2304
0
                    CPLFree(panSums);
2305
0
                    return CE_Failure;
2306
0
                }
2307
0
                const size_t nNumPx =
2308
0
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2309
0
                    static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2310
0
                size_t iMaxInd = 0;
2311
0
                size_t iMaxVal = 0;
2312
0
                bool biMaxValdValid = false;
2313
2314
0
                if (paVals == nullptr || nNumPx > nMaxNumPx)
2315
0
                {
2316
0
                    T *paValsNew = static_cast<T *>(
2317
0
                        VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2318
0
                    int *panSumsNew = static_cast<int *>(
2319
0
                        VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2320
0
                    if (paValsNew != nullptr)
2321
0
                        paVals = paValsNew;
2322
0
                    if (panSumsNew != nullptr)
2323
0
                        panSums = panSumsNew;
2324
0
                    if (paValsNew == nullptr || panSumsNew == nullptr)
2325
0
                    {
2326
0
                        CPLFree(paVals);
2327
0
                        CPLFree(panSums);
2328
0
                        return CE_Failure;
2329
0
                    }
2330
0
                    nMaxNumPx = nNumPx;
2331
0
                }
2332
2333
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2334
0
                {
2335
0
                    const GPtrDiff_t iTotYOff =
2336
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2337
0
                        nChunkXOff;
2338
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2339
0
                    {
2340
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2341
0
                            pabySrcScanlineNodataMask[iX + iTotYOff])
2342
0
                        {
2343
0
                            const T val = paSrcScanline[iX + iTotYOff];
2344
0
                            size_t i = 0;  // Used after for.
2345
2346
                            // Check array for existing entry.
2347
0
                            for (; i < iMaxInd; ++i)
2348
0
                                if (IsSame(paVals[i], val) &&
2349
0
                                    ++panSums[i] > panSums[iMaxVal])
2350
0
                                {
2351
0
                                    iMaxVal = i;
2352
0
                                    biMaxValdValid = true;
2353
0
                                    break;
2354
0
                                }
2355
2356
                            // Add to arr if entry not already there.
2357
0
                            if (i == iMaxInd)
2358
0
                            {
2359
0
                                paVals[iMaxInd] = val;
2360
0
                                panSums[iMaxInd] = 1;
2361
2362
0
                                if (!biMaxValdValid)
2363
0
                                {
2364
0
                                    iMaxVal = iMaxInd;
2365
0
                                    biMaxValdValid = true;
2366
0
                                }
2367
2368
0
                                ++iMaxInd;
2369
0
                            }
2370
0
                        }
2371
0
                    }
2372
0
                }
2373
2374
0
                if (!biMaxValdValid)
2375
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2376
0
                else
2377
0
                    paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2378
0
            }
2379
            else if constexpr (std::is_same<T, GByte>::value)
2380
            // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2381
0
            {
2382
                // So we go here for a paletted or non-paletted byte band.
2383
                // The input values are then between 0 and 255.
2384
0
                int nMaxVal = 0;
2385
0
                int iMaxInd = -1;
2386
2387
                // The cost of this zeroing might be high. Perhaps we should
2388
                // just use the above generic case, and go to this one if the
2389
                // number of source pixels is large enough
2390
0
                std::fill(anVals.begin(), anVals.end(), 0);
2391
2392
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2393
0
                {
2394
0
                    const GPtrDiff_t iTotYOff =
2395
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2396
0
                        nChunkXOff;
2397
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2398
0
                    {
2399
0
                        const T val = paSrcScanline[iX + iTotYOff];
2400
0
                        if (!bHasNoData || val != tNoDataValue)
2401
0
                        {
2402
0
                            int nVal = static_cast<int>(val);
2403
0
                            if (++anVals[nVal] > nMaxVal)
2404
0
                            {
2405
                                // Sum the density.
2406
                                // Is it the most common value so far?
2407
0
                                iMaxInd = nVal;
2408
0
                                nMaxVal = anVals[nVal];
2409
0
                            }
2410
0
                        }
2411
0
                    }
2412
0
                }
2413
2414
0
                if (iMaxInd == -1)
2415
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2416
0
                else
2417
0
                    paDstScanline[iDstPixel - nDstXOff] =
2418
0
                        static_cast<T>(iMaxInd);
2419
0
            }
2420
0
        }
2421
0
    }
2422
2423
0
    CPLFree(paVals);
2424
0
    CPLFree(panSums);
2425
2426
0
    return CE_None;
2427
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<signed char>(GDALOverviewResampleArgs const&, signed char const*, signed char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<float>(GDALOverviewResampleArgs const&, float const*, float*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<double>(GDALOverviewResampleArgs const&, double const*, double*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<float> >(GDALOverviewResampleArgs const&, std::__1::complex<float> const*, std::__1::complex<float>*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>*)
2428
2429
static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2430
                                     const void *pChunk, void **ppDstBuffer,
2431
                                     GDALDataType *peDstBufferDataType)
2432
0
{
2433
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2434
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2435
0
        GDALGetDataTypeSizeBytes(args.eWrkDataType));
2436
0
    if (*ppDstBuffer == nullptr)
2437
0
    {
2438
0
        return CE_Failure;
2439
0
    }
2440
2441
0
    CPLAssert(args.eSrcDataType == args.eWrkDataType);
2442
2443
0
    *peDstBufferDataType = args.eWrkDataType;
2444
0
    switch (args.eWrkDataType)
2445
0
    {
2446
        // For mode resampling, as no computation is done, only the
2447
        // size of the data type matters... except for Byte where we have
2448
        // special processing. And for floating point values
2449
0
        case GDT_Byte:
2450
0
        {
2451
0
            return GDALResampleChunk_ModeT(args,
2452
0
                                           static_cast<const GByte *>(pChunk),
2453
0
                                           static_cast<GByte *>(*ppDstBuffer));
2454
0
        }
2455
2456
0
        case GDT_Int8:
2457
0
        {
2458
0
            return GDALResampleChunk_ModeT(args,
2459
0
                                           static_cast<const int8_t *>(pChunk),
2460
0
                                           static_cast<int8_t *>(*ppDstBuffer));
2461
0
        }
2462
2463
0
        case GDT_Int16:
2464
0
        case GDT_UInt16:
2465
0
        case GDT_Float16:
2466
0
        {
2467
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2468
0
            return GDALResampleChunk_ModeT(
2469
0
                args, static_cast<const uint16_t *>(pChunk),
2470
0
                static_cast<uint16_t *>(*ppDstBuffer));
2471
0
        }
2472
2473
0
        case GDT_CInt16:
2474
0
        case GDT_CFloat16:
2475
0
        case GDT_Int32:
2476
0
        case GDT_UInt32:
2477
0
        {
2478
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2479
0
            return GDALResampleChunk_ModeT(
2480
0
                args, static_cast<const uint32_t *>(pChunk),
2481
0
                static_cast<uint32_t *>(*ppDstBuffer));
2482
0
        }
2483
2484
0
        case GDT_Float32:
2485
0
        {
2486
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2487
0
            return GDALResampleChunk_ModeT(args,
2488
0
                                           static_cast<const float *>(pChunk),
2489
0
                                           static_cast<float *>(*ppDstBuffer));
2490
0
        }
2491
2492
0
        case GDT_CInt32:
2493
0
        case GDT_Int64:
2494
0
        case GDT_UInt64:
2495
0
        {
2496
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2497
0
            return GDALResampleChunk_ModeT(
2498
0
                args, static_cast<const uint64_t *>(pChunk),
2499
0
                static_cast<uint64_t *>(*ppDstBuffer));
2500
0
        }
2501
2502
0
        case GDT_Float64:
2503
0
        {
2504
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2505
0
            return GDALResampleChunk_ModeT(args,
2506
0
                                           static_cast<const double *>(pChunk),
2507
0
                                           static_cast<double *>(*ppDstBuffer));
2508
0
        }
2509
2510
0
        case GDT_CFloat32:
2511
0
        {
2512
0
            return GDALResampleChunk_ModeT(
2513
0
                args, static_cast<const std::complex<float> *>(pChunk),
2514
0
                static_cast<std::complex<float> *>(*ppDstBuffer));
2515
0
        }
2516
2517
0
        case GDT_CFloat64:
2518
0
        {
2519
0
            return GDALResampleChunk_ModeT(
2520
0
                args, static_cast<const std::complex<double> *>(pChunk),
2521
0
                static_cast<std::complex<double> *>(*ppDstBuffer));
2522
0
        }
2523
2524
0
        case GDT_Unknown:
2525
0
        case GDT_TypeCount:
2526
0
            break;
2527
0
    }
2528
2529
0
    CPLAssert(false);
2530
0
    return CE_Failure;
2531
0
}
2532
2533
/************************************************************************/
2534
/*                  GDALResampleConvolutionHorizontal()                 */
2535
/************************************************************************/
2536
2537
template <class T>
2538
static inline double
2539
GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2540
                                  int nSrcPixelCount)
2541
0
{
2542
0
    double dfVal1 = 0.0;
2543
0
    double dfVal2 = 0.0;
2544
0
    int i = 0;  // Used after for.
2545
    // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2546
    // manually (untypical) unrolled loop in -O2 and -O3:
2547
    // https://github.com/OSGeo/gdal/issues/9508
2548
0
#if !defined(__INTEL_CLANG_COMPILER)
2549
0
    for (; i + 3 < nSrcPixelCount; i += 4)
2550
0
    {
2551
0
        dfVal1 += pChunk[i] * padfWeights[i];
2552
0
        dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2553
0
        dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2554
0
        dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2555
0
    }
2556
0
#endif
2557
0
    for (; i < nSrcPixelCount; ++i)
2558
0
    {
2559
0
        dfVal1 += pChunk[i] * padfWeights[i];
2560
0
    }
2561
0
    return dfVal1 + dfVal2;
2562
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<float>(float const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<double>(double const*, double const*, int)
2563
2564
template <class T>
2565
static inline void GDALResampleConvolutionHorizontalWithMask(
2566
    const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2567
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2568
0
{
2569
0
    dfVal = 0;
2570
0
    dfWeightSum = 0;
2571
0
    int i = 0;
2572
0
    for (; i + 3 < nSrcPixelCount; i += 4)
2573
0
    {
2574
0
        const double dfWeight0 = padfWeights[i] * pabyMask[i];
2575
0
        const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2576
0
        const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2577
0
        const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2578
0
        dfVal += pChunk[i] * dfWeight0;
2579
0
        dfVal += pChunk[i + 1] * dfWeight1;
2580
0
        dfVal += pChunk[i + 2] * dfWeight2;
2581
0
        dfVal += pChunk[i + 3] * dfWeight3;
2582
0
        dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2583
0
    }
2584
0
    for (; i < nSrcPixelCount; ++i)
2585
0
    {
2586
0
        const double dfWeight = padfWeights[i] * pabyMask[i];
2587
0
        dfVal += pChunk[i] * dfWeight;
2588
0
        dfWeightSum += dfWeight;
2589
0
    }
2590
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<float>(float const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<double>(double const*, unsigned char const*, double const*, int, double&, double&)
2591
2592
template <class T>
2593
static inline void GDALResampleConvolutionHorizontal_3rows(
2594
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2595
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2596
    double &dfRes2, double &dfRes3)
2597
0
{
2598
0
    double dfVal1 = 0.0;
2599
0
    double dfVal2 = 0.0;
2600
0
    double dfVal3 = 0.0;
2601
0
    double dfVal4 = 0.0;
2602
0
    double dfVal5 = 0.0;
2603
0
    double dfVal6 = 0.0;
2604
0
    int i = 0;  // Used after for.
2605
0
    for (; i + 3 < nSrcPixelCount; i += 4)
2606
0
    {
2607
0
        dfVal1 += pChunkRow1[i] * padfWeights[i];
2608
0
        dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2609
0
        dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2610
0
        dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2611
0
        dfVal3 += pChunkRow2[i] * padfWeights[i];
2612
0
        dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2613
0
        dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2614
0
        dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2615
0
        dfVal5 += pChunkRow3[i] * padfWeights[i];
2616
0
        dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2617
0
        dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2618
0
        dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2619
0
    }
2620
0
    for (; i < nSrcPixelCount; ++i)
2621
0
    {
2622
0
        dfVal1 += pChunkRow1[i] * padfWeights[i];
2623
0
        dfVal3 += pChunkRow2[i] * padfWeights[i];
2624
0
        dfVal5 += pChunkRow3[i] * padfWeights[i];
2625
0
    }
2626
0
    dfRes1 = dfVal1 + dfVal2;
2627
0
    dfRes2 = dfVal3 + dfVal4;
2628
0
    dfRes3 = dfVal5 + dfVal6;
2629
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2630
2631
template <class T>
2632
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2633
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2634
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2635
    double &dfRes2, double &dfRes3)
2636
0
{
2637
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2638
0
                                            padfWeights, nSrcPixelCount, dfRes1,
2639
0
                                            dfRes2, dfRes3);
2640
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2641
2642
template <class T>
2643
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2644
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2645
    const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2646
0
{
2647
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2648
0
                                            padfWeights, 4, dfRes1, dfRes2,
2649
0
                                            dfRes3);
2650
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<float>(float const*, float const*, float const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<double>(double const*, double const*, double const*, double const*, double&, double&, double&)
2651
2652
/************************************************************************/
2653
/*                  GDALResampleConvolutionVertical()                   */
2654
/************************************************************************/
2655
2656
template <class T>
2657
static inline double
2658
GDALResampleConvolutionVertical(const T *pChunk, int nStride,
2659
                                const double *padfWeights, int nSrcLineCount)
2660
0
{
2661
0
    double dfVal1 = 0.0;
2662
0
    double dfVal2 = 0.0;
2663
0
    int i = 0;
2664
0
    int j = 0;
2665
0
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2666
0
    {
2667
0
        dfVal1 += pChunk[j] * padfWeights[i];
2668
0
        dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2669
0
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2670
0
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2671
0
    }
2672
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2673
0
    {
2674
0
        dfVal1 += pChunk[j] * padfWeights[i];
2675
0
    }
2676
0
    return dfVal1 + dfVal2;
2677
0
}
2678
2679
template <class T>
2680
static inline void GDALResampleConvolutionVertical_2cols(
2681
    const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2682
    double &dfRes1, double &dfRes2)
2683
0
{
2684
0
    double dfVal1 = 0.0;
2685
0
    double dfVal2 = 0.0;
2686
0
    double dfVal3 = 0.0;
2687
0
    double dfVal4 = 0.0;
2688
0
    int i = 0;
2689
0
    int j = 0;
2690
0
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2691
0
    {
2692
0
        dfVal1 += pChunk[j] * padfWeights[i];
2693
0
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2694
0
        dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2695
0
        dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2696
0
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2697
0
        dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2698
0
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2699
0
        dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2700
0
    }
2701
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2702
0
    {
2703
0
        dfVal1 += pChunk[j] * padfWeights[i];
2704
0
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2705
0
    }
2706
0
    dfRes1 = dfVal1 + dfVal2;
2707
0
    dfRes2 = dfVal3 + dfVal4;
2708
0
}
2709
2710
#ifdef USE_SSE2
2711
2712
#ifdef __AVX__
2713
/************************************************************************/
2714
/*             GDALResampleConvolutionVertical_16cols<T>                */
2715
/************************************************************************/
2716
2717
template <class T>
2718
static inline void
2719
GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2720
                                       const double *padfWeights,
2721
                                       int nSrcLineCount, float *afDest)
2722
{
2723
    int i = 0;
2724
    int j = 0;
2725
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2726
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2727
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2728
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2729
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2730
    {
2731
        XMMReg4Double w0 =
2732
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2733
        XMMReg4Double w1 =
2734
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2735
        XMMReg4Double w2 =
2736
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2737
        XMMReg4Double w3 =
2738
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2739
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2740
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2741
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2742
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2743
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2744
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2745
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2746
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2747
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2748
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2749
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2750
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2751
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2752
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2753
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2754
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2755
    }
2756
    for (; i < nSrcLineCount; ++i, j += nStride)
2757
    {
2758
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2759
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2760
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2761
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2762
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2763
    }
2764
    v_acc0.Store4Val(afDest);
2765
    v_acc1.Store4Val(afDest + 4);
2766
    v_acc2.Store4Val(afDest + 8);
2767
    v_acc3.Store4Val(afDest + 12);
2768
}
2769
2770
template <class T>
2771
static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2772
                                                          const double *, int,
2773
                                                          double *)
2774
{
2775
    // Cannot be reached
2776
    CPLAssert(false);
2777
}
2778
2779
#else
2780
2781
/************************************************************************/
2782
/*              GDALResampleConvolutionVertical_8cols<T>                */
2783
/************************************************************************/
2784
2785
template <class T>
2786
static inline void
2787
GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
2788
                                      const double *padfWeights,
2789
                                      int nSrcLineCount, float *afDest)
2790
0
{
2791
0
    int i = 0;
2792
0
    int j = 0;
2793
0
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2794
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2795
0
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2796
0
    {
2797
0
        XMMReg4Double w0 =
2798
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2799
0
        XMMReg4Double w1 =
2800
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2801
0
        XMMReg4Double w2 =
2802
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2803
0
        XMMReg4Double w3 =
2804
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2805
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2806
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2807
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2808
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2809
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2810
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2811
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2812
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2813
0
    }
2814
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2815
0
    {
2816
0
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2817
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2818
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2819
0
    }
2820
0
    v_acc0.Store4Val(afDest);
2821
0
    v_acc1.Store4Val(afDest + 4);
2822
0
}
2823
2824
template <class T>
2825
static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2826
                                                         const double *, int,
2827
                                                         double *)
2828
{
2829
    // Cannot be reached
2830
    CPLAssert(false);
2831
}
2832
2833
#endif  // __AVX__
2834
2835
/************************************************************************/
2836
/*              GDALResampleConvolutionHorizontalSSE2<T>                */
2837
/************************************************************************/
2838
2839
template <class T>
2840
static inline double GDALResampleConvolutionHorizontalSSE2(
2841
    const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2842
0
{
2843
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2844
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2845
0
    int i = 0;  // Used after for.
2846
0
    for (; i + 7 < nSrcPixelCount; i += 8)
2847
0
    {
2848
        // Retrieve the pixel & accumulate
2849
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2850
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2851
0
        const XMMReg4Double v_weight1 =
2852
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2853
0
        const XMMReg4Double v_weight2 =
2854
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2855
2856
0
        v_acc1 += v_pixels1 * v_weight1;
2857
0
        v_acc2 += v_pixels2 * v_weight2;
2858
0
    }
2859
2860
0
    v_acc1 += v_acc2;
2861
2862
0
    double dfVal = v_acc1.GetHorizSum();
2863
0
    for (; i < nSrcPixelCount; ++i)
2864
0
    {
2865
0
        dfVal += pChunk[i] * padfWeightsAligned[i];
2866
0
    }
2867
0
    return dfVal;
2868
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned char>(unsigned char const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned short>(unsigned short const*, double const*, int)
2869
2870
/************************************************************************/
2871
/*              GDALResampleConvolutionHorizontal<GByte>                */
2872
/************************************************************************/
2873
2874
template <>
2875
inline double GDALResampleConvolutionHorizontal<GByte>(
2876
    const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2877
0
{
2878
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2879
0
                                                 nSrcPixelCount);
2880
0
}
2881
2882
template <>
2883
inline double GDALResampleConvolutionHorizontal<GUInt16>(
2884
    const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2885
0
{
2886
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2887
0
                                                 nSrcPixelCount);
2888
0
}
2889
2890
/************************************************************************/
2891
/*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
2892
/************************************************************************/
2893
2894
template <class T>
2895
static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2896
    const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2897
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2898
0
{
2899
0
    int i = 0;  // Used after for.
2900
0
    XMMReg4Double v_acc = XMMReg4Double::Zero();
2901
0
    XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2902
0
    for (; i + 3 < nSrcPixelCount; i += 4)
2903
0
    {
2904
0
        const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2905
0
        const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2906
0
        XMMReg4Double v_weight =
2907
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2908
0
        v_weight *= v_mask;
2909
0
        v_acc += v_pixels * v_weight;
2910
0
        v_acc_weight += v_weight;
2911
0
    }
2912
2913
0
    dfVal = v_acc.GetHorizSum();
2914
0
    dfWeightSum = v_acc_weight.GetHorizSum();
2915
0
    for (; i < nSrcPixelCount; ++i)
2916
0
    {
2917
0
        const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2918
0
        dfVal += pChunk[i] * dfWeight;
2919
0
        dfWeightSum += dfWeight;
2920
0
    }
2921
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned char>(unsigned char const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned short>(unsigned short const*, unsigned char const*, double const*, int, double&, double&)
2922
2923
/************************************************************************/
2924
/*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
2925
/************************************************************************/
2926
2927
template <>
2928
inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2929
    const GByte *pChunk, const GByte *pabyMask,
2930
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2931
    double &dfWeightSum)
2932
0
{
2933
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
2934
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2935
0
        dfWeightSum);
2936
0
}
2937
2938
template <>
2939
inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2940
    const GUInt16 *pChunk, const GByte *pabyMask,
2941
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2942
    double &dfWeightSum)
2943
0
{
2944
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
2945
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2946
0
        dfWeightSum);
2947
0
}
2948
2949
/************************************************************************/
2950
/*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
2951
/************************************************************************/
2952
2953
template <class T>
2954
static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2955
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2956
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2957
    double &dfRes2, double &dfRes3)
2958
0
{
2959
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2960
0
                  v_acc2 = XMMReg4Double::Zero(),
2961
0
                  v_acc3 = XMMReg4Double::Zero();
2962
0
    int i = 0;
2963
0
    for (; i + 7 < nSrcPixelCount; i += 8)
2964
0
    {
2965
        // Retrieve the pixel & accumulate.
2966
0
        XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2967
0
        XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2968
0
        const XMMReg4Double v_weight1 =
2969
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2970
0
        const XMMReg4Double v_weight2 =
2971
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2972
2973
0
        v_acc1 += v_pixels1 * v_weight1;
2974
0
        v_acc1 += v_pixels2 * v_weight2;
2975
2976
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2977
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2978
0
        v_acc2 += v_pixels1 * v_weight1;
2979
0
        v_acc2 += v_pixels2 * v_weight2;
2980
2981
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2982
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2983
0
        v_acc3 += v_pixels1 * v_weight1;
2984
0
        v_acc3 += v_pixels2 * v_weight2;
2985
0
    }
2986
2987
0
    dfRes1 = v_acc1.GetHorizSum();
2988
0
    dfRes2 = v_acc2.GetHorizSum();
2989
0
    dfRes3 = v_acc3.GetHorizSum();
2990
0
    for (; i < nSrcPixelCount; ++i)
2991
0
    {
2992
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2993
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2994
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2995
0
    }
2996
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
2997
2998
/************************************************************************/
2999
/*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
3000
/************************************************************************/
3001
3002
template <>
3003
inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
3004
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3005
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3006
    double &dfRes2, double &dfRes3)
3007
0
{
3008
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
3009
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3010
0
        dfRes1, dfRes2, dfRes3);
3011
0
}
3012
3013
template <>
3014
inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
3015
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3016
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3017
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3018
0
{
3019
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
3020
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3021
0
        dfRes1, dfRes2, dfRes3);
3022
0
}
3023
3024
/************************************************************************/
3025
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
3026
/************************************************************************/
3027
3028
template <class T>
3029
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3030
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3031
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3032
    double &dfRes2, double &dfRes3)
3033
0
{
3034
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3035
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3036
0
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3037
0
    int i = 0;  // Use after for.
3038
0
    for (; i + 3 < nSrcPixelCount; i += 4)
3039
0
    {
3040
        // Retrieve the pixel & accumulate.
3041
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3042
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3043
0
        const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3044
0
        const XMMReg4Double v_weight =
3045
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3046
3047
0
        v_acc1 += v_pixels1 * v_weight;
3048
0
        v_acc2 += v_pixels2 * v_weight;
3049
0
        v_acc3 += v_pixels3 * v_weight;
3050
0
    }
3051
3052
0
    dfRes1 = v_acc1.GetHorizSum();
3053
0
    dfRes2 = v_acc2.GetHorizSum();
3054
0
    dfRes3 = v_acc3.GetHorizSum();
3055
3056
0
    for (; i < nSrcPixelCount; ++i)
3057
0
    {
3058
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3059
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3060
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3061
0
    }
3062
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
3063
3064
/************************************************************************/
3065
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
3066
/************************************************************************/
3067
3068
template <>
3069
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3070
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3071
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3072
    double &dfRes2, double &dfRes3)
3073
0
{
3074
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3075
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3076
0
        dfRes1, dfRes2, dfRes3);
3077
0
}
3078
3079
template <>
3080
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3081
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3082
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3083
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3084
0
{
3085
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3086
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3087
0
        dfRes1, dfRes2, dfRes3);
3088
0
}
3089
3090
/************************************************************************/
3091
/*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
3092
/************************************************************************/
3093
3094
template <class T>
3095
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3096
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3097
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3098
    double &dfRes3)
3099
0
{
3100
0
    const XMMReg4Double v_weight =
3101
0
        XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3102
3103
    // Retrieve the pixel & accumulate.
3104
0
    const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3105
0
    const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3106
0
    const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3107
3108
0
    XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3109
0
    XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3110
0
    XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3111
3112
0
    dfRes1 = v_acc1.GetHorizSum();
3113
0
    dfRes2 = v_acc2.GetHorizSum();
3114
0
    dfRes3 = v_acc3.GetHorizSum();
3115
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, double&, double&, double&)
3116
3117
/************************************************************************/
3118
/*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
3119
/************************************************************************/
3120
3121
template <>
3122
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3123
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3124
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3125
    double &dfRes3)
3126
0
{
3127
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3128
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3129
0
        dfRes3);
3130
0
}
3131
3132
template <>
3133
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3134
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3135
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3136
    double &dfRes2, double &dfRes3)
3137
0
{
3138
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3139
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3140
0
        dfRes3);
3141
0
}
3142
3143
#endif  // USE_SSE2
3144
3145
/************************************************************************/
3146
/*                    GDALResampleChunk_Convolution()                   */
3147
/************************************************************************/
3148
3149
template <class T, class Twork, GDALDataType eWrkDataType>
3150
static CPLErr GDALResampleChunk_ConvolutionT(
3151
    const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3152
    FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3153
    int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3154
3155
0
{
3156
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3157
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3158
0
    const double dfSrcXDelta = args.dfSrcXDelta;
3159
0
    const double dfSrcYDelta = args.dfSrcYDelta;
3160
0
    constexpr int nBands = 1;
3161
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3162
0
    const int nChunkXOff = args.nChunkXOff;
3163
0
    const int nChunkXSize = args.nChunkXSize;
3164
0
    const int nChunkYOff = args.nChunkYOff;
3165
0
    const int nChunkYSize = args.nChunkYSize;
3166
0
    const int nDstXOff = args.nDstXOff;
3167
0
    const int nDstXOff2 = args.nDstXOff2;
3168
0
    const int nDstYOff = args.nDstYOff;
3169
0
    const int nDstYOff2 = args.nDstYOff2;
3170
0
    const bool bHasNoData = args.bHasNoData;
3171
0
    double dfNoDataValue = args.dfNoDataValue;
3172
3173
0
    if (!bHasNoData)
3174
0
        dfNoDataValue = 0.0;
3175
0
    const auto dstDataType = args.eOvrDataType;
3176
0
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3177
0
    const double dfReplacementVal =
3178
0
        bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3179
0
                   : dfNoDataValue;
3180
    // cppcheck-suppress unreadVariable
3181
0
    const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3182
0
    const bool bNoDataValueInt64Valid =
3183
0
        isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3184
0
    const auto nNodataValueInt64 =
3185
0
        bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3186
0
    constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3187
3188
    // TODO: we should have some generic function to do this.
3189
0
    Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3190
0
    Twork fDstMax = cpl::NumericLimits<Twork>::max();
3191
0
    if (dstDataType == GDT_Byte)
3192
0
    {
3193
0
        fDstMin = std::numeric_limits<GByte>::min();
3194
0
        fDstMax = std::numeric_limits<GByte>::max();
3195
0
    }
3196
0
    else if (dstDataType == GDT_Int8)
3197
0
    {
3198
0
        fDstMin = std::numeric_limits<GInt8>::min();
3199
0
        fDstMax = std::numeric_limits<GInt8>::max();
3200
0
    }
3201
0
    else if (dstDataType == GDT_UInt16)
3202
0
    {
3203
0
        fDstMin = std::numeric_limits<GUInt16>::min();
3204
0
        fDstMax = std::numeric_limits<GUInt16>::max();
3205
0
    }
3206
0
    else if (dstDataType == GDT_Int16)
3207
0
    {
3208
0
        fDstMin = std::numeric_limits<GInt16>::min();
3209
0
        fDstMax = std::numeric_limits<GInt16>::max();
3210
0
    }
3211
0
    else if (dstDataType == GDT_UInt32)
3212
0
    {
3213
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3214
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3215
0
    }
3216
0
    else if (dstDataType == GDT_Int32)
3217
0
    {
3218
        // cppcheck-suppress unreadVariable
3219
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3220
        // cppcheck-suppress unreadVariable
3221
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3222
0
    }
3223
0
    else if (dstDataType == GDT_UInt64)
3224
0
    {
3225
        // cppcheck-suppress unreadVariable
3226
0
        fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3227
        // cppcheck-suppress unreadVariable
3228
        // (1 << 64) - 2048: largest uint64 value a double can hold
3229
0
        fDstMax = static_cast<Twork>(18446744073709549568ULL);
3230
0
    }
3231
0
    else if (dstDataType == GDT_Int64)
3232
0
    {
3233
        // cppcheck-suppress unreadVariable
3234
0
        fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3235
        // cppcheck-suppress unreadVariable
3236
        // (1 << 63) - 1024: largest int64 that a double can hold
3237
0
        fDstMax = static_cast<Twork>(9223372036854774784LL);
3238
0
    }
3239
3240
0
    auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3241
0
                               bNoDataValueInt64Valid, nNodataValueInt64,
3242
0
                               dfNoDataValue, dfReplacementVal](Twork fVal)
3243
0
    {
3244
0
        if (!bHasNoData)
3245
0
            return fVal;
3246
3247
        // Clamp value before comparing to nodata: this is only needed for
3248
        // kernels with negative weights (Lanczos)
3249
0
        Twork fClamped = fVal;
3250
0
        if (fClamped < fDstMin)
3251
0
            fClamped = fDstMin;
3252
0
        else if (fClamped > fDstMax)
3253
0
            fClamped = fDstMax;
3254
0
        if (isIntegerDT)
3255
0
        {
3256
0
            if (bNoDataValueInt64Valid)
3257
0
            {
3258
0
                const double fClampedRounded = std::round(fClamped);
3259
0
                if (fClampedRounded >= fDstMin && fClampedRounded <= fDstMax &&
3260
0
                    nNodataValueInt64 ==
3261
0
                        static_cast<GInt64>(std::round(fClamped)))
3262
0
                {
3263
                    // Do not use the nodata value
3264
0
                    return static_cast<Twork>(dfReplacementVal);
3265
0
                }
3266
0
            }
3267
0
        }
3268
0
        else if (dfNoDataValue == fClamped)
3269
0
        {
3270
            // Do not use the nodata value
3271
0
            return static_cast<Twork>(dfReplacementVal);
3272
0
        }
3273
0
        return fClamped;
3274
0
    };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(double)#1}::operator()(double) const
3275
3276
    /* -------------------------------------------------------------------- */
3277
    /*      Allocate work buffers.                                          */
3278
    /* -------------------------------------------------------------------- */
3279
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
3280
0
    Twork *pafWrkScanline = nullptr;
3281
0
    if (dstDataType != eWrkDataType)
3282
0
    {
3283
0
        pafWrkScanline =
3284
0
            static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3285
0
        if (pafWrkScanline == nullptr)
3286
0
            return CE_Failure;
3287
0
    }
3288
3289
0
    const double dfXScale = 1.0 / dfXRatioDstToSrc;
3290
0
    const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3291
0
    const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3292
0
    const double dfYScale = 1.0 / dfYRatioDstToSrc;
3293
0
    const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3294
0
    const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3295
3296
    // Temporary array to store result of horizontal filter.
3297
0
    double *padfHorizontalFiltered = static_cast<double *>(
3298
0
        VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3299
3300
    // To store convolution coefficients.
3301
0
    double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3302
0
        static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3303
0
                         0.5) *
3304
0
        sizeof(double)));
3305
3306
0
    GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3307
0
    if (pabyChunkNodataMask)
3308
0
        pabyChunkNodataMaskHorizontalFiltered =
3309
0
            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3310
0
    if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3311
0
        (pabyChunkNodataMask != nullptr &&
3312
0
         pabyChunkNodataMaskHorizontalFiltered == nullptr))
3313
0
    {
3314
0
        VSIFree(pafWrkScanline);
3315
0
        VSIFree(padfHorizontalFiltered);
3316
0
        VSIFreeAligned(padfWeights);
3317
0
        VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3318
0
        return CE_Failure;
3319
0
    }
3320
3321
    /* ==================================================================== */
3322
    /*      First pass: horizontal filter                                   */
3323
    /* ==================================================================== */
3324
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3325
0
#ifdef USE_SSE2
3326
0
    bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3327
0
#endif
3328
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3329
0
    {
3330
0
        const double dfSrcPixel =
3331
0
            (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3332
0
        int nSrcPixelStart =
3333
0
            static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3334
0
        if (nSrcPixelStart < nChunkXOff)
3335
0
            nSrcPixelStart = nChunkXOff;
3336
0
        int nSrcPixelStop =
3337
0
            static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3338
0
        if (nSrcPixelStop > nChunkRightXOff)
3339
0
            nSrcPixelStop = nChunkRightXOff;
3340
#if 0
3341
        if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3342
        {
3343
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3344
        }
3345
        if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3346
        {
3347
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3348
        }
3349
#endif
3350
0
        const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3351
0
        double dfWeightSum = 0.0;
3352
3353
        // Compute convolution coefficients.
3354
0
        int nSrcPixel = nSrcPixelStart;
3355
0
        double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3356
0
        for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3357
0
        {
3358
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3359
0
            dfX += dfXScaleWeight;
3360
0
            padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3361
0
            dfX += dfXScaleWeight;
3362
0
            padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3363
0
            dfX += dfXScaleWeight;
3364
0
            padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3365
0
            dfX += dfXScaleWeight;
3366
0
            dfWeightSum +=
3367
0
                pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3368
0
        }
3369
0
        for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3370
0
        {
3371
0
            const double dfWeight = pfnFilterFunc(dfX);
3372
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3373
0
            dfWeightSum += dfWeight;
3374
0
        }
3375
3376
0
        const int nHeight = nChunkYSize * nBands;
3377
0
        if (pabyChunkNodataMask == nullptr)
3378
0
        {
3379
0
            if (dfWeightSum != 0)
3380
0
            {
3381
0
                const double dfInvWeightSum = 1.0 / dfWeightSum;
3382
0
                for (int i = 0; i < nSrcPixelCount; ++i)
3383
0
                    padfWeights[i] *= dfInvWeightSum;
3384
0
            }
3385
0
            int iSrcLineOff = 0;
3386
0
#ifdef USE_SSE2
3387
0
            if (nSrcPixelCount == 4)
3388
0
            {
3389
0
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3390
0
                {
3391
0
                    const GPtrDiff_t j =
3392
0
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3393
0
                        (nSrcPixelStart - nChunkXOff);
3394
0
                    double dfVal1 = 0.0;
3395
0
                    double dfVal2 = 0.0;
3396
0
                    double dfVal3 = 0.0;
3397
0
                    GDALResampleConvolutionHorizontalPixelCount4_3rows(
3398
0
                        pChunk + j, pChunk + j + nChunkXSize,
3399
0
                        pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3400
0
                        dfVal2, dfVal3);
3401
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3402
0
                                               nDstXSize +
3403
0
                                           iDstPixel - nDstXOff] = dfVal1;
3404
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3405
0
                                            1) *
3406
0
                                               nDstXSize +
3407
0
                                           iDstPixel - nDstXOff] = dfVal2;
3408
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3409
0
                                            2) *
3410
0
                                               nDstXSize +
3411
0
                                           iDstPixel - nDstXOff] = dfVal3;
3412
0
                }
3413
0
            }
3414
0
            else if (bSrcPixelCountLess8)
3415
0
            {
3416
0
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3417
0
                {
3418
0
                    const GPtrDiff_t j =
3419
0
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3420
0
                        (nSrcPixelStart - nChunkXOff);
3421
0
                    double dfVal1 = 0.0;
3422
0
                    double dfVal2 = 0.0;
3423
0
                    double dfVal3 = 0.0;
3424
0
                    GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3425
0
                        pChunk + j, pChunk + j + nChunkXSize,
3426
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3427
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3428
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3429
0
                                               nDstXSize +
3430
0
                                           iDstPixel - nDstXOff] = dfVal1;
3431
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3432
0
                                            1) *
3433
0
                                               nDstXSize +
3434
0
                                           iDstPixel - nDstXOff] = dfVal2;
3435
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3436
0
                                            2) *
3437
0
                                               nDstXSize +
3438
0
                                           iDstPixel - nDstXOff] = dfVal3;
3439
0
                }
3440
0
            }
3441
0
            else
3442
0
#endif
3443
0
            {
3444
0
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3445
0
                {
3446
0
                    const GPtrDiff_t j =
3447
0
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3448
0
                        (nSrcPixelStart - nChunkXOff);
3449
0
                    double dfVal1 = 0.0;
3450
0
                    double dfVal2 = 0.0;
3451
0
                    double dfVal3 = 0.0;
3452
0
                    GDALResampleConvolutionHorizontal_3rows(
3453
0
                        pChunk + j, pChunk + j + nChunkXSize,
3454
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3455
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3456
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3457
0
                                               nDstXSize +
3458
0
                                           iDstPixel - nDstXOff] = dfVal1;
3459
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3460
0
                                            1) *
3461
0
                                               nDstXSize +
3462
0
                                           iDstPixel - nDstXOff] = dfVal2;
3463
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3464
0
                                            2) *
3465
0
                                               nDstXSize +
3466
0
                                           iDstPixel - nDstXOff] = dfVal3;
3467
0
                }
3468
0
            }
3469
0
            for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3470
0
            {
3471
0
                const GPtrDiff_t j =
3472
0
                    static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3473
0
                    (nSrcPixelStart - nChunkXOff);
3474
0
                const double dfVal = GDALResampleConvolutionHorizontal(
3475
0
                    pChunk + j, padfWeights, nSrcPixelCount);
3476
0
                padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3477
0
                                           nDstXSize +
3478
0
                                       iDstPixel - nDstXOff] = dfVal;
3479
0
            }
3480
0
        }
3481
0
        else
3482
0
        {
3483
0
            for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3484
0
            {
3485
0
                const GPtrDiff_t j =
3486
0
                    static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3487
0
                    (nSrcPixelStart - nChunkXOff);
3488
3489
0
                if (bKernelWithNegativeWeights)
3490
0
                {
3491
0
                    int nConsecutiveValid = 0;
3492
0
                    int nMaxConsecutiveValid = 0;
3493
0
                    for (int k = 0; k < nSrcPixelCount; k++)
3494
0
                    {
3495
0
                        if (pabyChunkNodataMask[j + k])
3496
0
                            nConsecutiveValid++;
3497
0
                        else if (nConsecutiveValid)
3498
0
                        {
3499
0
                            nMaxConsecutiveValid = std::max(
3500
0
                                nMaxConsecutiveValid, nConsecutiveValid);
3501
0
                            nConsecutiveValid = 0;
3502
0
                        }
3503
0
                    }
3504
0
                    nMaxConsecutiveValid =
3505
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
3506
0
                    if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3507
0
                    {
3508
0
                        const size_t nTempOffset =
3509
0
                            static_cast<size_t>(iSrcLineOff) * nDstXSize +
3510
0
                            iDstPixel - nDstXOff;
3511
0
                        padfHorizontalFiltered[nTempOffset] = 0.0;
3512
0
                        pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3513
0
                        continue;
3514
0
                    }
3515
0
                }
3516
3517
0
                double dfVal = 0.0;
3518
0
                GDALResampleConvolutionHorizontalWithMask(
3519
0
                    pChunk + j, pabyChunkNodataMask + j, padfWeights,
3520
0
                    nSrcPixelCount, dfVal, dfWeightSum);
3521
0
                const size_t nTempOffset =
3522
0
                    static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3523
0
                    nDstXOff;
3524
0
                if (dfWeightSum > 0.0)
3525
0
                {
3526
0
                    padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3527
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3528
0
                }
3529
0
                else
3530
0
                {
3531
0
                    padfHorizontalFiltered[nTempOffset] = 0.0;
3532
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3533
0
                }
3534
0
            }
3535
0
        }
3536
0
    }
3537
3538
    /* ==================================================================== */
3539
    /*      Second pass: vertical filter                                    */
3540
    /* ==================================================================== */
3541
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3542
3543
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3544
0
    {
3545
0
        Twork *const pafDstScanline =
3546
0
            pafWrkScanline ? pafWrkScanline
3547
0
                           : static_cast<Twork *>(pDstBuffer) +
3548
0
                                 (iDstLine - nDstYOff) * nDstXSize;
3549
3550
0
        const double dfSrcLine =
3551
0
            (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3552
0
        int nSrcLineStart =
3553
0
            static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3554
0
        int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3555
0
        if (nSrcLineStart < nChunkYOff)
3556
0
            nSrcLineStart = nChunkYOff;
3557
0
        if (nSrcLineStop > nChunkBottomYOff)
3558
0
            nSrcLineStop = nChunkBottomYOff;
3559
#if 0
3560
        if( nSrcLineStart < nChunkYOff &&
3561
            nChunkYOff > 0 )
3562
        {
3563
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3564
        }
3565
        if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3566
        {
3567
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3568
        }
3569
#endif
3570
0
        const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3571
0
        double dfWeightSum = 0.0;
3572
3573
        // Compute convolution coefficients.
3574
0
        int nSrcLine = nSrcLineStart;  // Used after for.
3575
0
        double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3576
0
        for (; nSrcLine + 3 < nSrcLineStop;
3577
0
             nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3578
0
        {
3579
0
            padfWeights[nSrcLine - nSrcLineStart] = dfY;
3580
0
            padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3581
0
            padfWeights[nSrcLine + 2 - nSrcLineStart] =
3582
0
                dfY + 2 * dfYScaleWeight;
3583
0
            padfWeights[nSrcLine + 3 - nSrcLineStart] =
3584
0
                dfY + 3 * dfYScaleWeight;
3585
0
            dfWeightSum +=
3586
0
                pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3587
0
        }
3588
0
        for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3589
0
        {
3590
0
            const double dfWeight = pfnFilterFunc(dfY);
3591
0
            padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3592
0
            dfWeightSum += dfWeight;
3593
0
        }
3594
3595
0
        if (pabyChunkNodataMask == nullptr)
3596
0
        {
3597
0
            if (dfWeightSum != 0)
3598
0
            {
3599
0
                const double dfInvWeightSum = 1.0 / dfWeightSum;
3600
0
                for (int i = 0; i < nSrcLineCount; ++i)
3601
0
                    padfWeights[i] *= dfInvWeightSum;
3602
0
            }
3603
0
        }
3604
3605
0
        if (pabyChunkNodataMask == nullptr)
3606
0
        {
3607
0
            int iFilteredPixelOff = 0;  // Used after for.
3608
            // j used after for.
3609
0
            size_t j =
3610
0
                (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3611
0
#ifdef USE_SSE2
3612
            if constexpr (eWrkDataType == GDT_Float32)
3613
0
            {
3614
#ifdef __AVX__
3615
                for (; iFilteredPixelOff + 15 < nDstXSize;
3616
                     iFilteredPixelOff += 16, j += 16)
3617
                {
3618
                    GDALResampleConvolutionVertical_16cols(
3619
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3620
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3621
                    if (bHasNoData)
3622
                    {
3623
                        for (int k = 0; k < 16; k++)
3624
                        {
3625
                            pafDstScanline[iFilteredPixelOff + k] =
3626
                                replaceValIfNodata(
3627
                                    pafDstScanline[iFilteredPixelOff + k]);
3628
                        }
3629
                    }
3630
                }
3631
#else
3632
0
                for (; iFilteredPixelOff + 7 < nDstXSize;
3633
0
                     iFilteredPixelOff += 8, j += 8)
3634
0
                {
3635
0
                    GDALResampleConvolutionVertical_8cols(
3636
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3637
0
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3638
0
                    if (bHasNoData)
3639
0
                    {
3640
0
                        for (int k = 0; k < 8; k++)
3641
0
                        {
3642
0
                            pafDstScanline[iFilteredPixelOff + k] =
3643
0
                                replaceValIfNodata(
3644
0
                                    pafDstScanline[iFilteredPixelOff + k]);
3645
0
                        }
3646
0
                    }
3647
0
                }
3648
0
#endif
3649
3650
0
                for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3651
0
                {
3652
0
                    const Twork fVal =
3653
0
                        static_cast<Twork>(GDALResampleConvolutionVertical(
3654
0
                            padfHorizontalFiltered + j, nDstXSize, padfWeights,
3655
0
                            nSrcLineCount));
3656
0
                    pafDstScanline[iFilteredPixelOff] =
3657
0
                        replaceValIfNodata(fVal);
3658
0
                }
3659
            }
3660
            else
3661
#endif
3662
0
            {
3663
0
                for (; iFilteredPixelOff + 1 < nDstXSize;
3664
0
                     iFilteredPixelOff += 2, j += 2)
3665
0
                {
3666
0
                    double dfVal1 = 0.0;
3667
0
                    double dfVal2 = 0.0;
3668
0
                    GDALResampleConvolutionVertical_2cols(
3669
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3670
0
                        nSrcLineCount, dfVal1, dfVal2);
3671
0
                    pafDstScanline[iFilteredPixelOff] =
3672
0
                        replaceValIfNodata(static_cast<Twork>(dfVal1));
3673
0
                    pafDstScanline[iFilteredPixelOff + 1] =
3674
0
                        replaceValIfNodata(static_cast<Twork>(dfVal2));
3675
0
                }
3676
0
                if (iFilteredPixelOff < nDstXSize)
3677
0
                {
3678
0
                    const double dfVal = GDALResampleConvolutionVertical(
3679
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3680
0
                        nSrcLineCount);
3681
0
                    pafDstScanline[iFilteredPixelOff] =
3682
0
                        replaceValIfNodata(static_cast<Twork>(dfVal));
3683
0
                }
3684
0
            }
3685
0
        }
3686
0
        else
3687
0
        {
3688
0
            for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3689
0
                 ++iFilteredPixelOff)
3690
0
            {
3691
0
                double dfVal = 0.0;
3692
0
                dfWeightSum = 0.0;
3693
0
                size_t j = (nSrcLineStart - nChunkYOff) *
3694
0
                               static_cast<size_t>(nDstXSize) +
3695
0
                           iFilteredPixelOff;
3696
0
                if (bKernelWithNegativeWeights)
3697
0
                {
3698
0
                    int nConsecutiveValid = 0;
3699
0
                    int nMaxConsecutiveValid = 0;
3700
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3701
0
                    {
3702
0
                        const double dfWeight =
3703
0
                            padfWeights[i] *
3704
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
3705
0
                        if (pabyChunkNodataMaskHorizontalFiltered[j])
3706
0
                        {
3707
0
                            nConsecutiveValid++;
3708
0
                        }
3709
0
                        else if (nConsecutiveValid)
3710
0
                        {
3711
0
                            nMaxConsecutiveValid = std::max(
3712
0
                                nMaxConsecutiveValid, nConsecutiveValid);
3713
0
                            nConsecutiveValid = 0;
3714
0
                        }
3715
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
3716
0
                        dfWeightSum += dfWeight;
3717
0
                    }
3718
0
                    nMaxConsecutiveValid =
3719
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
3720
0
                    if (nMaxConsecutiveValid < nSrcLineCount / 2)
3721
0
                    {
3722
0
                        pafDstScanline[iFilteredPixelOff] =
3723
0
                            static_cast<Twork>(dfNoDataValue);
3724
0
                        continue;
3725
0
                    }
3726
0
                }
3727
0
                else
3728
0
                {
3729
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3730
0
                    {
3731
0
                        const double dfWeight =
3732
0
                            padfWeights[i] *
3733
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
3734
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
3735
0
                        dfWeightSum += dfWeight;
3736
0
                    }
3737
0
                }
3738
0
                if (dfWeightSum > 0.0)
3739
0
                {
3740
0
                    pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3741
0
                        static_cast<Twork>(dfVal / dfWeightSum));
3742
0
                }
3743
0
                else
3744
0
                {
3745
0
                    pafDstScanline[iFilteredPixelOff] =
3746
0
                        static_cast<Twork>(dfNoDataValue);
3747
0
                }
3748
0
            }
3749
0
        }
3750
3751
0
        if (fMaxVal != 0.0f)
3752
0
        {
3753
0
            for (int i = 0; i < nDstXSize; ++i)
3754
0
            {
3755
0
                if (pafDstScanline[i] > fMaxVal)
3756
0
                    pafDstScanline[i] = fMaxVal;
3757
0
            }
3758
0
        }
3759
3760
0
        if (pafWrkScanline)
3761
0
        {
3762
0
            GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
3763
0
                            static_cast<GByte *>(pDstBuffer) +
3764
0
                                static_cast<size_t>(iDstLine - nDstYOff) *
3765
0
                                    nDstXSize * nDstDataTypeSize,
3766
0
                            dstDataType, nDstDataTypeSize, nDstXSize);
3767
0
        }
3768
0
    }
3769
3770
0
    VSIFree(pafWrkScanline);
3771
0
    VSIFreeAligned(padfWeights);
3772
0
    VSIFree(padfHorizontalFiltered);
3773
0
    VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3774
3775
0
    return CE_None;
3776
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, bool, float)
3777
3778
static CPLErr
3779
GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3780
                              const void *pChunk, void **ppDstBuffer,
3781
                              GDALDataType *peDstBufferDataType)
3782
0
{
3783
0
    GDALResampleAlg eResample;
3784
0
    bool bKernelWithNegativeWeights = false;
3785
0
    if (EQUAL(args.pszResampling, "BILINEAR"))
3786
0
        eResample = GRA_Bilinear;
3787
0
    else if (EQUAL(args.pszResampling, "CUBIC"))
3788
0
    {
3789
0
        eResample = GRA_Cubic;
3790
0
        bKernelWithNegativeWeights = true;
3791
0
    }
3792
0
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
3793
0
        eResample = GRA_CubicSpline;
3794
0
    else if (EQUAL(args.pszResampling, "LANCZOS"))
3795
0
    {
3796
0
        eResample = GRA_Lanczos;
3797
0
        bKernelWithNegativeWeights = true;
3798
0
    }
3799
0
    else
3800
0
    {
3801
0
        CPLAssert(false);
3802
0
        return CE_Failure;
3803
0
    }
3804
0
    const int nKernelRadius = GWKGetFilterRadius(eResample);
3805
0
    FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3806
0
    const FilterFunc4ValuesType pfnFilterFunc4Values =
3807
0
        GWKGetFilterFunc4Values(eResample);
3808
3809
0
    float fMaxVal = 0.f;
3810
    // Cubic, etc... can have overshoots, so make sure we clamp values to the
3811
    // maximum value if NBITS is set.
3812
0
    if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3813
0
        (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
3814
0
         args.eOvrDataType == GDT_UInt32))
3815
0
    {
3816
0
        int nBits = args.nOvrNBITS;
3817
0
        if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
3818
0
            nBits = 0;
3819
0
        if (nBits > 0 && nBits < 32)
3820
0
            fMaxVal = static_cast<float>((1U << nBits) - 1);
3821
0
    }
3822
3823
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3824
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3825
0
        GDALGetDataTypeSizeBytes(args.eOvrDataType));
3826
0
    if (*ppDstBuffer == nullptr)
3827
0
    {
3828
0
        return CE_Failure;
3829
0
    }
3830
0
    *peDstBufferDataType = args.eOvrDataType;
3831
3832
0
    switch (args.eWrkDataType)
3833
0
    {
3834
0
        case GDT_Byte:
3835
0
        {
3836
0
            return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3837
0
                args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3838
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3839
0
                bKernelWithNegativeWeights, fMaxVal);
3840
0
        }
3841
3842
0
        case GDT_UInt16:
3843
0
        {
3844
0
            return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3845
0
                args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3846
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3847
0
                bKernelWithNegativeWeights, fMaxVal);
3848
0
        }
3849
3850
0
        case GDT_Float32:
3851
0
        {
3852
0
            return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3853
0
                args, static_cast<const float *>(pChunk), *ppDstBuffer,
3854
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3855
0
                bKernelWithNegativeWeights, fMaxVal);
3856
0
        }
3857
3858
0
        case GDT_Float64:
3859
0
        {
3860
0
            return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3861
0
                args, static_cast<const double *>(pChunk), *ppDstBuffer,
3862
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3863
0
                bKernelWithNegativeWeights, fMaxVal);
3864
0
        }
3865
3866
0
        default:
3867
0
            break;
3868
0
    }
3869
3870
0
    CPLAssert(false);
3871
0
    return CE_Failure;
3872
0
}
3873
3874
/************************************************************************/
3875
/*                       GDALResampleChunkC32R()                        */
3876
/************************************************************************/
3877
3878
static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
3879
                                    const float *pafChunk, const int nChunkYOff,
3880
                                    const int nChunkYSize, const int nDstYOff,
3881
                                    const int nDstYOff2, const int nOvrXSize,
3882
                                    const int nOvrYSize, void **ppDstBuffer,
3883
                                    GDALDataType *peDstBufferDataType,
3884
                                    const char *pszResampling)
3885
3886
0
{
3887
0
    enum Method
3888
0
    {
3889
0
        NEAR,
3890
0
        AVERAGE,
3891
0
        AVERAGE_MAGPHASE,
3892
0
        RMS,
3893
0
    };
3894
3895
0
    Method eMethod = NEAR;
3896
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
3897
0
    {
3898
0
        eMethod = NEAR;
3899
0
    }
3900
0
    else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3901
0
    {
3902
0
        eMethod = AVERAGE_MAGPHASE;
3903
0
    }
3904
0
    else if (EQUAL(pszResampling, "RMS"))
3905
0
    {
3906
0
        eMethod = RMS;
3907
0
    }
3908
0
    else if (STARTS_WITH_CI(pszResampling, "AVER"))
3909
0
    {
3910
0
        eMethod = AVERAGE;
3911
0
    }
3912
0
    else
3913
0
    {
3914
0
        CPLError(
3915
0
            CE_Failure, CPLE_NotSupported,
3916
0
            "Resampling method %s is not supported for complex data types. "
3917
0
            "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3918
0
            pszResampling);
3919
0
        return CE_Failure;
3920
0
    }
3921
3922
0
    const int nOXSize = nOvrXSize;
3923
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3924
0
                                       GDALGetDataTypeSizeBytes(GDT_CFloat32));
3925
0
    if (*ppDstBuffer == nullptr)
3926
0
    {
3927
0
        return CE_Failure;
3928
0
    }
3929
0
    float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3930
0
    *peDstBufferDataType = GDT_CFloat32;
3931
3932
0
    const int nOYSize = nOvrYSize;
3933
0
    const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3934
0
    const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3935
3936
    /* ==================================================================== */
3937
    /*      Loop over destination scanlines.                                */
3938
    /* ==================================================================== */
3939
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3940
0
    {
3941
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3942
0
        if (nSrcYOff < nChunkYOff)
3943
0
            nSrcYOff = nChunkYOff;
3944
3945
0
        int nSrcYOff2 =
3946
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3947
0
        if (nSrcYOff2 == nSrcYOff)
3948
0
            nSrcYOff2++;
3949
3950
0
        if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3951
0
        {
3952
0
            if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3953
0
                nSrcYOff = nSrcHeight - 1;
3954
0
            nSrcYOff2 = nSrcHeight;
3955
0
        }
3956
0
        if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3957
0
            nSrcYOff2 = nChunkYOff + nChunkYSize;
3958
3959
0
        const float *const pafSrcScanline =
3960
0
            pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3961
0
        float *const pafDstScanline =
3962
0
            pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3963
3964
        /* --------------------------------------------------------------------
3965
         */
3966
        /*      Loop over destination pixels */
3967
        /* --------------------------------------------------------------------
3968
         */
3969
0
        for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3970
0
        {
3971
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3972
0
            int nSrcXOff2 =
3973
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3974
0
            if (nSrcXOff2 == nSrcXOff)
3975
0
                nSrcXOff2++;
3976
0
            if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3977
0
            {
3978
0
                if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3979
0
                    nSrcXOff = nSrcWidth - 1;
3980
0
                nSrcXOff2 = nSrcWidth;
3981
0
            }
3982
3983
0
            if (eMethod == NEAR)
3984
0
            {
3985
0
                pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
3986
0
                pafDstScanline[iDstPixel * 2 + 1] =
3987
0
                    pafSrcScanline[nSrcXOff * 2 + 1];
3988
0
            }
3989
0
            else if (eMethod == AVERAGE_MAGPHASE)
3990
0
            {
3991
0
                double dfTotalR = 0.0;
3992
0
                double dfTotalI = 0.0;
3993
0
                double dfTotalM = 0.0;
3994
0
                int nCount = 0;
3995
3996
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3997
0
                {
3998
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3999
0
                    {
4000
0
                        const double dfR =
4001
0
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4002
0
                                                        iY - nSrcYOff) *
4003
0
                                                        nSrcWidth * 2];
4004
0
                        const double dfI =
4005
0
                            pafSrcScanline[iX * 2 +
4006
0
                                           static_cast<GPtrDiff_t>(iY -
4007
0
                                                                   nSrcYOff) *
4008
0
                                               nSrcWidth * 2 +
4009
0
                                           1];
4010
0
                        dfTotalR += dfR;
4011
0
                        dfTotalI += dfI;
4012
0
                        dfTotalM += std::hypot(dfR, dfI);
4013
0
                        ++nCount;
4014
0
                    }
4015
0
                }
4016
4017
0
                CPLAssert(nCount > 0);
4018
0
                if (nCount == 0)
4019
0
                {
4020
0
                    pafDstScanline[iDstPixel * 2] = 0.0;
4021
0
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4022
0
                }
4023
0
                else
4024
0
                {
4025
0
                    pafDstScanline[iDstPixel * 2] =
4026
0
                        static_cast<float>(dfTotalR / nCount);
4027
0
                    pafDstScanline[iDstPixel * 2 + 1] =
4028
0
                        static_cast<float>(dfTotalI / nCount);
4029
0
                    const double dfM =
4030
0
                        std::hypot(pafDstScanline[iDstPixel * 2],
4031
0
                                   pafDstScanline[iDstPixel * 2 + 1]);
4032
0
                    const double dfDesiredM = dfTotalM / nCount;
4033
0
                    double dfRatio = 1.0;
4034
0
                    if (dfM != 0.0)
4035
0
                        dfRatio = dfDesiredM / dfM;
4036
4037
0
                    pafDstScanline[iDstPixel * 2] *=
4038
0
                        static_cast<float>(dfRatio);
4039
0
                    pafDstScanline[iDstPixel * 2 + 1] *=
4040
0
                        static_cast<float>(dfRatio);
4041
0
                }
4042
0
            }
4043
0
            else if (eMethod == RMS)
4044
0
            {
4045
0
                double dfTotalR = 0.0;
4046
0
                double dfTotalI = 0.0;
4047
0
                int nCount = 0;
4048
4049
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4050
0
                {
4051
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4052
0
                    {
4053
0
                        const double dfR =
4054
0
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4055
0
                                                        iY - nSrcYOff) *
4056
0
                                                        nSrcWidth * 2];
4057
0
                        const double dfI =
4058
0
                            pafSrcScanline[iX * 2 +
4059
0
                                           static_cast<GPtrDiff_t>(iY -
4060
0
                                                                   nSrcYOff) *
4061
0
                                               nSrcWidth * 2 +
4062
0
                                           1];
4063
4064
0
                        dfTotalR += SQUARE(dfR);
4065
0
                        dfTotalI += SQUARE(dfI);
4066
4067
0
                        ++nCount;
4068
0
                    }
4069
0
                }
4070
4071
0
                CPLAssert(nCount > 0);
4072
0
                if (nCount == 0)
4073
0
                {
4074
0
                    pafDstScanline[iDstPixel * 2] = 0.0;
4075
0
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4076
0
                }
4077
0
                else
4078
0
                {
4079
                    /* compute RMS */
4080
0
                    pafDstScanline[iDstPixel * 2] =
4081
0
                        static_cast<float>(sqrt(dfTotalR / nCount));
4082
0
                    pafDstScanline[iDstPixel * 2 + 1] =
4083
0
                        static_cast<float>(sqrt(dfTotalI / nCount));
4084
0
                }
4085
0
            }
4086
0
            else if (eMethod == AVERAGE)
4087
0
            {
4088
0
                double dfTotalR = 0.0;
4089
0
                double dfTotalI = 0.0;
4090
0
                int nCount = 0;
4091
4092
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4093
0
                {
4094
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4095
0
                    {
4096
                        // TODO(schwehr): Maybe use std::complex?
4097
0
                        dfTotalR +=
4098
0
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4099
0
                                                        iY - nSrcYOff) *
4100
0
                                                        nSrcWidth * 2];
4101
0
                        dfTotalI += pafSrcScanline[iX * 2 +
4102
0
                                                   static_cast<GPtrDiff_t>(
4103
0
                                                       iY - nSrcYOff) *
4104
0
                                                       nSrcWidth * 2 +
4105
0
                                                   1];
4106
0
                        ++nCount;
4107
0
                    }
4108
0
                }
4109
4110
0
                CPLAssert(nCount > 0);
4111
0
                if (nCount == 0)
4112
0
                {
4113
0
                    pafDstScanline[iDstPixel * 2] = 0.0;
4114
0
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4115
0
                }
4116
0
                else
4117
0
                {
4118
0
                    pafDstScanline[iDstPixel * 2] =
4119
0
                        static_cast<float>(dfTotalR / nCount);
4120
0
                    pafDstScanline[iDstPixel * 2 + 1] =
4121
0
                        static_cast<float>(dfTotalI / nCount);
4122
0
                }
4123
0
            }
4124
0
        }
4125
0
    }
4126
4127
0
    return CE_None;
4128
0
}
4129
4130
/************************************************************************/
4131
/*                  GDALRegenerateCascadingOverviews()                  */
4132
/*                                                                      */
4133
/*      Generate a list of overviews in order from largest to           */
4134
/*      smallest, computing each from the next larger.                  */
4135
/************************************************************************/
4136
4137
static CPLErr GDALRegenerateCascadingOverviews(
4138
    GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4139
    const char *pszResampling, GDALProgressFunc pfnProgress,
4140
    void *pProgressData, CSLConstList papszOptions)
4141
4142
0
{
4143
    /* -------------------------------------------------------------------- */
4144
    /*      First, we must put the overviews in order from largest to       */
4145
    /*      smallest.                                                       */
4146
    /* -------------------------------------------------------------------- */
4147
0
    for (int i = 0; i < nOverviews - 1; ++i)
4148
0
    {
4149
0
        for (int j = 0; j < nOverviews - i - 1; ++j)
4150
0
        {
4151
0
            if (papoOvrBands[j]->GetXSize() *
4152
0
                    static_cast<float>(papoOvrBands[j]->GetYSize()) <
4153
0
                papoOvrBands[j + 1]->GetXSize() *
4154
0
                    static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4155
0
            {
4156
0
                GDALRasterBand *poTempBand = papoOvrBands[j];
4157
0
                papoOvrBands[j] = papoOvrBands[j + 1];
4158
0
                papoOvrBands[j + 1] = poTempBand;
4159
0
            }
4160
0
        }
4161
0
    }
4162
4163
    /* -------------------------------------------------------------------- */
4164
    /*      Count total pixels so we can prepare appropriate scaled         */
4165
    /*      progress functions.                                             */
4166
    /* -------------------------------------------------------------------- */
4167
0
    double dfTotalPixels = 0.0;
4168
4169
0
    for (int i = 0; i < nOverviews; ++i)
4170
0
    {
4171
0
        dfTotalPixels += papoOvrBands[i]->GetXSize() *
4172
0
                         static_cast<double>(papoOvrBands[i]->GetYSize());
4173
0
    }
4174
4175
    /* -------------------------------------------------------------------- */
4176
    /*      Generate all the bands.                                         */
4177
    /* -------------------------------------------------------------------- */
4178
0
    double dfPixelsProcessed = 0.0;
4179
4180
0
    for (int i = 0; i < nOverviews; ++i)
4181
0
    {
4182
0
        GDALRasterBand *poBaseBand = poSrcBand;
4183
0
        if (i != 0)
4184
0
            poBaseBand = papoOvrBands[i - 1];
4185
4186
0
        double dfPixels = papoOvrBands[i]->GetXSize() *
4187
0
                          static_cast<double>(papoOvrBands[i]->GetYSize());
4188
4189
0
        void *pScaledProgressData = GDALCreateScaledProgress(
4190
0
            dfPixelsProcessed / dfTotalPixels,
4191
0
            (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4192
0
            pProgressData);
4193
4194
0
        const CPLErr eErr = GDALRegenerateOverviewsEx(
4195
0
            poBaseBand, 1,
4196
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4197
0
            pszResampling, GDALScaledProgress, pScaledProgressData,
4198
0
            papszOptions);
4199
0
        GDALDestroyScaledProgress(pScaledProgressData);
4200
4201
0
        if (eErr != CE_None)
4202
0
            return eErr;
4203
4204
0
        dfPixelsProcessed += dfPixels;
4205
4206
        // Only do the bit2grayscale promotion on the base band.
4207
0
        if (STARTS_WITH_CI(pszResampling,
4208
0
                           "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4209
0
            pszResampling = "AVERAGE";
4210
0
    }
4211
4212
0
    return CE_None;
4213
0
}
4214
4215
/************************************************************************/
4216
/*                    GDALGetResampleFunction()                         */
4217
/************************************************************************/
4218
4219
GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4220
                                             int *pnRadius)
4221
0
{
4222
0
    if (pnRadius)
4223
0
        *pnRadius = 0;
4224
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
4225
0
        return GDALResampleChunk_Near;
4226
0
    else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4227
0
             EQUAL(pszResampling, "RMS"))
4228
0
        return GDALResampleChunk_AverageOrRMS;
4229
0
    else if (EQUAL(pszResampling, "GAUSS"))
4230
0
    {
4231
0
        if (pnRadius)
4232
0
            *pnRadius = 1;
4233
0
        return GDALResampleChunk_Gauss;
4234
0
    }
4235
0
    else if (EQUAL(pszResampling, "MODE"))
4236
0
        return GDALResampleChunk_Mode;
4237
0
    else if (EQUAL(pszResampling, "CUBIC"))
4238
0
    {
4239
0
        if (pnRadius)
4240
0
            *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4241
0
        return GDALResampleChunk_Convolution;
4242
0
    }
4243
0
    else if (EQUAL(pszResampling, "CUBICSPLINE"))
4244
0
    {
4245
0
        if (pnRadius)
4246
0
            *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4247
0
        return GDALResampleChunk_Convolution;
4248
0
    }
4249
0
    else if (EQUAL(pszResampling, "LANCZOS"))
4250
0
    {
4251
0
        if (pnRadius)
4252
0
            *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4253
0
        return GDALResampleChunk_Convolution;
4254
0
    }
4255
0
    else if (EQUAL(pszResampling, "BILINEAR"))
4256
0
    {
4257
0
        if (pnRadius)
4258
0
            *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4259
0
        return GDALResampleChunk_Convolution;
4260
0
    }
4261
0
    else
4262
0
    {
4263
0
        CPLError(
4264
0
            CE_Failure, CPLE_AppDefined,
4265
0
            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4266
0
            pszResampling);
4267
0
        return nullptr;
4268
0
    }
4269
0
}
4270
4271
/************************************************************************/
4272
/*                      GDALGetOvrWorkDataType()                        */
4273
/************************************************************************/
4274
4275
GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4276
                                    GDALDataType eSrcDataType)
4277
0
{
4278
0
    if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4279
0
    {
4280
0
        return eSrcDataType;
4281
0
    }
4282
0
    else if (eSrcDataType == GDT_Byte &&
4283
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4284
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4285
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4286
0
              EQUAL(pszResampling, "LANCZOS") ||
4287
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4288
0
    {
4289
0
        return GDT_Byte;
4290
0
    }
4291
0
    else if (eSrcDataType == GDT_UInt16 &&
4292
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4293
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4294
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4295
0
              EQUAL(pszResampling, "LANCZOS") ||
4296
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4297
0
    {
4298
0
        return GDT_UInt16;
4299
0
    }
4300
0
    else if (EQUAL(pszResampling, "GAUSS"))
4301
0
        return GDT_Float64;
4302
4303
0
    if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4304
0
        eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4305
0
        eSrcDataType == GDT_Float32)
4306
0
    {
4307
0
        return GDT_Float32;
4308
0
    }
4309
0
    return GDT_Float64;
4310
0
}
4311
4312
namespace
4313
{
4314
// Structure to hold a pointer to free with CPLFree()
4315
struct PointerHolder
4316
{
4317
    void *ptr = nullptr;
4318
4319
0
    explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4320
0
    {
4321
0
    }
4322
4323
    ~PointerHolder()
4324
0
    {
4325
0
        CPLFree(ptr);
4326
0
    }
4327
4328
    PointerHolder(const PointerHolder &) = delete;
4329
    PointerHolder &operator=(const PointerHolder &) = delete;
4330
};
4331
}  // namespace
4332
4333
/************************************************************************/
4334
/*                      GDALRegenerateOverviews()                       */
4335
/************************************************************************/
4336
4337
/**
4338
 * \brief Generate downsampled overviews.
4339
 *
4340
 * This function will generate one or more overview images from a base image
4341
 * using the requested downsampling algorithm.  Its primary use is for
4342
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4343
 * used to generate downsampled images in one file from another outside the
4344
 * overview architecture.
4345
 *
4346
 * The output bands need to exist in advance.
4347
 *
4348
 * The full set of resampling algorithms is documented in
4349
 * GDALDataset::BuildOverviews().
4350
 *
4351
 * This function will honour properly NODATA_VALUES tuples (special dataset
4352
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4353
 * considered as the nodata value and not each value of the triplet
4354
 * independently per band.
4355
 *
4356
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4357
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4358
 * overview computation.
4359
 *
4360
 * @param hSrcBand the source (base level) band.
4361
 * @param nOverviewCount the number of downsampled bands being generated.
4362
 * @param pahOvrBands the list of downsampled bands to be generated.
4363
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4364
 * @param pfnProgress progress report function.
4365
 * @param pProgressData progress function callback data.
4366
 * @return CE_None on success or CE_Failure on failure.
4367
 */
4368
CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4369
                               GDALRasterBandH *pahOvrBands,
4370
                               const char *pszResampling,
4371
                               GDALProgressFunc pfnProgress,
4372
                               void *pProgressData)
4373
4374
0
{
4375
0
    return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4376
0
                                     pszResampling, pfnProgress, pProgressData,
4377
0
                                     nullptr);
4378
0
}
4379
4380
/************************************************************************/
4381
/*                     GDALRegenerateOverviewsEx()                      */
4382
/************************************************************************/
4383
4384
constexpr int RADIUS_TO_DIAMETER = 2;
4385
4386
/**
4387
 * \brief Generate downsampled overviews.
4388
 *
4389
 * This function will generate one or more overview images from a base image
4390
 * using the requested downsampling algorithm.  Its primary use is for
4391
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4392
 * used to generate downsampled images in one file from another outside the
4393
 * overview architecture.
4394
 *
4395
 * The output bands need to exist in advance.
4396
 *
4397
 * The full set of resampling algorithms is documented in
4398
 * GDALDataset::BuildOverviews().
4399
 *
4400
 * This function will honour properly NODATA_VALUES tuples (special dataset
4401
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4402
 * considered as the nodata value and not each value of the triplet
4403
 * independently per band.
4404
 *
4405
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4406
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4407
 * overview computation.
4408
 *
4409
 * @param hSrcBand the source (base level) band.
4410
 * @param nOverviewCount the number of downsampled bands being generated.
4411
 * @param pahOvrBands the list of downsampled bands to be generated.
4412
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4413
 * @param pfnProgress progress report function.
4414
 * @param pProgressData progress function callback data.
4415
 * @param papszOptions NULL terminated list of options as key=value pairs, or
4416
 * NULL
4417
 * @return CE_None on success or CE_Failure on failure.
4418
 * @since GDAL 3.6
4419
 */
4420
CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4421
                                 GDALRasterBandH *pahOvrBands,
4422
                                 const char *pszResampling,
4423
                                 GDALProgressFunc pfnProgress,
4424
                                 void *pProgressData, CSLConstList papszOptions)
4425
4426
0
{
4427
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4428
0
    GDALRasterBand **papoOvrBands =
4429
0
        reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4430
4431
0
    if (pfnProgress == nullptr)
4432
0
        pfnProgress = GDALDummyProgress;
4433
4434
0
    if (EQUAL(pszResampling, "NONE"))
4435
0
        return CE_None;
4436
4437
0
    int nKernelRadius = 0;
4438
0
    GDALResampleFunction pfnResampleFn =
4439
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
4440
4441
0
    if (pfnResampleFn == nullptr)
4442
0
        return CE_Failure;
4443
4444
    /* -------------------------------------------------------------------- */
4445
    /*      Check color tables...                                           */
4446
    /* -------------------------------------------------------------------- */
4447
0
    GDALColorTable *poColorTable = nullptr;
4448
4449
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4450
0
         EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4451
0
        poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4452
0
    {
4453
0
        poColorTable = poSrcBand->GetColorTable();
4454
0
        if (poColorTable != nullptr)
4455
0
        {
4456
0
            if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4457
0
            {
4458
0
                CPLError(CE_Warning, CPLE_AppDefined,
4459
0
                         "Computing overviews on palette index raster bands "
4460
0
                         "with a palette whose color interpretation is not RGB "
4461
0
                         "will probably lead to unexpected results.");
4462
0
                poColorTable = nullptr;
4463
0
            }
4464
0
            else if (poColorTable->IsIdentity())
4465
0
            {
4466
0
                poColorTable = nullptr;
4467
0
            }
4468
0
        }
4469
0
        else
4470
0
        {
4471
0
            CPLError(CE_Warning, CPLE_AppDefined,
4472
0
                     "Computing overviews on palette index raster bands "
4473
0
                     "without a palette will probably lead to unexpected "
4474
0
                     "results.");
4475
0
        }
4476
0
    }
4477
    // Not ready yet
4478
0
    else if ((EQUAL(pszResampling, "CUBIC") ||
4479
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4480
0
              EQUAL(pszResampling, "LANCZOS") ||
4481
0
              EQUAL(pszResampling, "BILINEAR")) &&
4482
0
             poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4483
0
    {
4484
0
        CPLError(CE_Warning, CPLE_AppDefined,
4485
0
                 "Computing %s overviews on palette index raster bands "
4486
0
                 "will probably lead to unexpected results.",
4487
0
                 pszResampling);
4488
0
    }
4489
4490
    // If we have a nodata mask and we are doing something more complicated
4491
    // than nearest neighbouring, we have to fetch to nodata mask.
4492
4493
0
    GDALRasterBand *poMaskBand = nullptr;
4494
0
    bool bUseNoDataMask = false;
4495
0
    bool bCanUseCascaded = true;
4496
4497
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4498
0
    {
4499
        // Special case if we are an alpha/mask band. We want it to be
4500
        // considered as the mask band to avoid alpha=0 to be taken into account
4501
        // in average computation.
4502
0
        if (poSrcBand->IsMaskBand())
4503
0
        {
4504
0
            poMaskBand = poSrcBand;
4505
0
            bUseNoDataMask = true;
4506
0
        }
4507
0
        else
4508
0
        {
4509
0
            poMaskBand = poSrcBand->GetMaskBand();
4510
0
            const int nMaskFlags = poSrcBand->GetMaskFlags();
4511
0
            bCanUseCascaded =
4512
0
                (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4513
0
            bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4514
0
        }
4515
0
    }
4516
4517
    /* -------------------------------------------------------------------- */
4518
    /*      If we are operating on multiple overviews, and using            */
4519
    /*      averaging, lets do them in cascading order to reduce the        */
4520
    /*      amount of computation.                                          */
4521
    /* -------------------------------------------------------------------- */
4522
4523
    // In case the mask made be computed from another band of the dataset,
4524
    // we can't use cascaded generation, as the computation of the overviews
4525
    // of the band used for the mask band may not have yet occurred (#3033).
4526
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4527
0
         EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4528
0
         EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4529
0
         EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4530
0
         EQUAL(pszResampling, "MODE")) &&
4531
0
        nOverviewCount > 1 && bCanUseCascaded)
4532
0
        return GDALRegenerateCascadingOverviews(
4533
0
            poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4534
0
            pProgressData, papszOptions);
4535
4536
    /* -------------------------------------------------------------------- */
4537
    /*      Setup one horizontal swath to read from the raw buffer.         */
4538
    /* -------------------------------------------------------------------- */
4539
0
    int nFRXBlockSize = 0;
4540
0
    int nFRYBlockSize = 0;
4541
0
    poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4542
4543
0
    const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4544
0
    const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4545
0
                                       EQUAL(pszResampling, "MODE") ||
4546
0
                                       !GDALDataTypeIsComplex(eSrcDataType);
4547
0
    const GDALDataType eWrkDataType =
4548
0
        bUseGenericResampleFn
4549
0
            ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4550
0
            : GDT_CFloat32;
4551
4552
0
    const int nWidth = poSrcBand->GetXSize();
4553
0
    const int nHeight = poSrcBand->GetYSize();
4554
4555
0
    int nMaxOvrFactor = 1;
4556
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4557
0
    {
4558
0
        const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4559
0
        const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4560
0
        nMaxOvrFactor = std::max(
4561
0
            nMaxOvrFactor,
4562
0
            static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4563
0
        nMaxOvrFactor = std::max(
4564
0
            nMaxOvrFactor,
4565
0
            static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4566
0
    }
4567
4568
0
    int nFullResYChunk = nFRYBlockSize;
4569
0
    int nMaxChunkYSizeQueried = 0;
4570
4571
0
    const auto UpdateChunkHeightAndGetChunkSize =
4572
0
        [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4573
0
         eWrkDataType, nWidth]()
4574
0
    {
4575
        // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4576
        // + nFullResYChunk) / nMaxOvrFactor)
4577
0
        if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4578
0
        {
4579
0
            return GINTBIG_MAX;
4580
0
        }
4581
0
        nFullResYChunk =
4582
0
            std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4583
0
        if ((nKernelRadius > 0 &&
4584
0
             nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4585
0
            nFullResYChunk >
4586
0
                INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4587
0
        {
4588
0
            return GINTBIG_MAX;
4589
0
        }
4590
0
        nMaxChunkYSizeQueried =
4591
0
            nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4592
0
        if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4593
0
            std::numeric_limits<int64_t>::max() /
4594
0
                (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4595
0
        {
4596
0
            return GINTBIG_MAX;
4597
0
        }
4598
0
        return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4599
0
               nMaxChunkYSizeQueried * nWidth;
4600
0
    };
4601
4602
0
    const char *pszChunkYSize =
4603
0
        CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4604
0
#ifndef __COVERITY__
4605
    // Only configurable for debug / testing
4606
0
    if (pszChunkYSize)
4607
0
    {
4608
0
        nFullResYChunk = atoi(pszChunkYSize);
4609
0
    }
4610
0
#endif
4611
4612
    // Only configurable for debug / testing
4613
0
    const int nChunkMaxSize =
4614
0
        atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4615
4616
0
    auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4617
0
    if (nChunkSize > nChunkMaxSize)
4618
0
    {
4619
0
        if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4620
0
            !GDALDataTypeIsComplex(eSrcDataType) &&
4621
0
            (!STARTS_WITH_CI(pszResampling, "AVER") ||
4622
0
             EQUAL(pszResampling, "AVERAGE")))
4623
0
        {
4624
            // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4625
            // which use a block based strategy, which is much less memory
4626
            // hungry.
4627
0
            return GDALRegenerateOverviewsMultiBand(
4628
0
                1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4629
0
                pfnProgress, pProgressData, papszOptions);
4630
0
        }
4631
0
        else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4632
0
        {
4633
0
            return GDALRegenerateCascadingOverviews(
4634
0
                poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4635
0
                pfnProgress, pProgressData, papszOptions);
4636
0
        }
4637
0
    }
4638
0
    else if (pszChunkYSize == nullptr)
4639
0
    {
4640
        // Try to get as close as possible to nChunkMaxSize
4641
0
        while (nChunkSize < nChunkMaxSize / 2)
4642
0
        {
4643
0
            nFullResYChunk *= 2;
4644
0
            nChunkSize = UpdateChunkHeightAndGetChunkSize();
4645
0
        }
4646
0
    }
4647
4648
0
    int nHasNoData = 0;
4649
0
    const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4650
0
    const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4651
0
    const bool bPropagateNoData =
4652
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4653
4654
    // Structure describing a resampling job
4655
0
    struct OvrJob
4656
0
    {
4657
        // Buffers to free when job is finished
4658
0
        std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4659
0
        std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4660
0
        std::unique_ptr<PointerHolder> oDstBufferHolder{};
4661
4662
0
        GDALRasterBand *poDstBand = nullptr;
4663
4664
        // Input parameters of pfnResampleFn
4665
0
        GDALResampleFunction pfnResampleFn = nullptr;
4666
0
        int nSrcWidth = 0;
4667
0
        int nSrcHeight = 0;
4668
0
        int nDstWidth = 0;
4669
0
        GDALOverviewResampleArgs args{};
4670
0
        const void *pChunk = nullptr;
4671
0
        bool bUseGenericResampleFn = false;
4672
4673
        // Output values of resampling function
4674
0
        CPLErr eErr = CE_Failure;
4675
0
        void *pDstBuffer = nullptr;
4676
0
        GDALDataType eDstBufferDataType = GDT_Unknown;
4677
4678
0
        void SetSrcMaskBufferHolder(
4679
0
            const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4680
0
        {
4681
0
            oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4682
0
        }
4683
4684
0
        void SetSrcBufferHolder(
4685
0
            const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4686
0
        {
4687
0
            oSrcBufferHolder = oSrcBufferHolderIn;
4688
0
        }
4689
4690
0
        void NotifyFinished()
4691
0
        {
4692
0
            std::lock_guard guard(mutex);
4693
0
            bFinished = true;
4694
0
            cv.notify_one();
4695
0
        }
4696
4697
0
        bool IsFinished()
4698
0
        {
4699
0
            std::lock_guard guard(mutex);
4700
0
            return bFinished;
4701
0
        }
4702
4703
0
        void WaitFinished()
4704
0
        {
4705
0
            std::unique_lock oGuard(mutex);
4706
0
            while (!bFinished)
4707
0
            {
4708
0
                cv.wait(oGuard);
4709
0
            }
4710
0
        }
4711
4712
0
      private:
4713
        // Synchronization
4714
0
        bool bFinished = false;
4715
0
        std::mutex mutex{};
4716
0
        std::condition_variable cv{};
4717
0
    };
4718
4719
    // Thread function to resample
4720
0
    const auto JobResampleFunc = [](void *pData)
4721
0
    {
4722
0
        OvrJob *poJob = static_cast<OvrJob *>(pData);
4723
4724
0
        if (poJob->bUseGenericResampleFn)
4725
0
        {
4726
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
4727
0
                                               &(poJob->pDstBuffer),
4728
0
                                               &(poJob->eDstBufferDataType));
4729
0
        }
4730
0
        else
4731
0
        {
4732
0
            poJob->eErr = GDALResampleChunkC32R(
4733
0
                poJob->nSrcWidth, poJob->nSrcHeight,
4734
0
                static_cast<const float *>(poJob->pChunk),
4735
0
                poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4736
0
                poJob->args.nDstYOff, poJob->args.nDstYOff2,
4737
0
                poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4738
0
                &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4739
0
                poJob->args.pszResampling);
4740
0
        }
4741
4742
0
        poJob->oDstBufferHolder =
4743
0
            std::make_unique<PointerHolder>(poJob->pDstBuffer);
4744
4745
0
        poJob->NotifyFinished();
4746
0
    };
4747
4748
    // Function to write resample data to target band
4749
0
    const auto WriteJobData = [](const OvrJob *poJob)
4750
0
    {
4751
0
        return poJob->poDstBand->RasterIO(
4752
0
            GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
4753
0
            poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
4754
0
            poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
4755
0
            poJob->eDstBufferDataType, 0, 0, nullptr);
4756
0
    };
4757
4758
    // Wait for completion of oldest job and serialize it
4759
0
    const auto WaitAndFinalizeOldestJob =
4760
0
        [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4761
0
    {
4762
0
        auto poOldestJob = jobList.front().get();
4763
0
        poOldestJob->WaitFinished();
4764
0
        CPLErr l_eErr = poOldestJob->eErr;
4765
0
        if (l_eErr == CE_None)
4766
0
        {
4767
0
            l_eErr = WriteJobData(poOldestJob);
4768
0
        }
4769
4770
0
        jobList.pop_front();
4771
0
        return l_eErr;
4772
0
    };
4773
4774
    // Queue of jobs
4775
0
    std::list<std::unique_ptr<OvrJob>> jobList;
4776
4777
0
    GByte *pabyChunkNodataMask = nullptr;
4778
0
    void *pChunk = nullptr;
4779
4780
0
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4781
0
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4782
0
                                                       ? CPLGetNumCPUs()
4783
0
                                                       : atoi(pszThreads)));
4784
0
    auto poThreadPool =
4785
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4786
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4787
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
4788
4789
    /* -------------------------------------------------------------------- */
4790
    /*      Loop over image operating on chunks.                            */
4791
    /* -------------------------------------------------------------------- */
4792
0
    int nChunkYOff = 0;
4793
0
    CPLErr eErr = CE_None;
4794
4795
0
    for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4796
0
         nChunkYOff += nFullResYChunk)
4797
0
    {
4798
0
        if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4799
0
                         pProgressData))
4800
0
        {
4801
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4802
0
            eErr = CE_Failure;
4803
0
        }
4804
4805
0
        if (nFullResYChunk + nChunkYOff > nHeight)
4806
0
            nFullResYChunk = nHeight - nChunkYOff;
4807
4808
0
        int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4809
0
        int nChunkYSizeQueried =
4810
0
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4811
0
        if (nChunkYOffQueried < 0)
4812
0
        {
4813
0
            nChunkYSizeQueried += nChunkYOffQueried;
4814
0
            nChunkYOffQueried = 0;
4815
0
        }
4816
0
        if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4817
0
            nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4818
4819
        // Avoid accumulating too many tasks and exhaust RAM
4820
        // Try to complete already finished jobs
4821
0
        while (eErr == CE_None && !jobList.empty())
4822
0
        {
4823
0
            auto poOldestJob = jobList.front().get();
4824
0
            if (!poOldestJob->IsFinished())
4825
0
                break;
4826
0
            eErr = poOldestJob->eErr;
4827
0
            if (eErr == CE_None)
4828
0
            {
4829
0
                eErr = WriteJobData(poOldestJob);
4830
0
            }
4831
4832
0
            jobList.pop_front();
4833
0
        }
4834
4835
        // And in case we have saturated the number of threads,
4836
        // wait for completion of tasks to go below the threshold.
4837
0
        while (eErr == CE_None &&
4838
0
               jobList.size() >= static_cast<size_t>(nThreads))
4839
0
        {
4840
0
            eErr = WaitAndFinalizeOldestJob(jobList);
4841
0
        }
4842
4843
        // (Re)allocate buffers if needed
4844
0
        if (pChunk == nullptr)
4845
0
        {
4846
0
            pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4847
0
                                         nMaxChunkYSizeQueried, nWidth);
4848
0
        }
4849
0
        if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4850
0
        {
4851
0
            pabyChunkNodataMask = static_cast<GByte *>(
4852
0
                VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4853
0
        }
4854
4855
0
        if (pChunk == nullptr ||
4856
0
            (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4857
0
        {
4858
0
            CPLFree(pChunk);
4859
0
            CPLFree(pabyChunkNodataMask);
4860
0
            return CE_Failure;
4861
0
        }
4862
4863
        // Read chunk.
4864
0
        if (eErr == CE_None)
4865
0
            eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4866
0
                                       nChunkYSizeQueried, pChunk, nWidth,
4867
0
                                       nChunkYSizeQueried, eWrkDataType, 0, 0,
4868
0
                                       nullptr);
4869
0
        if (eErr == CE_None && bUseNoDataMask)
4870
0
            eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4871
0
                                        nChunkYSizeQueried, pabyChunkNodataMask,
4872
0
                                        nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4873
0
                                        0, nullptr);
4874
4875
        // Special case to promote 1bit data to 8bit 0/255 values.
4876
0
        if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4877
0
        {
4878
0
            if (eWrkDataType == GDT_Float32)
4879
0
            {
4880
0
                float *pafChunk = static_cast<float *>(pChunk);
4881
0
                for (GPtrDiff_t i = 0;
4882
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4883
0
                     i++)
4884
0
                {
4885
0
                    if (pafChunk[i] == 1.0)
4886
0
                        pafChunk[i] = 255.0;
4887
0
                }
4888
0
            }
4889
0
            else if (eWrkDataType == GDT_Byte)
4890
0
            {
4891
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
4892
0
                for (GPtrDiff_t i = 0;
4893
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4894
0
                     i++)
4895
0
                {
4896
0
                    if (pabyChunk[i] == 1)
4897
0
                        pabyChunk[i] = 255;
4898
0
                }
4899
0
            }
4900
0
            else if (eWrkDataType == GDT_UInt16)
4901
0
            {
4902
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4903
0
                for (GPtrDiff_t i = 0;
4904
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4905
0
                     i++)
4906
0
                {
4907
0
                    if (pasChunk[i] == 1)
4908
0
                        pasChunk[i] = 255;
4909
0
                }
4910
0
            }
4911
0
            else if (eWrkDataType == GDT_Float64)
4912
0
            {
4913
0
                double *padfChunk = static_cast<double *>(pChunk);
4914
0
                for (GPtrDiff_t i = 0;
4915
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4916
0
                     i++)
4917
0
                {
4918
0
                    if (padfChunk[i] == 1.0)
4919
0
                        padfChunk[i] = 255.0;
4920
0
                }
4921
0
            }
4922
0
            else
4923
0
            {
4924
0
                CPLAssert(false);
4925
0
            }
4926
0
        }
4927
0
        else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4928
0
        {
4929
0
            if (eWrkDataType == GDT_Float32)
4930
0
            {
4931
0
                float *pafChunk = static_cast<float *>(pChunk);
4932
0
                for (GPtrDiff_t i = 0;
4933
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4934
0
                     i++)
4935
0
                {
4936
0
                    if (pafChunk[i] == 1.0)
4937
0
                        pafChunk[i] = 0.0;
4938
0
                    else if (pafChunk[i] == 0.0)
4939
0
                        pafChunk[i] = 255.0;
4940
0
                }
4941
0
            }
4942
0
            else if (eWrkDataType == GDT_Byte)
4943
0
            {
4944
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
4945
0
                for (GPtrDiff_t i = 0;
4946
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4947
0
                     i++)
4948
0
                {
4949
0
                    if (pabyChunk[i] == 1)
4950
0
                        pabyChunk[i] = 0;
4951
0
                    else if (pabyChunk[i] == 0)
4952
0
                        pabyChunk[i] = 255;
4953
0
                }
4954
0
            }
4955
0
            else if (eWrkDataType == GDT_UInt16)
4956
0
            {
4957
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4958
0
                for (GPtrDiff_t i = 0;
4959
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4960
0
                     i++)
4961
0
                {
4962
0
                    if (pasChunk[i] == 1)
4963
0
                        pasChunk[i] = 0;
4964
0
                    else if (pasChunk[i] == 0)
4965
0
                        pasChunk[i] = 255;
4966
0
                }
4967
0
            }
4968
0
            else if (eWrkDataType == GDT_Float64)
4969
0
            {
4970
0
                double *padfChunk = static_cast<double *>(pChunk);
4971
0
                for (GPtrDiff_t i = 0;
4972
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4973
0
                     i++)
4974
0
                {
4975
0
                    if (padfChunk[i] == 1.0)
4976
0
                        padfChunk[i] = 0.0;
4977
0
                    else if (padfChunk[i] == 0.0)
4978
0
                        padfChunk[i] = 255.0;
4979
0
                }
4980
0
            }
4981
0
            else
4982
0
            {
4983
0
                CPLAssert(false);
4984
0
            }
4985
0
        }
4986
4987
0
        auto oSrcBufferHolder =
4988
0
            std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4989
0
        auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4990
0
            poJobQueue ? pabyChunkNodataMask : nullptr);
4991
4992
0
        for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4993
0
             ++iOverview)
4994
0
        {
4995
0
            GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4996
0
            const int nDstWidth = poDstBand->GetXSize();
4997
0
            const int nDstHeight = poDstBand->GetYSize();
4998
4999
0
            const double dfXRatioDstToSrc =
5000
0
                static_cast<double>(nWidth) / nDstWidth;
5001
0
            const double dfYRatioDstToSrc =
5002
0
                static_cast<double>(nHeight) / nDstHeight;
5003
5004
            /* --------------------------------------------------------------------
5005
             */
5006
            /*      Figure out the line to start writing to, and the first line
5007
             */
5008
            /*      to not write to.  In theory this approach should ensure that
5009
             */
5010
            /*      every output line will be written if all input chunks are */
5011
            /*      processed. */
5012
            /* --------------------------------------------------------------------
5013
             */
5014
0
            int nDstYOff =
5015
0
                static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5016
0
            if (nDstYOff == nDstHeight)
5017
0
                continue;
5018
0
            int nDstYOff2 = static_cast<int>(
5019
0
                0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5020
5021
0
            if (nChunkYOff + nFullResYChunk == nHeight)
5022
0
                nDstYOff2 = nDstHeight;
5023
#if DEBUG_VERBOSE
5024
            CPLDebug("GDAL",
5025
                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5026
                     nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5027
                     nDstWidth, nDstYOff2 - nDstYOff);
5028
#endif
5029
5030
0
            auto poJob = std::make_unique<OvrJob>();
5031
0
            poJob->pfnResampleFn = pfnResampleFn;
5032
0
            poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5033
0
            poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5034
0
            poJob->args.nOvrXSize = poDstBand->GetXSize();
5035
0
            poJob->args.nOvrYSize = poDstBand->GetYSize();
5036
0
            const char *pszNBITS =
5037
0
                poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5038
0
            poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5039
0
            poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5040
0
            poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5041
0
            poJob->args.eWrkDataType = eWrkDataType;
5042
0
            poJob->pChunk = pChunk;
5043
0
            poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
5044
0
            poJob->nSrcWidth = nWidth;
5045
0
            poJob->nSrcHeight = nHeight;
5046
0
            poJob->args.nChunkXOff = 0;
5047
0
            poJob->args.nChunkXSize = nWidth;
5048
0
            poJob->args.nChunkYOff = nChunkYOffQueried;
5049
0
            poJob->args.nChunkYSize = nChunkYSizeQueried;
5050
0
            poJob->nDstWidth = nDstWidth;
5051
0
            poJob->args.nDstXOff = 0;
5052
0
            poJob->args.nDstXOff2 = nDstWidth;
5053
0
            poJob->args.nDstYOff = nDstYOff;
5054
0
            poJob->args.nDstYOff2 = nDstYOff2;
5055
0
            poJob->poDstBand = poDstBand;
5056
0
            poJob->args.pszResampling = pszResampling;
5057
0
            poJob->args.bHasNoData = bHasNoData;
5058
0
            poJob->args.dfNoDataValue = dfNoDataValue;
5059
0
            poJob->args.poColorTable = poColorTable;
5060
0
            poJob->args.eSrcDataType = eSrcDataType;
5061
0
            poJob->args.bPropagateNoData = bPropagateNoData;
5062
5063
0
            if (poJobQueue)
5064
0
            {
5065
0
                poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5066
0
                poJob->SetSrcBufferHolder(oSrcBufferHolder);
5067
0
                poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5068
0
                jobList.emplace_back(std::move(poJob));
5069
0
            }
5070
0
            else
5071
0
            {
5072
0
                JobResampleFunc(poJob.get());
5073
0
                eErr = poJob->eErr;
5074
0
                if (eErr == CE_None)
5075
0
                {
5076
0
                    eErr = WriteJobData(poJob.get());
5077
0
                }
5078
0
            }
5079
0
        }
5080
5081
0
        if (poJobQueue)
5082
0
        {
5083
0
            pChunk = nullptr;
5084
0
            pabyChunkNodataMask = nullptr;
5085
0
        }
5086
0
    }
5087
5088
0
    VSIFree(pChunk);
5089
0
    VSIFree(pabyChunkNodataMask);
5090
5091
    // Wait for all pending jobs to complete
5092
0
    while (!jobList.empty())
5093
0
    {
5094
0
        const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5095
0
        if (l_eErr != CE_None && eErr == CE_None)
5096
0
            eErr = l_eErr;
5097
0
    }
5098
5099
    /* -------------------------------------------------------------------- */
5100
    /*      Renormalized overview mean / stddev if needed.                  */
5101
    /* -------------------------------------------------------------------- */
5102
0
    if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5103
0
    {
5104
0
        GDALOverviewMagnitudeCorrection(
5105
0
            poSrcBand, nOverviewCount,
5106
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5107
0
            GDALDummyProgress, nullptr);
5108
0
    }
5109
5110
    /* -------------------------------------------------------------------- */
5111
    /*      It can be important to flush out data to overviews.             */
5112
    /* -------------------------------------------------------------------- */
5113
0
    for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5114
0
         ++iOverview)
5115
0
    {
5116
0
        eErr = papoOvrBands[iOverview]->FlushCache(false);
5117
0
    }
5118
5119
0
    if (eErr == CE_None)
5120
0
        pfnProgress(1.0, nullptr, pProgressData);
5121
5122
0
    return eErr;
5123
0
}
5124
5125
/************************************************************************/
5126
/*            GDALRegenerateOverviewsMultiBand()                        */
5127
/************************************************************************/
5128
5129
/**
5130
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5131
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5132
 *
5133
 * This function will generate one or more overview images from a base
5134
 * image using the requested downsampling algorithm.  Its primary use
5135
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5136
 * can also be used to generate downsampled images in one file from another
5137
 * outside the overview architecture.
5138
 *
5139
 * The output bands need to exist in advance and share the same characteristics
5140
 * (type, dimensions)
5141
 *
5142
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5143
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5144
 *
5145
 * It does not support color tables or complex data types.
5146
 *
5147
 * The pseudo-algorithm used by the function is :
5148
 *    for each overview
5149
 *       iterate on lines of the source by a step of deltay
5150
 *           iterate on columns of the source  by a step of deltax
5151
 *               read the source data of size deltax * deltay for all the bands
5152
 *               generate the corresponding overview block for all the bands
5153
 *
5154
 * This function will honour properly NODATA_VALUES tuples (special dataset
5155
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5156
 * considered as the nodata value and not each value of the triplet
5157
 * independently per band.
5158
 *
5159
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5160
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5161
 * overview computation.
5162
 *
5163
 * @param nBands the number of bands, size of papoSrcBands and size of
5164
 *               first dimension of papapoOverviewBands
5165
 * @param papoSrcBands the list of source bands to downsample
5166
 * @param nOverviews the number of downsampled overview levels being generated.
5167
 * @param papapoOverviewBands bidimension array of bands. First dimension is
5168
 *                            indexed by nBands. Second dimension is indexed by
5169
 *                            nOverviews.
5170
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5171
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5172
 * @param pfnProgress progress report function.
5173
 * @param pProgressData progress function callback data.
5174
 * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5175
 *                     key=value pairs, or NULL
5176
 *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5177
 *                     options can be specified to express that overviews should
5178
 *                     be regenerated only in the specified subset of the source
5179
 *                     dataset.
5180
 * @return CE_None on success or CE_Failure on failure.
5181
 */
5182
5183
CPLErr GDALRegenerateOverviewsMultiBand(
5184
    int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5185
    GDALRasterBand *const *const *papapoOverviewBands,
5186
    const char *pszResampling, GDALProgressFunc pfnProgress,
5187
    void *pProgressData, CSLConstList papszOptions)
5188
0
{
5189
0
    CPL_IGNORE_RET_VAL(papszOptions);
5190
5191
0
    if (pfnProgress == nullptr)
5192
0
        pfnProgress = GDALDummyProgress;
5193
5194
0
    if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5195
0
        return CE_None;
5196
5197
    // Sanity checks.
5198
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5199
0
        !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5200
0
        !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5201
0
        !EQUAL(pszResampling, "CUBICSPLINE") &&
5202
0
        !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5203
0
        !EQUAL(pszResampling, "MODE"))
5204
0
    {
5205
0
        CPLError(CE_Failure, CPLE_NotSupported,
5206
0
                 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5207
0
                 "not supported",
5208
0
                 pszResampling);
5209
0
        return CE_Failure;
5210
0
    }
5211
5212
0
    int nKernelRadius = 0;
5213
0
    GDALResampleFunction pfnResampleFn =
5214
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
5215
0
    if (pfnResampleFn == nullptr)
5216
0
        return CE_Failure;
5217
5218
0
    const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5219
0
    const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5220
0
    if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5221
0
        return CE_None;
5222
0
    GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5223
0
    for (int iBand = 1; iBand < nBands; ++iBand)
5224
0
    {
5225
0
        if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5226
0
            papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5227
0
        {
5228
0
            CPLError(
5229
0
                CE_Failure, CPLE_NotSupported,
5230
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5231
0
                "have the same dimensions");
5232
0
            return CE_Failure;
5233
0
        }
5234
0
        if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5235
0
        {
5236
0
            CPLError(
5237
0
                CE_Failure, CPLE_NotSupported,
5238
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5239
0
                "have the same data type");
5240
0
            return CE_Failure;
5241
0
        }
5242
0
    }
5243
5244
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5245
0
    {
5246
0
        const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5247
0
        const int nDstWidth = poOvrFirstBand->GetXSize();
5248
0
        const int nDstHeight = poOvrFirstBand->GetYSize();
5249
0
        for (int iBand = 1; iBand < nBands; ++iBand)
5250
0
        {
5251
0
            const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5252
0
            if (poOvrBand->GetXSize() != nDstWidth ||
5253
0
                poOvrBand->GetYSize() != nDstHeight)
5254
0
            {
5255
0
                CPLError(
5256
0
                    CE_Failure, CPLE_NotSupported,
5257
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5258
0
                    "of the same level must have the same dimensions");
5259
0
                return CE_Failure;
5260
0
            }
5261
0
            if (poOvrBand->GetRasterDataType() != eDataType)
5262
0
            {
5263
0
                CPLError(
5264
0
                    CE_Failure, CPLE_NotSupported,
5265
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5266
0
                    "must have the same data type as the source bands");
5267
0
                return CE_Failure;
5268
0
            }
5269
0
        }
5270
0
    }
5271
5272
    // First pass to compute the total number of pixels to write.
5273
0
    double dfTotalPixelCount = 0;
5274
0
    const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5275
0
    const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5276
0
    const int nSrcXSize = atoi(CSLFetchNameValueDef(
5277
0
        papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5278
0
    const int nSrcYSize = atoi(CSLFetchNameValueDef(
5279
0
        papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5280
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5281
0
    {
5282
0
        dfTotalPixelCount +=
5283
0
            static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5284
0
            papapoOverviewBands[0][iOverview]->GetXSize() *
5285
0
            static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5286
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5287
0
    }
5288
5289
0
    const GDALDataType eWrkDataType =
5290
0
        GDALGetOvrWorkDataType(pszResampling, eDataType);
5291
0
    const int nWrkDataTypeSize =
5292
0
        std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5293
5294
0
    const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5295
5296
    // If we have a nodata mask and we are doing something more complicated
5297
    // than nearest neighbouring, we have to fetch to nodata mask.
5298
0
    const bool bUseNoDataMask =
5299
0
        !STARTS_WITH_CI(pszResampling, "NEAR") &&
5300
0
        (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5301
5302
0
    std::vector<bool> abHasNoData(nBands);
5303
0
    std::vector<double> adfNoDataValue(nBands);
5304
5305
0
    for (int iBand = 0; iBand < nBands; ++iBand)
5306
0
    {
5307
0
        int nHasNoData = 0;
5308
0
        adfNoDataValue[iBand] =
5309
0
            papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5310
0
        abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5311
0
    }
5312
0
    const bool bPropagateNoData =
5313
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5314
5315
0
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5316
0
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5317
0
                                                       ? CPLGetNumCPUs()
5318
0
                                                       : atoi(pszThreads)));
5319
0
    auto poThreadPool =
5320
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5321
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5322
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
5323
5324
    // Only configurable for debug / testing
5325
0
    const GIntBig nChunkMaxSize = []() -> GIntBig
5326
0
    {
5327
0
        const char *pszVal =
5328
0
            CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5329
0
        if (pszVal)
5330
0
        {
5331
0
            GIntBig nRet = 0;
5332
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5333
0
            return std::max<GIntBig>(100, nRet);
5334
0
        }
5335
0
        return 10 * 1024 * 1024;
5336
0
    }();
5337
5338
    // Only configurable for debug / testing
5339
0
    const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5340
0
    {
5341
0
        const char *pszVal = CPLGetConfigOption(
5342
0
            "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5343
0
        if (pszVal)
5344
0
        {
5345
0
            GIntBig nRet = 0;
5346
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5347
0
            return std::max<GIntBig>(100, nRet);
5348
0
        }
5349
0
        const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5350
0
        if (nUsableRAM > 0)
5351
0
            return nUsableRAM / 10;
5352
        // Select a value to be able to at least downsample by 2 for a RGB
5353
        // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5354
0
        return 100 * 1024 * 1024;
5355
0
    }();
5356
5357
    // Second pass to do the real job.
5358
0
    double dfCurPixelCount = 0;
5359
0
    CPLErr eErr = CE_None;
5360
0
    for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5361
0
         ++iOverview)
5362
0
    {
5363
0
        int iSrcOverview = -1;  // -1 means the source bands.
5364
5365
0
        const int nDstTotalWidth =
5366
0
            papapoOverviewBands[0][iOverview]->GetXSize();
5367
0
        const int nDstTotalHeight =
5368
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5369
5370
        // Compute the coordinates of the target region to refresh
5371
0
        constexpr double EPS = 1e-8;
5372
0
        const int nDstXOffStart = static_cast<int>(
5373
0
            static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5374
0
            EPS);
5375
0
        const int nDstXOffEnd =
5376
0
            std::min(static_cast<int>(
5377
0
                         std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5378
0
                                       nToplevelSrcWidth * nDstTotalWidth -
5379
0
                                   EPS)),
5380
0
                     nDstTotalWidth);
5381
0
        const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5382
0
        const int nDstYOffStart =
5383
0
            static_cast<int>(static_cast<double>(nSrcYOff) /
5384
0
                                 nToplevelSrcHeight * nDstTotalHeight +
5385
0
                             EPS);
5386
0
        const int nDstYOffEnd =
5387
0
            std::min(static_cast<int>(
5388
0
                         std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5389
0
                                       nToplevelSrcHeight * nDstTotalHeight -
5390
0
                                   EPS)),
5391
0
                     nDstTotalHeight);
5392
0
        const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5393
5394
        // Try to use previous level of overview as the source to compute
5395
        // the next level.
5396
0
        int nSrcWidth = nToplevelSrcWidth;
5397
0
        int nSrcHeight = nToplevelSrcHeight;
5398
0
        if (iOverview > 0 &&
5399
0
            papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5400
0
        {
5401
0
            nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5402
0
            nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5403
0
            iSrcOverview = iOverview - 1;
5404
0
        }
5405
5406
0
        const double dfXRatioDstToSrc =
5407
0
            static_cast<double>(nSrcWidth) / nDstTotalWidth;
5408
0
        const double dfYRatioDstToSrc =
5409
0
            static_cast<double>(nSrcHeight) / nDstTotalHeight;
5410
5411
0
        const int nOvrFactor =
5412
0
            std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5413
0
                                 static_cast<int>(0.5 + dfYRatioDstToSrc)));
5414
5415
0
        int nDstChunkXSize = 0;
5416
0
        int nDstChunkYSize = 0;
5417
0
        papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5418
0
                                                        &nDstChunkYSize);
5419
5420
0
        constexpr int PIXEL_MARGIN = 2;
5421
        // Try to extend the chunk size so that the memory needed to acquire
5422
        // source pixels goes up to 10 MB.
5423
        // This can help for drivers that support multi-threaded reading
5424
0
        const int nFullResYChunk = static_cast<int>(std::min<double>(
5425
0
            nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5426
0
        const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5427
0
            nSrcHeight,
5428
0
            nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5429
0
                                 nKernelRadius * nOvrFactor));
5430
0
        while (nDstChunkXSize < nDstWidth)
5431
0
        {
5432
0
            constexpr int INCREASE_FACTOR = 2;
5433
5434
0
            const int nFullResXChunk = static_cast<int>(std::min<double>(
5435
0
                nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5436
0
                                              dfXRatioDstToSrc));
5437
5438
0
            const int nFullResXChunkQueried =
5439
0
                static_cast<int>(std::min<int64_t>(
5440
0
                    nSrcWidth,
5441
0
                    nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5442
0
                                         nKernelRadius * nOvrFactor));
5443
5444
0
            if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5445
0
                             nFullResYChunkQueried / nWrkDataTypeSize)
5446
0
            {
5447
0
                break;
5448
0
            }
5449
5450
0
            nDstChunkXSize *= INCREASE_FACTOR;
5451
0
        }
5452
0
        nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5453
5454
0
        const int nFullResXChunk = static_cast<int>(std::min<double>(
5455
0
            nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5456
0
        const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5457
0
            nSrcWidth,
5458
0
            nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5459
0
                                 nKernelRadius * nOvrFactor));
5460
5461
        // Make sure that the RAM requirements to acquire the source data does
5462
        // not exceed nChunkMaxSizeForTempFile
5463
        // If so, reduce the destination chunk size, generate overviews in a
5464
        // temporary dataset, and copy that temporary dataset over the target
5465
        // overview bands (to avoid issues with lossy compression)
5466
0
        const bool bOverflowFullResXChunkYChunkQueried =
5467
0
            nBands > std::numeric_limits<int64_t>::max() /
5468
0
                         nFullResXChunkQueried / nFullResYChunkQueried /
5469
0
                         nWrkDataTypeSize;
5470
5471
0
        const auto nMemRequirement =
5472
0
            bOverflowFullResXChunkYChunkQueried
5473
0
                ? 0
5474
0
                : static_cast<GIntBig>(nFullResXChunkQueried) *
5475
0
                      nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5476
        // Use a temporary dataset with a smaller destination chunk size
5477
0
        const auto nOverShootFactor =
5478
0
            nMemRequirement / nChunkMaxSizeForTempFile;
5479
5480
0
        constexpr int MIN_OVERSHOOT_FACTOR = 4;
5481
0
        const auto nSqrtOverShootFactor = std::max<GIntBig>(
5482
0
            MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5483
0
                                      static_cast<double>(nOverShootFactor)))));
5484
0
        constexpr int DEFAULT_CHUNK_SIZE = 256;
5485
0
        constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5486
0
        const int nReducedDstChunkXSize =
5487
0
            bOverflowFullResXChunkYChunkQueried
5488
0
                ? DEFAULT_CHUNK_SIZE
5489
0
                : std::max(1, static_cast<int>(nDstChunkXSize /
5490
0
                                               nSqrtOverShootFactor) &
5491
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5492
0
        const int nReducedDstChunkYSize =
5493
0
            bOverflowFullResXChunkYChunkQueried
5494
0
                ? DEFAULT_CHUNK_SIZE
5495
0
                : std::max(1, static_cast<int>(nDstChunkYSize /
5496
0
                                               nSqrtOverShootFactor) &
5497
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5498
5499
0
        if (bOverflowFullResXChunkYChunkQueried ||
5500
0
            nMemRequirement > nChunkMaxSizeForTempFile)
5501
0
        {
5502
0
            const auto nDTSize =
5503
0
                std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5504
0
            const bool bTmpDSMemRequirementOverflow =
5505
0
                nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5506
0
                             nDstHeight / nDTSize;
5507
0
            const auto nTmpDSMemRequirement =
5508
0
                bTmpDSMemRequirementOverflow
5509
0
                    ? 0
5510
0
                    : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5511
0
                          nDTSize;
5512
5513
            // make sure that one band buffer doesn't overflow size_t
5514
0
            const bool bChunkSizeOverflow =
5515
0
                static_cast<size_t>(nDTSize) >
5516
0
                std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5517
0
            const size_t nChunkSize =
5518
0
                bChunkSizeOverflow
5519
0
                    ? 0
5520
0
                    : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5521
5522
0
            const auto CreateVRT =
5523
0
                [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5524
0
                 pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5525
0
                 iSrcOverview, &abHasNoData,
5526
0
                 &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5527
0
            {
5528
0
                auto poVRTDS = std::make_unique<VRTDataset>(
5529
0
                    nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5530
0
                    nVRTBlockYSize);
5531
5532
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5533
0
                {
5534
0
                    auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5535
0
                    poVRTSrc->SetResampling(pszResampling);
5536
0
                    poVRTDS->AddBand(eWrkDataType);
5537
0
                    auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5538
0
                        poVRTDS->GetRasterBand(iBand + 1));
5539
5540
0
                    auto poSrcBand = papoSrcBands[iBand];
5541
0
                    if (iSrcOverview != -1)
5542
0
                        poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5543
0
                    poVRTBand->ConfigureSource(
5544
0
                        poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5545
0
                        nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5546
                    // Add the source to the band
5547
0
                    poVRTBand->AddSource(poVRTSrc.release());
5548
0
                    if (abHasNoData[iBand])
5549
0
                        poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5550
0
                }
5551
5552
0
                if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5553
0
                    poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5554
0
                {
5555
0
                    VRTSourcedRasterBand *poMaskVRTBand =
5556
0
                        cpl::down_cast<VRTSourcedRasterBand *>(
5557
0
                            poVRTDS->GetRasterBand(1)->GetMaskBand());
5558
0
                    auto poSrcBand = papoSrcBands[0];
5559
0
                    if (iSrcOverview != -1)
5560
0
                        poSrcBand = papapoOverviewBands[0][iSrcOverview];
5561
0
                    poMaskVRTBand->AddMaskBandSource(
5562
0
                        poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5563
0
                        0, 0, nDstTotalWidth, nDstTotalHeight);
5564
0
                }
5565
5566
0
                return poVRTDS;
5567
0
            };
5568
5569
            // If the overview accommodates chunking, do so and recurse
5570
            // to avoid generating full size temporary files
5571
0
            if (!bOverflowFullResXChunkYChunkQueried &&
5572
0
                !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5573
0
                (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5574
0
            {
5575
                // Create a VRT with the smaller chunk to do the scaling
5576
0
                auto poVRTDS =
5577
0
                    CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5578
5579
0
                std::vector<GDALRasterBand *> apoVRTBand(nBands);
5580
0
                std::vector<GDALRasterBand *> apoDstBand(nBands);
5581
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5582
0
                {
5583
0
                    apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5584
0
                    apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5585
0
                }
5586
5587
                // Use a flag to avoid reading from the overview being built
5588
0
                GDALRasterIOExtraArg sExtraArg;
5589
0
                INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5590
0
                if (iSrcOverview == -1)
5591
0
                    sExtraArg.bUseOnlyThisScale = true;
5592
5593
                // A single band buffer for data transfer to the overview
5594
0
                std::vector<GByte> abyChunk;
5595
0
                try
5596
0
                {
5597
0
                    abyChunk.resize(nChunkSize);
5598
0
                }
5599
0
                catch (const std::exception &)
5600
0
                {
5601
0
                    CPLError(CE_Failure, CPLE_OutOfMemory,
5602
0
                             "Out of memory allocating temporary buffer");
5603
0
                    return CE_Failure;
5604
0
                }
5605
5606
                // Loop over output height, in chunks
5607
0
                for (int nDstYOff = nDstYOffStart;
5608
0
                     nDstYOff < nDstYOffEnd && eErr == CE_None;
5609
0
                     /* */)
5610
0
                {
5611
0
                    const int nDstYCount =
5612
0
                        std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5613
                    // Loop over output width, in output chunks
5614
0
                    for (int nDstXOff = nDstXOffStart;
5615
0
                         nDstXOff < nDstXOffEnd && eErr == CE_None;
5616
0
                         /* */)
5617
0
                    {
5618
0
                        const int nDstXCount =
5619
0
                            std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5620
                        // Read and transfer the chunk to the overview
5621
0
                        for (int iBand = 0; iBand < nBands && eErr == CE_None;
5622
0
                             ++iBand)
5623
0
                        {
5624
0
                            eErr = apoVRTBand[iBand]->RasterIO(
5625
0
                                GF_Read, nDstXOff, nDstYOff, nDstXCount,
5626
0
                                nDstYCount, abyChunk.data(), nDstXCount,
5627
0
                                nDstYCount, eDataType, 0, 0, &sExtraArg);
5628
0
                            if (eErr == CE_None)
5629
0
                            {
5630
0
                                eErr = apoDstBand[iBand]->RasterIO(
5631
0
                                    GF_Write, nDstXOff, nDstYOff, nDstXCount,
5632
0
                                    nDstYCount, abyChunk.data(), nDstXCount,
5633
0
                                    nDstYCount, eDataType, 0, 0, nullptr);
5634
0
                            }
5635
0
                        }
5636
5637
0
                        dfCurPixelCount +=
5638
0
                            static_cast<double>(nDstXCount) * nDstYCount;
5639
5640
0
                        nDstXOff += nDstXCount;
5641
0
                    }  // width
5642
5643
0
                    if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
5644
0
                                     nullptr, pProgressData))
5645
0
                    {
5646
0
                        CPLError(CE_Failure, CPLE_UserInterrupt,
5647
0
                                 "User terminated");
5648
0
                        eErr = CE_Failure;
5649
0
                    }
5650
5651
0
                    nDstYOff += nDstYCount;
5652
0
                }  // height
5653
5654
0
                if (CE_None != eErr)
5655
0
                {
5656
0
                    CPLError(CE_Failure, CPLE_AppDefined,
5657
0
                             "Error while writing overview");
5658
0
                    return CE_Failure;
5659
0
                }
5660
5661
0
                pfnProgress(1.0, nullptr, pProgressData);
5662
                // Flush the overviews we just generated
5663
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5664
0
                    apoDstBand[iBand]->FlushCache(false);
5665
5666
0
                continue;  // Next overview
5667
0
            }              // chunking via temporary dataset
5668
5669
0
            std::unique_ptr<GDALDataset> poTmpDS;
5670
            // Config option mostly/only for autotest purposes
5671
0
            const char *pszGDAL_OVR_TEMP_DRIVER =
5672
0
                CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
5673
0
            if ((!bTmpDSMemRequirementOverflow &&
5674
0
                 nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
5675
0
                 !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
5676
0
                EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
5677
0
            {
5678
0
                auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
5679
0
                if (!poTmpDrv)
5680
0
                {
5681
0
                    eErr = CE_Failure;
5682
0
                    break;
5683
0
                }
5684
0
                poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
5685
0
                                               nDstTotalHeight, nBands,
5686
0
                                               eDataType, nullptr));
5687
0
            }
5688
0
            else
5689
0
            {
5690
                // Create a temporary file for the overview
5691
0
                auto poTmpDrv =
5692
0
                    GetGDALDriverManager()->GetDriverByName("GTiff");
5693
0
                if (!poTmpDrv)
5694
0
                {
5695
0
                    eErr = CE_Failure;
5696
0
                    break;
5697
0
                }
5698
0
                std::string osTmpFilename;
5699
0
                auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
5700
0
                if (poDstDS)
5701
0
                {
5702
0
                    osTmpFilename = poDstDS->GetDescription();
5703
0
                    VSIStatBufL sStatBuf;
5704
0
                    if (!osTmpFilename.empty() &&
5705
0
                        VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
5706
0
                        osTmpFilename += "_tmp_ovr.tif";
5707
0
                }
5708
0
                if (osTmpFilename.empty())
5709
0
                {
5710
0
                    osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
5711
0
                    osTmpFilename += ".tif";
5712
0
                }
5713
0
                CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
5714
0
                         osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
5715
0
                CPLStringList aosCO;
5716
0
                if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
5717
0
                          (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
5718
0
                {
5719
0
                    aosCO.SetNameValue("TILED", "YES");
5720
0
                    aosCO.SetNameValue("BLOCKXSIZE",
5721
0
                                       CPLSPrintf("%d", nReducedDstChunkXSize));
5722
0
                    aosCO.SetNameValue("BLOCKYSIZE",
5723
0
                                       CPLSPrintf("%d", nReducedDstChunkYSize));
5724
0
                }
5725
0
                if (const char *pszCOList =
5726
0
                        poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
5727
0
                {
5728
0
                    aosCO.SetNameValue(
5729
0
                        "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
5730
0
                }
5731
0
                poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
5732
0
                                               nDstHeight, nBands, eDataType,
5733
0
                                               aosCO.List()));
5734
0
                if (poTmpDS)
5735
0
                {
5736
0
                    poTmpDS->MarkSuppressOnClose();
5737
0
                    VSIUnlink(osTmpFilename.c_str());
5738
0
                }
5739
0
            }
5740
0
            if (!poTmpDS)
5741
0
            {
5742
0
                eErr = CE_Failure;
5743
0
                break;
5744
0
            }
5745
5746
            // Create a full size VRT to do the resampling without edge effects
5747
0
            auto poVRTDS =
5748
0
                CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5749
5750
            // Allocate a band buffer with the overview chunk size
5751
0
            std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
5752
0
                VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
5753
0
                                    nDstChunkYSize));
5754
0
            if (pDstBuffer == nullptr)
5755
0
            {
5756
0
                eErr = CE_Failure;
5757
0
                break;
5758
0
            }
5759
5760
            // Use a flag to avoid reading the overview being built
5761
0
            GDALRasterIOExtraArg sExtraArg;
5762
0
            INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5763
0
            if (iSrcOverview == -1)
5764
0
                sExtraArg.bUseOnlyThisScale = true;
5765
5766
            // Scale and copy data from the VRT to the temp file
5767
0
            for (int nDstYOff = nDstYOffStart;
5768
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
5769
0
                 /* */)
5770
0
            {
5771
0
                const int nDstYCount =
5772
0
                    std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
5773
0
                for (int nDstXOff = nDstXOffStart;
5774
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
5775
0
                     /* */)
5776
0
                {
5777
0
                    const int nDstXCount =
5778
0
                        std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
5779
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
5780
0
                         ++iBand)
5781
0
                    {
5782
0
                        auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
5783
0
                        eErr = poSrcBand->RasterIO(
5784
0
                            GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
5785
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
5786
0
                            eWrkDataType, 0, 0, &sExtraArg);
5787
0
                        if (eErr == CE_None)
5788
0
                        {
5789
                            // Write to the temporary dataset, shifted
5790
0
                            auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
5791
0
                            eErr = poOvrBand->RasterIO(
5792
0
                                GF_Write, nDstXOff - nDstXOffStart,
5793
0
                                nDstYOff - nDstYOffStart, nDstXCount,
5794
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
5795
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
5796
0
                        }
5797
0
                    }
5798
0
                    nDstXOff += nDstXCount;
5799
0
                }
5800
0
                nDstYOff += nDstYCount;
5801
0
            }
5802
5803
            // Copy from the temporary to the overview
5804
0
            for (int nDstYOff = nDstYOffStart;
5805
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
5806
0
                 /* */)
5807
0
            {
5808
0
                const int nDstYCount =
5809
0
                    std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5810
0
                for (int nDstXOff = nDstXOffStart;
5811
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
5812
0
                     /* */)
5813
0
                {
5814
0
                    const int nDstXCount =
5815
0
                        std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5816
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
5817
0
                         ++iBand)
5818
0
                    {
5819
0
                        auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
5820
0
                        eErr = poSrcBand->RasterIO(
5821
0
                            GF_Read, nDstXOff - nDstXOffStart,
5822
0
                            nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
5823
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
5824
0
                            eWrkDataType, 0, 0, nullptr);
5825
0
                        if (eErr == CE_None)
5826
0
                        {
5827
                            // Write to the destination overview bands
5828
0
                            auto poOvrBand =
5829
0
                                papapoOverviewBands[iBand][iOverview];
5830
0
                            eErr = poOvrBand->RasterIO(
5831
0
                                GF_Write, nDstXOff, nDstYOff, nDstXCount,
5832
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
5833
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
5834
0
                        }
5835
0
                    }
5836
0
                    nDstXOff += nDstXCount;
5837
0
                }
5838
0
                nDstYOff += nDstYCount;
5839
0
            }
5840
5841
0
            if (eErr != CE_None)
5842
0
            {
5843
0
                CPLError(CE_Failure, CPLE_AppDefined,
5844
0
                         "Failed to write overview %d", iOverview);
5845
0
                return eErr;
5846
0
            }
5847
5848
            // Flush the data to overviews.
5849
0
            for (int iBand = 0; iBand < nBands; ++iBand)
5850
0
                papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5851
5852
0
            continue;
5853
0
        }
5854
5855
        // Structure describing a resampling job
5856
0
        struct OvrJob
5857
0
        {
5858
            // Buffers to free when job is finished
5859
0
            std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5860
0
            std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5861
0
            std::unique_ptr<PointerHolder> oDstBufferHolder{};
5862
5863
0
            GDALRasterBand *poDstBand = nullptr;
5864
5865
            // Input parameters of pfnResampleFn
5866
0
            GDALResampleFunction pfnResampleFn = nullptr;
5867
0
            GDALOverviewResampleArgs args{};
5868
0
            const void *pChunk = nullptr;
5869
5870
            // Output values of resampling function
5871
0
            CPLErr eErr = CE_Failure;
5872
0
            void *pDstBuffer = nullptr;
5873
0
            GDALDataType eDstBufferDataType = GDT_Unknown;
5874
5875
0
            void NotifyFinished()
5876
0
            {
5877
0
                std::lock_guard guard(mutex);
5878
0
                bFinished = true;
5879
0
                cv.notify_one();
5880
0
            }
5881
5882
0
            bool IsFinished()
5883
0
            {
5884
0
                std::lock_guard guard(mutex);
5885
0
                return bFinished;
5886
0
            }
5887
5888
0
            void WaitFinished()
5889
0
            {
5890
0
                std::unique_lock oGuard(mutex);
5891
0
                while (!bFinished)
5892
0
                {
5893
0
                    cv.wait(oGuard);
5894
0
                }
5895
0
            }
5896
5897
0
          private:
5898
            // Synchronization
5899
0
            bool bFinished = false;
5900
0
            std::mutex mutex{};
5901
0
            std::condition_variable cv{};
5902
0
        };
5903
5904
        // Thread function to resample
5905
0
        const auto JobResampleFunc = [](void *pData)
5906
0
        {
5907
0
            OvrJob *poJob = static_cast<OvrJob *>(pData);
5908
5909
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5910
0
                                               &(poJob->pDstBuffer),
5911
0
                                               &(poJob->eDstBufferDataType));
5912
5913
0
            poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5914
5915
0
            poJob->NotifyFinished();
5916
0
        };
5917
5918
        // Function to write resample data to target band
5919
0
        const auto WriteJobData = [](const OvrJob *poJob)
5920
0
        {
5921
0
            return poJob->poDstBand->RasterIO(
5922
0
                GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
5923
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5924
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5925
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5926
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5927
0
                poJob->eDstBufferDataType, 0, 0, nullptr);
5928
0
        };
5929
5930
        // Wait for completion of oldest job and serialize it
5931
0
        const auto WaitAndFinalizeOldestJob =
5932
0
            [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5933
0
        {
5934
0
            auto poOldestJob = jobList.front().get();
5935
0
            poOldestJob->WaitFinished();
5936
0
            CPLErr l_eErr = poOldestJob->eErr;
5937
0
            if (l_eErr == CE_None)
5938
0
            {
5939
0
                l_eErr = WriteJobData(poOldestJob);
5940
0
            }
5941
5942
0
            jobList.pop_front();
5943
0
            return l_eErr;
5944
0
        };
5945
5946
        // Queue of jobs
5947
0
        std::list<std::unique_ptr<OvrJob>> jobList;
5948
5949
0
        std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
5950
0
        std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
5951
0
            apabyChunkNoDataMask(nBands);
5952
5953
        // Iterate on destination overview, block by block.
5954
0
        for (int nDstYOff = nDstYOffStart;
5955
0
             nDstYOff < nDstYOffEnd && eErr == CE_None;
5956
0
             nDstYOff += nDstChunkYSize)
5957
0
        {
5958
0
            int nDstYCount;
5959
0
            if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5960
0
                nDstYCount = nDstChunkYSize;
5961
0
            else
5962
0
                nDstYCount = nDstYOffEnd - nDstYOff;
5963
5964
0
            int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5965
0
            int nChunkYOff2 = static_cast<int>(
5966
0
                ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5967
0
            if (nChunkYOff2 > nSrcHeight ||
5968
0
                nDstYOff + nDstYCount == nDstTotalHeight)
5969
0
                nChunkYOff2 = nSrcHeight;
5970
0
            int nYCount = nChunkYOff2 - nChunkYOff;
5971
0
            CPLAssert(nYCount <= nFullResYChunk);
5972
5973
0
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5974
0
            int nChunkYSizeQueried =
5975
0
                nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
5976
0
            if (nChunkYOffQueried < 0)
5977
0
            {
5978
0
                nChunkYSizeQueried += nChunkYOffQueried;
5979
0
                nChunkYOffQueried = 0;
5980
0
            }
5981
0
            if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5982
0
                nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5983
0
            CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5984
5985
0
            if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
5986
0
                             nullptr, pProgressData))
5987
0
            {
5988
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5989
0
                eErr = CE_Failure;
5990
0
            }
5991
5992
            // Iterate on destination overview, block by block.
5993
0
            for (int nDstXOff = nDstXOffStart;
5994
0
                 nDstXOff < nDstXOffEnd && eErr == CE_None;
5995
0
                 nDstXOff += nDstChunkXSize)
5996
0
            {
5997
0
                int nDstXCount = 0;
5998
0
                if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5999
0
                    nDstXCount = nDstChunkXSize;
6000
0
                else
6001
0
                    nDstXCount = nDstXOffEnd - nDstXOff;
6002
6003
0
                dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
6004
6005
0
                int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
6006
0
                int nChunkXOff2 = static_cast<int>(
6007
0
                    ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
6008
0
                if (nChunkXOff2 > nSrcWidth ||
6009
0
                    nDstXOff + nDstXCount == nDstTotalWidth)
6010
0
                    nChunkXOff2 = nSrcWidth;
6011
0
                const int nXCount = nChunkXOff2 - nChunkXOff;
6012
0
                CPLAssert(nXCount <= nFullResXChunk);
6013
6014
0
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6015
0
                int nChunkXSizeQueried =
6016
0
                    nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6017
0
                if (nChunkXOffQueried < 0)
6018
0
                {
6019
0
                    nChunkXSizeQueried += nChunkXOffQueried;
6020
0
                    nChunkXOffQueried = 0;
6021
0
                }
6022
0
                if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6023
0
                    nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6024
0
                CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6025
#if DEBUG_VERBOSE
6026
                CPLDebug("GDAL",
6027
                         "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6028
                         nChunkXOffQueried, nChunkYOffQueried,
6029
                         nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6030
                         nDstYOff, nDstXCount, nDstYCount);
6031
#endif
6032
6033
                // Avoid accumulating too many tasks and exhaust RAM
6034
6035
                // Try to complete already finished jobs
6036
0
                while (eErr == CE_None && !jobList.empty())
6037
0
                {
6038
0
                    auto poOldestJob = jobList.front().get();
6039
0
                    if (!poOldestJob->IsFinished())
6040
0
                        break;
6041
0
                    eErr = poOldestJob->eErr;
6042
0
                    if (eErr == CE_None)
6043
0
                    {
6044
0
                        eErr = WriteJobData(poOldestJob);
6045
0
                    }
6046
6047
0
                    jobList.pop_front();
6048
0
                }
6049
6050
                // And in case we have saturated the number of threads,
6051
                // wait for completion of tasks to go below the threshold.
6052
0
                while (eErr == CE_None &&
6053
0
                       jobList.size() >= static_cast<size_t>(nThreads))
6054
0
                {
6055
0
                    eErr = WaitAndFinalizeOldestJob(jobList);
6056
0
                }
6057
6058
                // Read the source buffers for all the bands.
6059
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6060
0
                {
6061
                    // (Re)allocate buffers if needed
6062
0
                    if (apaChunk[iBand] == nullptr)
6063
0
                    {
6064
0
                        apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6065
0
                            nFullResXChunkQueried, nFullResYChunkQueried,
6066
0
                            nWrkDataTypeSize));
6067
0
                        if (apaChunk[iBand] == nullptr)
6068
0
                        {
6069
0
                            eErr = CE_Failure;
6070
0
                        }
6071
0
                    }
6072
0
                    if (bUseNoDataMask &&
6073
0
                        apabyChunkNoDataMask[iBand] == nullptr)
6074
0
                    {
6075
0
                        apabyChunkNoDataMask[iBand].reset(
6076
0
                            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6077
0
                                nFullResXChunkQueried, nFullResYChunkQueried)));
6078
0
                        if (apabyChunkNoDataMask[iBand] == nullptr)
6079
0
                        {
6080
0
                            eErr = CE_Failure;
6081
0
                        }
6082
0
                    }
6083
6084
0
                    if (eErr == CE_None)
6085
0
                    {
6086
0
                        GDALRasterBand *poSrcBand = nullptr;
6087
0
                        if (iSrcOverview == -1)
6088
0
                            poSrcBand = papoSrcBands[iBand];
6089
0
                        else
6090
0
                            poSrcBand =
6091
0
                                papapoOverviewBands[iBand][iSrcOverview];
6092
0
                        eErr = poSrcBand->RasterIO(
6093
0
                            GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6094
0
                            nChunkXSizeQueried, nChunkYSizeQueried,
6095
0
                            apaChunk[iBand].get(), nChunkXSizeQueried,
6096
0
                            nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6097
6098
0
                        if (bUseNoDataMask && eErr == CE_None)
6099
0
                        {
6100
0
                            auto poMaskBand = poSrcBand->IsMaskBand()
6101
0
                                                  ? poSrcBand
6102
0
                                                  : poSrcBand->GetMaskBand();
6103
0
                            eErr = poMaskBand->RasterIO(
6104
0
                                GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6105
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6106
0
                                apabyChunkNoDataMask[iBand].get(),
6107
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6108
0
                                GDT_Byte, 0, 0, nullptr);
6109
0
                        }
6110
0
                    }
6111
0
                }
6112
6113
                // Compute the resulting overview block.
6114
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6115
0
                {
6116
0
                    auto poJob = std::make_unique<OvrJob>();
6117
0
                    poJob->pfnResampleFn = pfnResampleFn;
6118
0
                    poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6119
0
                    poJob->args.eOvrDataType =
6120
0
                        poJob->poDstBand->GetRasterDataType();
6121
0
                    poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6122
0
                    poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6123
0
                    const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6124
0
                        "NBITS", "IMAGE_STRUCTURE");
6125
0
                    poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6126
0
                    poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6127
0
                    poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6128
0
                    poJob->args.eWrkDataType = eWrkDataType;
6129
0
                    poJob->pChunk = apaChunk[iBand].get();
6130
0
                    poJob->args.pabyChunkNodataMask =
6131
0
                        apabyChunkNoDataMask[iBand].get();
6132
0
                    poJob->args.nChunkXOff = nChunkXOffQueried;
6133
0
                    poJob->args.nChunkXSize = nChunkXSizeQueried;
6134
0
                    poJob->args.nChunkYOff = nChunkYOffQueried;
6135
0
                    poJob->args.nChunkYSize = nChunkYSizeQueried;
6136
0
                    poJob->args.nDstXOff = nDstXOff;
6137
0
                    poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6138
0
                    poJob->args.nDstYOff = nDstYOff;
6139
0
                    poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6140
0
                    poJob->args.pszResampling = pszResampling;
6141
0
                    poJob->args.bHasNoData = abHasNoData[iBand];
6142
0
                    poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6143
0
                    poJob->args.eSrcDataType = eDataType;
6144
0
                    poJob->args.bPropagateNoData = bPropagateNoData;
6145
6146
0
                    if (poJobQueue)
6147
0
                    {
6148
0
                        poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6149
0
                            apabyChunkNoDataMask[iBand].release()));
6150
6151
0
                        poJob->oSrcBufferHolder.reset(
6152
0
                            new PointerHolder(apaChunk[iBand].release()));
6153
6154
0
                        poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6155
0
                        jobList.emplace_back(std::move(poJob));
6156
0
                    }
6157
0
                    else
6158
0
                    {
6159
0
                        JobResampleFunc(poJob.get());
6160
0
                        eErr = poJob->eErr;
6161
0
                        if (eErr == CE_None)
6162
0
                        {
6163
0
                            eErr = WriteJobData(poJob.get());
6164
0
                        }
6165
0
                    }
6166
0
                }
6167
0
            }
6168
0
        }
6169
6170
        // Wait for all pending jobs to complete
6171
0
        while (!jobList.empty())
6172
0
        {
6173
0
            const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6174
0
            if (l_eErr != CE_None && eErr == CE_None)
6175
0
                eErr = l_eErr;
6176
0
        }
6177
6178
        // Flush the data to overviews.
6179
0
        for (int iBand = 0; iBand < nBands; ++iBand)
6180
0
        {
6181
0
            if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6182
0
                CE_None)
6183
0
                eErr = CE_Failure;
6184
0
        }
6185
0
    }
6186
6187
0
    if (eErr == CE_None)
6188
0
        pfnProgress(1.0, nullptr, pProgressData);
6189
6190
0
    return eErr;
6191
0
}
6192
6193
/************************************************************************/
6194
/*            GDALRegenerateOverviewsMultiBand()                        */
6195
/************************************************************************/
6196
6197
/**
6198
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6199
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6200
 *
6201
 * This function will generate one or more overview images from a base
6202
 * image using the requested downsampling algorithm.  Its primary use
6203
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
6204
 * can also be used to generate downsampled images in one file from another
6205
 * outside the overview architecture.
6206
 *
6207
 * The output bands need to exist in advance and share the same characteristics
6208
 * (type, dimensions)
6209
 *
6210
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6211
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6212
 *
6213
 * It does not support color tables or complex data types.
6214
 *
6215
 * The pseudo-algorithm used by the function is :
6216
 *    for each overview
6217
 *       iterate on lines of the source by a step of deltay
6218
 *           iterate on columns of the source  by a step of deltax
6219
 *               read the source data of size deltax * deltay for all the bands
6220
 *               generate the corresponding overview block for all the bands
6221
 *
6222
 * This function will honour properly NODATA_VALUES tuples (special dataset
6223
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6224
 * considered as the nodata value and not each value of the triplet
6225
 * independently per band.
6226
 *
6227
 * The GDAL_NUM_THREADS configuration option can be set
6228
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6229
 * overview computation.
6230
 *
6231
 * @param apoSrcBands the list of source bands to downsample
6232
 * @param aapoOverviewBands bidimension array of bands. First dimension is
6233
 *                          indexed by bands. Second dimension is indexed by
6234
 *                          overview levels. All aapoOverviewBands[i] arrays
6235
 *                          must have the same size (i.e. same number of
6236
 *                          overviews)
6237
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6238
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6239
 * @param pfnProgress progress report function.
6240
 * @param pProgressData progress function callback data.
6241
 * @param papszOptions NULL terminated list of options as
6242
 *                     key=value pairs, or NULL
6243
 *                     The XOFF, YOFF, XSIZE and YSIZE
6244
 *                     options can be specified to express that overviews should
6245
 *                     be regenerated only in the specified subset of the source
6246
 *                     dataset.
6247
 * @return CE_None on success or CE_Failure on failure.
6248
 * @since 3.10
6249
 */
6250
6251
CPLErr GDALRegenerateOverviewsMultiBand(
6252
    const std::vector<GDALRasterBand *> &apoSrcBands,
6253
    const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6254
    const char *pszResampling, GDALProgressFunc pfnProgress,
6255
    void *pProgressData, CSLConstList papszOptions)
6256
0
{
6257
0
    CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6258
0
    for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6259
0
    {
6260
0
        CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6261
0
    }
6262
6263
0
    if (aapoOverviewBands.empty())
6264
0
        return CE_None;
6265
6266
0
    std::vector<GDALRasterBand **> apapoOverviewBands;
6267
0
    for (auto &apoOverviewBands : aapoOverviewBands)
6268
0
    {
6269
0
        auto papoOverviewBands = static_cast<GDALRasterBand **>(
6270
0
            CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6271
0
        for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6272
0
        {
6273
0
            papoOverviewBands[i] = apoOverviewBands[i];
6274
0
        }
6275
0
        apapoOverviewBands.push_back(papoOverviewBands);
6276
0
    }
6277
0
    const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6278
0
        static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6279
0
        static_cast<int>(aapoOverviewBands[0].size()),
6280
0
        apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6281
0
        papszOptions);
6282
0
    for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6283
0
        CPLFree(papoOverviewBands);
6284
0
    return eErr;
6285
0
}
6286
6287
/************************************************************************/
6288
/*                        GDALComputeBandStats()                        */
6289
/************************************************************************/
6290
6291
/** Undocumented
6292
 * @param hSrcBand undocumented.
6293
 * @param nSampleStep Step between scanlines used to compute statistics.
6294
 *                    When nSampleStep is equal to 1, all scanlines will
6295
 *                    be processed.
6296
 * @param pdfMean undocumented.
6297
 * @param pdfStdDev undocumented.
6298
 * @param pfnProgress undocumented.
6299
 * @param pProgressData undocumented.
6300
 * @return undocumented
6301
 */
6302
CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6303
                                        int nSampleStep, double *pdfMean,
6304
                                        double *pdfStdDev,
6305
                                        GDALProgressFunc pfnProgress,
6306
                                        void *pProgressData)
6307
6308
0
{
6309
0
    VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6310
6311
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6312
6313
0
    if (pfnProgress == nullptr)
6314
0
        pfnProgress = GDALDummyProgress;
6315
6316
0
    const int nWidth = poSrcBand->GetXSize();
6317
0
    const int nHeight = poSrcBand->GetYSize();
6318
6319
0
    if (nSampleStep >= nHeight || nSampleStep < 1)
6320
0
        nSampleStep = 1;
6321
6322
0
    GDALDataType eWrkType = GDT_Unknown;
6323
0
    float *pafData = nullptr;
6324
0
    GDALDataType eType = poSrcBand->GetRasterDataType();
6325
0
    const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6326
0
    if (bComplex)
6327
0
    {
6328
0
        pafData = static_cast<float *>(
6329
0
            VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6330
0
        eWrkType = GDT_CFloat32;
6331
0
    }
6332
0
    else
6333
0
    {
6334
0
        pafData =
6335
0
            static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6336
0
        eWrkType = GDT_Float32;
6337
0
    }
6338
6339
0
    if (nWidth == 0 || pafData == nullptr)
6340
0
    {
6341
0
        VSIFree(pafData);
6342
0
        return CE_Failure;
6343
0
    }
6344
6345
    /* -------------------------------------------------------------------- */
6346
    /*      Loop over all sample lines.                                     */
6347
    /* -------------------------------------------------------------------- */
6348
0
    double dfSum = 0.0;
6349
0
    double dfSum2 = 0.0;
6350
0
    int iLine = 0;
6351
0
    GIntBig nSamples = 0;
6352
6353
0
    do
6354
0
    {
6355
0
        if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6356
0
                         pProgressData))
6357
0
        {
6358
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6359
0
            CPLFree(pafData);
6360
0
            return CE_Failure;
6361
0
        }
6362
6363
0
        const CPLErr eErr =
6364
0
            poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6365
0
                                1, eWrkType, 0, 0, nullptr);
6366
0
        if (eErr != CE_None)
6367
0
        {
6368
0
            CPLFree(pafData);
6369
0
            return eErr;
6370
0
        }
6371
6372
0
        for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6373
0
        {
6374
0
            float fValue = 0.0f;
6375
6376
0
            if (bComplex)
6377
0
            {
6378
                // Compute the magnitude of the complex value.
6379
0
                fValue =
6380
0
                    std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
6381
0
            }
6382
0
            else
6383
0
            {
6384
0
                fValue = pafData[iPixel];
6385
0
            }
6386
6387
0
            dfSum += fValue;
6388
0
            dfSum2 += static_cast<double>(fValue) * fValue;
6389
0
        }
6390
6391
0
        nSamples += nWidth;
6392
0
        iLine += nSampleStep;
6393
0
    } while (iLine < nHeight);
6394
6395
0
    if (!pfnProgress(1.0, nullptr, pProgressData))
6396
0
    {
6397
0
        CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6398
0
        CPLFree(pafData);
6399
0
        return CE_Failure;
6400
0
    }
6401
6402
    /* -------------------------------------------------------------------- */
6403
    /*      Produce the result values.                                      */
6404
    /* -------------------------------------------------------------------- */
6405
0
    if (pdfMean != nullptr)
6406
0
        *pdfMean = dfSum / nSamples;
6407
6408
0
    if (pdfStdDev != nullptr)
6409
0
    {
6410
0
        const double dfMean = dfSum / nSamples;
6411
6412
0
        *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6413
0
    }
6414
6415
0
    CPLFree(pafData);
6416
6417
0
    return CE_None;
6418
0
}
6419
6420
/************************************************************************/
6421
/*                  GDALOverviewMagnitudeCorrection()                   */
6422
/*                                                                      */
6423
/*      Correct the mean and standard deviation of the overviews of     */
6424
/*      the given band to match the base layer approximately.           */
6425
/************************************************************************/
6426
6427
/** Undocumented
6428
 * @param hBaseBand undocumented.
6429
 * @param nOverviewCount undocumented.
6430
 * @param pahOverviews undocumented.
6431
 * @param pfnProgress undocumented.
6432
 * @param pProgressData undocumented.
6433
 * @return undocumented
6434
 */
6435
CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6436
                                       int nOverviewCount,
6437
                                       GDALRasterBandH *pahOverviews,
6438
                                       GDALProgressFunc pfnProgress,
6439
                                       void *pProgressData)
6440
6441
0
{
6442
0
    VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6443
6444
    /* -------------------------------------------------------------------- */
6445
    /*      Compute mean/stddev for source raster.                          */
6446
    /* -------------------------------------------------------------------- */
6447
0
    double dfOrigMean = 0.0;
6448
0
    double dfOrigStdDev = 0.0;
6449
0
    {
6450
0
        const CPLErr eErr =
6451
0
            GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6452
0
                                 pfnProgress, pProgressData);
6453
6454
0
        if (eErr != CE_None)
6455
0
            return eErr;
6456
0
    }
6457
6458
    /* -------------------------------------------------------------------- */
6459
    /*      Loop on overview bands.                                         */
6460
    /* -------------------------------------------------------------------- */
6461
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6462
0
    {
6463
0
        GDALRasterBand *poOverview =
6464
0
            GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6465
0
        double dfOverviewMean, dfOverviewStdDev;
6466
6467
0
        const CPLErr eErr =
6468
0
            GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6469
0
                                 &dfOverviewStdDev, pfnProgress, pProgressData);
6470
6471
0
        if (eErr != CE_None)
6472
0
            return eErr;
6473
6474
0
        double dfGain = 1.0;
6475
0
        if (dfOrigStdDev >= 0.0001)
6476
0
            dfGain = dfOrigStdDev / dfOverviewStdDev;
6477
6478
        /* --------------------------------------------------------------------
6479
         */
6480
        /*      Apply gain and offset. */
6481
        /* --------------------------------------------------------------------
6482
         */
6483
0
        const int nWidth = poOverview->GetXSize();
6484
0
        const int nHeight = poOverview->GetYSize();
6485
6486
0
        GDALDataType eWrkType = GDT_Unknown;
6487
0
        float *pafData = nullptr;
6488
0
        const GDALDataType eType = poOverview->GetRasterDataType();
6489
0
        const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6490
0
        if (bComplex)
6491
0
        {
6492
0
            pafData = static_cast<float *>(
6493
0
                VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6494
0
            eWrkType = GDT_CFloat32;
6495
0
        }
6496
0
        else
6497
0
        {
6498
0
            pafData = static_cast<float *>(
6499
0
                VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6500
0
            eWrkType = GDT_Float32;
6501
0
        }
6502
6503
0
        if (pafData == nullptr)
6504
0
        {
6505
0
            return CE_Failure;
6506
0
        }
6507
6508
0
        for (int iLine = 0; iLine < nHeight; ++iLine)
6509
0
        {
6510
0
            if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6511
0
                             pProgressData))
6512
0
            {
6513
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6514
0
                CPLFree(pafData);
6515
0
                return CE_Failure;
6516
0
            }
6517
6518
0
            if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6519
0
                                     nWidth, 1, eWrkType, 0, 0,
6520
0
                                     nullptr) != CE_None)
6521
0
            {
6522
0
                CPLFree(pafData);
6523
0
                return CE_Failure;
6524
0
            }
6525
6526
0
            for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6527
0
            {
6528
0
                if (bComplex)
6529
0
                {
6530
0
                    pafData[iPixel * 2] *= static_cast<float>(dfGain);
6531
0
                    pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
6532
0
                }
6533
0
                else
6534
0
                {
6535
0
                    pafData[iPixel] = static_cast<float>(
6536
0
                        (pafData[iPixel] - dfOverviewMean) * dfGain +
6537
0
                        dfOrigMean);
6538
0
                }
6539
0
            }
6540
6541
0
            if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6542
0
                                     nWidth, 1, eWrkType, 0, 0,
6543
0
                                     nullptr) != CE_None)
6544
0
            {
6545
0
                CPLFree(pafData);
6546
0
                return CE_Failure;
6547
0
            }
6548
0
        }
6549
6550
0
        if (!pfnProgress(1.0, nullptr, pProgressData))
6551
0
        {
6552
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6553
0
            CPLFree(pafData);
6554
0
            return CE_Failure;
6555
0
        }
6556
6557
0
        CPLFree(pafData);
6558
0
    }
6559
6560
0
    return CE_None;
6561
0
}