Coverage Report

Created: 2025-08-28 06:57

/src/gdal/gcore/overview.cpp
Line
Count
Source (jump to first uncovered line)
1
2
/******************************************************************************
3
 *
4
 * Project:  GDAL Core
5
 * Purpose:  Helper code to implement overview support in different drivers.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2000, Frank Warmerdam
10
 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14
15
#include "cpl_port.h"
16
#include "gdal_priv.h"
17
18
#include <cmath>
19
#include <cstddef>
20
#include <cstdlib>
21
22
#include <algorithm>
23
#include <complex>
24
#include <condition_variable>
25
#include <limits>
26
#include <list>
27
#include <memory>
28
#include <mutex>
29
#include <vector>
30
31
#include "cpl_conv.h"
32
#include "cpl_error.h"
33
#include "cpl_float.h"
34
#include "cpl_progress.h"
35
#include "cpl_vsi.h"
36
#include "gdal.h"
37
#include "gdal_thread_pool.h"
38
#include "gdalwarper.h"
39
#include "gdal_vrt.h"
40
#include "vrtdataset.h"
41
42
#ifdef USE_NEON_OPTIMIZATIONS
43
#include "include_sse2neon.h"
44
#define USE_SSE2
45
46
#include "gdalsse_priv.h"
47
48
// Restrict to 64bit processors because they are guaranteed to have SSE2,
49
// or if __AVX2__ is defined.
50
#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
51
#define USE_SSE2
52
53
#include "gdalsse_priv.h"
54
55
#ifdef __SSE3__
56
#include <pmmintrin.h>
57
#endif
58
#ifdef __SSSE3__
59
#include <tmmintrin.h>
60
#endif
61
#ifdef __SSE4_1__
62
#include <smmintrin.h>
63
#endif
64
#ifdef __AVX2__
65
#include <immintrin.h>
66
#endif
67
68
#endif
69
70
// To be included after above USE_SSE2 and include gdalsse_priv.h
71
// to avoid build issue on Windows x86
72
#include "gdal_priv_templates.hpp"
73
74
/************************************************************************/
75
/*                      GDALResampleChunk_Near()                        */
76
/************************************************************************/
77
78
template <class T>
79
static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
80
                                      const T *pChunk, T **ppDstBuffer)
81
82
0
{
83
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
84
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
85
0
    const GDALDataType eWrkDataType = args.eWrkDataType;
86
0
    const int nChunkXOff = args.nChunkXOff;
87
0
    const int nChunkXSize = args.nChunkXSize;
88
0
    const int nChunkYOff = args.nChunkYOff;
89
0
    const int nDstXOff = args.nDstXOff;
90
0
    const int nDstXOff2 = args.nDstXOff2;
91
0
    const int nDstYOff = args.nDstYOff;
92
0
    const int nDstYOff2 = args.nDstYOff2;
93
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
94
95
    /* -------------------------------------------------------------------- */
96
    /*      Allocate buffers.                                               */
97
    /* -------------------------------------------------------------------- */
98
0
    *ppDstBuffer = static_cast<T *>(
99
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
100
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
101
0
    if (*ppDstBuffer == nullptr)
102
0
    {
103
0
        return CE_Failure;
104
0
    }
105
0
    T *const pDstBuffer = *ppDstBuffer;
106
107
0
    int *panSrcXOff =
108
0
        static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
109
110
0
    if (panSrcXOff == nullptr)
111
0
    {
112
0
        return CE_Failure;
113
0
    }
114
115
    /* ==================================================================== */
116
    /*      Precompute inner loop constants.                                */
117
    /* ==================================================================== */
118
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
119
0
    {
120
0
        int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
121
0
        if (nSrcXOff < nChunkXOff)
122
0
            nSrcXOff = nChunkXOff;
123
124
0
        panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
125
0
    }
126
127
    /* ==================================================================== */
128
    /*      Loop over destination scanlines.                                */
129
    /* ==================================================================== */
130
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
131
0
    {
132
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
133
0
        if (nSrcYOff < nChunkYOff)
134
0
            nSrcYOff = nChunkYOff;
135
136
0
        const T *const pSrcScanline =
137
0
            pChunk +
138
0
            (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
139
0
            nChunkXOff;
140
141
        /* --------------------------------------------------------------------
142
         */
143
        /*      Loop over destination pixels */
144
        /* --------------------------------------------------------------------
145
         */
146
0
        T *pDstScanline =
147
0
            pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
148
0
        for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
149
0
        {
150
0
            pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
151
0
        }
152
0
    }
153
154
0
    CPLFree(panSrcXOff);
155
156
0
    return CE_None;
157
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>**)
158
159
static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
160
                                     const void *pChunk, void **ppDstBuffer,
161
                                     GDALDataType *peDstBufferDataType)
162
0
{
163
0
    *peDstBufferDataType = args.eWrkDataType;
164
0
    switch (args.eWrkDataType)
165
0
    {
166
        // For nearest resampling, as no computation is done, only the
167
        // size of the data type matters.
168
0
        case GDT_Byte:
169
0
        case GDT_Int8:
170
0
        {
171
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
172
0
            return GDALResampleChunk_NearT(
173
0
                args, static_cast<const uint8_t *>(pChunk),
174
0
                reinterpret_cast<uint8_t **>(ppDstBuffer));
175
0
        }
176
177
0
        case GDT_Int16:
178
0
        case GDT_UInt16:
179
0
        case GDT_Float16:
180
0
        {
181
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
182
0
            return GDALResampleChunk_NearT(
183
0
                args, static_cast<const uint16_t *>(pChunk),
184
0
                reinterpret_cast<uint16_t **>(ppDstBuffer));
185
0
        }
186
187
0
        case GDT_CInt16:
188
0
        case GDT_CFloat16:
189
0
        case GDT_Int32:
190
0
        case GDT_UInt32:
191
0
        case GDT_Float32:
192
0
        {
193
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
194
0
            return GDALResampleChunk_NearT(
195
0
                args, static_cast<const uint32_t *>(pChunk),
196
0
                reinterpret_cast<uint32_t **>(ppDstBuffer));
197
0
        }
198
199
0
        case GDT_CInt32:
200
0
        case GDT_CFloat32:
201
0
        case GDT_Int64:
202
0
        case GDT_UInt64:
203
0
        case GDT_Float64:
204
0
        {
205
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
206
0
            return GDALResampleChunk_NearT(
207
0
                args, static_cast<const uint64_t *>(pChunk),
208
0
                reinterpret_cast<uint64_t **>(ppDstBuffer));
209
0
        }
210
211
0
        case GDT_CFloat64:
212
0
        {
213
0
            return GDALResampleChunk_NearT(
214
0
                args, static_cast<const std::complex<double> *>(pChunk),
215
0
                reinterpret_cast<std::complex<double> **>(ppDstBuffer));
216
0
        }
217
218
0
        case GDT_Unknown:
219
0
        case GDT_TypeCount:
220
0
            break;
221
0
    }
222
0
    CPLAssert(false);
223
0
    return CE_Failure;
224
0
}
225
226
namespace
227
{
228
229
// Find in the color table the entry whose RGB value is the closest
230
// (using quadratic distance) to the test color, ignoring transparent entries.
231
int BestColorEntry(const std::vector<GDALColorEntry> &entries,
232
                   const GDALColorEntry &test)
233
0
{
234
0
    int nMinDist = std::numeric_limits<int>::max();
235
0
    size_t bestEntry = 0;
236
0
    for (size_t i = 0; i < entries.size(); ++i)
237
0
    {
238
0
        const GDALColorEntry &entry = entries[i];
239
        // Ignore transparent entries
240
0
        if (entry.c4 == 0)
241
0
            continue;
242
243
0
        int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
244
0
                    ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
245
0
                    ((test.c3 - entry.c3) * (test.c3 - entry.c3));
246
0
        if (nDist < nMinDist)
247
0
        {
248
0
            nMinDist = nDist;
249
0
            bestEntry = i;
250
0
        }
251
0
    }
252
0
    return static_cast<int>(bestEntry);
253
0
}
254
255
std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
256
                                           int &transparentIdx)
257
0
{
258
0
    std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
259
260
0
    transparentIdx = -1;
261
0
    int i = 0;
262
0
    for (auto &entry : entries)
263
0
    {
264
0
        table.GetColorEntryAsRGB(i, &entry);
265
0
        if (transparentIdx < 0 && entry.c4 == 0)
266
0
            transparentIdx = i;
267
0
        ++i;
268
0
    }
269
0
    return entries;
270
0
}
271
272
}  // unnamed  namespace
273
274
/************************************************************************/
275
/*                             SQUARE()                                 */
276
/************************************************************************/
277
278
template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
279
0
{
280
0
    return static_cast<Tsquare>(val) * val;
281
0
}
Unexecuted instantiation: int SQUARE<int, int>(int)
Unexecuted instantiation: double SQUARE<double, double>(double)
Unexecuted instantiation: unsigned int SQUARE<unsigned int, unsigned int>(unsigned int)
282
283
/************************************************************************/
284
/*                          ComputeIntegerRMS()                         */
285
/************************************************************************/
286
// Compute rms = sqrt(sumSquares / weight) in such a way that it is the
287
// integer that minimizes abs(rms**2 - sumSquares / weight)
288
template <class T, class Twork>
289
inline T ComputeIntegerRMS(double sumSquares, double weight)
290
0
{
291
0
    const double sumDivWeight = sumSquares / weight;
292
0
    T rms = static_cast<T>(sqrt(sumDivWeight));
293
294
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
295
    // Naive version:
296
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
297
0
    if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
298
0
        2 * sumDivWeight)
299
0
        rms += 1;
300
0
    return rms;
301
0
}
Unexecuted instantiation: unsigned char ComputeIntegerRMS<unsigned char, int>(double, double)
Unexecuted instantiation: unsigned short ComputeIntegerRMS<unsigned short, unsigned long>(double, double)
302
303
template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
304
0
{
305
0
    CPLAssert(false);
306
0
    return 0;
307
0
}
308
309
template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
310
0
{
311
    // It has been verified that given the correction on rms below, using
312
    // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
313
    // is equivalent, so use the former as it is used twice.
314
0
    const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
315
0
    const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
316
0
    GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
317
318
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
319
    // Naive version:
320
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
321
    // Optimized version for integer case and weight == 4
322
0
    if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
323
0
        rms += 1;
324
0
    return rms;
325
0
}
326
327
template <>
328
inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
329
0
{
330
0
    const double sumDivWeight = sumSquares * 0.25;
331
0
    GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
332
333
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
334
    // Naive version:
335
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
336
    // Optimized version for integer case and weight == 4
337
0
    if (static_cast<GUInt32>(rms) * (rms + 1) <
338
0
        static_cast<GUInt32>(sumDivWeight + 0.25))
339
0
        rms += 1;
340
0
    return rms;
341
0
}
342
343
#ifdef USE_SSE2
344
345
/************************************************************************/
346
/*                   QuadraticMeanByteSSE2OrAVX2()                      */
347
/************************************************************************/
348
349
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
350
#define sse2_packus_epi32 _mm_packus_epi32
351
#else
352
inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
353
0
{
354
0
    const auto minus32768_32 = _mm_set1_epi32(-32768);
355
0
    const auto minus32768_16 = _mm_set1_epi16(-32768);
356
0
    a = _mm_add_epi32(a, minus32768_32);
357
0
    b = _mm_add_epi32(b, minus32768_32);
358
0
    a = _mm_packs_epi32(a, b);
359
0
    a = _mm_sub_epi16(a, minus32768_16);
360
0
    return a;
361
0
}
362
#endif
363
364
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
365
#define sse2_hadd_epi16 _mm_hadd_epi16
366
#else
367
inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
368
0
{
369
    // Horizontal addition of adjacent pairs
370
0
    const auto mask = _mm_set1_epi32(0xFFFF);
371
0
    const auto horizLo =
372
0
        _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
373
0
    const auto horizHi =
374
0
        _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
375
376
    // Recombine low and high parts
377
0
    return _mm_packs_epi32(horizLo, horizHi);
378
0
}
379
#endif
380
381
#ifdef __AVX2__
382
383
#define DEST_ELTS 16
384
#define set1_epi16 _mm256_set1_epi16
385
#define set1_epi32 _mm256_set1_epi32
386
#define setzero _mm256_setzero_si256
387
#define set1_ps _mm256_set1_ps
388
#define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
389
#define unpacklo_epi8 _mm256_unpacklo_epi8
390
#define unpackhi_epi8 _mm256_unpackhi_epi8
391
#define madd_epi16 _mm256_madd_epi16
392
#define add_epi32 _mm256_add_epi32
393
#define mul_ps _mm256_mul_ps
394
#define cvtepi32_ps _mm256_cvtepi32_ps
395
#define sqrt_ps _mm256_sqrt_ps
396
#define cvttps_epi32 _mm256_cvttps_epi32
397
#define packs_epi32 _mm256_packs_epi32
398
#define packus_epi32 _mm256_packus_epi32
399
#define srli_epi32 _mm256_srli_epi32
400
#define mullo_epi16 _mm256_mullo_epi16
401
#define srli_epi16 _mm256_srli_epi16
402
#define cmpgt_epi16 _mm256_cmpgt_epi16
403
#define add_epi16 _mm256_add_epi16
404
#define sub_epi16 _mm256_sub_epi16
405
#define packus_epi16 _mm256_packus_epi16
406
407
/* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
408
/* to get the lower 128-bit bits of what would be a true 256-bit vector register
409
 */
410
411
inline __m256i FIXUP_LANES(__m256i x)
412
{
413
    return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
414
}
415
416
#define store_lo(x, y)                                                         \
417
    _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
418
                     _mm256_extracti128_si256(FIXUP_LANES(y), 0))
419
#define storeu_int(x, y)                                                       \
420
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
421
#define hadd_epi16 _mm256_hadd_epi16
422
#else
423
0
#define DEST_ELTS 8
424
0
#define set1_epi16 _mm_set1_epi16
425
0
#define set1_epi32 _mm_set1_epi32
426
0
#define setzero _mm_setzero_si128
427
#define set1_ps _mm_set1_ps
428
0
#define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
429
0
#define unpacklo_epi8 _mm_unpacklo_epi8
430
0
#define unpackhi_epi8 _mm_unpackhi_epi8
431
0
#define madd_epi16 _mm_madd_epi16
432
0
#define add_epi32 _mm_add_epi32
433
#define mul_ps _mm_mul_ps
434
0
#define cvtepi32_ps _mm_cvtepi32_ps
435
0
#define sqrt_ps _mm_sqrt_ps
436
0
#define cvttps_epi32 _mm_cvttps_epi32
437
0
#define packs_epi32 _mm_packs_epi32
438
0
#define packus_epi32 sse2_packus_epi32
439
0
#define srli_epi32 _mm_srli_epi32
440
0
#define mullo_epi16 _mm_mullo_epi16
441
0
#define srli_epi16 _mm_srli_epi16
442
0
#define cmpgt_epi16 _mm_cmpgt_epi16
443
0
#define add_epi16 _mm_add_epi16
444
0
#define sub_epi16 _mm_sub_epi16
445
0
#define packus_epi16 _mm_packus_epi16
446
0
#define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
447
0
#define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
448
0
#define hadd_epi16 sse2_hadd_epi16
449
#endif
450
451
template <class T>
452
static int
453
#if defined(__GNUC__)
454
    __attribute__((noinline))
455
#endif
456
    QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
457
                                const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
458
                                T *CPL_RESTRICT pDstScanline)
459
0
{
460
    // Optimized implementation for RMS on Byte by
461
    // processing by group of 8 output pixels, so as to use
462
    // a single _mm_sqrt_ps() call for 4 output pixels
463
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
464
465
0
    int iDstPixel = 0;
466
0
    const auto one16 = set1_epi16(1);
467
0
    const auto one32 = set1_epi32(1);
468
0
    const auto zero = setzero();
469
0
    const auto minus32768 = set1_epi16(-32768);
470
471
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
472
0
    {
473
        // Load 2 * DEST_ELTS bytes from each line
474
0
        auto firstLine = loadu_int(pSrcScanlineShifted);
475
0
        auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
476
        // Extend those Bytes as UInt16s
477
0
        auto firstLineLo = unpacklo_epi8(firstLine, zero);
478
0
        auto firstLineHi = unpackhi_epi8(firstLine, zero);
479
0
        auto secondLineLo = unpacklo_epi8(secondLine, zero);
480
0
        auto secondLineHi = unpackhi_epi8(secondLine, zero);
481
482
        // Multiplication of 16 bit values and horizontal
483
        // addition of 32 bit results
484
        // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
485
0
        firstLineLo = madd_epi16(firstLineLo, firstLineLo);
486
0
        firstLineHi = madd_epi16(firstLineHi, firstLineHi);
487
0
        secondLineLo = madd_epi16(secondLineLo, secondLineLo);
488
0
        secondLineHi = madd_epi16(secondLineHi, secondLineHi);
489
490
        // Vertical addition
491
0
        const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
492
0
        const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
493
494
0
        const auto sumSquaresPlusOneDiv4Lo =
495
0
            srli_epi32(add_epi32(sumSquaresLo, one32), 2);
496
0
        const auto sumSquaresPlusOneDiv4Hi =
497
0
            srli_epi32(add_epi32(sumSquaresHi, one32), 2);
498
499
        // Take square root and truncate/floor to int32
500
0
        const auto rmsLo =
501
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
502
0
        const auto rmsHi =
503
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
504
505
        // Merge back low and high registers with each RMS value
506
        // as a 16 bit value.
507
0
        auto rms = packs_epi32(rmsLo, rmsHi);
508
509
        // Round to upper value if it minimizes the
510
        // error |rms^2 - sumSquares/4|
511
        // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
512
        //    rms += 1;
513
        // which is equivalent to:
514
        // if( rms * (rms + 1) < (sumSquares+1) / 4 )
515
        //    rms += 1;
516
        // And both left and right parts fit on 16 (unsigned) bits
517
0
        const auto sumSquaresPlusOneDiv4 =
518
0
            packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
519
        // cmpgt_epi16 operates on signed int16, but here
520
        // we have unsigned values, so shift them by -32768 before
521
0
        auto mask = cmpgt_epi16(
522
0
            add_epi16(sumSquaresPlusOneDiv4, minus32768),
523
0
            add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
524
        // The value of the mask will be -1 when the correction needs to be
525
        // applied
526
0
        rms = sub_epi16(rms, mask);
527
528
        // Pack each 16 bit RMS value to 8 bits
529
0
        rms = packus_epi16(rms, rms /* could be anything */);
530
0
        store_lo(&pDstScanline[iDstPixel], rms);
531
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
532
0
    }
533
534
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
535
0
    return iDstPixel;
536
0
}
537
538
/************************************************************************/
539
/*                      AverageByteSSE2OrAVX2()                         */
540
/************************************************************************/
541
542
template <class T>
543
static int
544
AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
545
                      const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
546
                      T *CPL_RESTRICT pDstScanline)
547
0
{
548
    // Optimized implementation for average on Byte by
549
    // processing by group of 16 output pixels for SSE2, or 32 for AVX2
550
551
0
    const auto zero = setzero();
552
0
    const auto two16 = set1_epi16(2);
553
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
554
555
0
    int iDstPixel = 0;
556
0
    for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
557
0
         iDstPixel += 2 * DEST_ELTS)
558
0
    {
559
0
        decltype(setzero()) average0;
560
0
        {
561
            // Load 2 * DEST_ELTS bytes from each line
562
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
563
0
            const auto secondLine =
564
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
565
            // Extend those Bytes as UInt16s
566
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
567
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
568
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
569
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
570
571
            // Vertical addition
572
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
573
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
574
575
            // Horizontal addition of adjacent pairs, and recombine low and high
576
            // parts
577
0
            const auto sum = hadd_epi16(sumLo, sumHi);
578
579
            // average = (sum + 2) / 4
580
0
            average0 = srli_epi16(add_epi16(sum, two16), 2);
581
582
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
583
0
        }
584
585
0
        decltype(setzero()) average1;
586
0
        {
587
            // Load 2 * DEST_ELTS bytes from each line
588
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
589
0
            const auto secondLine =
590
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
591
            // Extend those Bytes as UInt16s
592
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
593
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
594
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
595
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
596
597
            // Vertical addition
598
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
599
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
600
601
            // Horizontal addition of adjacent pairs, and recombine low and high
602
            // parts
603
0
            const auto sum = hadd_epi16(sumLo, sumHi);
604
605
            // average = (sum + 2) / 4
606
0
            average1 = srli_epi16(add_epi16(sum, two16), 2);
607
608
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
609
0
        }
610
611
        // Pack each 16 bit average value to 8 bits
612
0
        const auto average = packus_epi16(average0, average1);
613
0
        storeu_int(&pDstScanline[iDstPixel], average);
614
0
    }
615
616
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
617
0
    return iDstPixel;
618
0
}
619
620
/************************************************************************/
621
/*                     QuadraticMeanUInt16SSE2()                        */
622
/************************************************************************/
623
624
#ifdef __SSE3__
625
#define sse2_hadd_pd _mm_hadd_pd
626
#else
627
inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
628
0
{
629
0
    auto aLo_bLo =
630
0
        _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
631
0
    auto aHi_bHi =
632
0
        _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
633
0
    return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
634
0
}
635
#endif
636
637
inline __m128d SQUARE_PD(__m128d x)
638
0
{
639
0
    return _mm_mul_pd(x, x);
640
0
}
641
642
#ifdef __AVX2__
643
644
inline __m256d SQUARE_PD(__m256d x)
645
{
646
    return _mm256_mul_pd(x, x);
647
}
648
649
inline __m256d FIXUP_LANES(__m256d x)
650
{
651
    return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
652
}
653
654
inline __m256 FIXUP_LANES(__m256 x)
655
{
656
    return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
657
}
658
659
#endif
660
661
template <class T>
662
static int
663
QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
664
                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
665
                        T *CPL_RESTRICT pDstScanline)
666
0
{
667
    // Optimized implementation for RMS on UInt16 by
668
    // processing by group of 4 output pixels.
669
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
670
671
0
    int iDstPixel = 0;
672
0
    const auto zero = _mm_setzero_si128();
673
674
#ifdef __AVX2__
675
    const auto zeroDot25 = _mm256_set1_pd(0.25);
676
    const auto zeroDot5 = _mm256_set1_pd(0.5);
677
678
    // The first four 0's could be anything, as we only take the bottom
679
    // 128 bits.
680
    const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
681
#else
682
0
    const auto zeroDot25 = _mm_set1_pd(0.25);
683
0
    const auto zeroDot5 = _mm_set1_pd(0.5);
684
0
#endif
685
686
0
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
687
0
    {
688
        // Load 8 UInt16 from each line
689
0
        const auto firstLine = _mm_loadu_si128(
690
0
            reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
691
0
        const auto secondLine =
692
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(
693
0
                pSrcScanlineShifted + nChunkXSize));
694
695
        // Detect if all of the source values fit in 14 bits.
696
        // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
697
        // and we can do a much faster implementation.
698
0
        const auto maskTmp =
699
0
            _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
700
#if defined(__i386__) || defined(_M_IX86)
701
        uint64_t nMaskFitsIn14Bits = 0;
702
        _mm_storel_epi64(
703
            reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
704
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
705
#else
706
0
        const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
707
0
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
708
0
#endif
709
0
        if (nMaskFitsIn14Bits == 0)
710
0
        {
711
            // Multiplication of 16 bit values and horizontal
712
            // addition of 32 bit results
713
0
            const auto firstLineHSumSquare =
714
0
                _mm_madd_epi16(firstLine, firstLine);
715
0
            const auto secondLineHSumSquare =
716
0
                _mm_madd_epi16(secondLine, secondLine);
717
            // Vertical addition
718
0
            const auto sumSquares =
719
0
                _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
720
            // In theory we should take sqrt(sumSquares * 0.25f)
721
            // but given the rounding we do, this is equivalent to
722
            // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
723
            // sumSquares <= 4 * 16383^2
724
0
            const auto one32 = _mm_set1_epi32(1);
725
0
            const auto sumSquaresPlusOneDiv4 =
726
0
                _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
727
            // Take square root and truncate/floor to int32
728
0
            auto rms = _mm_cvttps_epi32(
729
0
                _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
730
731
            // Round to upper value if it minimizes the
732
            // error |rms^2 - sumSquares/4|
733
            // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
734
            //    rms += 1;
735
            // which is equivalent to:
736
            // if( rms * rms + rms < (sumSquares+1) / 4 )
737
            //    rms += 1;
738
0
            auto mask =
739
0
                _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
740
0
                                _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
741
0
            rms = _mm_sub_epi32(rms, mask);
742
            // Pack each 32 bit RMS value to 16 bits
743
0
            rms = _mm_packs_epi32(rms, rms /* could be anything */);
744
0
            _mm_storel_epi64(
745
0
                reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
746
0
            pSrcScanlineShifted += 8;
747
0
            continue;
748
0
        }
749
750
        // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
751
        // to 32 bit would result in 4 multiplications instead of 8, but
752
        // mullo/mulhi have a worse throughput than mul_pd.
753
754
        // Extend those UInt16s as UInt32s
755
0
        const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
756
0
        const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
757
0
        const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
758
0
        const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
759
760
#ifdef __AVX2__
761
        // Multiplication of 32 bit values previously converted to 64 bit double
762
        const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
763
        const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
764
        const auto secondLineLoDbl =
765
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
766
        const auto secondLineHiDbl =
767
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
768
769
        // Vertical addition of squares
770
        const auto sumSquaresLo =
771
            _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
772
        const auto sumSquaresHi =
773
            _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
774
775
        // Horizontal addition of squares
776
        const auto sumSquares =
777
            FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
778
779
        const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
780
781
        // Take square root and truncate/floor to int32
782
        auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
783
        const auto rmsDouble = _mm256_cvtepi32_pd(rms);
784
        const auto right = _mm256_sub_pd(
785
            sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
786
787
        auto mask =
788
            _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
789
        // Extract 32-bit from each of the 4 64-bit masks
790
        // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
791
        // _MM_SHUFFLE(2,0,2,0)));
792
        mask = _mm256_permutevar8x32_ps(mask, permutation);
793
        const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
794
795
        // Apply the correction
796
        rms = _mm_sub_epi32(rms, maskI);
797
798
        // Pack each 32 bit RMS value to 16 bits
799
        rms = _mm_packus_epi32(rms, rms /* could be anything */);
800
#else
801
        // Multiplication of 32 bit values previously converted to 64 bit double
802
0
        const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
803
0
        const auto firstLineLoHi =
804
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
805
0
        const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
806
0
        const auto firstLineHiHi =
807
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
808
809
0
        const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
810
0
        const auto secondLineLoHi =
811
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
812
0
        const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
813
0
        const auto secondLineHiHi =
814
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
815
816
        // Vertical addition of squares
817
0
        const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
818
0
        const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
819
0
        const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
820
0
        const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
821
822
        // Horizontal addition of squares
823
0
        const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
824
0
        const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
825
826
0
        const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
827
0
        const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
828
        // Take square root and truncate/floor to int32
829
0
        const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
830
0
        const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
831
832
        // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
833
        // if( 0.5 < sumDivWeight - (rms * rms + rms) )
834
        //     rms += 1;
835
0
        const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
836
0
        const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
837
0
        const auto rightLo = _mm_sub_pd(
838
0
            sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
839
0
        const auto rightHi = _mm_sub_pd(
840
0
            sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
841
842
0
        const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
843
0
        const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
844
        // The value of the mask will be -1 when the correction needs to be
845
        // applied
846
0
        const auto mask = _mm_castps_si128(_mm_shuffle_ps(
847
0
            maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
848
849
0
        auto rms = _mm_castps_si128(
850
0
            _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
851
        // Apply the correction
852
0
        rms = _mm_sub_epi32(rms, mask);
853
854
        // Pack each 32 bit RMS value to 16 bits
855
0
        rms = sse2_packus_epi32(rms, rms /* could be anything */);
856
0
#endif
857
858
0
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
859
0
                         rms);
860
0
        pSrcScanlineShifted += 8;
861
0
    }
862
863
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
864
0
    return iDstPixel;
865
0
}
866
867
/************************************************************************/
868
/*                         AverageUInt16SSE2()                          */
869
/************************************************************************/
870
871
template <class T>
872
static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
873
                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
874
                             T *CPL_RESTRICT pDstScanline)
875
0
{
876
    // Optimized implementation for average on UInt16 by
877
    // processing by group of 8 output pixels.
878
879
0
    const auto mask = _mm_set1_epi32(0xFFFF);
880
0
    const auto two = _mm_set1_epi32(2);
881
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
882
883
0
    int iDstPixel = 0;
884
0
    for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
885
0
    {
886
0
        __m128i averageLow;
887
        // Load 8 UInt16 from each line
888
0
        {
889
0
            const auto firstLine = _mm_loadu_si128(
890
0
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
891
0
            const auto secondLine =
892
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
893
0
                    pSrcScanlineShifted + nChunkXSize));
894
895
            // Horizontal addition and extension to 32 bit
896
0
            const auto horizAddFirstLine = _mm_add_epi32(
897
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
898
0
            const auto horizAddSecondLine =
899
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
900
0
                              _mm_srli_epi32(secondLine, 16));
901
902
            // Vertical addition and average computation
903
            // average = (sum + 2) >> 2
904
0
            const auto sum = _mm_add_epi32(
905
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
906
0
            averageLow = _mm_srli_epi32(sum, 2);
907
0
        }
908
        // Load 8 UInt16 from each line
909
0
        __m128i averageHigh;
910
0
        {
911
0
            const auto firstLine = _mm_loadu_si128(
912
0
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
913
0
            const auto secondLine =
914
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
915
0
                    pSrcScanlineShifted + 8 + nChunkXSize));
916
917
            // Horizontal addition and extension to 32 bit
918
0
            const auto horizAddFirstLine = _mm_add_epi32(
919
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
920
0
            const auto horizAddSecondLine =
921
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
922
0
                              _mm_srli_epi32(secondLine, 16));
923
924
            // Vertical addition and average computation
925
            // average = (sum + 2) >> 2
926
0
            const auto sum = _mm_add_epi32(
927
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
928
0
            averageHigh = _mm_srli_epi32(sum, 2);
929
0
        }
930
931
        // Pack each 32 bit average value to 16 bits
932
0
        auto average = sse2_packus_epi32(averageLow, averageHigh);
933
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
934
0
                         average);
935
0
        pSrcScanlineShifted += 16;
936
0
    }
937
938
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
939
0
    return iDstPixel;
940
0
}
941
942
/************************************************************************/
943
/*                      QuadraticMeanFloatSSE2()                        */
944
/************************************************************************/
945
946
#ifdef __SSE3__
947
#define sse2_hadd_ps _mm_hadd_ps
948
#else
949
inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
950
0
{
951
0
    auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
952
0
    auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
953
0
    return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
954
0
}
955
#endif
956
957
#ifdef __AVX2__
958
#define RMS_FLOAT_ELTS 8
959
#define set1_ps _mm256_set1_ps
960
#define loadu_ps _mm256_loadu_ps
961
#define andnot_ps _mm256_andnot_ps
962
#define and_ps _mm256_and_ps
963
#define max_ps _mm256_max_ps
964
#define shuffle_ps _mm256_shuffle_ps
965
#define div_ps _mm256_div_ps
966
#define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
967
#define mul_ps _mm256_mul_ps
968
#define add_ps _mm256_add_ps
969
#define hadd_ps _mm256_hadd_ps
970
#define sqrt_ps _mm256_sqrt_ps
971
#define or_ps _mm256_or_ps
972
#define unpacklo_ps _mm256_unpacklo_ps
973
#define unpackhi_ps _mm256_unpackhi_ps
974
#define storeu_ps _mm256_storeu_ps
975
976
inline __m256 SQUARE_PS(__m256 x)
977
{
978
    return _mm256_mul_ps(x, x);
979
}
980
981
#else
982
983
0
#define RMS_FLOAT_ELTS 4
984
0
#define set1_ps _mm_set1_ps
985
0
#define loadu_ps _mm_loadu_ps
986
0
#define andnot_ps _mm_andnot_ps
987
0
#define and_ps _mm_and_ps
988
0
#define max_ps _mm_max_ps
989
0
#define shuffle_ps _mm_shuffle_ps
990
0
#define div_ps _mm_div_ps
991
0
#define cmpeq_ps _mm_cmpeq_ps
992
0
#define mul_ps _mm_mul_ps
993
0
#define add_ps _mm_add_ps
994
#define hadd_ps sse2_hadd_ps
995
0
#define sqrt_ps _mm_sqrt_ps
996
0
#define or_ps _mm_or_ps
997
#define unpacklo_ps _mm_unpacklo_ps
998
#define unpackhi_ps _mm_unpackhi_ps
999
0
#define storeu_ps _mm_storeu_ps
1000
1001
inline __m128 SQUARE_PS(__m128 x)
1002
0
{
1003
0
    return _mm_mul_ps(x, x);
1004
0
}
1005
1006
inline __m128 FIXUP_LANES(__m128 x)
1007
0
{
1008
0
    return x;
1009
0
}
1010
1011
#endif
1012
1013
static int
1014
#if defined(__GNUC__)
1015
    __attribute__((noinline))
1016
#endif
1017
    QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1018
                           const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1019
                           float *CPL_RESTRICT pDstScanline)
1020
0
{
1021
    // Optimized implementation for RMS on Float32 by
1022
    // processing by group of RMS_FLOAT_ELTS output pixels.
1023
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1024
1025
0
    int iDstPixel = 0;
1026
0
    const auto minus_zero = set1_ps(-0.0f);
1027
0
    const auto zeroDot25 = set1_ps(0.25f);
1028
0
    const auto one = set1_ps(1.0f);
1029
0
    const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1030
1031
0
    for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
1032
0
         iDstPixel += RMS_FLOAT_ELTS)
1033
0
    {
1034
        // Load 2*RMS_FLOAT_ELTS Float32 from each line
1035
0
        auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1036
0
        auto firstLineHi = loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS);
1037
0
        auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1038
0
        auto secondLineHi =
1039
0
            loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize);
1040
1041
        // Take the absolute value
1042
0
        firstLineLo = andnot_ps(minus_zero, firstLineLo);
1043
0
        firstLineHi = andnot_ps(minus_zero, firstLineHi);
1044
0
        secondLineLo = andnot_ps(minus_zero, secondLineLo);
1045
0
        secondLineHi = andnot_ps(minus_zero, secondLineHi);
1046
1047
0
        auto firstLineEven =
1048
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1049
0
        auto firstLineOdd =
1050
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1051
0
        auto secondLineEven =
1052
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1053
0
        auto secondLineOdd =
1054
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1055
1056
        // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1057
0
        const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1058
0
                                 max_ps(secondLineEven, secondLineEven));
1059
1060
        // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1061
        // This step is important to avoid that the square evaluates to infinity
1062
        // for sufficiently big input.
1063
0
        auto invMax = div_ps(one, maxV);
1064
        // Deal with 0 being the maximum to correct division by zero
1065
        // note: comparing to -0 leads to identical results as to comparing with
1066
        // 0
1067
0
        invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1068
1069
0
        firstLineEven = mul_ps(firstLineEven, invMax);
1070
0
        firstLineOdd = mul_ps(firstLineOdd, invMax);
1071
0
        secondLineEven = mul_ps(secondLineEven, invMax);
1072
0
        secondLineOdd = mul_ps(secondLineOdd, invMax);
1073
1074
        // Compute squares
1075
0
        firstLineEven = SQUARE_PS(firstLineEven);
1076
0
        firstLineOdd = SQUARE_PS(firstLineOdd);
1077
0
        secondLineEven = SQUARE_PS(secondLineEven);
1078
0
        secondLineOdd = SQUARE_PS(secondLineOdd);
1079
1080
0
        const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1081
0
                                       add_ps(secondLineEven, secondLineOdd));
1082
1083
0
        auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1084
1085
        // Deal with infinity being the maximum
1086
0
        const auto maskIsInf = cmpeq_ps(maxV, infv);
1087
0
        rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1088
1089
0
        rms = FIXUP_LANES(rms);
1090
1091
0
        storeu_ps(&pDstScanline[iDstPixel], rms);
1092
0
        pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1093
0
    }
1094
1095
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1096
0
    return iDstPixel;
1097
0
}
1098
1099
/************************************************************************/
1100
/*                        AverageFloatSSE2()                            */
1101
/************************************************************************/
1102
1103
static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1104
                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1105
                            float *CPL_RESTRICT pDstScanline)
1106
0
{
1107
    // Optimized implementation for average on Float32 by
1108
    // processing by group of 4 output pixels.
1109
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1110
1111
0
    int iDstPixel = 0;
1112
0
    const auto zeroDot25 = _mm_set1_ps(0.25f);
1113
1114
0
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1115
0
    {
1116
        // Load 8 Float32 from each line
1117
0
        const auto firstLineLo = _mm_loadu_ps(pSrcScanlineShifted);
1118
0
        const auto firstLineHi = _mm_loadu_ps(pSrcScanlineShifted + 4);
1119
0
        const auto secondLineLo =
1120
0
            _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize);
1121
0
        const auto secondLineHi =
1122
0
            _mm_loadu_ps(pSrcScanlineShifted + 4 + nChunkXSize);
1123
1124
        // Vertical addition
1125
0
        const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1126
0
        const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1127
1128
        // Horizontal addition
1129
0
        const auto sum = sse2_hadd_ps(sumLo, sumHi);
1130
1131
0
        const auto average = _mm_mul_ps(sum, zeroDot25);
1132
1133
0
        _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1134
0
        pSrcScanlineShifted += 8;
1135
0
    }
1136
1137
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1138
0
    return iDstPixel;
1139
0
}
1140
1141
#endif
1142
1143
/************************************************************************/
1144
/*                    GDALResampleChunk_AverageOrRMS()                  */
1145
/************************************************************************/
1146
1147
template <class T, class Tsum, GDALDataType eWrkDataType>
1148
static CPLErr
1149
GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1150
                                 const T *pChunk, void **ppDstBuffer)
1151
0
{
1152
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1153
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1154
0
    const double dfSrcXDelta = args.dfSrcXDelta;
1155
0
    const double dfSrcYDelta = args.dfSrcYDelta;
1156
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1157
0
    const int nChunkXOff = args.nChunkXOff;
1158
0
    const int nChunkYOff = args.nChunkYOff;
1159
0
    const int nChunkXSize = args.nChunkXSize;
1160
0
    const int nChunkYSize = args.nChunkYSize;
1161
0
    const int nDstXOff = args.nDstXOff;
1162
0
    const int nDstXOff2 = args.nDstXOff2;
1163
0
    const int nDstYOff = args.nDstYOff;
1164
0
    const int nDstYOff2 = args.nDstYOff2;
1165
0
    const char *pszResampling = args.pszResampling;
1166
0
    bool bHasNoData = args.bHasNoData;
1167
0
    const double dfNoDataValue = args.dfNoDataValue;
1168
0
    const GDALColorTable *poColorTable = args.poColorTable;
1169
0
    const bool bPropagateNoData = args.bPropagateNoData;
1170
1171
    // AVERAGE_BIT2GRAYSCALE
1172
0
    const bool bBit2Grayscale =
1173
0
        CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1174
0
    const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1175
0
    if (bBit2Grayscale)
1176
0
        poColorTable = nullptr;
1177
1178
0
    T tNoDataValue;
1179
0
    if (!bHasNoData)
1180
0
        tNoDataValue = 0;
1181
0
    else
1182
0
        tNoDataValue = static_cast<T>(dfNoDataValue);
1183
0
    const T tReplacementVal =
1184
0
        bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1185
0
                         args.eOvrDataType, dfNoDataValue))
1186
0
                   : 0;
1187
1188
0
    int nChunkRightXOff = nChunkXOff + nChunkXSize;
1189
0
    int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1190
0
    int nDstXWidth = nDstXOff2 - nDstXOff;
1191
1192
    /* -------------------------------------------------------------------- */
1193
    /*      Allocate buffers.                                               */
1194
    /* -------------------------------------------------------------------- */
1195
0
    *ppDstBuffer = static_cast<T *>(
1196
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1197
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
1198
0
    if (*ppDstBuffer == nullptr)
1199
0
    {
1200
0
        return CE_Failure;
1201
0
    }
1202
0
    T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1203
1204
0
    struct PrecomputedXValue
1205
0
    {
1206
0
        int nLeftXOffShifted;
1207
0
        int nRightXOffShifted;
1208
0
        double dfLeftWeight;
1209
0
        double dfRightWeight;
1210
0
        double dfTotalWeightFullLine;
1211
0
    };
1212
1213
0
    PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1214
0
        VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1215
1216
0
    if (pasSrcX == nullptr)
1217
0
    {
1218
0
        return CE_Failure;
1219
0
    }
1220
1221
0
    int nTransparentIdx = -1;
1222
0
    std::vector<GDALColorEntry> colorEntries;
1223
0
    if (poColorTable)
1224
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1225
1226
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1227
    // it as nodata value
1228
0
    if (bHasNoData && dfNoDataValue >= 0.0f &&
1229
0
        tNoDataValue < colorEntries.size())
1230
0
        colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1231
1232
    // Or if we have no explicit nodata, but a color table entry that is
1233
    // transparent, consider it as the nodata value
1234
0
    else if (!bHasNoData && nTransparentIdx >= 0)
1235
0
    {
1236
0
        bHasNoData = true;
1237
0
        tNoDataValue = static_cast<T>(nTransparentIdx);
1238
0
    }
1239
1240
    /* ==================================================================== */
1241
    /*      Precompute inner loop constants.                                */
1242
    /* ==================================================================== */
1243
0
    bool bSrcXSpacingIsTwo = true;
1244
0
    int nLastSrcXOff2 = -1;
1245
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1246
0
    {
1247
0
        double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1248
        // Apply some epsilon to avoid numerical precision issues
1249
0
        int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1250
0
        double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1251
0
        int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1252
1253
0
        if (nSrcXOff < nChunkXOff)
1254
0
            nSrcXOff = nChunkXOff;
1255
0
        if (nSrcXOff2 == nSrcXOff)
1256
0
            nSrcXOff2++;
1257
0
        if (nSrcXOff2 > nChunkRightXOff)
1258
0
            nSrcXOff2 = nChunkRightXOff;
1259
1260
0
        pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1261
0
        pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1262
0
            nSrcXOff2 - nChunkXOff;
1263
0
        pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1264
0
            (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1265
0
        pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1266
0
            1 - (nSrcXOff2 - dfSrcXOff2);
1267
0
        pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1268
0
            pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1269
0
        if (nSrcXOff + 1 < nSrcXOff2)
1270
0
        {
1271
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1272
0
                nSrcXOff2 - nSrcXOff - 2;
1273
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1274
0
                pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1275
0
        }
1276
1277
0
        if (nSrcXOff2 - nSrcXOff != 2 ||
1278
0
            (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1279
0
        {
1280
0
            bSrcXSpacingIsTwo = false;
1281
0
        }
1282
0
        nLastSrcXOff2 = nSrcXOff2;
1283
0
    }
1284
1285
    /* ==================================================================== */
1286
    /*      Loop over destination scanlines.                                */
1287
    /* ==================================================================== */
1288
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1289
0
    {
1290
0
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1291
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1292
0
        if (nSrcYOff < nChunkYOff)
1293
0
            nSrcYOff = nChunkYOff;
1294
1295
0
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1296
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1297
0
        if (nSrcYOff2 == nSrcYOff)
1298
0
            ++nSrcYOff2;
1299
0
        if (nSrcYOff2 > nChunkBottomYOff)
1300
0
            nSrcYOff2 = nChunkBottomYOff;
1301
1302
0
        T *const pDstScanline =
1303
0
            pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
1304
1305
        /* --------------------------------------------------------------------
1306
         */
1307
        /*      Loop over destination pixels */
1308
        /* --------------------------------------------------------------------
1309
         */
1310
0
        if (poColorTable == nullptr)
1311
0
        {
1312
0
            if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1313
0
                pabyChunkNodataMask == nullptr)
1314
0
            {
1315
                if constexpr (eWrkDataType == GDT_Byte ||
1316
                              eWrkDataType == GDT_UInt16)
1317
0
                {
1318
                    // Optimized case : no nodata, overview by a factor of 2 and
1319
                    // regular x and y src spacing.
1320
0
                    const T *pSrcScanlineShifted =
1321
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1322
0
                        static_cast<size_t>(nSrcYOff - nChunkYOff) *
1323
0
                            nChunkXSize;
1324
0
                    int iDstPixel = 0;
1325
0
#ifdef USE_SSE2
1326
                    if constexpr (eWrkDataType == GDT_Byte)
1327
0
                    {
1328
0
                        if (bQuadraticMean)
1329
0
                        {
1330
0
                            iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1331
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1332
0
                                pDstScanline);
1333
0
                        }
1334
0
                        else
1335
0
                        {
1336
0
                            iDstPixel = AverageByteSSE2OrAVX2(
1337
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1338
0
                                pDstScanline);
1339
0
                        }
1340
                    }
1341
                    else
1342
0
                    {
1343
0
                        static_assert(eWrkDataType == GDT_UInt16);
1344
0
                        if (bQuadraticMean)
1345
0
                        {
1346
0
                            iDstPixel = QuadraticMeanUInt16SSE2(
1347
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1348
0
                                pDstScanline);
1349
0
                        }
1350
0
                        else
1351
0
                        {
1352
0
                            iDstPixel = AverageUInt16SSE2(
1353
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1354
0
                                pDstScanline);
1355
0
                        }
1356
0
                    }
1357
0
#endif
1358
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1359
0
                    {
1360
0
                        Tsum nTotal = 0;
1361
0
                        T nVal;
1362
0
                        if (bQuadraticMean)
1363
0
                            nTotal =
1364
0
                                SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1365
0
                                SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1366
0
                                SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1367
0
                                SQUARE<Tsum>(
1368
0
                                    pSrcScanlineShifted[1 + nChunkXSize]);
1369
0
                        else
1370
0
                            nTotal = pSrcScanlineShifted[0] +
1371
0
                                     pSrcScanlineShifted[1] +
1372
0
                                     pSrcScanlineShifted[nChunkXSize] +
1373
0
                                     pSrcScanlineShifted[1 + nChunkXSize];
1374
1375
0
                        constexpr int nTotalWeight = 4;
1376
0
                        if (bQuadraticMean)
1377
0
                            nVal = ComputeIntegerRMS_4values<T>(nTotal);
1378
0
                        else
1379
0
                            nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1380
0
                                                  nTotalWeight);
1381
1382
                        // No need to compare nVal against tNoDataValue as we
1383
                        // are in a case where pabyChunkNodataMask == nullptr
1384
                        // implies the absence of nodata value.
1385
0
                        pDstScanline[iDstPixel] = nVal;
1386
0
                        pSrcScanlineShifted += 2;
1387
0
                    }
1388
                }
1389
                else
1390
0
                {
1391
0
                    static_assert(eWrkDataType == GDT_Float32 ||
1392
0
                                  eWrkDataType == GDT_Float64);
1393
0
                    const T *pSrcScanlineShifted =
1394
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1395
0
                        static_cast<size_t>(nSrcYOff - nChunkYOff) *
1396
0
                            nChunkXSize;
1397
0
                    int iDstPixel = 0;
1398
0
#ifdef USE_SSE2
1399
                    if constexpr (eWrkDataType == GDT_Float32)
1400
0
                    {
1401
0
                        static_assert(std::is_same_v<T, float>);
1402
0
                        if (bQuadraticMean)
1403
0
                        {
1404
0
                            iDstPixel = QuadraticMeanFloatSSE2(
1405
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1406
0
                                pDstScanline);
1407
0
                        }
1408
0
                        else
1409
0
                        {
1410
0
                            iDstPixel = AverageFloatSSE2(
1411
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1412
0
                                pDstScanline);
1413
0
                        }
1414
0
                    }
1415
0
#endif
1416
1417
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1418
0
                    {
1419
0
                        T nVal;
1420
0
                        if (bQuadraticMean)
1421
0
                        {
1422
                            // Cast to double to avoid overflows
1423
                            // (using std::hypot() is much slower)
1424
0
                            nVal = static_cast<T>(std::sqrt(
1425
0
                                0.25 *
1426
0
                                (SQUARE<double>(pSrcScanlineShifted[0]) +
1427
0
                                 SQUARE<double>(pSrcScanlineShifted[1]) +
1428
0
                                 SQUARE<double>(
1429
0
                                     pSrcScanlineShifted[nChunkXSize]) +
1430
0
                                 SQUARE<double>(
1431
0
                                     pSrcScanlineShifted[1 + nChunkXSize]))));
1432
0
                        }
1433
0
                        else
1434
0
                        {
1435
0
                            nVal = static_cast<T>(
1436
0
                                0.25f * (pSrcScanlineShifted[0] +
1437
0
                                         pSrcScanlineShifted[1] +
1438
0
                                         pSrcScanlineShifted[nChunkXSize] +
1439
0
                                         pSrcScanlineShifted[1 + nChunkXSize]));
1440
0
                        }
1441
1442
                        // No need to compare nVal against tNoDataValue as we
1443
                        // are in a case where pabyChunkNodataMask == nullptr
1444
                        // implies the absence of nodata value.
1445
0
                        pDstScanline[iDstPixel] = nVal;
1446
0
                        pSrcScanlineShifted += 2;
1447
0
                    }
1448
0
                }
1449
0
            }
1450
0
            else
1451
0
            {
1452
0
                const double dfBottomWeight =
1453
0
                    (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1454
0
                                                : 1.0 - (dfSrcYOff - nSrcYOff);
1455
0
                const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1456
0
                nSrcYOff -= nChunkYOff;
1457
0
                nSrcYOff2 -= nChunkYOff;
1458
1459
0
                double dfTotalWeightFullColumn = dfBottomWeight;
1460
0
                if (nSrcYOff + 1 < nSrcYOff2)
1461
0
                {
1462
0
                    dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1463
0
                    dfTotalWeightFullColumn += dfTopWeight;
1464
0
                }
1465
1466
0
                for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1467
0
                {
1468
0
                    const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1469
0
                    const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1470
1471
0
                    double dfTotal = 0;
1472
0
                    double dfTotalWeight = 0;
1473
0
                    if (pabyChunkNodataMask == nullptr)
1474
0
                    {
1475
0
                        auto pChunkShifted =
1476
0
                            pChunk +
1477
0
                            static_cast<size_t>(nSrcYOff) * nChunkXSize;
1478
0
                        int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1479
0
                        double dfWeightY = dfBottomWeight;
1480
0
                        while (true)
1481
0
                        {
1482
0
                            double dfTotalLine;
1483
0
                            if (bQuadraticMean)
1484
0
                            {
1485
                                // Left pixel
1486
0
                                {
1487
0
                                    const T val = pChunkShifted[nSrcXOff];
1488
0
                                    dfTotalLine =
1489
0
                                        SQUARE<double>(val) *
1490
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1491
0
                                }
1492
1493
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1494
0
                                {
1495
                                    // Middle pixels
1496
0
                                    for (int iX = nSrcXOff + 1;
1497
0
                                         iX < nSrcXOff2 - 1; ++iX)
1498
0
                                    {
1499
0
                                        const T val = pChunkShifted[iX];
1500
0
                                        dfTotalLine += SQUARE<double>(val);
1501
0
                                    }
1502
1503
                                    // Right pixel
1504
0
                                    {
1505
0
                                        const T val =
1506
0
                                            pChunkShifted[nSrcXOff2 - 1];
1507
0
                                        dfTotalLine +=
1508
0
                                            SQUARE<double>(val) *
1509
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1510
0
                                    }
1511
0
                                }
1512
0
                            }
1513
0
                            else
1514
0
                            {
1515
                                // Left pixel
1516
0
                                {
1517
0
                                    const T val = pChunkShifted[nSrcXOff];
1518
0
                                    dfTotalLine =
1519
0
                                        val * pasSrcX[iDstPixel].dfLeftWeight;
1520
0
                                }
1521
1522
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1523
0
                                {
1524
                                    // Middle pixels
1525
0
                                    for (int iX = nSrcXOff + 1;
1526
0
                                         iX < nSrcXOff2 - 1; ++iX)
1527
0
                                    {
1528
0
                                        const T val = pChunkShifted[iX];
1529
0
                                        dfTotalLine += val;
1530
0
                                    }
1531
1532
                                    // Right pixel
1533
0
                                    {
1534
0
                                        const T val =
1535
0
                                            pChunkShifted[nSrcXOff2 - 1];
1536
0
                                        dfTotalLine +=
1537
0
                                            val *
1538
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1539
0
                                    }
1540
0
                                }
1541
0
                            }
1542
1543
0
                            dfTotal += dfTotalLine * dfWeightY;
1544
0
                            --nCounterY;
1545
0
                            if (nCounterY < 0)
1546
0
                                break;
1547
0
                            pChunkShifted += nChunkXSize;
1548
0
                            dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1549
0
                        }
1550
1551
0
                        dfTotalWeight =
1552
0
                            pasSrcX[iDstPixel].dfTotalWeightFullLine *
1553
0
                            dfTotalWeightFullColumn;
1554
0
                    }
1555
0
                    else
1556
0
                    {
1557
0
                        size_t nCount = 0;
1558
0
                        for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1559
0
                        {
1560
0
                            const auto pChunkShifted =
1561
0
                                pChunk + static_cast<size_t>(iY) * nChunkXSize;
1562
1563
0
                            double dfTotalLine = 0;
1564
0
                            double dfTotalWeightLine = 0;
1565
                            // Left pixel
1566
0
                            {
1567
0
                                const int iX = nSrcXOff;
1568
0
                                const T val = pChunkShifted[iX];
1569
0
                                if (pabyChunkNodataMask
1570
0
                                        [iX +
1571
0
                                         static_cast<size_t>(iY) * nChunkXSize])
1572
0
                                {
1573
0
                                    nCount++;
1574
0
                                    const double dfWeightX =
1575
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1576
0
                                    dfTotalWeightLine = dfWeightX;
1577
0
                                    if (bQuadraticMean)
1578
0
                                        dfTotalLine =
1579
0
                                            SQUARE<double>(val) * dfWeightX;
1580
0
                                    else
1581
0
                                        dfTotalLine = val * dfWeightX;
1582
0
                                }
1583
0
                            }
1584
1585
0
                            if (nSrcXOff < nSrcXOff2 - 1)
1586
0
                            {
1587
                                // Middle pixels
1588
0
                                for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
1589
0
                                     ++iX)
1590
0
                                {
1591
0
                                    const T val = pChunkShifted[iX];
1592
0
                                    if (pabyChunkNodataMask
1593
0
                                            [iX + static_cast<size_t>(iY) *
1594
0
                                                      nChunkXSize])
1595
0
                                    {
1596
0
                                        nCount++;
1597
0
                                        dfTotalWeightLine += 1;
1598
0
                                        if (bQuadraticMean)
1599
0
                                            dfTotalLine += SQUARE<double>(val);
1600
0
                                        else
1601
0
                                            dfTotalLine += val;
1602
0
                                    }
1603
0
                                }
1604
1605
                                // Right pixel
1606
0
                                {
1607
0
                                    const int iX = nSrcXOff2 - 1;
1608
0
                                    const T val = pChunkShifted[iX];
1609
0
                                    if (pabyChunkNodataMask
1610
0
                                            [iX + static_cast<size_t>(iY) *
1611
0
                                                      nChunkXSize])
1612
0
                                    {
1613
0
                                        nCount++;
1614
0
                                        const double dfWeightX =
1615
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1616
0
                                        dfTotalWeightLine += dfWeightX;
1617
0
                                        if (bQuadraticMean)
1618
0
                                            dfTotalLine +=
1619
0
                                                SQUARE<double>(val) * dfWeightX;
1620
0
                                        else
1621
0
                                            dfTotalLine += val * dfWeightX;
1622
0
                                    }
1623
0
                                }
1624
0
                            }
1625
1626
0
                            const double dfWeightY =
1627
0
                                (iY == nSrcYOff)        ? dfBottomWeight
1628
0
                                : (iY + 1 == nSrcYOff2) ? dfTopWeight
1629
0
                                                        : 1.0;
1630
0
                            dfTotal += dfTotalLine * dfWeightY;
1631
0
                            dfTotalWeight += dfTotalWeightLine * dfWeightY;
1632
0
                        }
1633
1634
0
                        if (nCount == 0 ||
1635
0
                            (bPropagateNoData &&
1636
0
                             nCount <
1637
0
                                 static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1638
0
                                     (nSrcXOff2 - nSrcXOff)))
1639
0
                        {
1640
0
                            pDstScanline[iDstPixel] = tNoDataValue;
1641
0
                            continue;
1642
0
                        }
1643
0
                    }
1644
                    if constexpr (eWrkDataType == GDT_Byte)
1645
0
                    {
1646
0
                        T nVal;
1647
0
                        if (bQuadraticMean)
1648
0
                            nVal = ComputeIntegerRMS<T, int>(dfTotal,
1649
0
                                                             dfTotalWeight);
1650
0
                        else
1651
0
                            nVal =
1652
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1653
0
                        if (bHasNoData && nVal == tNoDataValue)
1654
0
                            nVal = tReplacementVal;
1655
0
                        pDstScanline[iDstPixel] = nVal;
1656
                    }
1657
                    else if constexpr (eWrkDataType == GDT_UInt16)
1658
0
                    {
1659
0
                        T nVal;
1660
0
                        if (bQuadraticMean)
1661
0
                            nVal = ComputeIntegerRMS<T, uint64_t>(
1662
0
                                dfTotal, dfTotalWeight);
1663
0
                        else
1664
0
                            nVal =
1665
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1666
0
                        if (bHasNoData && nVal == tNoDataValue)
1667
0
                            nVal = tReplacementVal;
1668
0
                        pDstScanline[iDstPixel] = nVal;
1669
                    }
1670
                    else
1671
0
                    {
1672
0
                        T nVal;
1673
0
                        if (bQuadraticMean)
1674
0
                            nVal =
1675
0
                                static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1676
0
                        else
1677
0
                            nVal = static_cast<T>(dfTotal / dfTotalWeight);
1678
0
                        if (bHasNoData && nVal == tNoDataValue)
1679
0
                            nVal = tReplacementVal;
1680
0
                        pDstScanline[iDstPixel] = nVal;
1681
0
                    }
1682
0
                }
1683
0
            }
1684
0
        }
1685
0
        else
1686
0
        {
1687
0
            nSrcYOff -= nChunkYOff;
1688
0
            nSrcYOff2 -= nChunkYOff;
1689
1690
0
            for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1691
0
            {
1692
0
                const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1693
0
                const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1694
1695
0
                uint64_t nTotalR = 0;
1696
0
                uint64_t nTotalG = 0;
1697
0
                uint64_t nTotalB = 0;
1698
0
                size_t nCount = 0;
1699
1700
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1701
0
                {
1702
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1703
0
                    {
1704
0
                        const T val =
1705
0
                            pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
1706
                        // cppcheck-suppress unsignedLessThanZero
1707
0
                        if (val < 0 || val >= colorEntries.size())
1708
0
                            continue;
1709
0
                        const size_t idx = static_cast<size_t>(val);
1710
0
                        const auto &entry = colorEntries[idx];
1711
0
                        if (entry.c4)
1712
0
                        {
1713
0
                            if (bQuadraticMean)
1714
0
                            {
1715
0
                                nTotalR += SQUARE<int>(entry.c1);
1716
0
                                nTotalG += SQUARE<int>(entry.c2);
1717
0
                                nTotalB += SQUARE<int>(entry.c3);
1718
0
                                ++nCount;
1719
0
                            }
1720
0
                            else
1721
0
                            {
1722
0
                                nTotalR += entry.c1;
1723
0
                                nTotalG += entry.c2;
1724
0
                                nTotalB += entry.c3;
1725
0
                                ++nCount;
1726
0
                            }
1727
0
                        }
1728
0
                    }
1729
0
                }
1730
1731
0
                if (nCount == 0 ||
1732
0
                    (bPropagateNoData &&
1733
0
                     nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
1734
0
                                  (nSrcXOff2 - nSrcXOff)))
1735
0
                {
1736
0
                    pDstScanline[iDstPixel] = tNoDataValue;
1737
0
                }
1738
0
                else
1739
0
                {
1740
0
                    GDALColorEntry color;
1741
0
                    if (bQuadraticMean)
1742
0
                    {
1743
0
                        color.c1 =
1744
0
                            static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1745
0
                        color.c2 =
1746
0
                            static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1747
0
                        color.c3 =
1748
0
                            static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1749
0
                    }
1750
0
                    else
1751
0
                    {
1752
0
                        color.c1 =
1753
0
                            static_cast<short>((nTotalR + nCount / 2) / nCount);
1754
0
                        color.c2 =
1755
0
                            static_cast<short>((nTotalG + nCount / 2) / nCount);
1756
0
                        color.c3 =
1757
0
                            static_cast<short>((nTotalB + nCount / 2) / nCount);
1758
0
                    }
1759
0
                    pDstScanline[iDstPixel] =
1760
0
                        static_cast<T>(BestColorEntry(colorEntries, color));
1761
0
                }
1762
0
            }
1763
0
        }
1764
0
    }
1765
1766
0
    CPLFree(pasSrcX);
1767
1768
0
    return CE_None;
1769
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned char, int, (GDALDataType)1>(GDALOverviewResampleArgs const&, unsigned char const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, double, (GDALDataType)2>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, unsigned int, (GDALDataType)2>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<float, double, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void**)
1770
1771
static CPLErr
1772
GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1773
                               const void *pChunk, void **ppDstBuffer,
1774
                               GDALDataType *peDstBufferDataType)
1775
0
{
1776
0
    *peDstBufferDataType = args.eWrkDataType;
1777
0
    switch (args.eWrkDataType)
1778
0
    {
1779
0
        case GDT_Byte:
1780
0
        {
1781
0
            return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1782
0
                args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1783
0
        }
1784
1785
0
        case GDT_UInt16:
1786
0
        {
1787
0
            if (EQUAL(args.pszResampling, "RMS"))
1788
0
            {
1789
                // Use double as accumulation type, because UInt32 could overflow
1790
0
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1791
0
                                                        GDT_UInt16>(
1792
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1793
0
            }
1794
0
            else
1795
0
            {
1796
0
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1797
0
                                                        GDT_UInt16>(
1798
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1799
0
            }
1800
0
        }
1801
1802
0
        case GDT_Float32:
1803
0
        {
1804
0
            return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1805
0
                args, static_cast<const float *>(pChunk), ppDstBuffer);
1806
0
        }
1807
1808
0
        case GDT_Float64:
1809
0
        {
1810
0
            return GDALResampleChunk_AverageOrRMS_T<double, double,
1811
0
                                                    GDT_Float64>(
1812
0
                args, static_cast<const double *>(pChunk), ppDstBuffer);
1813
0
        }
1814
1815
0
        default:
1816
0
            break;
1817
0
    }
1818
1819
0
    CPLAssert(false);
1820
0
    return CE_Failure;
1821
0
}
1822
1823
/************************************************************************/
1824
/*                     GDALResampleChunk_Gauss()                        */
1825
/************************************************************************/
1826
1827
static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1828
                                      const void *pChunk, void **ppDstBuffer,
1829
                                      GDALDataType *peDstBufferDataType)
1830
1831
0
{
1832
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1833
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1834
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1835
0
    const int nChunkXOff = args.nChunkXOff;
1836
0
    const int nChunkXSize = args.nChunkXSize;
1837
0
    const int nChunkYOff = args.nChunkYOff;
1838
0
    const int nChunkYSize = args.nChunkYSize;
1839
0
    const int nDstXOff = args.nDstXOff;
1840
0
    const int nDstXOff2 = args.nDstXOff2;
1841
0
    const int nDstYOff = args.nDstYOff;
1842
0
    const int nDstYOff2 = args.nDstYOff2;
1843
0
    const bool bHasNoData = args.bHasNoData;
1844
0
    double dfNoDataValue = args.dfNoDataValue;
1845
0
    const GDALColorTable *poColorTable = args.poColorTable;
1846
1847
0
    const double *const padfChunk = static_cast<const double *>(pChunk);
1848
1849
0
    *ppDstBuffer =
1850
0
        VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1851
0
                            GDALGetDataTypeSizeBytes(GDT_Float64));
1852
0
    if (*ppDstBuffer == nullptr)
1853
0
    {
1854
0
        return CE_Failure;
1855
0
    }
1856
0
    *peDstBufferDataType = GDT_Float64;
1857
0
    double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1858
1859
    /* -------------------------------------------------------------------- */
1860
    /*      Create the filter kernel and allocate scanline buffer.          */
1861
    /* -------------------------------------------------------------------- */
1862
0
    int nGaussMatrixDim = 3;
1863
0
    const int *panGaussMatrix;
1864
0
    constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1865
0
    constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
1866
0
                                        4,  6, 24, 36, 24, 6, 4,  16, 24,
1867
0
                                        16, 4, 1,  4,  6,  4, 1};
1868
0
    constexpr int anGaussMatrix7x7[] = {
1869
0
        1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
1870
0
        6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
1871
0
        120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
1872
0
        90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
1873
1874
0
    const int nOXSize = args.nOvrXSize;
1875
0
    const int nOYSize = args.nOvrYSize;
1876
0
    const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1877
1878
    // matrix for gauss filter
1879
0
    if (nResYFactor <= 2)
1880
0
    {
1881
0
        panGaussMatrix = anGaussMatrix3x3;
1882
0
        nGaussMatrixDim = 3;
1883
0
    }
1884
0
    else if (nResYFactor <= 4)
1885
0
    {
1886
0
        panGaussMatrix = anGaussMatrix5x5;
1887
0
        nGaussMatrixDim = 5;
1888
0
    }
1889
0
    else
1890
0
    {
1891
0
        panGaussMatrix = anGaussMatrix7x7;
1892
0
        nGaussMatrixDim = 7;
1893
0
    }
1894
1895
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
1896
    int *panGaussMatrixDup = static_cast<int *>(
1897
        CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1898
    memcpy(panGaussMatrixDup, panGaussMatrix,
1899
           sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1900
    panGaussMatrix = panGaussMatrixDup;
1901
#endif
1902
1903
0
    if (!bHasNoData)
1904
0
        dfNoDataValue = 0.0;
1905
1906
0
    std::vector<GDALColorEntry> colorEntries;
1907
0
    int nTransparentIdx = -1;
1908
0
    if (poColorTable)
1909
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1910
1911
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1912
    // it as nodata value.
1913
0
    if (bHasNoData && dfNoDataValue >= 0.0f &&
1914
0
        dfNoDataValue < colorEntries.size())
1915
0
        colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1916
1917
    // Or if we have no explicit nodata, but a color table entry that is
1918
    // transparent, consider it as the nodata value.
1919
0
    else if (!bHasNoData && nTransparentIdx >= 0)
1920
0
    {
1921
0
        dfNoDataValue = nTransparentIdx;
1922
0
    }
1923
1924
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1925
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1926
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
1927
1928
    /* ==================================================================== */
1929
    /*      Loop over destination scanlines.                                */
1930
    /* ==================================================================== */
1931
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1932
0
    {
1933
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1934
0
        int nSrcYOff2 =
1935
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1936
1937
0
        if (nSrcYOff < nChunkYOff)
1938
0
        {
1939
0
            nSrcYOff = nChunkYOff;
1940
0
            nSrcYOff2++;
1941
0
        }
1942
1943
0
        const int iSizeY = nSrcYOff2 - nSrcYOff;
1944
0
        nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1945
0
        nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1946
1947
0
        if (nSrcYOff2 > nChunkBottomYOff ||
1948
0
            (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1949
0
        {
1950
0
            nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1951
0
        }
1952
1953
0
        int nYShiftGaussMatrix = 0;
1954
0
        if (nSrcYOff < nChunkYOff)
1955
0
        {
1956
0
            nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1957
0
            nSrcYOff = nChunkYOff;
1958
0
        }
1959
1960
0
        const double *const padfSrcScanline =
1961
0
            padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1962
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
1963
0
        if (pabyChunkNodataMask != nullptr)
1964
0
            pabySrcScanlineNodataMask =
1965
0
                pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1966
1967
        /* --------------------------------------------------------------------
1968
         */
1969
        /*      Loop over destination pixels */
1970
        /* --------------------------------------------------------------------
1971
         */
1972
0
        double *const padfDstScanline =
1973
0
            padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1974
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1975
0
        {
1976
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1977
0
            int nSrcXOff2 =
1978
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1979
1980
0
            if (nSrcXOff < nChunkXOff)
1981
0
            {
1982
0
                nSrcXOff = nChunkXOff;
1983
0
                nSrcXOff2++;
1984
0
            }
1985
1986
0
            const int iSizeX = nSrcXOff2 - nSrcXOff;
1987
0
            nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
1988
0
            nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1989
1990
0
            if (nSrcXOff2 > nChunkRightXOff ||
1991
0
                (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
1992
0
            {
1993
0
                nSrcXOff2 =
1994
0
                    std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
1995
0
            }
1996
1997
0
            int nXShiftGaussMatrix = 0;
1998
0
            if (nSrcXOff < nChunkXOff)
1999
0
            {
2000
0
                nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
2001
0
                nSrcXOff = nChunkXOff;
2002
0
            }
2003
2004
0
            if (poColorTable == nullptr)
2005
0
            {
2006
0
                double dfTotal = 0.0;
2007
0
                GInt64 nCount = 0;
2008
0
                const int *panLineWeight =
2009
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2010
0
                    nXShiftGaussMatrix;
2011
2012
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2013
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2014
0
                {
2015
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2016
0
                    {
2017
0
                        const double val =
2018
0
                            padfSrcScanline[iX - nChunkXOff +
2019
0
                                            static_cast<GPtrDiff_t>(iY -
2020
0
                                                                    nSrcYOff) *
2021
0
                                                nChunkXSize];
2022
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2023
0
                            pabySrcScanlineNodataMask[iX - nChunkXOff +
2024
0
                                                      static_cast<GPtrDiff_t>(
2025
0
                                                          iY - nSrcYOff) *
2026
0
                                                          nChunkXSize])
2027
0
                        {
2028
0
                            const int nWeight = panLineWeight[i];
2029
0
                            dfTotal += val * nWeight;
2030
0
                            nCount += nWeight;
2031
0
                        }
2032
0
                    }
2033
0
                }
2034
2035
0
                if (nCount == 0)
2036
0
                {
2037
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2038
0
                }
2039
0
                else
2040
0
                {
2041
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2042
0
                }
2043
0
            }
2044
0
            else
2045
0
            {
2046
0
                GInt64 nTotalR = 0;
2047
0
                GInt64 nTotalG = 0;
2048
0
                GInt64 nTotalB = 0;
2049
0
                GInt64 nTotalWeight = 0;
2050
0
                const int *panLineWeight =
2051
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2052
0
                    nXShiftGaussMatrix;
2053
2054
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2055
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2056
0
                {
2057
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2058
0
                    {
2059
0
                        const double val =
2060
0
                            padfSrcScanline[iX - nChunkXOff +
2061
0
                                            static_cast<GPtrDiff_t>(iY -
2062
0
                                                                    nSrcYOff) *
2063
0
                                                nChunkXSize];
2064
0
                        if (val < 0 || val >= colorEntries.size())
2065
0
                            continue;
2066
2067
0
                        size_t idx = static_cast<size_t>(val);
2068
0
                        if (colorEntries[idx].c4)
2069
0
                        {
2070
0
                            const int nWeight = panLineWeight[i];
2071
0
                            nTotalR +=
2072
0
                                static_cast<GInt64>(colorEntries[idx].c1) *
2073
0
                                nWeight;
2074
0
                            nTotalG +=
2075
0
                                static_cast<GInt64>(colorEntries[idx].c2) *
2076
0
                                nWeight;
2077
0
                            nTotalB +=
2078
0
                                static_cast<GInt64>(colorEntries[idx].c3) *
2079
0
                                nWeight;
2080
0
                            nTotalWeight += nWeight;
2081
0
                        }
2082
0
                    }
2083
0
                }
2084
2085
0
                if (nTotalWeight == 0)
2086
0
                {
2087
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2088
0
                }
2089
0
                else
2090
0
                {
2091
0
                    GDALColorEntry color;
2092
2093
0
                    color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2094
0
                                                  nTotalWeight);
2095
0
                    color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2096
0
                                                  nTotalWeight);
2097
0
                    color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2098
0
                                                  nTotalWeight);
2099
0
                    padfDstScanline[iDstPixel - nDstXOff] =
2100
0
                        BestColorEntry(colorEntries, color);
2101
0
                }
2102
0
            }
2103
0
        }
2104
0
    }
2105
2106
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2107
    CPLFree(panGaussMatrixDup);
2108
#endif
2109
2110
0
    return CE_None;
2111
0
}
2112
2113
/************************************************************************/
2114
/*                      GDALResampleChunk_Mode()                        */
2115
/************************************************************************/
2116
2117
template <class T> static inline bool IsSame(T a, T b)
2118
0
{
2119
0
    return a == b;
2120
0
}
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned char>(unsigned char, unsigned char)
Unexecuted instantiation: overview.cpp:bool IsSame<signed char>(signed char, signed char)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned short>(unsigned short, unsigned short)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned int>(unsigned int, unsigned int)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned long>(unsigned long, unsigned long)
2121
2122
template <> bool IsSame<float>(float a, float b)
2123
0
{
2124
0
    return a == b || (std::isnan(a) && std::isnan(b));
2125
0
}
2126
2127
template <> bool IsSame<double>(double a, double b)
2128
0
{
2129
0
    return a == b || (std::isnan(a) && std::isnan(b));
2130
0
}
2131
2132
template <>
2133
bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2134
0
{
2135
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2136
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2137
0
}
2138
2139
template <>
2140
bool IsSame<std::complex<double>>(std::complex<double> a,
2141
                                  std::complex<double> b)
2142
0
{
2143
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2144
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2145
0
}
2146
2147
template <class T>
2148
static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2149
                                      const T *pChunk, T *const pDstBuffer)
2150
2151
0
{
2152
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2153
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2154
0
    const double dfSrcXDelta = args.dfSrcXDelta;
2155
0
    const double dfSrcYDelta = args.dfSrcYDelta;
2156
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2157
0
    const int nChunkXOff = args.nChunkXOff;
2158
0
    const int nChunkXSize = args.nChunkXSize;
2159
0
    const int nChunkYOff = args.nChunkYOff;
2160
0
    const int nChunkYSize = args.nChunkYSize;
2161
0
    const int nDstXOff = args.nDstXOff;
2162
0
    const int nDstXOff2 = args.nDstXOff2;
2163
0
    const int nDstYOff = args.nDstYOff;
2164
0
    const int nDstYOff2 = args.nDstYOff2;
2165
0
    const bool bHasNoData = args.bHasNoData;
2166
0
    const GDALColorTable *poColorTable = args.poColorTable;
2167
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
2168
2169
0
    T tNoDataValue;
2170
    if constexpr (std::is_same<T, std::complex<float>>::value ||
2171
                  std::is_same<T, std::complex<double>>::value)
2172
0
    {
2173
0
        using BaseT = typename T::value_type;
2174
0
        tNoDataValue =
2175
0
            std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2176
0
                                std::numeric_limits<BaseT>::quiet_NaN());
2177
    }
2178
0
    else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2179
0
        tNoDataValue = 0;
2180
0
    else
2181
0
        tNoDataValue = static_cast<T>(args.dfNoDataValue);
2182
2183
0
    size_t nMaxNumPx = 0;
2184
0
    T *paVals = nullptr;
2185
0
    int *panSums = nullptr;
2186
2187
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2188
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2189
0
    std::vector<int> anVals(256, 0);
2190
2191
    /* ==================================================================== */
2192
    /*      Loop over destination scanlines.                                */
2193
    /* ==================================================================== */
2194
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2195
0
    {
2196
0
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2197
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2198
#ifdef only_pixels_with_more_than_10_pct_participation
2199
        // When oversampling, don't take into account pixels that have a tiny
2200
        // participation in the resulting pixel
2201
        if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2202
            nSrcYOff < nChunkBottomYOff)
2203
            nSrcYOff++;
2204
#endif
2205
0
        if (nSrcYOff < nChunkYOff)
2206
0
            nSrcYOff = nChunkYOff;
2207
2208
0
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2209
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2210
#ifdef only_pixels_with_more_than_10_pct_participation
2211
        // When oversampling, don't take into account pixels that have a tiny
2212
        // participation in the resulting pixel
2213
        if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2214
            nSrcYOff2 > nChunkYOff)
2215
            nSrcYOff2--;
2216
#endif
2217
0
        if (nSrcYOff2 == nSrcYOff)
2218
0
            ++nSrcYOff2;
2219
0
        if (nSrcYOff2 > nChunkBottomYOff)
2220
0
            nSrcYOff2 = nChunkBottomYOff;
2221
2222
0
        const T *const paSrcScanline =
2223
0
            pChunk +
2224
0
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2225
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
2226
0
        if (pabyChunkNodataMask != nullptr)
2227
0
            pabySrcScanlineNodataMask =
2228
0
                pabyChunkNodataMask +
2229
0
                static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2230
2231
0
        T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2232
        /* --------------------------------------------------------------------
2233
         */
2234
        /*      Loop over destination pixels */
2235
        /* --------------------------------------------------------------------
2236
         */
2237
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2238
0
        {
2239
0
            double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2240
            // Apply some epsilon to avoid numerical precision issues
2241
0
            int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2242
#ifdef only_pixels_with_more_than_10_pct_participation
2243
            // When oversampling, don't take into account pixels that have a
2244
            // tiny participation in the resulting pixel
2245
            if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2246
                nSrcXOff < nChunkRightXOff)
2247
                nSrcXOff++;
2248
#endif
2249
0
            if (nSrcXOff < nChunkXOff)
2250
0
                nSrcXOff = nChunkXOff;
2251
2252
0
            double dfSrcXOff2 =
2253
0
                dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2254
0
            int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2255
#ifdef only_pixels_with_more_than_10_pct_participation
2256
            // When oversampling, don't take into account pixels that have a
2257
            // tiny participation in the resulting pixel
2258
            if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2259
                nSrcXOff2 > nChunkXOff)
2260
                nSrcXOff2--;
2261
#endif
2262
0
            if (nSrcXOff2 == nSrcXOff)
2263
0
                nSrcXOff2++;
2264
0
            if (nSrcXOff2 > nChunkRightXOff)
2265
0
                nSrcXOff2 = nChunkRightXOff;
2266
2267
0
            bool bRegularProcessing = false;
2268
            if constexpr (!std::is_same<T, GByte>::value)
2269
0
                bRegularProcessing = true;
2270
0
            else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2271
0
                bRegularProcessing = true;
2272
2273
0
            if (bRegularProcessing)
2274
0
            {
2275
                // Not sure how much sense it makes to run a majority
2276
                // filter on floating point data, but here it is for the sake
2277
                // of compatibility. It won't look right on RGB images by the
2278
                // nature of the filter.
2279
2280
0
                if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2281
0
                    nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2282
0
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2283
0
                            static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2284
0
                        std::numeric_limits<size_t>::max() / sizeof(float))
2285
0
                {
2286
0
                    CPLError(CE_Failure, CPLE_NotSupported,
2287
0
                             "Too big downsampling factor");
2288
0
                    CPLFree(paVals);
2289
0
                    CPLFree(panSums);
2290
0
                    return CE_Failure;
2291
0
                }
2292
0
                const size_t nNumPx =
2293
0
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2294
0
                    static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2295
0
                size_t iMaxInd = 0;
2296
0
                size_t iMaxVal = 0;
2297
0
                bool biMaxValdValid = false;
2298
2299
0
                if (paVals == nullptr || nNumPx > nMaxNumPx)
2300
0
                {
2301
0
                    T *paValsNew = static_cast<T *>(
2302
0
                        VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2303
0
                    int *panSumsNew = static_cast<int *>(
2304
0
                        VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2305
0
                    if (paValsNew != nullptr)
2306
0
                        paVals = paValsNew;
2307
0
                    if (panSumsNew != nullptr)
2308
0
                        panSums = panSumsNew;
2309
0
                    if (paValsNew == nullptr || panSumsNew == nullptr)
2310
0
                    {
2311
0
                        CPLFree(paVals);
2312
0
                        CPLFree(panSums);
2313
0
                        return CE_Failure;
2314
0
                    }
2315
0
                    nMaxNumPx = nNumPx;
2316
0
                }
2317
2318
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2319
0
                {
2320
0
                    const GPtrDiff_t iTotYOff =
2321
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2322
0
                        nChunkXOff;
2323
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2324
0
                    {
2325
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2326
0
                            pabySrcScanlineNodataMask[iX + iTotYOff])
2327
0
                        {
2328
0
                            const T val = paSrcScanline[iX + iTotYOff];
2329
0
                            size_t i = 0;  // Used after for.
2330
2331
                            // Check array for existing entry.
2332
0
                            for (; i < iMaxInd; ++i)
2333
0
                                if (IsSame(paVals[i], val) &&
2334
0
                                    ++panSums[i] > panSums[iMaxVal])
2335
0
                                {
2336
0
                                    iMaxVal = i;
2337
0
                                    biMaxValdValid = true;
2338
0
                                    break;
2339
0
                                }
2340
2341
                            // Add to arr if entry not already there.
2342
0
                            if (i == iMaxInd)
2343
0
                            {
2344
0
                                paVals[iMaxInd] = val;
2345
0
                                panSums[iMaxInd] = 1;
2346
2347
0
                                if (!biMaxValdValid)
2348
0
                                {
2349
0
                                    iMaxVal = iMaxInd;
2350
0
                                    biMaxValdValid = true;
2351
0
                                }
2352
2353
0
                                ++iMaxInd;
2354
0
                            }
2355
0
                        }
2356
0
                    }
2357
0
                }
2358
2359
0
                if (!biMaxValdValid)
2360
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2361
0
                else
2362
0
                    paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2363
0
            }
2364
            else if constexpr (std::is_same<T, GByte>::value)
2365
            // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2366
0
            {
2367
                // So we go here for a paletted or non-paletted byte band.
2368
                // The input values are then between 0 and 255.
2369
0
                int nMaxVal = 0;
2370
0
                int iMaxInd = -1;
2371
2372
                // The cost of this zeroing might be high. Perhaps we should
2373
                // just use the above generic case, and go to this one if the
2374
                // number of source pixels is large enough
2375
0
                std::fill(anVals.begin(), anVals.end(), 0);
2376
2377
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2378
0
                {
2379
0
                    const GPtrDiff_t iTotYOff =
2380
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2381
0
                        nChunkXOff;
2382
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2383
0
                    {
2384
0
                        const T val = paSrcScanline[iX + iTotYOff];
2385
0
                        if (!bHasNoData || val != tNoDataValue)
2386
0
                        {
2387
0
                            int nVal = static_cast<int>(val);
2388
0
                            if (++anVals[nVal] > nMaxVal)
2389
0
                            {
2390
                                // Sum the density.
2391
                                // Is it the most common value so far?
2392
0
                                iMaxInd = nVal;
2393
0
                                nMaxVal = anVals[nVal];
2394
0
                            }
2395
0
                        }
2396
0
                    }
2397
0
                }
2398
2399
0
                if (iMaxInd == -1)
2400
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2401
0
                else
2402
0
                    paDstScanline[iDstPixel - nDstXOff] =
2403
0
                        static_cast<T>(iMaxInd);
2404
0
            }
2405
0
        }
2406
0
    }
2407
2408
0
    CPLFree(paVals);
2409
0
    CPLFree(panSums);
2410
2411
0
    return CE_None;
2412
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<signed char>(GDALOverviewResampleArgs const&, signed char const*, signed char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<float>(GDALOverviewResampleArgs const&, float const*, float*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<double>(GDALOverviewResampleArgs const&, double const*, double*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<float> >(GDALOverviewResampleArgs const&, std::__1::complex<float> const*, std::__1::complex<float>*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>*)
2413
2414
static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2415
                                     const void *pChunk, void **ppDstBuffer,
2416
                                     GDALDataType *peDstBufferDataType)
2417
0
{
2418
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2419
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2420
0
        GDALGetDataTypeSizeBytes(args.eWrkDataType));
2421
0
    if (*ppDstBuffer == nullptr)
2422
0
    {
2423
0
        return CE_Failure;
2424
0
    }
2425
2426
0
    CPLAssert(args.eSrcDataType == args.eWrkDataType);
2427
2428
0
    *peDstBufferDataType = args.eWrkDataType;
2429
0
    switch (args.eWrkDataType)
2430
0
    {
2431
        // For mode resampling, as no computation is done, only the
2432
        // size of the data type matters... except for Byte where we have
2433
        // special processing. And for floating point values
2434
0
        case GDT_Byte:
2435
0
        {
2436
0
            return GDALResampleChunk_ModeT(args,
2437
0
                                           static_cast<const GByte *>(pChunk),
2438
0
                                           static_cast<GByte *>(*ppDstBuffer));
2439
0
        }
2440
2441
0
        case GDT_Int8:
2442
0
        {
2443
0
            return GDALResampleChunk_ModeT(args,
2444
0
                                           static_cast<const int8_t *>(pChunk),
2445
0
                                           static_cast<int8_t *>(*ppDstBuffer));
2446
0
        }
2447
2448
0
        case GDT_Int16:
2449
0
        case GDT_UInt16:
2450
0
        case GDT_Float16:
2451
0
        {
2452
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2453
0
            return GDALResampleChunk_ModeT(
2454
0
                args, static_cast<const uint16_t *>(pChunk),
2455
0
                static_cast<uint16_t *>(*ppDstBuffer));
2456
0
        }
2457
2458
0
        case GDT_CInt16:
2459
0
        case GDT_CFloat16:
2460
0
        case GDT_Int32:
2461
0
        case GDT_UInt32:
2462
0
        {
2463
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2464
0
            return GDALResampleChunk_ModeT(
2465
0
                args, static_cast<const uint32_t *>(pChunk),
2466
0
                static_cast<uint32_t *>(*ppDstBuffer));
2467
0
        }
2468
2469
0
        case GDT_Float32:
2470
0
        {
2471
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2472
0
            return GDALResampleChunk_ModeT(args,
2473
0
                                           static_cast<const float *>(pChunk),
2474
0
                                           static_cast<float *>(*ppDstBuffer));
2475
0
        }
2476
2477
0
        case GDT_CInt32:
2478
0
        case GDT_Int64:
2479
0
        case GDT_UInt64:
2480
0
        {
2481
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2482
0
            return GDALResampleChunk_ModeT(
2483
0
                args, static_cast<const uint64_t *>(pChunk),
2484
0
                static_cast<uint64_t *>(*ppDstBuffer));
2485
0
        }
2486
2487
0
        case GDT_Float64:
2488
0
        {
2489
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2490
0
            return GDALResampleChunk_ModeT(args,
2491
0
                                           static_cast<const double *>(pChunk),
2492
0
                                           static_cast<double *>(*ppDstBuffer));
2493
0
        }
2494
2495
0
        case GDT_CFloat32:
2496
0
        {
2497
0
            return GDALResampleChunk_ModeT(
2498
0
                args, static_cast<const std::complex<float> *>(pChunk),
2499
0
                static_cast<std::complex<float> *>(*ppDstBuffer));
2500
0
        }
2501
2502
0
        case GDT_CFloat64:
2503
0
        {
2504
0
            return GDALResampleChunk_ModeT(
2505
0
                args, static_cast<const std::complex<double> *>(pChunk),
2506
0
                static_cast<std::complex<double> *>(*ppDstBuffer));
2507
0
        }
2508
2509
0
        case GDT_Unknown:
2510
0
        case GDT_TypeCount:
2511
0
            break;
2512
0
    }
2513
2514
0
    CPLAssert(false);
2515
0
    return CE_Failure;
2516
0
}
2517
2518
/************************************************************************/
2519
/*                  GDALResampleConvolutionHorizontal()                 */
2520
/************************************************************************/
2521
2522
template <class T>
2523
static inline double
2524
GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2525
                                  int nSrcPixelCount)
2526
0
{
2527
0
    double dfVal1 = 0.0;
2528
0
    double dfVal2 = 0.0;
2529
0
    int i = 0;  // Used after for.
2530
    // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2531
    // manually (untypical) unrolled loop in -O2 and -O3:
2532
    // https://github.com/OSGeo/gdal/issues/9508
2533
0
#if !defined(__INTEL_CLANG_COMPILER)
2534
0
    for (; i < nSrcPixelCount - 3; i += 4)
2535
0
    {
2536
0
        dfVal1 += pChunk[i] * padfWeights[i];
2537
0
        dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2538
0
        dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2539
0
        dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2540
0
    }
2541
0
#endif
2542
0
    for (; i < nSrcPixelCount; ++i)
2543
0
    {
2544
0
        dfVal1 += pChunk[i] * padfWeights[i];
2545
0
    }
2546
0
    return dfVal1 + dfVal2;
2547
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<float>(float const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<double>(double const*, double const*, int)
2548
2549
template <class T>
2550
static inline void GDALResampleConvolutionHorizontalWithMask(
2551
    const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2552
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2553
0
{
2554
0
    dfVal = 0;
2555
0
    dfWeightSum = 0;
2556
0
    int i = 0;
2557
0
    for (; i < nSrcPixelCount - 3; i += 4)
2558
0
    {
2559
0
        const double dfWeight0 = padfWeights[i] * pabyMask[i];
2560
0
        const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2561
0
        const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2562
0
        const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2563
0
        dfVal += pChunk[i] * dfWeight0;
2564
0
        dfVal += pChunk[i + 1] * dfWeight1;
2565
0
        dfVal += pChunk[i + 2] * dfWeight2;
2566
0
        dfVal += pChunk[i + 3] * dfWeight3;
2567
0
        dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2568
0
    }
2569
0
    for (; i < nSrcPixelCount; ++i)
2570
0
    {
2571
0
        const double dfWeight = padfWeights[i] * pabyMask[i];
2572
0
        dfVal += pChunk[i] * dfWeight;
2573
0
        dfWeightSum += dfWeight;
2574
0
    }
2575
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<float>(float const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<double>(double const*, unsigned char const*, double const*, int, double&, double&)
2576
2577
template <class T>
2578
static inline void GDALResampleConvolutionHorizontal_3rows(
2579
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2580
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2581
    double &dfRes2, double &dfRes3)
2582
0
{
2583
0
    double dfVal1 = 0.0;
2584
0
    double dfVal2 = 0.0;
2585
0
    double dfVal3 = 0.0;
2586
0
    double dfVal4 = 0.0;
2587
0
    double dfVal5 = 0.0;
2588
0
    double dfVal6 = 0.0;
2589
0
    int i = 0;  // Used after for.
2590
0
    for (; i < nSrcPixelCount - 3; i += 4)
2591
0
    {
2592
0
        dfVal1 += pChunkRow1[i] * padfWeights[i];
2593
0
        dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2594
0
        dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2595
0
        dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2596
0
        dfVal3 += pChunkRow2[i] * padfWeights[i];
2597
0
        dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2598
0
        dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2599
0
        dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2600
0
        dfVal5 += pChunkRow3[i] * padfWeights[i];
2601
0
        dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2602
0
        dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2603
0
        dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2604
0
    }
2605
0
    for (; i < nSrcPixelCount; ++i)
2606
0
    {
2607
0
        dfVal1 += pChunkRow1[i] * padfWeights[i];
2608
0
        dfVal3 += pChunkRow2[i] * padfWeights[i];
2609
0
        dfVal5 += pChunkRow3[i] * padfWeights[i];
2610
0
    }
2611
0
    dfRes1 = dfVal1 + dfVal2;
2612
0
    dfRes2 = dfVal3 + dfVal4;
2613
0
    dfRes3 = dfVal5 + dfVal6;
2614
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2615
2616
template <class T>
2617
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2618
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2619
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2620
    double &dfRes2, double &dfRes3)
2621
0
{
2622
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2623
0
                                            padfWeights, nSrcPixelCount, dfRes1,
2624
0
                                            dfRes2, dfRes3);
2625
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2626
2627
template <class T>
2628
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2629
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2630
    const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2631
0
{
2632
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2633
0
                                            padfWeights, 4, dfRes1, dfRes2,
2634
0
                                            dfRes3);
2635
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<float>(float const*, float const*, float const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<double>(double const*, double const*, double const*, double const*, double&, double&, double&)
2636
2637
/************************************************************************/
2638
/*                  GDALResampleConvolutionVertical()                   */
2639
/************************************************************************/
2640
2641
template <class T>
2642
static inline double
2643
GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
2644
                                const double *padfWeights, int nSrcLineCount)
2645
0
{
2646
0
    double dfVal1 = 0.0;
2647
0
    double dfVal2 = 0.0;
2648
0
    int i = 0;
2649
0
    size_t j = 0;
2650
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2651
0
    {
2652
0
        dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
2653
0
        dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
2654
0
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2655
0
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2656
0
    }
2657
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2658
0
    {
2659
0
        dfVal1 += pChunk[j] * padfWeights[i];
2660
0
    }
2661
0
    return dfVal1 + dfVal2;
2662
0
}
2663
2664
template <class T>
2665
static inline void GDALResampleConvolutionVertical_2cols(
2666
    const T *pChunk, size_t nStride, const double *padfWeights,
2667
    int nSrcLineCount, double &dfRes1, double &dfRes2)
2668
0
{
2669
0
    double dfVal1 = 0.0;
2670
0
    double dfVal2 = 0.0;
2671
0
    double dfVal3 = 0.0;
2672
0
    double dfVal4 = 0.0;
2673
0
    int i = 0;
2674
0
    size_t j = 0;
2675
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2676
0
    {
2677
0
        dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
2678
0
        dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
2679
0
        dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
2680
0
        dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
2681
0
        dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
2682
0
        dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2683
0
        dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
2684
0
        dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2685
0
    }
2686
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2687
0
    {
2688
0
        dfVal1 += pChunk[j + 0] * padfWeights[i];
2689
0
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2690
0
    }
2691
0
    dfRes1 = dfVal1 + dfVal2;
2692
0
    dfRes2 = dfVal3 + dfVal4;
2693
0
}
2694
2695
#ifdef USE_SSE2
2696
2697
#ifdef __AVX__
2698
/************************************************************************/
2699
/*             GDALResampleConvolutionVertical_16cols<T>                */
2700
/************************************************************************/
2701
2702
template <class T>
2703
static inline void
2704
GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
2705
                                       const double *padfWeights,
2706
                                       int nSrcLineCount, float *afDest)
2707
{
2708
    int i = 0;
2709
    size_t j = 0;
2710
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2711
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2712
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2713
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2714
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2715
    {
2716
        XMMReg4Double w0 =
2717
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2718
        XMMReg4Double w1 =
2719
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2720
        XMMReg4Double w2 =
2721
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2722
        XMMReg4Double w3 =
2723
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2724
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2725
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2726
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2727
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2728
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2729
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2730
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2731
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2732
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2733
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2734
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2735
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2736
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2737
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2738
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2739
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2740
    }
2741
    for (; i < nSrcLineCount; ++i, j += nStride)
2742
    {
2743
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2744
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2745
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2746
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2747
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2748
    }
2749
    v_acc0.Store4Val(afDest);
2750
    v_acc1.Store4Val(afDest + 4);
2751
    v_acc2.Store4Val(afDest + 8);
2752
    v_acc3.Store4Val(afDest + 12);
2753
}
2754
2755
template <class T>
2756
static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2757
                                                          const double *, int,
2758
                                                          double *)
2759
{
2760
    // Cannot be reached
2761
    CPLAssert(false);
2762
}
2763
2764
#else
2765
2766
/************************************************************************/
2767
/*              GDALResampleConvolutionVertical_8cols<T>                */
2768
/************************************************************************/
2769
2770
template <class T>
2771
static inline void
2772
GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
2773
                                      const double *padfWeights,
2774
                                      int nSrcLineCount, float *afDest)
2775
0
{
2776
0
    int i = 0;
2777
0
    size_t j = 0;
2778
0
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2779
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2780
0
    for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
2781
0
    {
2782
0
        XMMReg4Double w0 =
2783
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2784
0
        XMMReg4Double w1 =
2785
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2786
0
        XMMReg4Double w2 =
2787
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2788
0
        XMMReg4Double w3 =
2789
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2790
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2791
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2792
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2793
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2794
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2795
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2796
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2797
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2798
0
    }
2799
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2800
0
    {
2801
0
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2802
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2803
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2804
0
    }
2805
0
    v_acc0.Store4Val(afDest);
2806
0
    v_acc1.Store4Val(afDest + 4);
2807
0
}
2808
2809
template <class T>
2810
static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2811
                                                         const double *, int,
2812
                                                         double *)
2813
{
2814
    // Cannot be reached
2815
    CPLAssert(false);
2816
}
2817
2818
#endif  // __AVX__
2819
2820
/************************************************************************/
2821
/*              GDALResampleConvolutionHorizontalSSE2<T>                */
2822
/************************************************************************/
2823
2824
template <class T>
2825
static inline double GDALResampleConvolutionHorizontalSSE2(
2826
    const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2827
0
{
2828
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2829
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2830
0
    int i = 0;  // Used after for.
2831
0
    for (; i < nSrcPixelCount - 7; i += 8)
2832
0
    {
2833
        // Retrieve the pixel & accumulate
2834
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2835
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2836
0
        const XMMReg4Double v_weight1 =
2837
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2838
0
        const XMMReg4Double v_weight2 =
2839
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2840
2841
0
        v_acc1 += v_pixels1 * v_weight1;
2842
0
        v_acc2 += v_pixels2 * v_weight2;
2843
0
    }
2844
2845
0
    v_acc1 += v_acc2;
2846
2847
0
    double dfVal = v_acc1.GetHorizSum();
2848
0
    for (; i < nSrcPixelCount; ++i)
2849
0
    {
2850
0
        dfVal += pChunk[i] * padfWeightsAligned[i];
2851
0
    }
2852
0
    return dfVal;
2853
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned char>(unsigned char const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned short>(unsigned short const*, double const*, int)
2854
2855
/************************************************************************/
2856
/*              GDALResampleConvolutionHorizontal<GByte>                */
2857
/************************************************************************/
2858
2859
template <>
2860
inline double GDALResampleConvolutionHorizontal<GByte>(
2861
    const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2862
0
{
2863
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2864
0
                                                 nSrcPixelCount);
2865
0
}
2866
2867
template <>
2868
inline double GDALResampleConvolutionHorizontal<GUInt16>(
2869
    const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2870
0
{
2871
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2872
0
                                                 nSrcPixelCount);
2873
0
}
2874
2875
/************************************************************************/
2876
/*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
2877
/************************************************************************/
2878
2879
template <class T>
2880
static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2881
    const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2882
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2883
0
{
2884
0
    int i = 0;  // Used after for.
2885
0
    XMMReg4Double v_acc = XMMReg4Double::Zero();
2886
0
    XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2887
0
    for (; i < nSrcPixelCount - 3; i += 4)
2888
0
    {
2889
0
        const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2890
0
        const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2891
0
        XMMReg4Double v_weight =
2892
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2893
0
        v_weight *= v_mask;
2894
0
        v_acc += v_pixels * v_weight;
2895
0
        v_acc_weight += v_weight;
2896
0
    }
2897
2898
0
    dfVal = v_acc.GetHorizSum();
2899
0
    dfWeightSum = v_acc_weight.GetHorizSum();
2900
0
    for (; i < nSrcPixelCount; ++i)
2901
0
    {
2902
0
        const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2903
0
        dfVal += pChunk[i] * dfWeight;
2904
0
        dfWeightSum += dfWeight;
2905
0
    }
2906
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned char>(unsigned char const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned short>(unsigned short const*, unsigned char const*, double const*, int, double&, double&)
2907
2908
/************************************************************************/
2909
/*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
2910
/************************************************************************/
2911
2912
template <>
2913
inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2914
    const GByte *pChunk, const GByte *pabyMask,
2915
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2916
    double &dfWeightSum)
2917
0
{
2918
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
2919
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2920
0
        dfWeightSum);
2921
0
}
2922
2923
template <>
2924
inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2925
    const GUInt16 *pChunk, const GByte *pabyMask,
2926
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2927
    double &dfWeightSum)
2928
0
{
2929
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
2930
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2931
0
        dfWeightSum);
2932
0
}
2933
2934
/************************************************************************/
2935
/*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
2936
/************************************************************************/
2937
2938
template <class T>
2939
static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2940
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2941
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2942
    double &dfRes2, double &dfRes3)
2943
0
{
2944
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2945
0
                  v_acc2 = XMMReg4Double::Zero(),
2946
0
                  v_acc3 = XMMReg4Double::Zero();
2947
0
    int i = 0;
2948
0
    for (; i < nSrcPixelCount - 7; i += 8)
2949
0
    {
2950
        // Retrieve the pixel & accumulate.
2951
0
        XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2952
0
        XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2953
0
        const XMMReg4Double v_weight1 =
2954
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2955
0
        const XMMReg4Double v_weight2 =
2956
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2957
2958
0
        v_acc1 += v_pixels1 * v_weight1;
2959
0
        v_acc1 += v_pixels2 * v_weight2;
2960
2961
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2962
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2963
0
        v_acc2 += v_pixels1 * v_weight1;
2964
0
        v_acc2 += v_pixels2 * v_weight2;
2965
2966
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2967
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2968
0
        v_acc3 += v_pixels1 * v_weight1;
2969
0
        v_acc3 += v_pixels2 * v_weight2;
2970
0
    }
2971
2972
0
    dfRes1 = v_acc1.GetHorizSum();
2973
0
    dfRes2 = v_acc2.GetHorizSum();
2974
0
    dfRes3 = v_acc3.GetHorizSum();
2975
0
    for (; i < nSrcPixelCount; ++i)
2976
0
    {
2977
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2978
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2979
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2980
0
    }
2981
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
2982
2983
/************************************************************************/
2984
/*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
2985
/************************************************************************/
2986
2987
template <>
2988
inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2989
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2990
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2991
    double &dfRes2, double &dfRes3)
2992
0
{
2993
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
2994
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2995
0
        dfRes1, dfRes2, dfRes3);
2996
0
}
2997
2998
template <>
2999
inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
3000
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3001
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3002
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3003
0
{
3004
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
3005
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3006
0
        dfRes1, dfRes2, dfRes3);
3007
0
}
3008
3009
/************************************************************************/
3010
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
3011
/************************************************************************/
3012
3013
template <class T>
3014
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3015
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3016
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3017
    double &dfRes2, double &dfRes3)
3018
0
{
3019
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3020
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3021
0
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3022
0
    int i = 0;  // Use after for.
3023
0
    for (; i < nSrcPixelCount - 3; i += 4)
3024
0
    {
3025
        // Retrieve the pixel & accumulate.
3026
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3027
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3028
0
        const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3029
0
        const XMMReg4Double v_weight =
3030
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3031
3032
0
        v_acc1 += v_pixels1 * v_weight;
3033
0
        v_acc2 += v_pixels2 * v_weight;
3034
0
        v_acc3 += v_pixels3 * v_weight;
3035
0
    }
3036
3037
0
    dfRes1 = v_acc1.GetHorizSum();
3038
0
    dfRes2 = v_acc2.GetHorizSum();
3039
0
    dfRes3 = v_acc3.GetHorizSum();
3040
3041
0
    for (; i < nSrcPixelCount; ++i)
3042
0
    {
3043
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3044
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3045
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3046
0
    }
3047
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
3048
3049
/************************************************************************/
3050
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
3051
/************************************************************************/
3052
3053
template <>
3054
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3055
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3056
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3057
    double &dfRes2, double &dfRes3)
3058
0
{
3059
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3060
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3061
0
        dfRes1, dfRes2, dfRes3);
3062
0
}
3063
3064
template <>
3065
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3066
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3067
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3068
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3069
0
{
3070
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3071
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3072
0
        dfRes1, dfRes2, dfRes3);
3073
0
}
3074
3075
/************************************************************************/
3076
/*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
3077
/************************************************************************/
3078
3079
template <class T>
3080
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3081
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3082
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3083
    double &dfRes3)
3084
0
{
3085
0
    const XMMReg4Double v_weight =
3086
0
        XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3087
3088
    // Retrieve the pixel & accumulate.
3089
0
    const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3090
0
    const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3091
0
    const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3092
3093
0
    XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3094
0
    XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3095
0
    XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3096
3097
0
    dfRes1 = v_acc1.GetHorizSum();
3098
0
    dfRes2 = v_acc2.GetHorizSum();
3099
0
    dfRes3 = v_acc3.GetHorizSum();
3100
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, double&, double&, double&)
3101
3102
/************************************************************************/
3103
/*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
3104
/************************************************************************/
3105
3106
template <>
3107
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3108
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3109
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3110
    double &dfRes3)
3111
0
{
3112
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3113
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3114
0
        dfRes3);
3115
0
}
3116
3117
template <>
3118
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3119
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3120
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3121
    double &dfRes2, double &dfRes3)
3122
0
{
3123
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3124
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3125
0
        dfRes3);
3126
0
}
3127
3128
#endif  // USE_SSE2
3129
3130
/************************************************************************/
3131
/*                    GDALResampleChunk_Convolution()                   */
3132
/************************************************************************/
3133
3134
template <class T, class Twork, GDALDataType eWrkDataType>
3135
static CPLErr GDALResampleChunk_ConvolutionT(
3136
    const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3137
    FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3138
    int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3139
3140
0
{
3141
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3142
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3143
0
    const double dfSrcXDelta = args.dfSrcXDelta;
3144
0
    const double dfSrcYDelta = args.dfSrcYDelta;
3145
0
    constexpr int nBands = 1;
3146
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3147
0
    const int nChunkXOff = args.nChunkXOff;
3148
0
    const int nChunkXSize = args.nChunkXSize;
3149
0
    const int nChunkYOff = args.nChunkYOff;
3150
0
    const int nChunkYSize = args.nChunkYSize;
3151
0
    const int nDstXOff = args.nDstXOff;
3152
0
    const int nDstXOff2 = args.nDstXOff2;
3153
0
    const int nDstYOff = args.nDstYOff;
3154
0
    const int nDstYOff2 = args.nDstYOff2;
3155
0
    const bool bHasNoData = args.bHasNoData;
3156
0
    double dfNoDataValue = args.dfNoDataValue;
3157
3158
0
    if (!bHasNoData)
3159
0
        dfNoDataValue = 0.0;
3160
0
    const auto dstDataType = args.eOvrDataType;
3161
0
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3162
0
    const double dfReplacementVal =
3163
0
        bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3164
0
                   : dfNoDataValue;
3165
    // cppcheck-suppress unreadVariable
3166
0
    const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3167
0
    const bool bNoDataValueInt64Valid =
3168
0
        isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3169
0
    const auto nNodataValueInt64 =
3170
0
        bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3171
0
    constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3172
3173
    // TODO: we should have some generic function to do this.
3174
0
    Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3175
0
    Twork fDstMax = cpl::NumericLimits<Twork>::max();
3176
0
    if (dstDataType == GDT_Byte)
3177
0
    {
3178
0
        fDstMin = std::numeric_limits<GByte>::min();
3179
0
        fDstMax = std::numeric_limits<GByte>::max();
3180
0
    }
3181
0
    else if (dstDataType == GDT_Int8)
3182
0
    {
3183
0
        fDstMin = std::numeric_limits<GInt8>::min();
3184
0
        fDstMax = std::numeric_limits<GInt8>::max();
3185
0
    }
3186
0
    else if (dstDataType == GDT_UInt16)
3187
0
    {
3188
0
        fDstMin = std::numeric_limits<GUInt16>::min();
3189
0
        fDstMax = std::numeric_limits<GUInt16>::max();
3190
0
    }
3191
0
    else if (dstDataType == GDT_Int16)
3192
0
    {
3193
0
        fDstMin = std::numeric_limits<GInt16>::min();
3194
0
        fDstMax = std::numeric_limits<GInt16>::max();
3195
0
    }
3196
0
    else if (dstDataType == GDT_UInt32)
3197
0
    {
3198
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3199
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3200
0
    }
3201
0
    else if (dstDataType == GDT_Int32)
3202
0
    {
3203
        // cppcheck-suppress unreadVariable
3204
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3205
        // cppcheck-suppress unreadVariable
3206
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3207
0
    }
3208
0
    else if (dstDataType == GDT_UInt64)
3209
0
    {
3210
        // cppcheck-suppress unreadVariable
3211
0
        fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3212
        // cppcheck-suppress unreadVariable
3213
        // (1 << 64) - 2048: largest uint64 value a double can hold
3214
0
        fDstMax = static_cast<Twork>(18446744073709549568ULL);
3215
0
    }
3216
0
    else if (dstDataType == GDT_Int64)
3217
0
    {
3218
        // cppcheck-suppress unreadVariable
3219
0
        fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3220
        // cppcheck-suppress unreadVariable
3221
        // (1 << 63) - 1024: largest int64 that a double can hold
3222
0
        fDstMax = static_cast<Twork>(9223372036854774784LL);
3223
0
    }
3224
3225
0
    auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3226
0
                               bNoDataValueInt64Valid, nNodataValueInt64,
3227
0
                               dfNoDataValue, dfReplacementVal](Twork fVal)
3228
0
    {
3229
0
        if (!bHasNoData)
3230
0
            return fVal;
3231
3232
        // Clamp value before comparing to nodata: this is only needed for
3233
        // kernels with negative weights (Lanczos)
3234
0
        Twork fClamped = fVal;
3235
0
        if (fClamped < fDstMin)
3236
0
            fClamped = fDstMin;
3237
0
        else if (fClamped > fDstMax)
3238
0
            fClamped = fDstMax;
3239
0
        if (isIntegerDT)
3240
0
        {
3241
0
            if (bNoDataValueInt64Valid)
3242
0
            {
3243
0
                const double fClampedRounded = std::round(fClamped);
3244
0
                if (fClampedRounded >=
3245
0
                        static_cast<Twork>(
3246
0
                            std::numeric_limits<int64_t>::min()) &&
3247
0
                    fClampedRounded <=
3248
0
                        static_cast<Twork>(9223372036854774784LL) &&
3249
0
                    nNodataValueInt64 ==
3250
0
                        static_cast<GInt64>(std::round(fClamped)))
3251
0
                {
3252
                    // Do not use the nodata value
3253
0
                    return static_cast<Twork>(dfReplacementVal);
3254
0
                }
3255
0
            }
3256
0
        }
3257
0
        else if (dfNoDataValue == fClamped)
3258
0
        {
3259
            // Do not use the nodata value
3260
0
            return static_cast<Twork>(dfReplacementVal);
3261
0
        }
3262
0
        return fClamped;
3263
0
    };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(double)#1}::operator()(double) const
3264
3265
    /* -------------------------------------------------------------------- */
3266
    /*      Allocate work buffers.                                          */
3267
    /* -------------------------------------------------------------------- */
3268
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
3269
0
    Twork *pafWrkScanline = nullptr;
3270
0
    if (dstDataType != eWrkDataType)
3271
0
    {
3272
0
        pafWrkScanline =
3273
0
            static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3274
0
        if (pafWrkScanline == nullptr)
3275
0
            return CE_Failure;
3276
0
    }
3277
3278
0
    const double dfXScale = 1.0 / dfXRatioDstToSrc;
3279
0
    const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3280
0
    const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3281
0
    const double dfYScale = 1.0 / dfYRatioDstToSrc;
3282
0
    const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3283
0
    const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3284
3285
    // Temporary array to store result of horizontal filter.
3286
0
    double *padfHorizontalFiltered = static_cast<double *>(
3287
0
        VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3288
3289
    // To store convolution coefficients.
3290
0
    double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3291
0
        static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3292
0
                         0.5) *
3293
0
        sizeof(double)));
3294
3295
0
    GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3296
0
    if (pabyChunkNodataMask)
3297
0
        pabyChunkNodataMaskHorizontalFiltered =
3298
0
            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3299
0
    if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3300
0
        (pabyChunkNodataMask != nullptr &&
3301
0
         pabyChunkNodataMaskHorizontalFiltered == nullptr))
3302
0
    {
3303
0
        VSIFree(pafWrkScanline);
3304
0
        VSIFree(padfHorizontalFiltered);
3305
0
        VSIFreeAligned(padfWeights);
3306
0
        VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3307
0
        return CE_Failure;
3308
0
    }
3309
3310
    /* ==================================================================== */
3311
    /*      First pass: horizontal filter                                   */
3312
    /* ==================================================================== */
3313
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3314
0
#ifdef USE_SSE2
3315
0
    bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3316
0
#endif
3317
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3318
0
    {
3319
0
        const double dfSrcPixel =
3320
0
            (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3321
0
        int nSrcPixelStart =
3322
0
            static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3323
0
        if (nSrcPixelStart < nChunkXOff)
3324
0
            nSrcPixelStart = nChunkXOff;
3325
0
        int nSrcPixelStop =
3326
0
            static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3327
0
        if (nSrcPixelStop > nChunkRightXOff)
3328
0
            nSrcPixelStop = nChunkRightXOff;
3329
#if 0
3330
        if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3331
        {
3332
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3333
        }
3334
        if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3335
        {
3336
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3337
        }
3338
#endif
3339
0
        const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3340
0
        double dfWeightSum = 0.0;
3341
3342
        // Compute convolution coefficients.
3343
0
        int nSrcPixel = nSrcPixelStart;
3344
0
        double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3345
0
        for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
3346
0
        {
3347
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3348
0
            dfX += dfXScaleWeight;
3349
0
            padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3350
0
            dfX += dfXScaleWeight;
3351
0
            padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3352
0
            dfX += dfXScaleWeight;
3353
0
            padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3354
0
            dfX += dfXScaleWeight;
3355
0
            dfWeightSum +=
3356
0
                pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3357
0
        }
3358
0
        for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3359
0
        {
3360
0
            const double dfWeight = pfnFilterFunc(dfX);
3361
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3362
0
            dfWeightSum += dfWeight;
3363
0
        }
3364
3365
0
        const int nHeight = nChunkYSize * nBands;
3366
0
        if (pabyChunkNodataMask == nullptr)
3367
0
        {
3368
0
            if (dfWeightSum != 0)
3369
0
            {
3370
0
                const double dfInvWeightSum = 1.0 / dfWeightSum;
3371
0
                for (int i = 0; i < nSrcPixelCount; ++i)
3372
0
                    padfWeights[i] *= dfInvWeightSum;
3373
0
            }
3374
0
            int iSrcLineOff = 0;
3375
0
#ifdef USE_SSE2
3376
0
            if (nSrcPixelCount == 4)
3377
0
            {
3378
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3379
0
                {
3380
0
                    const size_t j =
3381
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3382
0
                        (nSrcPixelStart - nChunkXOff);
3383
0
                    double dfVal1 = 0.0;
3384
0
                    double dfVal2 = 0.0;
3385
0
                    double dfVal3 = 0.0;
3386
0
                    GDALResampleConvolutionHorizontalPixelCount4_3rows(
3387
0
                        pChunk + j, pChunk + j + nChunkXSize,
3388
0
                        pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3389
0
                        dfVal2, dfVal3);
3390
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3391
0
                                               nDstXSize +
3392
0
                                           iDstPixel - nDstXOff] = dfVal1;
3393
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3394
0
                                            1) *
3395
0
                                               nDstXSize +
3396
0
                                           iDstPixel - nDstXOff] = dfVal2;
3397
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3398
0
                                            2) *
3399
0
                                               nDstXSize +
3400
0
                                           iDstPixel - nDstXOff] = dfVal3;
3401
0
                }
3402
0
            }
3403
0
            else if (bSrcPixelCountLess8)
3404
0
            {
3405
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3406
0
                {
3407
0
                    const size_t j =
3408
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3409
0
                        (nSrcPixelStart - nChunkXOff);
3410
0
                    double dfVal1 = 0.0;
3411
0
                    double dfVal2 = 0.0;
3412
0
                    double dfVal3 = 0.0;
3413
0
                    GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3414
0
                        pChunk + j, pChunk + j + nChunkXSize,
3415
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3416
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3417
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3418
0
                                               nDstXSize +
3419
0
                                           iDstPixel - nDstXOff] = dfVal1;
3420
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3421
0
                                            1) *
3422
0
                                               nDstXSize +
3423
0
                                           iDstPixel - nDstXOff] = dfVal2;
3424
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3425
0
                                            2) *
3426
0
                                               nDstXSize +
3427
0
                                           iDstPixel - nDstXOff] = dfVal3;
3428
0
                }
3429
0
            }
3430
0
            else
3431
0
#endif
3432
0
            {
3433
0
                for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
3434
0
                {
3435
0
                    const size_t j =
3436
0
                        static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3437
0
                        (nSrcPixelStart - nChunkXOff);
3438
0
                    double dfVal1 = 0.0;
3439
0
                    double dfVal2 = 0.0;
3440
0
                    double dfVal3 = 0.0;
3441
0
                    GDALResampleConvolutionHorizontal_3rows(
3442
0
                        pChunk + j, pChunk + j + nChunkXSize,
3443
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3444
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3445
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3446
0
                                               nDstXSize +
3447
0
                                           iDstPixel - nDstXOff] = dfVal1;
3448
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3449
0
                                            1) *
3450
0
                                               nDstXSize +
3451
0
                                           iDstPixel - nDstXOff] = dfVal2;
3452
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3453
0
                                            2) *
3454
0
                                               nDstXSize +
3455
0
                                           iDstPixel - nDstXOff] = dfVal3;
3456
0
                }
3457
0
            }
3458
0
            for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3459
0
            {
3460
0
                const size_t j =
3461
0
                    static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3462
0
                    (nSrcPixelStart - nChunkXOff);
3463
0
                const double dfVal = GDALResampleConvolutionHorizontal(
3464
0
                    pChunk + j, padfWeights, nSrcPixelCount);
3465
0
                padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3466
0
                                           nDstXSize +
3467
0
                                       iDstPixel - nDstXOff] = dfVal;
3468
0
            }
3469
0
        }
3470
0
        else
3471
0
        {
3472
0
            for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3473
0
            {
3474
0
                const size_t j =
3475
0
                    static_cast<size_t>(iSrcLineOff) * nChunkXSize +
3476
0
                    (nSrcPixelStart - nChunkXOff);
3477
3478
0
                if (bKernelWithNegativeWeights)
3479
0
                {
3480
0
                    int nConsecutiveValid = 0;
3481
0
                    int nMaxConsecutiveValid = 0;
3482
0
                    for (int k = 0; k < nSrcPixelCount; k++)
3483
0
                    {
3484
0
                        if (pabyChunkNodataMask[j + k])
3485
0
                            nConsecutiveValid++;
3486
0
                        else if (nConsecutiveValid)
3487
0
                        {
3488
0
                            nMaxConsecutiveValid = std::max(
3489
0
                                nMaxConsecutiveValid, nConsecutiveValid);
3490
0
                            nConsecutiveValid = 0;
3491
0
                        }
3492
0
                    }
3493
0
                    nMaxConsecutiveValid =
3494
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
3495
0
                    if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3496
0
                    {
3497
0
                        const size_t nTempOffset =
3498
0
                            static_cast<size_t>(iSrcLineOff) * nDstXSize +
3499
0
                            iDstPixel - nDstXOff;
3500
0
                        padfHorizontalFiltered[nTempOffset] = 0.0;
3501
0
                        pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3502
0
                        continue;
3503
0
                    }
3504
0
                }
3505
3506
0
                double dfVal = 0.0;
3507
0
                GDALResampleConvolutionHorizontalWithMask(
3508
0
                    pChunk + j, pabyChunkNodataMask + j, padfWeights,
3509
0
                    nSrcPixelCount, dfVal, dfWeightSum);
3510
0
                const size_t nTempOffset =
3511
0
                    static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3512
0
                    nDstXOff;
3513
0
                if (dfWeightSum > 0.0)
3514
0
                {
3515
0
                    padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3516
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3517
0
                }
3518
0
                else
3519
0
                {
3520
0
                    padfHorizontalFiltered[nTempOffset] = 0.0;
3521
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3522
0
                }
3523
0
            }
3524
0
        }
3525
0
    }
3526
3527
    /* ==================================================================== */
3528
    /*      Second pass: vertical filter                                    */
3529
    /* ==================================================================== */
3530
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3531
3532
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3533
0
    {
3534
0
        Twork *const pafDstScanline =
3535
0
            pafWrkScanline
3536
0
                ? pafWrkScanline
3537
0
                : static_cast<Twork *>(pDstBuffer) +
3538
0
                      static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
3539
3540
0
        const double dfSrcLine =
3541
0
            (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3542
0
        int nSrcLineStart =
3543
0
            static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3544
0
        int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3545
0
        if (nSrcLineStart < nChunkYOff)
3546
0
            nSrcLineStart = nChunkYOff;
3547
0
        if (nSrcLineStop > nChunkBottomYOff)
3548
0
            nSrcLineStop = nChunkBottomYOff;
3549
#if 0
3550
        if( nSrcLineStart < nChunkYOff &&
3551
            nChunkYOff > 0 )
3552
        {
3553
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3554
        }
3555
        if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3556
        {
3557
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3558
        }
3559
#endif
3560
0
        const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3561
0
        double dfWeightSum = 0.0;
3562
3563
        // Compute convolution coefficients.
3564
0
        int nSrcLine = nSrcLineStart;  // Used after for.
3565
0
        double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3566
0
        for (; nSrcLine < nSrcLineStop - 3;
3567
0
             nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3568
0
        {
3569
0
            padfWeights[nSrcLine - nSrcLineStart] = dfY;
3570
0
            padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3571
0
            padfWeights[nSrcLine + 2 - nSrcLineStart] =
3572
0
                dfY + 2 * dfYScaleWeight;
3573
0
            padfWeights[nSrcLine + 3 - nSrcLineStart] =
3574
0
                dfY + 3 * dfYScaleWeight;
3575
0
            dfWeightSum +=
3576
0
                pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3577
0
        }
3578
0
        for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3579
0
        {
3580
0
            const double dfWeight = pfnFilterFunc(dfY);
3581
0
            padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3582
0
            dfWeightSum += dfWeight;
3583
0
        }
3584
3585
0
        if (pabyChunkNodataMask == nullptr)
3586
0
        {
3587
0
            if (dfWeightSum != 0)
3588
0
            {
3589
0
                const double dfInvWeightSum = 1.0 / dfWeightSum;
3590
0
                for (int i = 0; i < nSrcLineCount; ++i)
3591
0
                    padfWeights[i] *= dfInvWeightSum;
3592
0
            }
3593
0
        }
3594
3595
0
        if (pabyChunkNodataMask == nullptr)
3596
0
        {
3597
0
            int iFilteredPixelOff = 0;  // Used after for.
3598
            // j used after for.
3599
0
            size_t j =
3600
0
                (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3601
0
#ifdef USE_SSE2
3602
            if constexpr (eWrkDataType == GDT_Float32)
3603
0
            {
3604
#ifdef __AVX__
3605
                for (; iFilteredPixelOff < nDstXSize - 15;
3606
                     iFilteredPixelOff += 16, j += 16)
3607
                {
3608
                    GDALResampleConvolutionVertical_16cols(
3609
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3610
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3611
                    if (bHasNoData)
3612
                    {
3613
                        for (int k = 0; k < 16; k++)
3614
                        {
3615
                            pafDstScanline[iFilteredPixelOff + k] =
3616
                                replaceValIfNodata(
3617
                                    pafDstScanline[iFilteredPixelOff + k]);
3618
                        }
3619
                    }
3620
                }
3621
#else
3622
0
                for (; iFilteredPixelOff < nDstXSize - 7;
3623
0
                     iFilteredPixelOff += 8, j += 8)
3624
0
                {
3625
0
                    GDALResampleConvolutionVertical_8cols(
3626
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3627
0
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3628
0
                    if (bHasNoData)
3629
0
                    {
3630
0
                        for (int k = 0; k < 8; k++)
3631
0
                        {
3632
0
                            pafDstScanline[iFilteredPixelOff + k] =
3633
0
                                replaceValIfNodata(
3634
0
                                    pafDstScanline[iFilteredPixelOff + k]);
3635
0
                        }
3636
0
                    }
3637
0
                }
3638
0
#endif
3639
3640
0
                for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3641
0
                {
3642
0
                    const Twork fVal =
3643
0
                        static_cast<Twork>(GDALResampleConvolutionVertical(
3644
0
                            padfHorizontalFiltered + j, nDstXSize, padfWeights,
3645
0
                            nSrcLineCount));
3646
0
                    pafDstScanline[iFilteredPixelOff] =
3647
0
                        replaceValIfNodata(fVal);
3648
0
                }
3649
            }
3650
            else
3651
#endif
3652
0
            {
3653
0
                for (; iFilteredPixelOff < nDstXSize - 1;
3654
0
                     iFilteredPixelOff += 2, j += 2)
3655
0
                {
3656
0
                    double dfVal1 = 0.0;
3657
0
                    double dfVal2 = 0.0;
3658
0
                    GDALResampleConvolutionVertical_2cols(
3659
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3660
0
                        nSrcLineCount, dfVal1, dfVal2);
3661
0
                    pafDstScanline[iFilteredPixelOff] =
3662
0
                        replaceValIfNodata(static_cast<Twork>(dfVal1));
3663
0
                    pafDstScanline[iFilteredPixelOff + 1] =
3664
0
                        replaceValIfNodata(static_cast<Twork>(dfVal2));
3665
0
                }
3666
0
                if (iFilteredPixelOff < nDstXSize)
3667
0
                {
3668
0
                    const double dfVal = GDALResampleConvolutionVertical(
3669
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3670
0
                        nSrcLineCount);
3671
0
                    pafDstScanline[iFilteredPixelOff] =
3672
0
                        replaceValIfNodata(static_cast<Twork>(dfVal));
3673
0
                }
3674
0
            }
3675
0
        }
3676
0
        else
3677
0
        {
3678
0
            for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3679
0
                 ++iFilteredPixelOff)
3680
0
            {
3681
0
                double dfVal = 0.0;
3682
0
                dfWeightSum = 0.0;
3683
0
                size_t j = (nSrcLineStart - nChunkYOff) *
3684
0
                               static_cast<size_t>(nDstXSize) +
3685
0
                           iFilteredPixelOff;
3686
0
                if (bKernelWithNegativeWeights)
3687
0
                {
3688
0
                    int nConsecutiveValid = 0;
3689
0
                    int nMaxConsecutiveValid = 0;
3690
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3691
0
                    {
3692
0
                        const double dfWeight =
3693
0
                            padfWeights[i] *
3694
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
3695
0
                        if (pabyChunkNodataMaskHorizontalFiltered[j])
3696
0
                        {
3697
0
                            nConsecutiveValid++;
3698
0
                        }
3699
0
                        else if (nConsecutiveValid)
3700
0
                        {
3701
0
                            nMaxConsecutiveValid = std::max(
3702
0
                                nMaxConsecutiveValid, nConsecutiveValid);
3703
0
                            nConsecutiveValid = 0;
3704
0
                        }
3705
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
3706
0
                        dfWeightSum += dfWeight;
3707
0
                    }
3708
0
                    nMaxConsecutiveValid =
3709
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
3710
0
                    if (nMaxConsecutiveValid < nSrcLineCount / 2)
3711
0
                    {
3712
0
                        pafDstScanline[iFilteredPixelOff] =
3713
0
                            static_cast<Twork>(dfNoDataValue);
3714
0
                        continue;
3715
0
                    }
3716
0
                }
3717
0
                else
3718
0
                {
3719
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3720
0
                    {
3721
0
                        const double dfWeight =
3722
0
                            padfWeights[i] *
3723
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
3724
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
3725
0
                        dfWeightSum += dfWeight;
3726
0
                    }
3727
0
                }
3728
0
                if (dfWeightSum > 0.0)
3729
0
                {
3730
0
                    pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3731
0
                        static_cast<Twork>(dfVal / dfWeightSum));
3732
0
                }
3733
0
                else
3734
0
                {
3735
0
                    pafDstScanline[iFilteredPixelOff] =
3736
0
                        static_cast<Twork>(dfNoDataValue);
3737
0
                }
3738
0
            }
3739
0
        }
3740
3741
0
        if (fMaxVal != 0.0f)
3742
0
        {
3743
0
            for (int i = 0; i < nDstXSize; ++i)
3744
0
            {
3745
0
                if (pafDstScanline[i] > fMaxVal)
3746
0
                    pafDstScanline[i] = fMaxVal;
3747
0
            }
3748
0
        }
3749
3750
0
        if (pafWrkScanline)
3751
0
        {
3752
0
            GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
3753
0
                            static_cast<GByte *>(pDstBuffer) +
3754
0
                                static_cast<size_t>(iDstLine - nDstYOff) *
3755
0
                                    nDstXSize * nDstDataTypeSize,
3756
0
                            dstDataType, nDstDataTypeSize, nDstXSize);
3757
0
        }
3758
0
    }
3759
3760
0
    VSIFree(pafWrkScanline);
3761
0
    VSIFreeAligned(padfWeights);
3762
0
    VSIFree(padfHorizontalFiltered);
3763
0
    VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3764
3765
0
    return CE_None;
3766
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, bool, float)
3767
3768
static CPLErr
3769
GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3770
                              const void *pChunk, void **ppDstBuffer,
3771
                              GDALDataType *peDstBufferDataType)
3772
0
{
3773
0
    GDALResampleAlg eResample;
3774
0
    bool bKernelWithNegativeWeights = false;
3775
0
    if (EQUAL(args.pszResampling, "BILINEAR"))
3776
0
        eResample = GRA_Bilinear;
3777
0
    else if (EQUAL(args.pszResampling, "CUBIC"))
3778
0
    {
3779
0
        eResample = GRA_Cubic;
3780
0
        bKernelWithNegativeWeights = true;
3781
0
    }
3782
0
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
3783
0
        eResample = GRA_CubicSpline;
3784
0
    else if (EQUAL(args.pszResampling, "LANCZOS"))
3785
0
    {
3786
0
        eResample = GRA_Lanczos;
3787
0
        bKernelWithNegativeWeights = true;
3788
0
    }
3789
0
    else
3790
0
    {
3791
0
        CPLAssert(false);
3792
0
        return CE_Failure;
3793
0
    }
3794
0
    const int nKernelRadius = GWKGetFilterRadius(eResample);
3795
0
    FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3796
0
    const FilterFunc4ValuesType pfnFilterFunc4Values =
3797
0
        GWKGetFilterFunc4Values(eResample);
3798
3799
0
    float fMaxVal = 0.f;
3800
    // Cubic, etc... can have overshoots, so make sure we clamp values to the
3801
    // maximum value if NBITS is set.
3802
0
    if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3803
0
        (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
3804
0
         args.eOvrDataType == GDT_UInt32))
3805
0
    {
3806
0
        int nBits = args.nOvrNBITS;
3807
0
        if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
3808
0
            nBits = 0;
3809
0
        if (nBits > 0 && nBits < 32)
3810
0
            fMaxVal = static_cast<float>((1U << nBits) - 1);
3811
0
    }
3812
3813
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3814
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3815
0
        GDALGetDataTypeSizeBytes(args.eOvrDataType));
3816
0
    if (*ppDstBuffer == nullptr)
3817
0
    {
3818
0
        return CE_Failure;
3819
0
    }
3820
0
    *peDstBufferDataType = args.eOvrDataType;
3821
3822
0
    switch (args.eWrkDataType)
3823
0
    {
3824
0
        case GDT_Byte:
3825
0
        {
3826
0
            return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3827
0
                args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3828
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3829
0
                bKernelWithNegativeWeights, fMaxVal);
3830
0
        }
3831
3832
0
        case GDT_UInt16:
3833
0
        {
3834
0
            return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3835
0
                args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3836
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3837
0
                bKernelWithNegativeWeights, fMaxVal);
3838
0
        }
3839
3840
0
        case GDT_Float32:
3841
0
        {
3842
0
            return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3843
0
                args, static_cast<const float *>(pChunk), *ppDstBuffer,
3844
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3845
0
                bKernelWithNegativeWeights, fMaxVal);
3846
0
        }
3847
3848
0
        case GDT_Float64:
3849
0
        {
3850
0
            return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3851
0
                args, static_cast<const double *>(pChunk), *ppDstBuffer,
3852
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3853
0
                bKernelWithNegativeWeights, fMaxVal);
3854
0
        }
3855
3856
0
        default:
3857
0
            break;
3858
0
    }
3859
3860
0
    CPLAssert(false);
3861
0
    return CE_Failure;
3862
0
}
3863
3864
/************************************************************************/
3865
/*                       GDALResampleChunkC32R()                        */
3866
/************************************************************************/
3867
3868
static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
3869
                                    const float *pafChunk, const int nChunkYOff,
3870
                                    const int nChunkYSize, const int nDstYOff,
3871
                                    const int nDstYOff2, const int nOvrXSize,
3872
                                    const int nOvrYSize, void **ppDstBuffer,
3873
                                    GDALDataType *peDstBufferDataType,
3874
                                    const char *pszResampling)
3875
3876
0
{
3877
0
    enum Method
3878
0
    {
3879
0
        NEAR,
3880
0
        AVERAGE,
3881
0
        AVERAGE_MAGPHASE,
3882
0
        RMS,
3883
0
    };
3884
3885
0
    Method eMethod = NEAR;
3886
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
3887
0
    {
3888
0
        eMethod = NEAR;
3889
0
    }
3890
0
    else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3891
0
    {
3892
0
        eMethod = AVERAGE_MAGPHASE;
3893
0
    }
3894
0
    else if (EQUAL(pszResampling, "RMS"))
3895
0
    {
3896
0
        eMethod = RMS;
3897
0
    }
3898
0
    else if (STARTS_WITH_CI(pszResampling, "AVER"))
3899
0
    {
3900
0
        eMethod = AVERAGE;
3901
0
    }
3902
0
    else
3903
0
    {
3904
0
        CPLError(
3905
0
            CE_Failure, CPLE_NotSupported,
3906
0
            "Resampling method %s is not supported for complex data types. "
3907
0
            "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3908
0
            pszResampling);
3909
0
        return CE_Failure;
3910
0
    }
3911
3912
0
    const int nOXSize = nOvrXSize;
3913
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3914
0
                                       GDALGetDataTypeSizeBytes(GDT_CFloat32));
3915
0
    if (*ppDstBuffer == nullptr)
3916
0
    {
3917
0
        return CE_Failure;
3918
0
    }
3919
0
    float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3920
0
    *peDstBufferDataType = GDT_CFloat32;
3921
3922
0
    const int nOYSize = nOvrYSize;
3923
0
    const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3924
0
    const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3925
3926
    /* ==================================================================== */
3927
    /*      Loop over destination scanlines.                                */
3928
    /* ==================================================================== */
3929
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3930
0
    {
3931
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3932
0
        if (nSrcYOff < nChunkYOff)
3933
0
            nSrcYOff = nChunkYOff;
3934
3935
0
        int nSrcYOff2 =
3936
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3937
0
        if (nSrcYOff2 == nSrcYOff)
3938
0
            nSrcYOff2++;
3939
3940
0
        if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3941
0
        {
3942
0
            if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3943
0
                nSrcYOff = nSrcHeight - 1;
3944
0
            nSrcYOff2 = nSrcHeight;
3945
0
        }
3946
0
        if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3947
0
            nSrcYOff2 = nChunkYOff + nChunkYSize;
3948
3949
0
        const float *const pafSrcScanline =
3950
0
            pafChunk +
3951
0
            (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3952
0
        float *const pafDstScanline =
3953
0
            pafDstBuffer +
3954
0
            static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
3955
3956
        /* --------------------------------------------------------------------
3957
         */
3958
        /*      Loop over destination pixels */
3959
        /* --------------------------------------------------------------------
3960
         */
3961
0
        for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3962
0
        {
3963
0
            const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
3964
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3965
0
            int nSrcXOff2 =
3966
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3967
0
            if (nSrcXOff2 == nSrcXOff)
3968
0
                nSrcXOff2++;
3969
0
            if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3970
0
            {
3971
0
                if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3972
0
                    nSrcXOff = nSrcWidth - 1;
3973
0
                nSrcXOff2 = nSrcWidth;
3974
0
            }
3975
0
            const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
3976
3977
0
            if (eMethod == NEAR)
3978
0
            {
3979
0
                pafDstScanline[iDstPixelSZ * 2] =
3980
0
                    pafSrcScanline[nSrcXOffSZ * 2];
3981
0
                pafDstScanline[iDstPixelSZ * 2 + 1] =
3982
0
                    pafSrcScanline[nSrcXOffSZ * 2 + 1];
3983
0
            }
3984
0
            else if (eMethod == AVERAGE_MAGPHASE)
3985
0
            {
3986
0
                double dfTotalR = 0.0;
3987
0
                double dfTotalI = 0.0;
3988
0
                double dfTotalM = 0.0;
3989
0
                size_t nCount = 0;
3990
3991
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3992
0
                {
3993
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3994
0
                    {
3995
0
                        const double dfR =
3996
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
3997
0
                                           static_cast<size_t>(iY - nSrcYOff) *
3998
0
                                               nSrcWidth * 2];
3999
0
                        const double dfI =
4000
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4001
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4002
0
                                               nSrcWidth * 2 +
4003
0
                                           1];
4004
0
                        dfTotalR += dfR;
4005
0
                        dfTotalI += dfI;
4006
0
                        dfTotalM += std::hypot(dfR, dfI);
4007
0
                        ++nCount;
4008
0
                    }
4009
0
                }
4010
4011
0
                CPLAssert(nCount > 0);
4012
0
                if (nCount == 0)
4013
0
                {
4014
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4015
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4016
0
                }
4017
0
                else
4018
0
                {
4019
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4020
0
                        dfTotalR / static_cast<double>(nCount));
4021
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4022
0
                        dfTotalI / static_cast<double>(nCount));
4023
0
                    const double dfM =
4024
0
                        std::hypot(pafDstScanline[iDstPixelSZ * 2],
4025
0
                                   pafDstScanline[iDstPixelSZ * 2 + 1]);
4026
0
                    const double dfDesiredM =
4027
0
                        dfTotalM / static_cast<double>(nCount);
4028
0
                    double dfRatio = 1.0;
4029
0
                    if (dfM != 0.0)
4030
0
                        dfRatio = dfDesiredM / dfM;
4031
4032
0
                    pafDstScanline[iDstPixelSZ * 2] *=
4033
0
                        static_cast<float>(dfRatio);
4034
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] *=
4035
0
                        static_cast<float>(dfRatio);
4036
0
                }
4037
0
            }
4038
0
            else if (eMethod == RMS)
4039
0
            {
4040
0
                double dfTotalR = 0.0;
4041
0
                double dfTotalI = 0.0;
4042
0
                size_t nCount = 0;
4043
4044
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4045
0
                {
4046
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4047
0
                    {
4048
0
                        const double dfR =
4049
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4050
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4051
0
                                               nSrcWidth * 2];
4052
0
                        const double dfI =
4053
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4054
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4055
0
                                               nSrcWidth * 2 +
4056
0
                                           1];
4057
4058
0
                        dfTotalR += SQUARE(dfR);
4059
0
                        dfTotalI += SQUARE(dfI);
4060
4061
0
                        ++nCount;
4062
0
                    }
4063
0
                }
4064
4065
0
                CPLAssert(nCount > 0);
4066
0
                if (nCount == 0)
4067
0
                {
4068
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4069
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4070
0
                }
4071
0
                else
4072
0
                {
4073
                    /* compute RMS */
4074
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4075
0
                        sqrt(dfTotalR / static_cast<double>(nCount)));
4076
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4077
0
                        sqrt(dfTotalI / static_cast<double>(nCount)));
4078
0
                }
4079
0
            }
4080
0
            else if (eMethod == AVERAGE)
4081
0
            {
4082
0
                double dfTotalR = 0.0;
4083
0
                double dfTotalI = 0.0;
4084
0
                size_t nCount = 0;
4085
4086
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4087
0
                {
4088
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4089
0
                    {
4090
                        // TODO(schwehr): Maybe use std::complex?
4091
0
                        dfTotalR +=
4092
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4093
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4094
0
                                               nSrcWidth * 2];
4095
0
                        dfTotalI +=
4096
0
                            pafSrcScanline[static_cast<size_t>(iX) * 2 +
4097
0
                                           static_cast<size_t>(iY - nSrcYOff) *
4098
0
                                               nSrcWidth * 2 +
4099
0
                                           1];
4100
0
                        ++nCount;
4101
0
                    }
4102
0
                }
4103
4104
0
                CPLAssert(nCount > 0);
4105
0
                if (nCount == 0)
4106
0
                {
4107
0
                    pafDstScanline[iDstPixelSZ * 2] = 0.0;
4108
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
4109
0
                }
4110
0
                else
4111
0
                {
4112
0
                    pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
4113
0
                        dfTotalR / static_cast<double>(nCount));
4114
0
                    pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
4115
0
                        dfTotalI / static_cast<double>(nCount));
4116
0
                }
4117
0
            }
4118
0
        }
4119
0
    }
4120
4121
0
    return CE_None;
4122
0
}
4123
4124
/************************************************************************/
4125
/*                  GDALRegenerateCascadingOverviews()                  */
4126
/*                                                                      */
4127
/*      Generate a list of overviews in order from largest to           */
4128
/*      smallest, computing each from the next larger.                  */
4129
/************************************************************************/
4130
4131
static CPLErr GDALRegenerateCascadingOverviews(
4132
    GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4133
    const char *pszResampling, GDALProgressFunc pfnProgress,
4134
    void *pProgressData, CSLConstList papszOptions)
4135
4136
0
{
4137
    /* -------------------------------------------------------------------- */
4138
    /*      First, we must put the overviews in order from largest to       */
4139
    /*      smallest.                                                       */
4140
    /* -------------------------------------------------------------------- */
4141
0
    for (int i = 0; i < nOverviews - 1; ++i)
4142
0
    {
4143
0
        for (int j = 0; j < nOverviews - i - 1; ++j)
4144
0
        {
4145
0
            if (papoOvrBands[j]->GetXSize() *
4146
0
                    static_cast<float>(papoOvrBands[j]->GetYSize()) <
4147
0
                papoOvrBands[j + 1]->GetXSize() *
4148
0
                    static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4149
0
            {
4150
0
                GDALRasterBand *poTempBand = papoOvrBands[j];
4151
0
                papoOvrBands[j] = papoOvrBands[j + 1];
4152
0
                papoOvrBands[j + 1] = poTempBand;
4153
0
            }
4154
0
        }
4155
0
    }
4156
4157
    /* -------------------------------------------------------------------- */
4158
    /*      Count total pixels so we can prepare appropriate scaled         */
4159
    /*      progress functions.                                             */
4160
    /* -------------------------------------------------------------------- */
4161
0
    double dfTotalPixels = 0.0;
4162
4163
0
    for (int i = 0; i < nOverviews; ++i)
4164
0
    {
4165
0
        dfTotalPixels += papoOvrBands[i]->GetXSize() *
4166
0
                         static_cast<double>(papoOvrBands[i]->GetYSize());
4167
0
    }
4168
4169
    /* -------------------------------------------------------------------- */
4170
    /*      Generate all the bands.                                         */
4171
    /* -------------------------------------------------------------------- */
4172
0
    double dfPixelsProcessed = 0.0;
4173
4174
0
    for (int i = 0; i < nOverviews; ++i)
4175
0
    {
4176
0
        GDALRasterBand *poBaseBand = poSrcBand;
4177
0
        if (i != 0)
4178
0
            poBaseBand = papoOvrBands[i - 1];
4179
4180
0
        double dfPixels = papoOvrBands[i]->GetXSize() *
4181
0
                          static_cast<double>(papoOvrBands[i]->GetYSize());
4182
4183
0
        void *pScaledProgressData = GDALCreateScaledProgress(
4184
0
            dfPixelsProcessed / dfTotalPixels,
4185
0
            (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4186
0
            pProgressData);
4187
4188
0
        const CPLErr eErr = GDALRegenerateOverviewsEx(
4189
0
            poBaseBand, 1,
4190
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4191
0
            pszResampling, GDALScaledProgress, pScaledProgressData,
4192
0
            papszOptions);
4193
0
        GDALDestroyScaledProgress(pScaledProgressData);
4194
4195
0
        if (eErr != CE_None)
4196
0
            return eErr;
4197
4198
0
        dfPixelsProcessed += dfPixels;
4199
4200
        // Only do the bit2grayscale promotion on the base band.
4201
0
        if (STARTS_WITH_CI(pszResampling,
4202
0
                           "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4203
0
            pszResampling = "AVERAGE";
4204
0
    }
4205
4206
0
    return CE_None;
4207
0
}
4208
4209
/************************************************************************/
4210
/*                    GDALGetResampleFunction()                         */
4211
/************************************************************************/
4212
4213
GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4214
                                             int *pnRadius)
4215
0
{
4216
0
    if (pnRadius)
4217
0
        *pnRadius = 0;
4218
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
4219
0
        return GDALResampleChunk_Near;
4220
0
    else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4221
0
             EQUAL(pszResampling, "RMS"))
4222
0
        return GDALResampleChunk_AverageOrRMS;
4223
0
    else if (EQUAL(pszResampling, "GAUSS"))
4224
0
    {
4225
0
        if (pnRadius)
4226
0
            *pnRadius = 1;
4227
0
        return GDALResampleChunk_Gauss;
4228
0
    }
4229
0
    else if (EQUAL(pszResampling, "MODE"))
4230
0
        return GDALResampleChunk_Mode;
4231
0
    else if (EQUAL(pszResampling, "CUBIC"))
4232
0
    {
4233
0
        if (pnRadius)
4234
0
            *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4235
0
        return GDALResampleChunk_Convolution;
4236
0
    }
4237
0
    else if (EQUAL(pszResampling, "CUBICSPLINE"))
4238
0
    {
4239
0
        if (pnRadius)
4240
0
            *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4241
0
        return GDALResampleChunk_Convolution;
4242
0
    }
4243
0
    else if (EQUAL(pszResampling, "LANCZOS"))
4244
0
    {
4245
0
        if (pnRadius)
4246
0
            *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4247
0
        return GDALResampleChunk_Convolution;
4248
0
    }
4249
0
    else if (EQUAL(pszResampling, "BILINEAR"))
4250
0
    {
4251
0
        if (pnRadius)
4252
0
            *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4253
0
        return GDALResampleChunk_Convolution;
4254
0
    }
4255
0
    else
4256
0
    {
4257
0
        CPLError(
4258
0
            CE_Failure, CPLE_AppDefined,
4259
0
            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4260
0
            pszResampling);
4261
0
        return nullptr;
4262
0
    }
4263
0
}
4264
4265
/************************************************************************/
4266
/*                      GDALGetOvrWorkDataType()                        */
4267
/************************************************************************/
4268
4269
GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4270
                                    GDALDataType eSrcDataType)
4271
0
{
4272
0
    if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4273
0
    {
4274
0
        return eSrcDataType;
4275
0
    }
4276
0
    else if (eSrcDataType == GDT_Byte &&
4277
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4278
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4279
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4280
0
              EQUAL(pszResampling, "LANCZOS") ||
4281
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4282
0
    {
4283
0
        return GDT_Byte;
4284
0
    }
4285
0
    else if (eSrcDataType == GDT_UInt16 &&
4286
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4287
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4288
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4289
0
              EQUAL(pszResampling, "LANCZOS") ||
4290
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4291
0
    {
4292
0
        return GDT_UInt16;
4293
0
    }
4294
0
    else if (EQUAL(pszResampling, "GAUSS"))
4295
0
        return GDT_Float64;
4296
4297
0
    if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4298
0
        eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4299
0
        eSrcDataType == GDT_Float32)
4300
0
    {
4301
0
        return GDT_Float32;
4302
0
    }
4303
0
    return GDT_Float64;
4304
0
}
4305
4306
namespace
4307
{
4308
// Structure to hold a pointer to free with CPLFree()
4309
struct PointerHolder
4310
{
4311
    void *ptr = nullptr;
4312
4313
0
    explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4314
0
    {
4315
0
    }
4316
4317
    ~PointerHolder()
4318
0
    {
4319
0
        CPLFree(ptr);
4320
0
    }
4321
4322
    PointerHolder(const PointerHolder &) = delete;
4323
    PointerHolder &operator=(const PointerHolder &) = delete;
4324
};
4325
}  // namespace
4326
4327
/************************************************************************/
4328
/*                      GDALRegenerateOverviews()                       */
4329
/************************************************************************/
4330
4331
/**
4332
 * \brief Generate downsampled overviews.
4333
 *
4334
 * This function will generate one or more overview images from a base image
4335
 * using the requested downsampling algorithm.  Its primary use is for
4336
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4337
 * used to generate downsampled images in one file from another outside the
4338
 * overview architecture.
4339
 *
4340
 * The output bands need to exist in advance.
4341
 *
4342
 * The full set of resampling algorithms is documented in
4343
 * GDALDataset::BuildOverviews().
4344
 *
4345
 * This function will honour properly NODATA_VALUES tuples (special dataset
4346
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4347
 * considered as the nodata value and not each value of the triplet
4348
 * independently per band.
4349
 *
4350
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4351
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4352
 * overview computation.
4353
 *
4354
 * @param hSrcBand the source (base level) band.
4355
 * @param nOverviewCount the number of downsampled bands being generated.
4356
 * @param pahOvrBands the list of downsampled bands to be generated.
4357
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4358
 * @param pfnProgress progress report function.
4359
 * @param pProgressData progress function callback data.
4360
 * @return CE_None on success or CE_Failure on failure.
4361
 */
4362
CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4363
                               GDALRasterBandH *pahOvrBands,
4364
                               const char *pszResampling,
4365
                               GDALProgressFunc pfnProgress,
4366
                               void *pProgressData)
4367
4368
0
{
4369
0
    return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4370
0
                                     pszResampling, pfnProgress, pProgressData,
4371
0
                                     nullptr);
4372
0
}
4373
4374
/************************************************************************/
4375
/*                     GDALRegenerateOverviewsEx()                      */
4376
/************************************************************************/
4377
4378
constexpr int RADIUS_TO_DIAMETER = 2;
4379
4380
/**
4381
 * \brief Generate downsampled overviews.
4382
 *
4383
 * This function will generate one or more overview images from a base image
4384
 * using the requested downsampling algorithm.  Its primary use is for
4385
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4386
 * used to generate downsampled images in one file from another outside the
4387
 * overview architecture.
4388
 *
4389
 * The output bands need to exist in advance.
4390
 *
4391
 * The full set of resampling algorithms is documented in
4392
 * GDALDataset::BuildOverviews().
4393
 *
4394
 * This function will honour properly NODATA_VALUES tuples (special dataset
4395
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4396
 * considered as the nodata value and not each value of the triplet
4397
 * independently per band.
4398
 *
4399
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4400
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4401
 * overview computation.
4402
 *
4403
 * @param hSrcBand the source (base level) band.
4404
 * @param nOverviewCount the number of downsampled bands being generated.
4405
 * @param pahOvrBands the list of downsampled bands to be generated.
4406
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4407
 * @param pfnProgress progress report function.
4408
 * @param pProgressData progress function callback data.
4409
 * @param papszOptions NULL terminated list of options as key=value pairs, or
4410
 * NULL
4411
 * @return CE_None on success or CE_Failure on failure.
4412
 * @since GDAL 3.6
4413
 */
4414
CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4415
                                 GDALRasterBandH *pahOvrBands,
4416
                                 const char *pszResampling,
4417
                                 GDALProgressFunc pfnProgress,
4418
                                 void *pProgressData, CSLConstList papszOptions)
4419
4420
0
{
4421
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4422
0
    GDALRasterBand **papoOvrBands =
4423
0
        reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4424
4425
0
    if (pfnProgress == nullptr)
4426
0
        pfnProgress = GDALDummyProgress;
4427
4428
0
    if (EQUAL(pszResampling, "NONE"))
4429
0
        return CE_None;
4430
4431
0
    int nKernelRadius = 0;
4432
0
    GDALResampleFunction pfnResampleFn =
4433
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
4434
4435
0
    if (pfnResampleFn == nullptr)
4436
0
        return CE_Failure;
4437
4438
    /* -------------------------------------------------------------------- */
4439
    /*      Check color tables...                                           */
4440
    /* -------------------------------------------------------------------- */
4441
0
    GDALColorTable *poColorTable = nullptr;
4442
4443
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4444
0
         EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4445
0
        poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4446
0
    {
4447
0
        poColorTable = poSrcBand->GetColorTable();
4448
0
        if (poColorTable != nullptr)
4449
0
        {
4450
0
            if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4451
0
            {
4452
0
                CPLError(CE_Warning, CPLE_AppDefined,
4453
0
                         "Computing overviews on palette index raster bands "
4454
0
                         "with a palette whose color interpretation is not RGB "
4455
0
                         "will probably lead to unexpected results.");
4456
0
                poColorTable = nullptr;
4457
0
            }
4458
0
            else if (poColorTable->IsIdentity())
4459
0
            {
4460
0
                poColorTable = nullptr;
4461
0
            }
4462
0
        }
4463
0
        else
4464
0
        {
4465
0
            CPLError(CE_Warning, CPLE_AppDefined,
4466
0
                     "Computing overviews on palette index raster bands "
4467
0
                     "without a palette will probably lead to unexpected "
4468
0
                     "results.");
4469
0
        }
4470
0
    }
4471
    // Not ready yet
4472
0
    else if ((EQUAL(pszResampling, "CUBIC") ||
4473
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4474
0
              EQUAL(pszResampling, "LANCZOS") ||
4475
0
              EQUAL(pszResampling, "BILINEAR")) &&
4476
0
             poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4477
0
    {
4478
0
        CPLError(CE_Warning, CPLE_AppDefined,
4479
0
                 "Computing %s overviews on palette index raster bands "
4480
0
                 "will probably lead to unexpected results.",
4481
0
                 pszResampling);
4482
0
    }
4483
4484
    // If we have a nodata mask and we are doing something more complicated
4485
    // than nearest neighbouring, we have to fetch to nodata mask.
4486
4487
0
    GDALRasterBand *poMaskBand = nullptr;
4488
0
    bool bUseNoDataMask = false;
4489
0
    bool bCanUseCascaded = true;
4490
4491
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4492
0
    {
4493
        // Special case if we are an alpha/mask band. We want it to be
4494
        // considered as the mask band to avoid alpha=0 to be taken into account
4495
        // in average computation.
4496
0
        if (poSrcBand->IsMaskBand())
4497
0
        {
4498
0
            poMaskBand = poSrcBand;
4499
0
            bUseNoDataMask = true;
4500
0
        }
4501
0
        else
4502
0
        {
4503
0
            poMaskBand = poSrcBand->GetMaskBand();
4504
0
            const int nMaskFlags = poSrcBand->GetMaskFlags();
4505
0
            bCanUseCascaded =
4506
0
                (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4507
0
            bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4508
0
        }
4509
0
    }
4510
4511
    /* -------------------------------------------------------------------- */
4512
    /*      If we are operating on multiple overviews, and using            */
4513
    /*      averaging, lets do them in cascading order to reduce the        */
4514
    /*      amount of computation.                                          */
4515
    /* -------------------------------------------------------------------- */
4516
4517
    // In case the mask made be computed from another band of the dataset,
4518
    // we can't use cascaded generation, as the computation of the overviews
4519
    // of the band used for the mask band may not have yet occurred (#3033).
4520
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4521
0
         EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4522
0
         EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4523
0
         EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4524
0
         EQUAL(pszResampling, "MODE")) &&
4525
0
        nOverviewCount > 1 && bCanUseCascaded)
4526
0
        return GDALRegenerateCascadingOverviews(
4527
0
            poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4528
0
            pProgressData, papszOptions);
4529
4530
    /* -------------------------------------------------------------------- */
4531
    /*      Setup one horizontal swath to read from the raw buffer.         */
4532
    /* -------------------------------------------------------------------- */
4533
0
    int nFRXBlockSize = 0;
4534
0
    int nFRYBlockSize = 0;
4535
0
    poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4536
4537
0
    const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4538
0
    const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4539
0
                                       EQUAL(pszResampling, "MODE") ||
4540
0
                                       !GDALDataTypeIsComplex(eSrcDataType);
4541
0
    const GDALDataType eWrkDataType =
4542
0
        bUseGenericResampleFn
4543
0
            ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4544
0
            : GDT_CFloat32;
4545
4546
0
    const int nWidth = poSrcBand->GetXSize();
4547
0
    const int nHeight = poSrcBand->GetYSize();
4548
4549
0
    int nMaxOvrFactor = 1;
4550
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4551
0
    {
4552
0
        const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4553
0
        const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4554
0
        nMaxOvrFactor = std::max(
4555
0
            nMaxOvrFactor,
4556
0
            static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4557
0
        nMaxOvrFactor = std::max(
4558
0
            nMaxOvrFactor,
4559
0
            static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4560
0
    }
4561
4562
0
    int nFullResYChunk = nFRYBlockSize;
4563
0
    int nMaxChunkYSizeQueried = 0;
4564
4565
0
    const auto UpdateChunkHeightAndGetChunkSize =
4566
0
        [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4567
0
         eWrkDataType, nWidth]()
4568
0
    {
4569
        // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4570
        // + nFullResYChunk) / nMaxOvrFactor)
4571
0
        if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4572
0
        {
4573
0
            return GINTBIG_MAX;
4574
0
        }
4575
0
        nFullResYChunk =
4576
0
            std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4577
0
        if ((nKernelRadius > 0 &&
4578
0
             nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4579
0
            nFullResYChunk >
4580
0
                INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4581
0
        {
4582
0
            return GINTBIG_MAX;
4583
0
        }
4584
0
        nMaxChunkYSizeQueried =
4585
0
            nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4586
0
        if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4587
0
            std::numeric_limits<int64_t>::max() /
4588
0
                (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4589
0
        {
4590
0
            return GINTBIG_MAX;
4591
0
        }
4592
0
        return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4593
0
               nMaxChunkYSizeQueried * nWidth;
4594
0
    };
4595
4596
0
    const char *pszChunkYSize =
4597
0
        CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4598
0
#ifndef __COVERITY__
4599
    // Only configurable for debug / testing
4600
0
    if (pszChunkYSize)
4601
0
    {
4602
0
        nFullResYChunk = atoi(pszChunkYSize);
4603
0
    }
4604
0
#endif
4605
4606
    // Only configurable for debug / testing
4607
0
    const int nChunkMaxSize =
4608
0
        atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4609
4610
0
    auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4611
0
    if (nChunkSize > nChunkMaxSize)
4612
0
    {
4613
0
        if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4614
0
            !GDALDataTypeIsComplex(eSrcDataType) &&
4615
0
            (!STARTS_WITH_CI(pszResampling, "AVER") ||
4616
0
             EQUAL(pszResampling, "AVERAGE")))
4617
0
        {
4618
            // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4619
            // which use a block based strategy, which is much less memory
4620
            // hungry.
4621
0
            return GDALRegenerateOverviewsMultiBand(
4622
0
                1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4623
0
                pfnProgress, pProgressData, papszOptions);
4624
0
        }
4625
0
        else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4626
0
        {
4627
0
            return GDALRegenerateCascadingOverviews(
4628
0
                poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4629
0
                pfnProgress, pProgressData, papszOptions);
4630
0
        }
4631
0
    }
4632
0
    else if (pszChunkYSize == nullptr)
4633
0
    {
4634
        // Try to get as close as possible to nChunkMaxSize
4635
0
        while (nChunkSize < nChunkMaxSize / 2)
4636
0
        {
4637
0
            nFullResYChunk *= 2;
4638
0
            nChunkSize = UpdateChunkHeightAndGetChunkSize();
4639
0
        }
4640
0
    }
4641
4642
0
    int nHasNoData = 0;
4643
0
    const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4644
0
    const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4645
0
    const bool bPropagateNoData =
4646
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4647
4648
    // Structure describing a resampling job
4649
0
    struct OvrJob
4650
0
    {
4651
        // Buffers to free when job is finished
4652
0
        std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4653
0
        std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4654
0
        std::unique_ptr<PointerHolder> oDstBufferHolder{};
4655
4656
0
        GDALRasterBand *poDstBand = nullptr;
4657
4658
        // Input parameters of pfnResampleFn
4659
0
        GDALResampleFunction pfnResampleFn = nullptr;
4660
0
        int nSrcWidth = 0;
4661
0
        int nSrcHeight = 0;
4662
0
        int nDstWidth = 0;
4663
0
        GDALOverviewResampleArgs args{};
4664
0
        const void *pChunk = nullptr;
4665
0
        bool bUseGenericResampleFn = false;
4666
4667
        // Output values of resampling function
4668
0
        CPLErr eErr = CE_Failure;
4669
0
        void *pDstBuffer = nullptr;
4670
0
        GDALDataType eDstBufferDataType = GDT_Unknown;
4671
4672
0
        void SetSrcMaskBufferHolder(
4673
0
            const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4674
0
        {
4675
0
            oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4676
0
        }
4677
4678
0
        void SetSrcBufferHolder(
4679
0
            const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4680
0
        {
4681
0
            oSrcBufferHolder = oSrcBufferHolderIn;
4682
0
        }
4683
4684
0
        void NotifyFinished()
4685
0
        {
4686
0
            std::lock_guard guard(mutex);
4687
0
            bFinished = true;
4688
0
            cv.notify_one();
4689
0
        }
4690
4691
0
        bool IsFinished()
4692
0
        {
4693
0
            std::lock_guard guard(mutex);
4694
0
            return bFinished;
4695
0
        }
4696
4697
0
        void WaitFinished()
4698
0
        {
4699
0
            std::unique_lock oGuard(mutex);
4700
0
            while (!bFinished)
4701
0
            {
4702
0
                cv.wait(oGuard);
4703
0
            }
4704
0
        }
4705
4706
0
      private:
4707
        // Synchronization
4708
0
        bool bFinished = false;
4709
0
        std::mutex mutex{};
4710
0
        std::condition_variable cv{};
4711
0
    };
4712
4713
    // Thread function to resample
4714
0
    const auto JobResampleFunc = [](void *pData)
4715
0
    {
4716
0
        OvrJob *poJob = static_cast<OvrJob *>(pData);
4717
4718
0
        if (poJob->bUseGenericResampleFn)
4719
0
        {
4720
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
4721
0
                                               &(poJob->pDstBuffer),
4722
0
                                               &(poJob->eDstBufferDataType));
4723
0
        }
4724
0
        else
4725
0
        {
4726
0
            poJob->eErr = GDALResampleChunkC32R(
4727
0
                poJob->nSrcWidth, poJob->nSrcHeight,
4728
0
                static_cast<const float *>(poJob->pChunk),
4729
0
                poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4730
0
                poJob->args.nDstYOff, poJob->args.nDstYOff2,
4731
0
                poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4732
0
                &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4733
0
                poJob->args.pszResampling);
4734
0
        }
4735
4736
0
        poJob->oDstBufferHolder =
4737
0
            std::make_unique<PointerHolder>(poJob->pDstBuffer);
4738
4739
0
        poJob->NotifyFinished();
4740
0
    };
4741
4742
    // Function to write resample data to target band
4743
0
    const auto WriteJobData = [](const OvrJob *poJob)
4744
0
    {
4745
0
        return poJob->poDstBand->RasterIO(
4746
0
            GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
4747
0
            poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
4748
0
            poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
4749
0
            poJob->eDstBufferDataType, 0, 0, nullptr);
4750
0
    };
4751
4752
    // Wait for completion of oldest job and serialize it
4753
0
    const auto WaitAndFinalizeOldestJob =
4754
0
        [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4755
0
    {
4756
0
        auto poOldestJob = jobList.front().get();
4757
0
        poOldestJob->WaitFinished();
4758
0
        CPLErr l_eErr = poOldestJob->eErr;
4759
0
        if (l_eErr == CE_None)
4760
0
        {
4761
0
            l_eErr = WriteJobData(poOldestJob);
4762
0
        }
4763
4764
0
        jobList.pop_front();
4765
0
        return l_eErr;
4766
0
    };
4767
4768
    // Queue of jobs
4769
0
    std::list<std::unique_ptr<OvrJob>> jobList;
4770
4771
0
    GByte *pabyChunkNodataMask = nullptr;
4772
0
    void *pChunk = nullptr;
4773
4774
0
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4775
0
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4776
0
                                                       ? CPLGetNumCPUs()
4777
0
                                                       : atoi(pszThreads)));
4778
0
    auto poThreadPool =
4779
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4780
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4781
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
4782
4783
    /* -------------------------------------------------------------------- */
4784
    /*      Loop over image operating on chunks.                            */
4785
    /* -------------------------------------------------------------------- */
4786
0
    int nChunkYOff = 0;
4787
0
    CPLErr eErr = CE_None;
4788
4789
0
    for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4790
0
         nChunkYOff += nFullResYChunk)
4791
0
    {
4792
0
        if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4793
0
                         pProgressData))
4794
0
        {
4795
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4796
0
            eErr = CE_Failure;
4797
0
        }
4798
4799
0
        if (nFullResYChunk + nChunkYOff > nHeight)
4800
0
            nFullResYChunk = nHeight - nChunkYOff;
4801
4802
0
        int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4803
0
        int nChunkYSizeQueried =
4804
0
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4805
0
        if (nChunkYOffQueried < 0)
4806
0
        {
4807
0
            nChunkYSizeQueried += nChunkYOffQueried;
4808
0
            nChunkYOffQueried = 0;
4809
0
        }
4810
0
        if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4811
0
            nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4812
4813
        // Avoid accumulating too many tasks and exhaust RAM
4814
        // Try to complete already finished jobs
4815
0
        while (eErr == CE_None && !jobList.empty())
4816
0
        {
4817
0
            auto poOldestJob = jobList.front().get();
4818
0
            if (!poOldestJob->IsFinished())
4819
0
                break;
4820
0
            eErr = poOldestJob->eErr;
4821
0
            if (eErr == CE_None)
4822
0
            {
4823
0
                eErr = WriteJobData(poOldestJob);
4824
0
            }
4825
4826
0
            jobList.pop_front();
4827
0
        }
4828
4829
        // And in case we have saturated the number of threads,
4830
        // wait for completion of tasks to go below the threshold.
4831
0
        while (eErr == CE_None &&
4832
0
               jobList.size() >= static_cast<size_t>(nThreads))
4833
0
        {
4834
0
            eErr = WaitAndFinalizeOldestJob(jobList);
4835
0
        }
4836
4837
        // (Re)allocate buffers if needed
4838
0
        if (pChunk == nullptr)
4839
0
        {
4840
0
            pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4841
0
                                         nMaxChunkYSizeQueried, nWidth);
4842
0
        }
4843
0
        if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4844
0
        {
4845
0
            pabyChunkNodataMask = static_cast<GByte *>(
4846
0
                VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4847
0
        }
4848
4849
0
        if (pChunk == nullptr ||
4850
0
            (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4851
0
        {
4852
0
            CPLFree(pChunk);
4853
0
            CPLFree(pabyChunkNodataMask);
4854
0
            return CE_Failure;
4855
0
        }
4856
4857
        // Read chunk.
4858
0
        if (eErr == CE_None)
4859
0
            eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4860
0
                                       nChunkYSizeQueried, pChunk, nWidth,
4861
0
                                       nChunkYSizeQueried, eWrkDataType, 0, 0,
4862
0
                                       nullptr);
4863
0
        if (eErr == CE_None && bUseNoDataMask)
4864
0
            eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4865
0
                                        nChunkYSizeQueried, pabyChunkNodataMask,
4866
0
                                        nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4867
0
                                        0, nullptr);
4868
4869
        // Special case to promote 1bit data to 8bit 0/255 values.
4870
0
        if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4871
0
        {
4872
0
            if (eWrkDataType == GDT_Float32)
4873
0
            {
4874
0
                float *pafChunk = static_cast<float *>(pChunk);
4875
0
                for (size_t i = 0;
4876
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4877
0
                {
4878
0
                    if (pafChunk[i] == 1.0)
4879
0
                        pafChunk[i] = 255.0;
4880
0
                }
4881
0
            }
4882
0
            else if (eWrkDataType == GDT_Byte)
4883
0
            {
4884
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
4885
0
                for (size_t i = 0;
4886
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4887
0
                {
4888
0
                    if (pabyChunk[i] == 1)
4889
0
                        pabyChunk[i] = 255;
4890
0
                }
4891
0
            }
4892
0
            else if (eWrkDataType == GDT_UInt16)
4893
0
            {
4894
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4895
0
                for (size_t i = 0;
4896
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4897
0
                {
4898
0
                    if (pasChunk[i] == 1)
4899
0
                        pasChunk[i] = 255;
4900
0
                }
4901
0
            }
4902
0
            else if (eWrkDataType == GDT_Float64)
4903
0
            {
4904
0
                double *padfChunk = static_cast<double *>(pChunk);
4905
0
                for (size_t i = 0;
4906
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4907
0
                {
4908
0
                    if (padfChunk[i] == 1.0)
4909
0
                        padfChunk[i] = 255.0;
4910
0
                }
4911
0
            }
4912
0
            else
4913
0
            {
4914
0
                CPLAssert(false);
4915
0
            }
4916
0
        }
4917
0
        else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4918
0
        {
4919
0
            if (eWrkDataType == GDT_Float32)
4920
0
            {
4921
0
                float *pafChunk = static_cast<float *>(pChunk);
4922
0
                for (size_t i = 0;
4923
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4924
0
                {
4925
0
                    if (pafChunk[i] == 1.0)
4926
0
                        pafChunk[i] = 0.0;
4927
0
                    else if (pafChunk[i] == 0.0)
4928
0
                        pafChunk[i] = 255.0;
4929
0
                }
4930
0
            }
4931
0
            else if (eWrkDataType == GDT_Byte)
4932
0
            {
4933
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
4934
0
                for (size_t i = 0;
4935
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4936
0
                {
4937
0
                    if (pabyChunk[i] == 1)
4938
0
                        pabyChunk[i] = 0;
4939
0
                    else if (pabyChunk[i] == 0)
4940
0
                        pabyChunk[i] = 255;
4941
0
                }
4942
0
            }
4943
0
            else if (eWrkDataType == GDT_UInt16)
4944
0
            {
4945
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4946
0
                for (size_t i = 0;
4947
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4948
0
                {
4949
0
                    if (pasChunk[i] == 1)
4950
0
                        pasChunk[i] = 0;
4951
0
                    else if (pasChunk[i] == 0)
4952
0
                        pasChunk[i] = 255;
4953
0
                }
4954
0
            }
4955
0
            else if (eWrkDataType == GDT_Float64)
4956
0
            {
4957
0
                double *padfChunk = static_cast<double *>(pChunk);
4958
0
                for (size_t i = 0;
4959
0
                     i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
4960
0
                {
4961
0
                    if (padfChunk[i] == 1.0)
4962
0
                        padfChunk[i] = 0.0;
4963
0
                    else if (padfChunk[i] == 0.0)
4964
0
                        padfChunk[i] = 255.0;
4965
0
                }
4966
0
            }
4967
0
            else
4968
0
            {
4969
0
                CPLAssert(false);
4970
0
            }
4971
0
        }
4972
4973
0
        auto oSrcBufferHolder =
4974
0
            std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4975
0
        auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4976
0
            poJobQueue ? pabyChunkNodataMask : nullptr);
4977
4978
0
        for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4979
0
             ++iOverview)
4980
0
        {
4981
0
            GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4982
0
            const int nDstWidth = poDstBand->GetXSize();
4983
0
            const int nDstHeight = poDstBand->GetYSize();
4984
4985
0
            const double dfXRatioDstToSrc =
4986
0
                static_cast<double>(nWidth) / nDstWidth;
4987
0
            const double dfYRatioDstToSrc =
4988
0
                static_cast<double>(nHeight) / nDstHeight;
4989
4990
            /* --------------------------------------------------------------------
4991
             */
4992
            /*      Figure out the line to start writing to, and the first line
4993
             */
4994
            /*      to not write to.  In theory this approach should ensure that
4995
             */
4996
            /*      every output line will be written if all input chunks are */
4997
            /*      processed. */
4998
            /* --------------------------------------------------------------------
4999
             */
5000
0
            int nDstYOff =
5001
0
                static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5002
0
            if (nDstYOff == nDstHeight)
5003
0
                continue;
5004
0
            int nDstYOff2 = static_cast<int>(
5005
0
                0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5006
5007
0
            if (nChunkYOff + nFullResYChunk == nHeight)
5008
0
                nDstYOff2 = nDstHeight;
5009
#if DEBUG_VERBOSE
5010
            CPLDebug("GDAL",
5011
                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5012
                     nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5013
                     nDstWidth, nDstYOff2 - nDstYOff);
5014
#endif
5015
5016
0
            auto poJob = std::make_unique<OvrJob>();
5017
0
            poJob->pfnResampleFn = pfnResampleFn;
5018
0
            poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5019
0
            poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5020
0
            poJob->args.nOvrXSize = poDstBand->GetXSize();
5021
0
            poJob->args.nOvrYSize = poDstBand->GetYSize();
5022
0
            const char *pszNBITS =
5023
0
                poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5024
0
            poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5025
0
            poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5026
0
            poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5027
0
            poJob->args.eWrkDataType = eWrkDataType;
5028
0
            poJob->pChunk = pChunk;
5029
0
            poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
5030
0
            poJob->nSrcWidth = nWidth;
5031
0
            poJob->nSrcHeight = nHeight;
5032
0
            poJob->args.nChunkXOff = 0;
5033
0
            poJob->args.nChunkXSize = nWidth;
5034
0
            poJob->args.nChunkYOff = nChunkYOffQueried;
5035
0
            poJob->args.nChunkYSize = nChunkYSizeQueried;
5036
0
            poJob->nDstWidth = nDstWidth;
5037
0
            poJob->args.nDstXOff = 0;
5038
0
            poJob->args.nDstXOff2 = nDstWidth;
5039
0
            poJob->args.nDstYOff = nDstYOff;
5040
0
            poJob->args.nDstYOff2 = nDstYOff2;
5041
0
            poJob->poDstBand = poDstBand;
5042
0
            poJob->args.pszResampling = pszResampling;
5043
0
            poJob->args.bHasNoData = bHasNoData;
5044
0
            poJob->args.dfNoDataValue = dfNoDataValue;
5045
0
            poJob->args.poColorTable = poColorTable;
5046
0
            poJob->args.eSrcDataType = eSrcDataType;
5047
0
            poJob->args.bPropagateNoData = bPropagateNoData;
5048
5049
0
            if (poJobQueue)
5050
0
            {
5051
0
                poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5052
0
                poJob->SetSrcBufferHolder(oSrcBufferHolder);
5053
0
                poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5054
0
                jobList.emplace_back(std::move(poJob));
5055
0
            }
5056
0
            else
5057
0
            {
5058
0
                JobResampleFunc(poJob.get());
5059
0
                eErr = poJob->eErr;
5060
0
                if (eErr == CE_None)
5061
0
                {
5062
0
                    eErr = WriteJobData(poJob.get());
5063
0
                }
5064
0
            }
5065
0
        }
5066
5067
0
        if (poJobQueue)
5068
0
        {
5069
0
            pChunk = nullptr;
5070
0
            pabyChunkNodataMask = nullptr;
5071
0
        }
5072
0
    }
5073
5074
0
    VSIFree(pChunk);
5075
0
    VSIFree(pabyChunkNodataMask);
5076
5077
    // Wait for all pending jobs to complete
5078
0
    while (!jobList.empty())
5079
0
    {
5080
0
        const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5081
0
        if (l_eErr != CE_None && eErr == CE_None)
5082
0
            eErr = l_eErr;
5083
0
    }
5084
5085
    /* -------------------------------------------------------------------- */
5086
    /*      Renormalized overview mean / stddev if needed.                  */
5087
    /* -------------------------------------------------------------------- */
5088
0
    if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5089
0
    {
5090
0
        GDALOverviewMagnitudeCorrection(
5091
0
            poSrcBand, nOverviewCount,
5092
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5093
0
            GDALDummyProgress, nullptr);
5094
0
    }
5095
5096
    /* -------------------------------------------------------------------- */
5097
    /*      It can be important to flush out data to overviews.             */
5098
    /* -------------------------------------------------------------------- */
5099
0
    for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5100
0
         ++iOverview)
5101
0
    {
5102
0
        eErr = papoOvrBands[iOverview]->FlushCache(false);
5103
0
    }
5104
5105
0
    if (eErr == CE_None)
5106
0
        pfnProgress(1.0, nullptr, pProgressData);
5107
5108
0
    return eErr;
5109
0
}
5110
5111
/************************************************************************/
5112
/*            GDALRegenerateOverviewsMultiBand()                        */
5113
/************************************************************************/
5114
5115
/**
5116
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5117
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5118
 *
5119
 * This function will generate one or more overview images from a base
5120
 * image using the requested downsampling algorithm.  Its primary use
5121
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5122
 * can also be used to generate downsampled images in one file from another
5123
 * outside the overview architecture.
5124
 *
5125
 * The output bands need to exist in advance and share the same characteristics
5126
 * (type, dimensions)
5127
 *
5128
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5129
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5130
 *
5131
 * It does not support color tables or complex data types.
5132
 *
5133
 * The pseudo-algorithm used by the function is :
5134
 *    for each overview
5135
 *       iterate on lines of the source by a step of deltay
5136
 *           iterate on columns of the source  by a step of deltax
5137
 *               read the source data of size deltax * deltay for all the bands
5138
 *               generate the corresponding overview block for all the bands
5139
 *
5140
 * This function will honour properly NODATA_VALUES tuples (special dataset
5141
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5142
 * considered as the nodata value and not each value of the triplet
5143
 * independently per band.
5144
 *
5145
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5146
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5147
 * overview computation.
5148
 *
5149
 * @param nBands the number of bands, size of papoSrcBands and size of
5150
 *               first dimension of papapoOverviewBands
5151
 * @param papoSrcBands the list of source bands to downsample
5152
 * @param nOverviews the number of downsampled overview levels being generated.
5153
 * @param papapoOverviewBands bidimension array of bands. First dimension is
5154
 *                            indexed by nBands. Second dimension is indexed by
5155
 *                            nOverviews.
5156
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5157
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5158
 * @param pfnProgress progress report function.
5159
 * @param pProgressData progress function callback data.
5160
 * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5161
 *                     key=value pairs, or NULL
5162
 *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5163
 *                     options can be specified to express that overviews should
5164
 *                     be regenerated only in the specified subset of the source
5165
 *                     dataset.
5166
 * @return CE_None on success or CE_Failure on failure.
5167
 */
5168
5169
CPLErr GDALRegenerateOverviewsMultiBand(
5170
    int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5171
    GDALRasterBand *const *const *papapoOverviewBands,
5172
    const char *pszResampling, GDALProgressFunc pfnProgress,
5173
    void *pProgressData, CSLConstList papszOptions)
5174
0
{
5175
0
    CPL_IGNORE_RET_VAL(papszOptions);
5176
5177
0
    if (pfnProgress == nullptr)
5178
0
        pfnProgress = GDALDummyProgress;
5179
5180
0
    if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5181
0
        return CE_None;
5182
5183
    // Sanity checks.
5184
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5185
0
        !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5186
0
        !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5187
0
        !EQUAL(pszResampling, "CUBICSPLINE") &&
5188
0
        !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5189
0
        !EQUAL(pszResampling, "MODE"))
5190
0
    {
5191
0
        CPLError(CE_Failure, CPLE_NotSupported,
5192
0
                 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5193
0
                 "not supported",
5194
0
                 pszResampling);
5195
0
        return CE_Failure;
5196
0
    }
5197
5198
0
    int nKernelRadius = 0;
5199
0
    GDALResampleFunction pfnResampleFn =
5200
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
5201
0
    if (pfnResampleFn == nullptr)
5202
0
        return CE_Failure;
5203
5204
0
    const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5205
0
    const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5206
0
    if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5207
0
        return CE_None;
5208
0
    GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5209
0
    for (int iBand = 1; iBand < nBands; ++iBand)
5210
0
    {
5211
0
        if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5212
0
            papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5213
0
        {
5214
0
            CPLError(
5215
0
                CE_Failure, CPLE_NotSupported,
5216
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5217
0
                "have the same dimensions");
5218
0
            return CE_Failure;
5219
0
        }
5220
0
        if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5221
0
        {
5222
0
            CPLError(
5223
0
                CE_Failure, CPLE_NotSupported,
5224
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5225
0
                "have the same data type");
5226
0
            return CE_Failure;
5227
0
        }
5228
0
    }
5229
5230
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5231
0
    {
5232
0
        const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5233
0
        const int nDstWidth = poOvrFirstBand->GetXSize();
5234
0
        const int nDstHeight = poOvrFirstBand->GetYSize();
5235
0
        for (int iBand = 1; iBand < nBands; ++iBand)
5236
0
        {
5237
0
            const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5238
0
            if (poOvrBand->GetXSize() != nDstWidth ||
5239
0
                poOvrBand->GetYSize() != nDstHeight)
5240
0
            {
5241
0
                CPLError(
5242
0
                    CE_Failure, CPLE_NotSupported,
5243
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5244
0
                    "of the same level must have the same dimensions");
5245
0
                return CE_Failure;
5246
0
            }
5247
0
            if (poOvrBand->GetRasterDataType() != eDataType)
5248
0
            {
5249
0
                CPLError(
5250
0
                    CE_Failure, CPLE_NotSupported,
5251
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5252
0
                    "must have the same data type as the source bands");
5253
0
                return CE_Failure;
5254
0
            }
5255
0
        }
5256
0
    }
5257
5258
    // First pass to compute the total number of pixels to write.
5259
0
    double dfTotalPixelCount = 0;
5260
0
    const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5261
0
    const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5262
0
    const int nSrcXSize = atoi(CSLFetchNameValueDef(
5263
0
        papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5264
0
    const int nSrcYSize = atoi(CSLFetchNameValueDef(
5265
0
        papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5266
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5267
0
    {
5268
0
        dfTotalPixelCount +=
5269
0
            static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5270
0
            papapoOverviewBands[0][iOverview]->GetXSize() *
5271
0
            static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5272
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5273
0
    }
5274
5275
0
    const GDALDataType eWrkDataType =
5276
0
        GDALGetOvrWorkDataType(pszResampling, eDataType);
5277
0
    const int nWrkDataTypeSize =
5278
0
        std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5279
5280
0
    const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5281
5282
    // If we have a nodata mask and we are doing something more complicated
5283
    // than nearest neighbouring, we have to fetch to nodata mask.
5284
0
    const bool bUseNoDataMask =
5285
0
        !STARTS_WITH_CI(pszResampling, "NEAR") &&
5286
0
        (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5287
5288
0
    std::vector<bool> abHasNoData(nBands);
5289
0
    std::vector<double> adfNoDataValue(nBands);
5290
5291
0
    for (int iBand = 0; iBand < nBands; ++iBand)
5292
0
    {
5293
0
        int nHasNoData = 0;
5294
0
        adfNoDataValue[iBand] =
5295
0
            papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5296
0
        abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5297
0
    }
5298
0
    const bool bPropagateNoData =
5299
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5300
5301
0
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5302
0
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5303
0
                                                       ? CPLGetNumCPUs()
5304
0
                                                       : atoi(pszThreads)));
5305
0
    auto poThreadPool =
5306
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5307
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5308
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
5309
5310
    // Only configurable for debug / testing
5311
0
    const GIntBig nChunkMaxSize = []() -> GIntBig
5312
0
    {
5313
0
        const char *pszVal =
5314
0
            CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5315
0
        if (pszVal)
5316
0
        {
5317
0
            GIntBig nRet = 0;
5318
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5319
0
            return std::max<GIntBig>(100, nRet);
5320
0
        }
5321
0
        return 10 * 1024 * 1024;
5322
0
    }();
5323
5324
    // Only configurable for debug / testing
5325
0
    const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5326
0
    {
5327
0
        const char *pszVal = CPLGetConfigOption(
5328
0
            "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5329
0
        if (pszVal)
5330
0
        {
5331
0
            GIntBig nRet = 0;
5332
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5333
0
            return std::max<GIntBig>(100, nRet);
5334
0
        }
5335
0
        const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5336
0
        if (nUsableRAM > 0)
5337
0
            return nUsableRAM / 10;
5338
        // Select a value to be able to at least downsample by 2 for a RGB
5339
        // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5340
0
        return 100 * 1024 * 1024;
5341
0
    }();
5342
5343
    // Second pass to do the real job.
5344
0
    double dfCurPixelCount = 0;
5345
0
    CPLErr eErr = CE_None;
5346
0
    for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5347
0
         ++iOverview)
5348
0
    {
5349
0
        int iSrcOverview = -1;  // -1 means the source bands.
5350
5351
0
        const int nDstTotalWidth =
5352
0
            papapoOverviewBands[0][iOverview]->GetXSize();
5353
0
        const int nDstTotalHeight =
5354
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5355
5356
        // Compute the coordinates of the target region to refresh
5357
0
        constexpr double EPS = 1e-8;
5358
0
        const int nDstXOffStart = static_cast<int>(
5359
0
            static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5360
0
            EPS);
5361
0
        const int nDstXOffEnd =
5362
0
            std::min(static_cast<int>(
5363
0
                         std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5364
0
                                       nToplevelSrcWidth * nDstTotalWidth -
5365
0
                                   EPS)),
5366
0
                     nDstTotalWidth);
5367
0
        const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5368
0
        const int nDstYOffStart =
5369
0
            static_cast<int>(static_cast<double>(nSrcYOff) /
5370
0
                                 nToplevelSrcHeight * nDstTotalHeight +
5371
0
                             EPS);
5372
0
        const int nDstYOffEnd =
5373
0
            std::min(static_cast<int>(
5374
0
                         std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5375
0
                                       nToplevelSrcHeight * nDstTotalHeight -
5376
0
                                   EPS)),
5377
0
                     nDstTotalHeight);
5378
0
        const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5379
5380
        // Try to use previous level of overview as the source to compute
5381
        // the next level.
5382
0
        int nSrcWidth = nToplevelSrcWidth;
5383
0
        int nSrcHeight = nToplevelSrcHeight;
5384
0
        if (iOverview > 0 &&
5385
0
            papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5386
0
        {
5387
0
            nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5388
0
            nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5389
0
            iSrcOverview = iOverview - 1;
5390
0
        }
5391
5392
0
        const double dfXRatioDstToSrc =
5393
0
            static_cast<double>(nSrcWidth) / nDstTotalWidth;
5394
0
        const double dfYRatioDstToSrc =
5395
0
            static_cast<double>(nSrcHeight) / nDstTotalHeight;
5396
5397
0
        const int nOvrFactor =
5398
0
            std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5399
0
                                 static_cast<int>(0.5 + dfYRatioDstToSrc)));
5400
5401
0
        int nDstChunkXSize = 0;
5402
0
        int nDstChunkYSize = 0;
5403
0
        papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5404
0
                                                        &nDstChunkYSize);
5405
5406
0
        constexpr int PIXEL_MARGIN = 2;
5407
        // Try to extend the chunk size so that the memory needed to acquire
5408
        // source pixels goes up to 10 MB.
5409
        // This can help for drivers that support multi-threaded reading
5410
0
        const int nFullResYChunk = static_cast<int>(std::min<double>(
5411
0
            nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5412
0
        const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5413
0
            nSrcHeight,
5414
0
            nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5415
0
                                 nKernelRadius * nOvrFactor));
5416
0
        while (nDstChunkXSize < nDstWidth)
5417
0
        {
5418
0
            constexpr int INCREASE_FACTOR = 2;
5419
5420
0
            const int nFullResXChunk = static_cast<int>(std::min<double>(
5421
0
                nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5422
0
                                              dfXRatioDstToSrc));
5423
5424
0
            const int nFullResXChunkQueried =
5425
0
                static_cast<int>(std::min<int64_t>(
5426
0
                    nSrcWidth,
5427
0
                    nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5428
0
                                         nKernelRadius * nOvrFactor));
5429
5430
0
            if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5431
0
                             nFullResYChunkQueried / nWrkDataTypeSize)
5432
0
            {
5433
0
                break;
5434
0
            }
5435
5436
0
            nDstChunkXSize *= INCREASE_FACTOR;
5437
0
        }
5438
0
        nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5439
5440
0
        const int nFullResXChunk = static_cast<int>(std::min<double>(
5441
0
            nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5442
0
        const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5443
0
            nSrcWidth,
5444
0
            nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5445
0
                                 nKernelRadius * nOvrFactor));
5446
5447
        // Make sure that the RAM requirements to acquire the source data does
5448
        // not exceed nChunkMaxSizeForTempFile
5449
        // If so, reduce the destination chunk size, generate overviews in a
5450
        // temporary dataset, and copy that temporary dataset over the target
5451
        // overview bands (to avoid issues with lossy compression)
5452
0
        const bool bOverflowFullResXChunkYChunkQueried =
5453
0
            nBands > std::numeric_limits<int64_t>::max() /
5454
0
                         nFullResXChunkQueried / nFullResYChunkQueried /
5455
0
                         nWrkDataTypeSize;
5456
5457
0
        const auto nMemRequirement =
5458
0
            bOverflowFullResXChunkYChunkQueried
5459
0
                ? 0
5460
0
                : static_cast<GIntBig>(nFullResXChunkQueried) *
5461
0
                      nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5462
        // Use a temporary dataset with a smaller destination chunk size
5463
0
        const auto nOverShootFactor =
5464
0
            nMemRequirement / nChunkMaxSizeForTempFile;
5465
5466
0
        constexpr int MIN_OVERSHOOT_FACTOR = 4;
5467
0
        const auto nSqrtOverShootFactor = std::max<GIntBig>(
5468
0
            MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5469
0
                                      static_cast<double>(nOverShootFactor)))));
5470
0
        constexpr int DEFAULT_CHUNK_SIZE = 256;
5471
0
        constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5472
0
        const int nReducedDstChunkXSize =
5473
0
            bOverflowFullResXChunkYChunkQueried
5474
0
                ? DEFAULT_CHUNK_SIZE
5475
0
                : std::max(1, static_cast<int>(nDstChunkXSize /
5476
0
                                               nSqrtOverShootFactor) &
5477
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5478
0
        const int nReducedDstChunkYSize =
5479
0
            bOverflowFullResXChunkYChunkQueried
5480
0
                ? DEFAULT_CHUNK_SIZE
5481
0
                : std::max(1, static_cast<int>(nDstChunkYSize /
5482
0
                                               nSqrtOverShootFactor) &
5483
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5484
5485
0
        if (bOverflowFullResXChunkYChunkQueried ||
5486
0
            nMemRequirement > nChunkMaxSizeForTempFile)
5487
0
        {
5488
0
            const auto nDTSize =
5489
0
                std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5490
0
            const bool bTmpDSMemRequirementOverflow =
5491
0
                nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5492
0
                             nDstHeight / nDTSize;
5493
0
            const auto nTmpDSMemRequirement =
5494
0
                bTmpDSMemRequirementOverflow
5495
0
                    ? 0
5496
0
                    : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5497
0
                          nDTSize;
5498
5499
            // make sure that one band buffer doesn't overflow size_t
5500
0
            const bool bChunkSizeOverflow =
5501
0
                static_cast<size_t>(nDTSize) >
5502
0
                std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5503
0
            const size_t nChunkSize =
5504
0
                bChunkSizeOverflow
5505
0
                    ? 0
5506
0
                    : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5507
5508
0
            const auto CreateVRT =
5509
0
                [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5510
0
                 pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5511
0
                 iSrcOverview, &abHasNoData,
5512
0
                 &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5513
0
            {
5514
0
                auto poVRTDS = std::make_unique<VRTDataset>(
5515
0
                    nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5516
0
                    nVRTBlockYSize);
5517
5518
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5519
0
                {
5520
0
                    auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5521
0
                    poVRTSrc->SetResampling(pszResampling);
5522
0
                    poVRTDS->AddBand(eWrkDataType);
5523
0
                    auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5524
0
                        poVRTDS->GetRasterBand(iBand + 1));
5525
5526
0
                    auto poSrcBand = papoSrcBands[iBand];
5527
0
                    if (iSrcOverview != -1)
5528
0
                        poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5529
0
                    poVRTBand->ConfigureSource(
5530
0
                        poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5531
0
                        nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5532
                    // Add the source to the band
5533
0
                    poVRTBand->AddSource(poVRTSrc.release());
5534
0
                    if (abHasNoData[iBand])
5535
0
                        poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5536
0
                }
5537
5538
0
                if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5539
0
                    poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5540
0
                {
5541
0
                    VRTSourcedRasterBand *poMaskVRTBand =
5542
0
                        cpl::down_cast<VRTSourcedRasterBand *>(
5543
0
                            poVRTDS->GetRasterBand(1)->GetMaskBand());
5544
0
                    auto poSrcBand = papoSrcBands[0];
5545
0
                    if (iSrcOverview != -1)
5546
0
                        poSrcBand = papapoOverviewBands[0][iSrcOverview];
5547
0
                    poMaskVRTBand->AddMaskBandSource(
5548
0
                        poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5549
0
                        0, 0, nDstTotalWidth, nDstTotalHeight);
5550
0
                }
5551
5552
0
                return poVRTDS;
5553
0
            };
5554
5555
            // If the overview accommodates chunking, do so and recurse
5556
            // to avoid generating full size temporary files
5557
0
            if (!bOverflowFullResXChunkYChunkQueried &&
5558
0
                !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5559
0
                (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5560
0
            {
5561
                // Create a VRT with the smaller chunk to do the scaling
5562
0
                auto poVRTDS =
5563
0
                    CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5564
5565
0
                std::vector<GDALRasterBand *> apoVRTBand(nBands);
5566
0
                std::vector<GDALRasterBand *> apoDstBand(nBands);
5567
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5568
0
                {
5569
0
                    apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5570
0
                    apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5571
0
                }
5572
5573
                // Use a flag to avoid reading from the overview being built
5574
0
                GDALRasterIOExtraArg sExtraArg;
5575
0
                INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5576
0
                if (iSrcOverview == -1)
5577
0
                    sExtraArg.bUseOnlyThisScale = true;
5578
5579
                // A single band buffer for data transfer to the overview
5580
0
                std::vector<GByte> abyChunk;
5581
0
                try
5582
0
                {
5583
0
                    abyChunk.resize(nChunkSize);
5584
0
                }
5585
0
                catch (const std::exception &)
5586
0
                {
5587
0
                    CPLError(CE_Failure, CPLE_OutOfMemory,
5588
0
                             "Out of memory allocating temporary buffer");
5589
0
                    return CE_Failure;
5590
0
                }
5591
5592
                // Loop over output height, in chunks
5593
0
                for (int nDstYOff = nDstYOffStart;
5594
0
                     nDstYOff < nDstYOffEnd && eErr == CE_None;
5595
0
                     /* */)
5596
0
                {
5597
0
                    const int nDstYCount =
5598
0
                        std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5599
                    // Loop over output width, in output chunks
5600
0
                    for (int nDstXOff = nDstXOffStart;
5601
0
                         nDstXOff < nDstXOffEnd && eErr == CE_None;
5602
0
                         /* */)
5603
0
                    {
5604
0
                        const int nDstXCount =
5605
0
                            std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5606
                        // Read and transfer the chunk to the overview
5607
0
                        for (int iBand = 0; iBand < nBands && eErr == CE_None;
5608
0
                             ++iBand)
5609
0
                        {
5610
0
                            eErr = apoVRTBand[iBand]->RasterIO(
5611
0
                                GF_Read, nDstXOff, nDstYOff, nDstXCount,
5612
0
                                nDstYCount, abyChunk.data(), nDstXCount,
5613
0
                                nDstYCount, eDataType, 0, 0, &sExtraArg);
5614
0
                            if (eErr == CE_None)
5615
0
                            {
5616
0
                                eErr = apoDstBand[iBand]->RasterIO(
5617
0
                                    GF_Write, nDstXOff, nDstYOff, nDstXCount,
5618
0
                                    nDstYCount, abyChunk.data(), nDstXCount,
5619
0
                                    nDstYCount, eDataType, 0, 0, nullptr);
5620
0
                            }
5621
0
                        }
5622
5623
0
                        dfCurPixelCount +=
5624
0
                            static_cast<double>(nDstXCount) * nDstYCount;
5625
5626
0
                        nDstXOff += nDstXCount;
5627
0
                    }  // width
5628
5629
0
                    if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
5630
0
                                     nullptr, pProgressData))
5631
0
                    {
5632
0
                        CPLError(CE_Failure, CPLE_UserInterrupt,
5633
0
                                 "User terminated");
5634
0
                        eErr = CE_Failure;
5635
0
                    }
5636
5637
0
                    nDstYOff += nDstYCount;
5638
0
                }  // height
5639
5640
0
                if (CE_None != eErr)
5641
0
                {
5642
0
                    CPLError(CE_Failure, CPLE_AppDefined,
5643
0
                             "Error while writing overview");
5644
0
                    return CE_Failure;
5645
0
                }
5646
5647
0
                pfnProgress(1.0, nullptr, pProgressData);
5648
                // Flush the overviews we just generated
5649
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5650
0
                    apoDstBand[iBand]->FlushCache(false);
5651
5652
0
                continue;  // Next overview
5653
0
            }              // chunking via temporary dataset
5654
5655
0
            std::unique_ptr<GDALDataset> poTmpDS;
5656
            // Config option mostly/only for autotest purposes
5657
0
            const char *pszGDAL_OVR_TEMP_DRIVER =
5658
0
                CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
5659
0
            if ((!bTmpDSMemRequirementOverflow &&
5660
0
                 nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
5661
0
                 !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
5662
0
                EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
5663
0
            {
5664
0
                auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
5665
0
                if (!poTmpDrv)
5666
0
                {
5667
0
                    eErr = CE_Failure;
5668
0
                    break;
5669
0
                }
5670
0
                poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
5671
0
                                               nDstTotalHeight, nBands,
5672
0
                                               eDataType, nullptr));
5673
0
            }
5674
0
            else
5675
0
            {
5676
                // Create a temporary file for the overview
5677
0
                auto poTmpDrv =
5678
0
                    GetGDALDriverManager()->GetDriverByName("GTiff");
5679
0
                if (!poTmpDrv)
5680
0
                {
5681
0
                    eErr = CE_Failure;
5682
0
                    break;
5683
0
                }
5684
0
                std::string osTmpFilename;
5685
0
                auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
5686
0
                if (poDstDS)
5687
0
                {
5688
0
                    osTmpFilename = poDstDS->GetDescription();
5689
0
                    VSIStatBufL sStatBuf;
5690
0
                    if (!osTmpFilename.empty() &&
5691
0
                        VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
5692
0
                        osTmpFilename += "_tmp_ovr.tif";
5693
0
                }
5694
0
                if (osTmpFilename.empty())
5695
0
                {
5696
0
                    osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
5697
0
                    osTmpFilename += ".tif";
5698
0
                }
5699
0
                CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
5700
0
                         osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
5701
0
                CPLStringList aosCO;
5702
0
                if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
5703
0
                          (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
5704
0
                {
5705
0
                    aosCO.SetNameValue("TILED", "YES");
5706
0
                    aosCO.SetNameValue("BLOCKXSIZE",
5707
0
                                       CPLSPrintf("%d", nReducedDstChunkXSize));
5708
0
                    aosCO.SetNameValue("BLOCKYSIZE",
5709
0
                                       CPLSPrintf("%d", nReducedDstChunkYSize));
5710
0
                }
5711
0
                if (const char *pszCOList =
5712
0
                        poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
5713
0
                {
5714
0
                    aosCO.SetNameValue(
5715
0
                        "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
5716
0
                }
5717
0
                poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
5718
0
                                               nDstHeight, nBands, eDataType,
5719
0
                                               aosCO.List()));
5720
0
                if (poTmpDS)
5721
0
                {
5722
0
                    poTmpDS->MarkSuppressOnClose();
5723
0
                    VSIUnlink(osTmpFilename.c_str());
5724
0
                }
5725
0
            }
5726
0
            if (!poTmpDS)
5727
0
            {
5728
0
                eErr = CE_Failure;
5729
0
                break;
5730
0
            }
5731
5732
            // Create a full size VRT to do the resampling without edge effects
5733
0
            auto poVRTDS =
5734
0
                CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5735
5736
            // Allocate a band buffer with the overview chunk size
5737
0
            std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
5738
0
                VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
5739
0
                                    nDstChunkYSize));
5740
0
            if (pDstBuffer == nullptr)
5741
0
            {
5742
0
                eErr = CE_Failure;
5743
0
                break;
5744
0
            }
5745
5746
            // Use a flag to avoid reading the overview being built
5747
0
            GDALRasterIOExtraArg sExtraArg;
5748
0
            INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5749
0
            if (iSrcOverview == -1)
5750
0
                sExtraArg.bUseOnlyThisScale = true;
5751
5752
            // Scale and copy data from the VRT to the temp file
5753
0
            for (int nDstYOff = nDstYOffStart;
5754
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
5755
0
                 /* */)
5756
0
            {
5757
0
                const int nDstYCount =
5758
0
                    std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
5759
0
                for (int nDstXOff = nDstXOffStart;
5760
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
5761
0
                     /* */)
5762
0
                {
5763
0
                    const int nDstXCount =
5764
0
                        std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
5765
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
5766
0
                         ++iBand)
5767
0
                    {
5768
0
                        auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
5769
0
                        eErr = poSrcBand->RasterIO(
5770
0
                            GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
5771
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
5772
0
                            eWrkDataType, 0, 0, &sExtraArg);
5773
0
                        if (eErr == CE_None)
5774
0
                        {
5775
                            // Write to the temporary dataset, shifted
5776
0
                            auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
5777
0
                            eErr = poOvrBand->RasterIO(
5778
0
                                GF_Write, nDstXOff - nDstXOffStart,
5779
0
                                nDstYOff - nDstYOffStart, nDstXCount,
5780
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
5781
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
5782
0
                        }
5783
0
                    }
5784
0
                    nDstXOff += nDstXCount;
5785
0
                }
5786
0
                nDstYOff += nDstYCount;
5787
0
            }
5788
5789
            // Copy from the temporary to the overview
5790
0
            for (int nDstYOff = nDstYOffStart;
5791
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
5792
0
                 /* */)
5793
0
            {
5794
0
                const int nDstYCount =
5795
0
                    std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5796
0
                for (int nDstXOff = nDstXOffStart;
5797
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
5798
0
                     /* */)
5799
0
                {
5800
0
                    const int nDstXCount =
5801
0
                        std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5802
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
5803
0
                         ++iBand)
5804
0
                    {
5805
0
                        auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
5806
0
                        eErr = poSrcBand->RasterIO(
5807
0
                            GF_Read, nDstXOff - nDstXOffStart,
5808
0
                            nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
5809
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
5810
0
                            eWrkDataType, 0, 0, nullptr);
5811
0
                        if (eErr == CE_None)
5812
0
                        {
5813
                            // Write to the destination overview bands
5814
0
                            auto poOvrBand =
5815
0
                                papapoOverviewBands[iBand][iOverview];
5816
0
                            eErr = poOvrBand->RasterIO(
5817
0
                                GF_Write, nDstXOff, nDstYOff, nDstXCount,
5818
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
5819
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
5820
0
                        }
5821
0
                    }
5822
0
                    nDstXOff += nDstXCount;
5823
0
                }
5824
0
                nDstYOff += nDstYCount;
5825
0
            }
5826
5827
0
            if (eErr != CE_None)
5828
0
            {
5829
0
                CPLError(CE_Failure, CPLE_AppDefined,
5830
0
                         "Failed to write overview %d", iOverview);
5831
0
                return eErr;
5832
0
            }
5833
5834
            // Flush the data to overviews.
5835
0
            for (int iBand = 0; iBand < nBands; ++iBand)
5836
0
                papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5837
5838
0
            continue;
5839
0
        }
5840
5841
        // Structure describing a resampling job
5842
0
        struct OvrJob
5843
0
        {
5844
            // Buffers to free when job is finished
5845
0
            std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5846
0
            std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5847
0
            std::unique_ptr<PointerHolder> oDstBufferHolder{};
5848
5849
0
            GDALRasterBand *poDstBand = nullptr;
5850
5851
            // Input parameters of pfnResampleFn
5852
0
            GDALResampleFunction pfnResampleFn = nullptr;
5853
0
            GDALOverviewResampleArgs args{};
5854
0
            const void *pChunk = nullptr;
5855
5856
            // Output values of resampling function
5857
0
            CPLErr eErr = CE_Failure;
5858
0
            void *pDstBuffer = nullptr;
5859
0
            GDALDataType eDstBufferDataType = GDT_Unknown;
5860
5861
0
            void NotifyFinished()
5862
0
            {
5863
0
                std::lock_guard guard(mutex);
5864
0
                bFinished = true;
5865
0
                cv.notify_one();
5866
0
            }
5867
5868
0
            bool IsFinished()
5869
0
            {
5870
0
                std::lock_guard guard(mutex);
5871
0
                return bFinished;
5872
0
            }
5873
5874
0
            void WaitFinished()
5875
0
            {
5876
0
                std::unique_lock oGuard(mutex);
5877
0
                while (!bFinished)
5878
0
                {
5879
0
                    cv.wait(oGuard);
5880
0
                }
5881
0
            }
5882
5883
0
          private:
5884
            // Synchronization
5885
0
            bool bFinished = false;
5886
0
            std::mutex mutex{};
5887
0
            std::condition_variable cv{};
5888
0
        };
5889
5890
        // Thread function to resample
5891
0
        const auto JobResampleFunc = [](void *pData)
5892
0
        {
5893
0
            OvrJob *poJob = static_cast<OvrJob *>(pData);
5894
5895
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5896
0
                                               &(poJob->pDstBuffer),
5897
0
                                               &(poJob->eDstBufferDataType));
5898
5899
0
            poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5900
5901
0
            poJob->NotifyFinished();
5902
0
        };
5903
5904
        // Function to write resample data to target band
5905
0
        const auto WriteJobData = [](const OvrJob *poJob)
5906
0
        {
5907
0
            return poJob->poDstBand->RasterIO(
5908
0
                GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
5909
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5910
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5911
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5912
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5913
0
                poJob->eDstBufferDataType, 0, 0, nullptr);
5914
0
        };
5915
5916
        // Wait for completion of oldest job and serialize it
5917
0
        const auto WaitAndFinalizeOldestJob =
5918
0
            [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5919
0
        {
5920
0
            auto poOldestJob = jobList.front().get();
5921
0
            poOldestJob->WaitFinished();
5922
0
            CPLErr l_eErr = poOldestJob->eErr;
5923
0
            if (l_eErr == CE_None)
5924
0
            {
5925
0
                l_eErr = WriteJobData(poOldestJob);
5926
0
            }
5927
5928
0
            jobList.pop_front();
5929
0
            return l_eErr;
5930
0
        };
5931
5932
        // Queue of jobs
5933
0
        std::list<std::unique_ptr<OvrJob>> jobList;
5934
5935
0
        std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
5936
0
        std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
5937
0
            apabyChunkNoDataMask(nBands);
5938
5939
        // Iterate on destination overview, block by block.
5940
0
        for (int nDstYOff = nDstYOffStart;
5941
0
             nDstYOff < nDstYOffEnd && eErr == CE_None;
5942
0
             nDstYOff += nDstChunkYSize)
5943
0
        {
5944
0
            int nDstYCount;
5945
0
            if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5946
0
                nDstYCount = nDstChunkYSize;
5947
0
            else
5948
0
                nDstYCount = nDstYOffEnd - nDstYOff;
5949
5950
0
            int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5951
0
            int nChunkYOff2 = static_cast<int>(
5952
0
                ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5953
0
            if (nChunkYOff2 > nSrcHeight ||
5954
0
                nDstYOff + nDstYCount == nDstTotalHeight)
5955
0
                nChunkYOff2 = nSrcHeight;
5956
0
            int nYCount = nChunkYOff2 - nChunkYOff;
5957
0
            CPLAssert(nYCount <= nFullResYChunk);
5958
5959
0
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5960
0
            int nChunkYSizeQueried =
5961
0
                nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
5962
0
            if (nChunkYOffQueried < 0)
5963
0
            {
5964
0
                nChunkYSizeQueried += nChunkYOffQueried;
5965
0
                nChunkYOffQueried = 0;
5966
0
            }
5967
0
            if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5968
0
                nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5969
0
            CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5970
5971
0
            if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
5972
0
                             nullptr, pProgressData))
5973
0
            {
5974
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5975
0
                eErr = CE_Failure;
5976
0
            }
5977
5978
            // Iterate on destination overview, block by block.
5979
0
            for (int nDstXOff = nDstXOffStart;
5980
0
                 nDstXOff < nDstXOffEnd && eErr == CE_None;
5981
0
                 nDstXOff += nDstChunkXSize)
5982
0
            {
5983
0
                int nDstXCount = 0;
5984
0
                if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5985
0
                    nDstXCount = nDstChunkXSize;
5986
0
                else
5987
0
                    nDstXCount = nDstXOffEnd - nDstXOff;
5988
5989
0
                dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
5990
5991
0
                int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5992
0
                int nChunkXOff2 = static_cast<int>(
5993
0
                    ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
5994
0
                if (nChunkXOff2 > nSrcWidth ||
5995
0
                    nDstXOff + nDstXCount == nDstTotalWidth)
5996
0
                    nChunkXOff2 = nSrcWidth;
5997
0
                const int nXCount = nChunkXOff2 - nChunkXOff;
5998
0
                CPLAssert(nXCount <= nFullResXChunk);
5999
6000
0
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
6001
0
                int nChunkXSizeQueried =
6002
0
                    nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6003
0
                if (nChunkXOffQueried < 0)
6004
0
                {
6005
0
                    nChunkXSizeQueried += nChunkXOffQueried;
6006
0
                    nChunkXOffQueried = 0;
6007
0
                }
6008
0
                if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6009
0
                    nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6010
0
                CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6011
#if DEBUG_VERBOSE
6012
                CPLDebug("GDAL",
6013
                         "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6014
                         nChunkXOffQueried, nChunkYOffQueried,
6015
                         nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6016
                         nDstYOff, nDstXCount, nDstYCount);
6017
#endif
6018
6019
                // Avoid accumulating too many tasks and exhaust RAM
6020
6021
                // Try to complete already finished jobs
6022
0
                while (eErr == CE_None && !jobList.empty())
6023
0
                {
6024
0
                    auto poOldestJob = jobList.front().get();
6025
0
                    if (!poOldestJob->IsFinished())
6026
0
                        break;
6027
0
                    eErr = poOldestJob->eErr;
6028
0
                    if (eErr == CE_None)
6029
0
                    {
6030
0
                        eErr = WriteJobData(poOldestJob);
6031
0
                    }
6032
6033
0
                    jobList.pop_front();
6034
0
                }
6035
6036
                // And in case we have saturated the number of threads,
6037
                // wait for completion of tasks to go below the threshold.
6038
0
                while (eErr == CE_None &&
6039
0
                       jobList.size() >= static_cast<size_t>(nThreads))
6040
0
                {
6041
0
                    eErr = WaitAndFinalizeOldestJob(jobList);
6042
0
                }
6043
6044
                // Read the source buffers for all the bands.
6045
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6046
0
                {
6047
                    // (Re)allocate buffers if needed
6048
0
                    if (apaChunk[iBand] == nullptr)
6049
0
                    {
6050
0
                        apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6051
0
                            nFullResXChunkQueried, nFullResYChunkQueried,
6052
0
                            nWrkDataTypeSize));
6053
0
                        if (apaChunk[iBand] == nullptr)
6054
0
                        {
6055
0
                            eErr = CE_Failure;
6056
0
                        }
6057
0
                    }
6058
0
                    if (bUseNoDataMask &&
6059
0
                        apabyChunkNoDataMask[iBand] == nullptr)
6060
0
                    {
6061
0
                        apabyChunkNoDataMask[iBand].reset(
6062
0
                            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6063
0
                                nFullResXChunkQueried, nFullResYChunkQueried)));
6064
0
                        if (apabyChunkNoDataMask[iBand] == nullptr)
6065
0
                        {
6066
0
                            eErr = CE_Failure;
6067
0
                        }
6068
0
                    }
6069
6070
0
                    if (eErr == CE_None)
6071
0
                    {
6072
0
                        GDALRasterBand *poSrcBand = nullptr;
6073
0
                        if (iSrcOverview == -1)
6074
0
                            poSrcBand = papoSrcBands[iBand];
6075
0
                        else
6076
0
                            poSrcBand =
6077
0
                                papapoOverviewBands[iBand][iSrcOverview];
6078
0
                        eErr = poSrcBand->RasterIO(
6079
0
                            GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6080
0
                            nChunkXSizeQueried, nChunkYSizeQueried,
6081
0
                            apaChunk[iBand].get(), nChunkXSizeQueried,
6082
0
                            nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6083
6084
0
                        if (bUseNoDataMask && eErr == CE_None)
6085
0
                        {
6086
0
                            auto poMaskBand = poSrcBand->IsMaskBand()
6087
0
                                                  ? poSrcBand
6088
0
                                                  : poSrcBand->GetMaskBand();
6089
0
                            eErr = poMaskBand->RasterIO(
6090
0
                                GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6091
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6092
0
                                apabyChunkNoDataMask[iBand].get(),
6093
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6094
0
                                GDT_Byte, 0, 0, nullptr);
6095
0
                        }
6096
0
                    }
6097
0
                }
6098
6099
                // Compute the resulting overview block.
6100
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6101
0
                {
6102
0
                    auto poJob = std::make_unique<OvrJob>();
6103
0
                    poJob->pfnResampleFn = pfnResampleFn;
6104
0
                    poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6105
0
                    poJob->args.eOvrDataType =
6106
0
                        poJob->poDstBand->GetRasterDataType();
6107
0
                    poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6108
0
                    poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6109
0
                    const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6110
0
                        "NBITS", "IMAGE_STRUCTURE");
6111
0
                    poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6112
0
                    poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6113
0
                    poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6114
0
                    poJob->args.eWrkDataType = eWrkDataType;
6115
0
                    poJob->pChunk = apaChunk[iBand].get();
6116
0
                    poJob->args.pabyChunkNodataMask =
6117
0
                        apabyChunkNoDataMask[iBand].get();
6118
0
                    poJob->args.nChunkXOff = nChunkXOffQueried;
6119
0
                    poJob->args.nChunkXSize = nChunkXSizeQueried;
6120
0
                    poJob->args.nChunkYOff = nChunkYOffQueried;
6121
0
                    poJob->args.nChunkYSize = nChunkYSizeQueried;
6122
0
                    poJob->args.nDstXOff = nDstXOff;
6123
0
                    poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6124
0
                    poJob->args.nDstYOff = nDstYOff;
6125
0
                    poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6126
0
                    poJob->args.pszResampling = pszResampling;
6127
0
                    poJob->args.bHasNoData = abHasNoData[iBand];
6128
0
                    poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6129
0
                    poJob->args.eSrcDataType = eDataType;
6130
0
                    poJob->args.bPropagateNoData = bPropagateNoData;
6131
6132
0
                    if (poJobQueue)
6133
0
                    {
6134
0
                        poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6135
0
                            apabyChunkNoDataMask[iBand].release()));
6136
6137
0
                        poJob->oSrcBufferHolder.reset(
6138
0
                            new PointerHolder(apaChunk[iBand].release()));
6139
6140
0
                        poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6141
0
                        jobList.emplace_back(std::move(poJob));
6142
0
                    }
6143
0
                    else
6144
0
                    {
6145
0
                        JobResampleFunc(poJob.get());
6146
0
                        eErr = poJob->eErr;
6147
0
                        if (eErr == CE_None)
6148
0
                        {
6149
0
                            eErr = WriteJobData(poJob.get());
6150
0
                        }
6151
0
                    }
6152
0
                }
6153
0
            }
6154
0
        }
6155
6156
        // Wait for all pending jobs to complete
6157
0
        while (!jobList.empty())
6158
0
        {
6159
0
            const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6160
0
            if (l_eErr != CE_None && eErr == CE_None)
6161
0
                eErr = l_eErr;
6162
0
        }
6163
6164
        // Flush the data to overviews.
6165
0
        for (int iBand = 0; iBand < nBands; ++iBand)
6166
0
        {
6167
0
            if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6168
0
                CE_None)
6169
0
                eErr = CE_Failure;
6170
0
        }
6171
0
    }
6172
6173
0
    if (eErr == CE_None)
6174
0
        pfnProgress(1.0, nullptr, pProgressData);
6175
6176
0
    return eErr;
6177
0
}
6178
6179
/************************************************************************/
6180
/*            GDALRegenerateOverviewsMultiBand()                        */
6181
/************************************************************************/
6182
6183
/**
6184
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6185
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6186
 *
6187
 * This function will generate one or more overview images from a base
6188
 * image using the requested downsampling algorithm.  Its primary use
6189
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
6190
 * can also be used to generate downsampled images in one file from another
6191
 * outside the overview architecture.
6192
 *
6193
 * The output bands need to exist in advance and share the same characteristics
6194
 * (type, dimensions)
6195
 *
6196
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6197
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6198
 *
6199
 * It does not support color tables or complex data types.
6200
 *
6201
 * The pseudo-algorithm used by the function is :
6202
 *    for each overview
6203
 *       iterate on lines of the source by a step of deltay
6204
 *           iterate on columns of the source  by a step of deltax
6205
 *               read the source data of size deltax * deltay for all the bands
6206
 *               generate the corresponding overview block for all the bands
6207
 *
6208
 * This function will honour properly NODATA_VALUES tuples (special dataset
6209
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6210
 * considered as the nodata value and not each value of the triplet
6211
 * independently per band.
6212
 *
6213
 * The GDAL_NUM_THREADS configuration option can be set
6214
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6215
 * overview computation.
6216
 *
6217
 * @param apoSrcBands the list of source bands to downsample
6218
 * @param aapoOverviewBands bidimension array of bands. First dimension is
6219
 *                          indexed by bands. Second dimension is indexed by
6220
 *                          overview levels. All aapoOverviewBands[i] arrays
6221
 *                          must have the same size (i.e. same number of
6222
 *                          overviews)
6223
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6224
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6225
 * @param pfnProgress progress report function.
6226
 * @param pProgressData progress function callback data.
6227
 * @param papszOptions NULL terminated list of options as
6228
 *                     key=value pairs, or NULL
6229
 *                     The XOFF, YOFF, XSIZE and YSIZE
6230
 *                     options can be specified to express that overviews should
6231
 *                     be regenerated only in the specified subset of the source
6232
 *                     dataset.
6233
 * @return CE_None on success or CE_Failure on failure.
6234
 * @since 3.10
6235
 */
6236
6237
CPLErr GDALRegenerateOverviewsMultiBand(
6238
    const std::vector<GDALRasterBand *> &apoSrcBands,
6239
    const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6240
    const char *pszResampling, GDALProgressFunc pfnProgress,
6241
    void *pProgressData, CSLConstList papszOptions)
6242
0
{
6243
0
    CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6244
0
    for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6245
0
    {
6246
0
        CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6247
0
    }
6248
6249
0
    if (aapoOverviewBands.empty())
6250
0
        return CE_None;
6251
6252
0
    std::vector<GDALRasterBand **> apapoOverviewBands;
6253
0
    for (auto &apoOverviewBands : aapoOverviewBands)
6254
0
    {
6255
0
        auto papoOverviewBands = static_cast<GDALRasterBand **>(
6256
0
            CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6257
0
        for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6258
0
        {
6259
0
            papoOverviewBands[i] = apoOverviewBands[i];
6260
0
        }
6261
0
        apapoOverviewBands.push_back(papoOverviewBands);
6262
0
    }
6263
0
    const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6264
0
        static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6265
0
        static_cast<int>(aapoOverviewBands[0].size()),
6266
0
        apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6267
0
        papszOptions);
6268
0
    for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6269
0
        CPLFree(papoOverviewBands);
6270
0
    return eErr;
6271
0
}
6272
6273
/************************************************************************/
6274
/*                        GDALComputeBandStats()                        */
6275
/************************************************************************/
6276
6277
/** Undocumented
6278
 * @param hSrcBand undocumented.
6279
 * @param nSampleStep Step between scanlines used to compute statistics.
6280
 *                    When nSampleStep is equal to 1, all scanlines will
6281
 *                    be processed.
6282
 * @param pdfMean undocumented.
6283
 * @param pdfStdDev undocumented.
6284
 * @param pfnProgress undocumented.
6285
 * @param pProgressData undocumented.
6286
 * @return undocumented
6287
 */
6288
CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6289
                                        int nSampleStep, double *pdfMean,
6290
                                        double *pdfStdDev,
6291
                                        GDALProgressFunc pfnProgress,
6292
                                        void *pProgressData)
6293
6294
0
{
6295
0
    VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6296
6297
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6298
6299
0
    if (pfnProgress == nullptr)
6300
0
        pfnProgress = GDALDummyProgress;
6301
6302
0
    const int nWidth = poSrcBand->GetXSize();
6303
0
    const int nHeight = poSrcBand->GetYSize();
6304
6305
0
    if (nSampleStep >= nHeight || nSampleStep < 1)
6306
0
        nSampleStep = 1;
6307
6308
0
    GDALDataType eWrkType = GDT_Unknown;
6309
0
    float *pafData = nullptr;
6310
0
    GDALDataType eType = poSrcBand->GetRasterDataType();
6311
0
    const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6312
0
    if (bComplex)
6313
0
    {
6314
0
        pafData = static_cast<float *>(
6315
0
            VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6316
0
        eWrkType = GDT_CFloat32;
6317
0
    }
6318
0
    else
6319
0
    {
6320
0
        pafData =
6321
0
            static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6322
0
        eWrkType = GDT_Float32;
6323
0
    }
6324
6325
0
    if (nWidth == 0 || pafData == nullptr)
6326
0
    {
6327
0
        VSIFree(pafData);
6328
0
        return CE_Failure;
6329
0
    }
6330
6331
    /* -------------------------------------------------------------------- */
6332
    /*      Loop over all sample lines.                                     */
6333
    /* -------------------------------------------------------------------- */
6334
0
    double dfSum = 0.0;
6335
0
    double dfSum2 = 0.0;
6336
0
    int iLine = 0;
6337
0
    GIntBig nSamples = 0;
6338
6339
0
    do
6340
0
    {
6341
0
        if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6342
0
                         pProgressData))
6343
0
        {
6344
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6345
0
            CPLFree(pafData);
6346
0
            return CE_Failure;
6347
0
        }
6348
6349
0
        const CPLErr eErr =
6350
0
            poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6351
0
                                1, eWrkType, 0, 0, nullptr);
6352
0
        if (eErr != CE_None)
6353
0
        {
6354
0
            CPLFree(pafData);
6355
0
            return eErr;
6356
0
        }
6357
6358
0
        for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6359
0
        {
6360
0
            float fValue = 0.0f;
6361
6362
0
            if (bComplex)
6363
0
            {
6364
                // Compute the magnitude of the complex value.
6365
0
                fValue =
6366
0
                    std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
6367
0
                               pafData[static_cast<size_t>(iPixel) * 2 + 1]);
6368
0
            }
6369
0
            else
6370
0
            {
6371
0
                fValue = pafData[iPixel];
6372
0
            }
6373
6374
0
            dfSum += fValue;
6375
0
            dfSum2 += static_cast<double>(fValue) * fValue;
6376
0
        }
6377
6378
0
        nSamples += nWidth;
6379
0
        iLine += nSampleStep;
6380
0
    } while (iLine < nHeight);
6381
6382
0
    if (!pfnProgress(1.0, nullptr, pProgressData))
6383
0
    {
6384
0
        CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6385
0
        CPLFree(pafData);
6386
0
        return CE_Failure;
6387
0
    }
6388
6389
    /* -------------------------------------------------------------------- */
6390
    /*      Produce the result values.                                      */
6391
    /* -------------------------------------------------------------------- */
6392
0
    if (pdfMean != nullptr)
6393
0
        *pdfMean = dfSum / nSamples;
6394
6395
0
    if (pdfStdDev != nullptr)
6396
0
    {
6397
0
        const double dfMean = dfSum / nSamples;
6398
6399
0
        *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6400
0
    }
6401
6402
0
    CPLFree(pafData);
6403
6404
0
    return CE_None;
6405
0
}
6406
6407
/************************************************************************/
6408
/*                  GDALOverviewMagnitudeCorrection()                   */
6409
/*                                                                      */
6410
/*      Correct the mean and standard deviation of the overviews of     */
6411
/*      the given band to match the base layer approximately.           */
6412
/************************************************************************/
6413
6414
/** Undocumented
6415
 * @param hBaseBand undocumented.
6416
 * @param nOverviewCount undocumented.
6417
 * @param pahOverviews undocumented.
6418
 * @param pfnProgress undocumented.
6419
 * @param pProgressData undocumented.
6420
 * @return undocumented
6421
 */
6422
CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6423
                                       int nOverviewCount,
6424
                                       GDALRasterBandH *pahOverviews,
6425
                                       GDALProgressFunc pfnProgress,
6426
                                       void *pProgressData)
6427
6428
0
{
6429
0
    VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6430
6431
    /* -------------------------------------------------------------------- */
6432
    /*      Compute mean/stddev for source raster.                          */
6433
    /* -------------------------------------------------------------------- */
6434
0
    double dfOrigMean = 0.0;
6435
0
    double dfOrigStdDev = 0.0;
6436
0
    {
6437
0
        const CPLErr eErr =
6438
0
            GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6439
0
                                 pfnProgress, pProgressData);
6440
6441
0
        if (eErr != CE_None)
6442
0
            return eErr;
6443
0
    }
6444
6445
    /* -------------------------------------------------------------------- */
6446
    /*      Loop on overview bands.                                         */
6447
    /* -------------------------------------------------------------------- */
6448
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6449
0
    {
6450
0
        GDALRasterBand *poOverview =
6451
0
            GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6452
0
        double dfOverviewMean, dfOverviewStdDev;
6453
6454
0
        const CPLErr eErr =
6455
0
            GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6456
0
                                 &dfOverviewStdDev, pfnProgress, pProgressData);
6457
6458
0
        if (eErr != CE_None)
6459
0
            return eErr;
6460
6461
0
        double dfGain = 1.0;
6462
0
        if (dfOrigStdDev >= 0.0001)
6463
0
            dfGain = dfOrigStdDev / dfOverviewStdDev;
6464
6465
        /* --------------------------------------------------------------------
6466
         */
6467
        /*      Apply gain and offset. */
6468
        /* --------------------------------------------------------------------
6469
         */
6470
0
        const int nWidth = poOverview->GetXSize();
6471
0
        const int nHeight = poOverview->GetYSize();
6472
6473
0
        GDALDataType eWrkType = GDT_Unknown;
6474
0
        float *pafData = nullptr;
6475
0
        const GDALDataType eType = poOverview->GetRasterDataType();
6476
0
        const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6477
0
        if (bComplex)
6478
0
        {
6479
0
            pafData = static_cast<float *>(
6480
0
                VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6481
0
            eWrkType = GDT_CFloat32;
6482
0
        }
6483
0
        else
6484
0
        {
6485
0
            pafData = static_cast<float *>(
6486
0
                VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6487
0
            eWrkType = GDT_Float32;
6488
0
        }
6489
6490
0
        if (pafData == nullptr)
6491
0
        {
6492
0
            return CE_Failure;
6493
0
        }
6494
6495
0
        for (int iLine = 0; iLine < nHeight; ++iLine)
6496
0
        {
6497
0
            if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6498
0
                             pProgressData))
6499
0
            {
6500
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6501
0
                CPLFree(pafData);
6502
0
                return CE_Failure;
6503
0
            }
6504
6505
0
            if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6506
0
                                     nWidth, 1, eWrkType, 0, 0,
6507
0
                                     nullptr) != CE_None)
6508
0
            {
6509
0
                CPLFree(pafData);
6510
0
                return CE_Failure;
6511
0
            }
6512
6513
0
            for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6514
0
            {
6515
0
                if (bComplex)
6516
0
                {
6517
0
                    pafData[static_cast<size_t>(iPixel) * 2] *=
6518
0
                        static_cast<float>(dfGain);
6519
0
                    pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
6520
0
                        static_cast<float>(dfGain);
6521
0
                }
6522
0
                else
6523
0
                {
6524
0
                    pafData[iPixel] = static_cast<float>(
6525
0
                        (pafData[iPixel] - dfOverviewMean) * dfGain +
6526
0
                        dfOrigMean);
6527
0
                }
6528
0
            }
6529
6530
0
            if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6531
0
                                     nWidth, 1, eWrkType, 0, 0,
6532
0
                                     nullptr) != CE_None)
6533
0
            {
6534
0
                CPLFree(pafData);
6535
0
                return CE_Failure;
6536
0
            }
6537
0
        }
6538
6539
0
        if (!pfnProgress(1.0, nullptr, pProgressData))
6540
0
        {
6541
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6542
0
            CPLFree(pafData);
6543
0
            return CE_Failure;
6544
0
        }
6545
6546
0
        CPLFree(pafData);
6547
0
    }
6548
6549
0
    return CE_None;
6550
0
}