Coverage Report

Created: 2025-06-22 06:59

/src/gdal/gcore/overview.cpp
Line
Count
Source (jump to first uncovered line)
1
2
/******************************************************************************
3
 *
4
 * Project:  GDAL Core
5
 * Purpose:  Helper code to implement overview support in different drivers.
6
 * Author:   Frank Warmerdam, warmerdam@pobox.com
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2000, Frank Warmerdam
10
 * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
11
 *
12
 * SPDX-License-Identifier: MIT
13
 ****************************************************************************/
14
15
#include "cpl_port.h"
16
#include "gdal_priv.h"
17
18
#include <cmath>
19
#include <cstddef>
20
#include <cstdlib>
21
22
#include <algorithm>
23
#include <complex>
24
#include <condition_variable>
25
#include <limits>
26
#include <list>
27
#include <memory>
28
#include <mutex>
29
#include <vector>
30
31
#include "cpl_conv.h"
32
#include "cpl_error.h"
33
#include "cpl_float.h"
34
#include "cpl_progress.h"
35
#include "cpl_vsi.h"
36
#include "gdal.h"
37
#include "gdal_thread_pool.h"
38
#include "gdalwarper.h"
39
#include "gdal_vrt.h"
40
#include "vrtdataset.h"
41
42
#ifdef USE_NEON_OPTIMIZATIONS
43
#include "include_sse2neon.h"
44
#define USE_SSE2
45
46
#include "gdalsse_priv.h"
47
48
// Restrict to 64bit processors because they are guaranteed to have SSE2,
49
// or if __AVX2__ is defined.
50
#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
51
#define USE_SSE2
52
53
#include "gdalsse_priv.h"
54
55
#ifdef __SSE3__
56
#include <pmmintrin.h>
57
#endif
58
#ifdef __SSSE3__
59
#include <tmmintrin.h>
60
#endif
61
#ifdef __SSE4_1__
62
#include <smmintrin.h>
63
#endif
64
#ifdef __AVX2__
65
#include <immintrin.h>
66
#endif
67
68
#endif
69
70
// To be included after above USE_SSE2 and include gdalsse_priv.h
71
// to avoid build issue on Windows x86
72
#include "gdal_priv_templates.hpp"
73
74
/************************************************************************/
75
/*                      GDALResampleChunk_Near()                        */
76
/************************************************************************/
77
78
template <class T>
79
static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
80
                                      const T *pChunk, T **ppDstBuffer)
81
82
0
{
83
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
84
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
85
0
    const GDALDataType eWrkDataType = args.eWrkDataType;
86
0
    const int nChunkXOff = args.nChunkXOff;
87
0
    const int nChunkXSize = args.nChunkXSize;
88
0
    const int nChunkYOff = args.nChunkYOff;
89
0
    const int nDstXOff = args.nDstXOff;
90
0
    const int nDstXOff2 = args.nDstXOff2;
91
0
    const int nDstYOff = args.nDstYOff;
92
0
    const int nDstYOff2 = args.nDstYOff2;
93
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
94
95
    /* -------------------------------------------------------------------- */
96
    /*      Allocate buffers.                                               */
97
    /* -------------------------------------------------------------------- */
98
0
    *ppDstBuffer = static_cast<T *>(
99
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
100
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
101
0
    if (*ppDstBuffer == nullptr)
102
0
    {
103
0
        return CE_Failure;
104
0
    }
105
0
    T *const pDstBuffer = *ppDstBuffer;
106
107
0
    int *panSrcXOff =
108
0
        static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
109
110
0
    if (panSrcXOff == nullptr)
111
0
    {
112
0
        return CE_Failure;
113
0
    }
114
115
    /* ==================================================================== */
116
    /*      Precompute inner loop constants.                                */
117
    /* ==================================================================== */
118
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
119
0
    {
120
0
        int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
121
0
        if (nSrcXOff < nChunkXOff)
122
0
            nSrcXOff = nChunkXOff;
123
124
0
        panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
125
0
    }
126
127
    /* ==================================================================== */
128
    /*      Loop over destination scanlines.                                */
129
    /* ==================================================================== */
130
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
131
0
    {
132
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
133
0
        if (nSrcYOff < nChunkYOff)
134
0
            nSrcYOff = nChunkYOff;
135
136
0
        const T *const pSrcScanline =
137
0
            pChunk +
138
0
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
139
0
            nChunkXOff;
140
141
        /* --------------------------------------------------------------------
142
         */
143
        /*      Loop over destination pixels */
144
        /* --------------------------------------------------------------------
145
         */
146
0
        T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
147
0
        for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
148
0
        {
149
0
            pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
150
0
        }
151
0
    }
152
153
0
    CPLFree(panSrcXOff);
154
155
0
    return CE_None;
156
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_NearT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>**)
157
158
static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
159
                                     const void *pChunk, void **ppDstBuffer,
160
                                     GDALDataType *peDstBufferDataType)
161
0
{
162
0
    *peDstBufferDataType = args.eWrkDataType;
163
0
    switch (args.eWrkDataType)
164
0
    {
165
        // For nearest resampling, as no computation is done, only the
166
        // size of the data type matters.
167
0
        case GDT_Byte:
168
0
        case GDT_Int8:
169
0
        {
170
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
171
0
            return GDALResampleChunk_NearT(
172
0
                args, static_cast<const uint8_t *>(pChunk),
173
0
                reinterpret_cast<uint8_t **>(ppDstBuffer));
174
0
        }
175
176
0
        case GDT_Int16:
177
0
        case GDT_UInt16:
178
0
        case GDT_Float16:
179
0
        {
180
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
181
0
            return GDALResampleChunk_NearT(
182
0
                args, static_cast<const uint16_t *>(pChunk),
183
0
                reinterpret_cast<uint16_t **>(ppDstBuffer));
184
0
        }
185
186
0
        case GDT_CInt16:
187
0
        case GDT_CFloat16:
188
0
        case GDT_Int32:
189
0
        case GDT_UInt32:
190
0
        case GDT_Float32:
191
0
        {
192
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
193
0
            return GDALResampleChunk_NearT(
194
0
                args, static_cast<const uint32_t *>(pChunk),
195
0
                reinterpret_cast<uint32_t **>(ppDstBuffer));
196
0
        }
197
198
0
        case GDT_CInt32:
199
0
        case GDT_CFloat32:
200
0
        case GDT_Int64:
201
0
        case GDT_UInt64:
202
0
        case GDT_Float64:
203
0
        {
204
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
205
0
            return GDALResampleChunk_NearT(
206
0
                args, static_cast<const uint64_t *>(pChunk),
207
0
                reinterpret_cast<uint64_t **>(ppDstBuffer));
208
0
        }
209
210
0
        case GDT_CFloat64:
211
0
        {
212
0
            return GDALResampleChunk_NearT(
213
0
                args, static_cast<const std::complex<double> *>(pChunk),
214
0
                reinterpret_cast<std::complex<double> **>(ppDstBuffer));
215
0
        }
216
217
0
        case GDT_Unknown:
218
0
        case GDT_TypeCount:
219
0
            break;
220
0
    }
221
0
    CPLAssert(false);
222
0
    return CE_Failure;
223
0
}
224
225
namespace
226
{
227
228
// Find in the color table the entry whose RGB value is the closest
229
// (using quadratic distance) to the test color, ignoring transparent entries.
230
int BestColorEntry(const std::vector<GDALColorEntry> &entries,
231
                   const GDALColorEntry &test)
232
0
{
233
0
    int nMinDist = std::numeric_limits<int>::max();
234
0
    size_t bestEntry = 0;
235
0
    for (size_t i = 0; i < entries.size(); ++i)
236
0
    {
237
0
        const GDALColorEntry &entry = entries[i];
238
        // Ignore transparent entries
239
0
        if (entry.c4 == 0)
240
0
            continue;
241
242
0
        int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
243
0
                    ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
244
0
                    ((test.c3 - entry.c3) * (test.c3 - entry.c3));
245
0
        if (nDist < nMinDist)
246
0
        {
247
0
            nMinDist = nDist;
248
0
            bestEntry = i;
249
0
        }
250
0
    }
251
0
    return static_cast<int>(bestEntry);
252
0
}
253
254
std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
255
                                           int &transparentIdx)
256
0
{
257
0
    std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
258
259
0
    transparentIdx = -1;
260
0
    int i = 0;
261
0
    for (auto &entry : entries)
262
0
    {
263
0
        table.GetColorEntryAsRGB(i, &entry);
264
0
        if (transparentIdx < 0 && entry.c4 == 0)
265
0
            transparentIdx = i;
266
0
        ++i;
267
0
    }
268
0
    return entries;
269
0
}
270
271
}  // unnamed  namespace
272
273
/************************************************************************/
274
/*                             SQUARE()                                 */
275
/************************************************************************/
276
277
template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
278
0
{
279
0
    return static_cast<Tsquare>(val) * val;
280
0
}
Unexecuted instantiation: int SQUARE<int, int>(int)
Unexecuted instantiation: double SQUARE<double, double>(double)
Unexecuted instantiation: unsigned int SQUARE<unsigned int, unsigned int>(unsigned int)
281
282
/************************************************************************/
283
/*                          ComputeIntegerRMS()                         */
284
/************************************************************************/
285
// Compute rms = sqrt(sumSquares / weight) in such a way that it is the
286
// integer that minimizes abs(rms**2 - sumSquares / weight)
287
template <class T, class Twork>
288
inline T ComputeIntegerRMS(double sumSquares, double weight)
289
0
{
290
0
    const double sumDivWeight = sumSquares / weight;
291
0
    T rms = static_cast<T>(sqrt(sumDivWeight));
292
293
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
294
    // Naive version:
295
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
296
0
    if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
297
0
        2 * sumDivWeight)
298
0
        rms += 1;
299
0
    return rms;
300
0
}
Unexecuted instantiation: unsigned char ComputeIntegerRMS<unsigned char, int>(double, double)
Unexecuted instantiation: unsigned short ComputeIntegerRMS<unsigned short, unsigned long>(double, double)
301
302
template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
303
0
{
304
0
    CPLAssert(false);
305
0
    return 0;
306
0
}
307
308
template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
309
0
{
310
    // It has been verified that given the correction on rms below, using
311
    // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
312
    // is equivalent, so use the former as it is used twice.
313
0
    const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
314
0
    const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
315
0
    GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
316
317
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
318
    // Naive version:
319
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
320
    // Optimized version for integer case and weight == 4
321
0
    if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
322
0
        rms += 1;
323
0
    return rms;
324
0
}
325
326
template <>
327
inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
328
0
{
329
0
    const double sumDivWeight = sumSquares * 0.25;
330
0
    GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
331
332
    // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
333
    // Naive version:
334
    // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
335
    // Optimized version for integer case and weight == 4
336
0
    if (static_cast<GUInt32>(rms) * (rms + 1) <
337
0
        static_cast<GUInt32>(sumDivWeight + 0.25))
338
0
        rms += 1;
339
0
    return rms;
340
0
}
341
342
#ifdef USE_SSE2
343
344
/************************************************************************/
345
/*                   QuadraticMeanByteSSE2OrAVX2()                      */
346
/************************************************************************/
347
348
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
349
#define sse2_packus_epi32 _mm_packus_epi32
350
#else
351
inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
352
0
{
353
0
    const auto minus32768_32 = _mm_set1_epi32(-32768);
354
0
    const auto minus32768_16 = _mm_set1_epi16(-32768);
355
0
    a = _mm_add_epi32(a, minus32768_32);
356
0
    b = _mm_add_epi32(b, minus32768_32);
357
0
    a = _mm_packs_epi32(a, b);
358
0
    a = _mm_sub_epi16(a, minus32768_16);
359
0
    return a;
360
0
}
361
#endif
362
363
#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
364
#define sse2_hadd_epi16 _mm_hadd_epi16
365
#else
366
inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
367
0
{
368
    // Horizontal addition of adjacent pairs
369
0
    const auto mask = _mm_set1_epi32(0xFFFF);
370
0
    const auto horizLo =
371
0
        _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
372
0
    const auto horizHi =
373
0
        _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
374
375
    // Recombine low and high parts
376
0
    return _mm_packs_epi32(horizLo, horizHi);
377
0
}
378
#endif
379
380
#ifdef __AVX2__
381
382
#define DEST_ELTS 16
383
#define set1_epi16 _mm256_set1_epi16
384
#define set1_epi32 _mm256_set1_epi32
385
#define setzero _mm256_setzero_si256
386
#define set1_ps _mm256_set1_ps
387
#define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
388
#define unpacklo_epi8 _mm256_unpacklo_epi8
389
#define unpackhi_epi8 _mm256_unpackhi_epi8
390
#define madd_epi16 _mm256_madd_epi16
391
#define add_epi32 _mm256_add_epi32
392
#define mul_ps _mm256_mul_ps
393
#define cvtepi32_ps _mm256_cvtepi32_ps
394
#define sqrt_ps _mm256_sqrt_ps
395
#define cvttps_epi32 _mm256_cvttps_epi32
396
#define packs_epi32 _mm256_packs_epi32
397
#define packus_epi32 _mm256_packus_epi32
398
#define srli_epi32 _mm256_srli_epi32
399
#define mullo_epi16 _mm256_mullo_epi16
400
#define srli_epi16 _mm256_srli_epi16
401
#define cmpgt_epi16 _mm256_cmpgt_epi16
402
#define add_epi16 _mm256_add_epi16
403
#define sub_epi16 _mm256_sub_epi16
404
#define packus_epi16 _mm256_packus_epi16
405
406
/* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
407
/* to get the lower 128-bit bits of what would be a true 256-bit vector register
408
 */
409
410
inline __m256i FIXUP_LANES(__m256i x)
411
{
412
    return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
413
}
414
415
#define store_lo(x, y)                                                         \
416
    _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
417
                     _mm256_extracti128_si256(FIXUP_LANES(y), 0))
418
#define storeu_int(x, y)                                                       \
419
    _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
420
#define hadd_epi16 _mm256_hadd_epi16
421
#else
422
0
#define DEST_ELTS 8
423
0
#define set1_epi16 _mm_set1_epi16
424
0
#define set1_epi32 _mm_set1_epi32
425
0
#define setzero _mm_setzero_si128
426
#define set1_ps _mm_set1_ps
427
0
#define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
428
0
#define unpacklo_epi8 _mm_unpacklo_epi8
429
0
#define unpackhi_epi8 _mm_unpackhi_epi8
430
0
#define madd_epi16 _mm_madd_epi16
431
0
#define add_epi32 _mm_add_epi32
432
#define mul_ps _mm_mul_ps
433
0
#define cvtepi32_ps _mm_cvtepi32_ps
434
0
#define sqrt_ps _mm_sqrt_ps
435
0
#define cvttps_epi32 _mm_cvttps_epi32
436
0
#define packs_epi32 _mm_packs_epi32
437
0
#define packus_epi32 sse2_packus_epi32
438
0
#define srli_epi32 _mm_srli_epi32
439
0
#define mullo_epi16 _mm_mullo_epi16
440
0
#define srli_epi16 _mm_srli_epi16
441
0
#define cmpgt_epi16 _mm_cmpgt_epi16
442
0
#define add_epi16 _mm_add_epi16
443
0
#define sub_epi16 _mm_sub_epi16
444
0
#define packus_epi16 _mm_packus_epi16
445
0
#define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
446
0
#define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
447
0
#define hadd_epi16 sse2_hadd_epi16
448
#endif
449
450
template <class T>
451
static int
452
#if defined(__GNUC__)
453
    __attribute__((noinline))
454
#endif
455
    QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
456
                                const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
457
                                T *CPL_RESTRICT pDstScanline)
458
0
{
459
    // Optimized implementation for RMS on Byte by
460
    // processing by group of 8 output pixels, so as to use
461
    // a single _mm_sqrt_ps() call for 4 output pixels
462
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
463
464
0
    int iDstPixel = 0;
465
0
    const auto one16 = set1_epi16(1);
466
0
    const auto one32 = set1_epi32(1);
467
0
    const auto zero = setzero();
468
0
    const auto minus32768 = set1_epi16(-32768);
469
470
0
    for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
471
0
    {
472
        // Load 2 * DEST_ELTS bytes from each line
473
0
        auto firstLine = loadu_int(pSrcScanlineShifted);
474
0
        auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
475
        // Extend those Bytes as UInt16s
476
0
        auto firstLineLo = unpacklo_epi8(firstLine, zero);
477
0
        auto firstLineHi = unpackhi_epi8(firstLine, zero);
478
0
        auto secondLineLo = unpacklo_epi8(secondLine, zero);
479
0
        auto secondLineHi = unpackhi_epi8(secondLine, zero);
480
481
        // Multiplication of 16 bit values and horizontal
482
        // addition of 32 bit results
483
        // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
484
0
        firstLineLo = madd_epi16(firstLineLo, firstLineLo);
485
0
        firstLineHi = madd_epi16(firstLineHi, firstLineHi);
486
0
        secondLineLo = madd_epi16(secondLineLo, secondLineLo);
487
0
        secondLineHi = madd_epi16(secondLineHi, secondLineHi);
488
489
        // Vertical addition
490
0
        const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
491
0
        const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
492
493
0
        const auto sumSquaresPlusOneDiv4Lo =
494
0
            srli_epi32(add_epi32(sumSquaresLo, one32), 2);
495
0
        const auto sumSquaresPlusOneDiv4Hi =
496
0
            srli_epi32(add_epi32(sumSquaresHi, one32), 2);
497
498
        // Take square root and truncate/floor to int32
499
0
        const auto rmsLo =
500
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
501
0
        const auto rmsHi =
502
0
            cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
503
504
        // Merge back low and high registers with each RMS value
505
        // as a 16 bit value.
506
0
        auto rms = packs_epi32(rmsLo, rmsHi);
507
508
        // Round to upper value if it minimizes the
509
        // error |rms^2 - sumSquares/4|
510
        // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
511
        //    rms += 1;
512
        // which is equivalent to:
513
        // if( rms * (rms + 1) < (sumSquares+1) / 4 )
514
        //    rms += 1;
515
        // And both left and right parts fit on 16 (unsigned) bits
516
0
        const auto sumSquaresPlusOneDiv4 =
517
0
            packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
518
        // cmpgt_epi16 operates on signed int16, but here
519
        // we have unsigned values, so shift them by -32768 before
520
0
        auto mask = cmpgt_epi16(
521
0
            add_epi16(sumSquaresPlusOneDiv4, minus32768),
522
0
            add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
523
        // The value of the mask will be -1 when the correction needs to be
524
        // applied
525
0
        rms = sub_epi16(rms, mask);
526
527
        // Pack each 16 bit RMS value to 8 bits
528
0
        rms = packus_epi16(rms, rms /* could be anything */);
529
0
        store_lo(&pDstScanline[iDstPixel], rms);
530
0
        pSrcScanlineShifted += 2 * DEST_ELTS;
531
0
    }
532
533
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
534
0
    return iDstPixel;
535
0
}
536
537
/************************************************************************/
538
/*                      AverageByteSSE2OrAVX2()                         */
539
/************************************************************************/
540
541
template <class T>
542
static int
543
AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
544
                      const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
545
                      T *CPL_RESTRICT pDstScanline)
546
0
{
547
    // Optimized implementation for average on Byte by
548
    // processing by group of 16 output pixels for SSE2, or 32 for AVX2
549
550
0
    const auto zero = setzero();
551
0
    const auto two16 = set1_epi16(2);
552
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
553
554
0
    int iDstPixel = 0;
555
0
    for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
556
0
         iDstPixel += 2 * DEST_ELTS)
557
0
    {
558
0
        decltype(setzero()) average0;
559
0
        {
560
            // Load 2 * DEST_ELTS bytes from each line
561
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
562
0
            const auto secondLine =
563
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
564
            // Extend those Bytes as UInt16s
565
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
566
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
567
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
568
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
569
570
            // Vertical addition
571
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
572
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
573
574
            // Horizontal addition of adjacent pairs, and recombine low and high
575
            // parts
576
0
            const auto sum = hadd_epi16(sumLo, sumHi);
577
578
            // average = (sum + 2) / 4
579
0
            average0 = srli_epi16(add_epi16(sum, two16), 2);
580
581
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
582
0
        }
583
584
0
        decltype(setzero()) average1;
585
0
        {
586
            // Load 2 * DEST_ELTS bytes from each line
587
0
            const auto firstLine = loadu_int(pSrcScanlineShifted);
588
0
            const auto secondLine =
589
0
                loadu_int(pSrcScanlineShifted + nChunkXSize);
590
            // Extend those Bytes as UInt16s
591
0
            const auto firstLineLo = unpacklo_epi8(firstLine, zero);
592
0
            const auto firstLineHi = unpackhi_epi8(firstLine, zero);
593
0
            const auto secondLineLo = unpacklo_epi8(secondLine, zero);
594
0
            const auto secondLineHi = unpackhi_epi8(secondLine, zero);
595
596
            // Vertical addition
597
0
            const auto sumLo = add_epi16(firstLineLo, secondLineLo);
598
0
            const auto sumHi = add_epi16(firstLineHi, secondLineHi);
599
600
            // Horizontal addition of adjacent pairs, and recombine low and high
601
            // parts
602
0
            const auto sum = hadd_epi16(sumLo, sumHi);
603
604
            // average = (sum + 2) / 4
605
0
            average1 = srli_epi16(add_epi16(sum, two16), 2);
606
607
0
            pSrcScanlineShifted += 2 * DEST_ELTS;
608
0
        }
609
610
        // Pack each 16 bit average value to 8 bits
611
0
        const auto average = packus_epi16(average0, average1);
612
0
        storeu_int(&pDstScanline[iDstPixel], average);
613
0
    }
614
615
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
616
0
    return iDstPixel;
617
0
}
618
619
/************************************************************************/
620
/*                     QuadraticMeanUInt16SSE2()                        */
621
/************************************************************************/
622
623
#ifdef __SSE3__
624
#define sse2_hadd_pd _mm_hadd_pd
625
#else
626
inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
627
0
{
628
0
    auto aLo_bLo =
629
0
        _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
630
0
    auto aHi_bHi =
631
0
        _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
632
0
    return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
633
0
}
634
#endif
635
636
inline __m128d SQUARE_PD(__m128d x)
637
0
{
638
0
    return _mm_mul_pd(x, x);
639
0
}
640
641
#ifdef __AVX2__
642
643
inline __m256d SQUARE_PD(__m256d x)
644
{
645
    return _mm256_mul_pd(x, x);
646
}
647
648
inline __m256d FIXUP_LANES(__m256d x)
649
{
650
    return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
651
}
652
653
inline __m256 FIXUP_LANES(__m256 x)
654
{
655
    return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
656
}
657
658
#endif
659
660
template <class T>
661
static int
662
QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
663
                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
664
                        T *CPL_RESTRICT pDstScanline)
665
0
{
666
    // Optimized implementation for RMS on UInt16 by
667
    // processing by group of 4 output pixels.
668
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
669
670
0
    int iDstPixel = 0;
671
0
    const auto zero = _mm_setzero_si128();
672
673
#ifdef __AVX2__
674
    const auto zeroDot25 = _mm256_set1_pd(0.25);
675
    const auto zeroDot5 = _mm256_set1_pd(0.5);
676
677
    // The first four 0's could be anything, as we only take the bottom
678
    // 128 bits.
679
    const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
680
#else
681
0
    const auto zeroDot25 = _mm_set1_pd(0.25);
682
0
    const auto zeroDot5 = _mm_set1_pd(0.5);
683
0
#endif
684
685
0
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
686
0
    {
687
        // Load 8 UInt16 from each line
688
0
        const auto firstLine = _mm_loadu_si128(
689
0
            reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
690
0
        const auto secondLine =
691
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(
692
0
                pSrcScanlineShifted + nChunkXSize));
693
694
        // Detect if all of the source values fit in 14 bits.
695
        // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
696
        // and we can do a much faster implementation.
697
0
        const auto maskTmp =
698
0
            _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
699
#if defined(__i386__) || defined(_M_IX86)
700
        uint64_t nMaskFitsIn14Bits = 0;
701
        _mm_storel_epi64(
702
            reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
703
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
704
#else
705
0
        const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
706
0
            _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
707
0
#endif
708
0
        if (nMaskFitsIn14Bits == 0)
709
0
        {
710
            // Multiplication of 16 bit values and horizontal
711
            // addition of 32 bit results
712
0
            const auto firstLineHSumSquare =
713
0
                _mm_madd_epi16(firstLine, firstLine);
714
0
            const auto secondLineHSumSquare =
715
0
                _mm_madd_epi16(secondLine, secondLine);
716
            // Vertical addition
717
0
            const auto sumSquares =
718
0
                _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
719
            // In theory we should take sqrt(sumSquares * 0.25f)
720
            // but given the rounding we do, this is equivalent to
721
            // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
722
            // sumSquares <= 4 * 16383^2
723
0
            const auto one32 = _mm_set1_epi32(1);
724
0
            const auto sumSquaresPlusOneDiv4 =
725
0
                _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
726
            // Take square root and truncate/floor to int32
727
0
            auto rms = _mm_cvttps_epi32(
728
0
                _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
729
730
            // Round to upper value if it minimizes the
731
            // error |rms^2 - sumSquares/4|
732
            // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
733
            //    rms += 1;
734
            // which is equivalent to:
735
            // if( rms * rms + rms < (sumSquares+1) / 4 )
736
            //    rms += 1;
737
0
            auto mask =
738
0
                _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
739
0
                                _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
740
0
            rms = _mm_sub_epi32(rms, mask);
741
            // Pack each 32 bit RMS value to 16 bits
742
0
            rms = _mm_packs_epi32(rms, rms /* could be anything */);
743
0
            _mm_storel_epi64(
744
0
                reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
745
0
            pSrcScanlineShifted += 8;
746
0
            continue;
747
0
        }
748
749
        // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
750
        // to 32 bit would result in 4 multiplications instead of 8, but
751
        // mullo/mulhi have a worse throughput than mul_pd.
752
753
        // Extend those UInt16s as UInt32s
754
0
        const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
755
0
        const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
756
0
        const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
757
0
        const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
758
759
#ifdef __AVX2__
760
        // Multiplication of 32 bit values previously converted to 64 bit double
761
        const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
762
        const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
763
        const auto secondLineLoDbl =
764
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
765
        const auto secondLineHiDbl =
766
            SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
767
768
        // Vertical addition of squares
769
        const auto sumSquaresLo =
770
            _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
771
        const auto sumSquaresHi =
772
            _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
773
774
        // Horizontal addition of squares
775
        const auto sumSquares =
776
            FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
777
778
        const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
779
780
        // Take square root and truncate/floor to int32
781
        auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
782
        const auto rmsDouble = _mm256_cvtepi32_pd(rms);
783
        const auto right = _mm256_sub_pd(
784
            sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
785
786
        auto mask =
787
            _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
788
        // Extract 32-bit from each of the 4 64-bit masks
789
        // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
790
        // _MM_SHUFFLE(2,0,2,0)));
791
        mask = _mm256_permutevar8x32_ps(mask, permutation);
792
        const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
793
794
        // Apply the correction
795
        rms = _mm_sub_epi32(rms, maskI);
796
797
        // Pack each 32 bit RMS value to 16 bits
798
        rms = _mm_packus_epi32(rms, rms /* could be anything */);
799
#else
800
        // Multiplication of 32 bit values previously converted to 64 bit double
801
0
        const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
802
0
        const auto firstLineLoHi =
803
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
804
0
        const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
805
0
        const auto firstLineHiHi =
806
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
807
808
0
        const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
809
0
        const auto secondLineLoHi =
810
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
811
0
        const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
812
0
        const auto secondLineHiHi =
813
0
            SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
814
815
        // Vertical addition of squares
816
0
        const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
817
0
        const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
818
0
        const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
819
0
        const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
820
821
        // Horizontal addition of squares
822
0
        const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
823
0
        const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
824
825
0
        const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
826
0
        const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
827
        // Take square root and truncate/floor to int32
828
0
        const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
829
0
        const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
830
831
        // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
832
        // if( 0.5 < sumDivWeight - (rms * rms + rms) )
833
        //     rms += 1;
834
0
        const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
835
0
        const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
836
0
        const auto rightLo = _mm_sub_pd(
837
0
            sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
838
0
        const auto rightHi = _mm_sub_pd(
839
0
            sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
840
841
0
        const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
842
0
        const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
843
        // The value of the mask will be -1 when the correction needs to be
844
        // applied
845
0
        const auto mask = _mm_castps_si128(_mm_shuffle_ps(
846
0
            maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
847
848
0
        auto rms = _mm_castps_si128(
849
0
            _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
850
        // Apply the correction
851
0
        rms = _mm_sub_epi32(rms, mask);
852
853
        // Pack each 32 bit RMS value to 16 bits
854
0
        rms = sse2_packus_epi32(rms, rms /* could be anything */);
855
0
#endif
856
857
0
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
858
0
                         rms);
859
0
        pSrcScanlineShifted += 8;
860
0
    }
861
862
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
863
0
    return iDstPixel;
864
0
}
865
866
/************************************************************************/
867
/*                         AverageUInt16SSE2()                          */
868
/************************************************************************/
869
870
template <class T>
871
static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
872
                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
873
                             T *CPL_RESTRICT pDstScanline)
874
0
{
875
    // Optimized implementation for average on UInt16 by
876
    // processing by group of 8 output pixels.
877
878
0
    const auto mask = _mm_set1_epi32(0xFFFF);
879
0
    const auto two = _mm_set1_epi32(2);
880
0
    const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
881
882
0
    int iDstPixel = 0;
883
0
    for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
884
0
    {
885
0
        __m128i averageLow;
886
        // Load 8 UInt16 from each line
887
0
        {
888
0
            const auto firstLine = _mm_loadu_si128(
889
0
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
890
0
            const auto secondLine =
891
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
892
0
                    pSrcScanlineShifted + nChunkXSize));
893
894
            // Horizontal addition and extension to 32 bit
895
0
            const auto horizAddFirstLine = _mm_add_epi32(
896
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
897
0
            const auto horizAddSecondLine =
898
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
899
0
                              _mm_srli_epi32(secondLine, 16));
900
901
            // Vertical addition and average computation
902
            // average = (sum + 2) >> 2
903
0
            const auto sum = _mm_add_epi32(
904
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
905
0
            averageLow = _mm_srli_epi32(sum, 2);
906
0
        }
907
        // Load 8 UInt16 from each line
908
0
        __m128i averageHigh;
909
0
        {
910
0
            const auto firstLine = _mm_loadu_si128(
911
0
                reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
912
0
            const auto secondLine =
913
0
                _mm_loadu_si128(reinterpret_cast<__m128i const *>(
914
0
                    pSrcScanlineShifted + 8 + nChunkXSize));
915
916
            // Horizontal addition and extension to 32 bit
917
0
            const auto horizAddFirstLine = _mm_add_epi32(
918
0
                _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
919
0
            const auto horizAddSecondLine =
920
0
                _mm_add_epi32(_mm_and_si128(secondLine, mask),
921
0
                              _mm_srli_epi32(secondLine, 16));
922
923
            // Vertical addition and average computation
924
            // average = (sum + 2) >> 2
925
0
            const auto sum = _mm_add_epi32(
926
0
                _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
927
0
            averageHigh = _mm_srli_epi32(sum, 2);
928
0
        }
929
930
        // Pack each 32 bit average value to 16 bits
931
0
        auto average = sse2_packus_epi32(averageLow, averageHigh);
932
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
933
0
                         average);
934
0
        pSrcScanlineShifted += 16;
935
0
    }
936
937
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
938
0
    return iDstPixel;
939
0
}
940
941
/************************************************************************/
942
/*                      QuadraticMeanFloatSSE2()                        */
943
/************************************************************************/
944
945
#ifdef __SSE3__
946
#define sse2_hadd_ps _mm_hadd_ps
947
#else
948
inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
949
0
{
950
0
    auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
951
0
    auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
952
0
    return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
953
0
}
954
#endif
955
956
#ifdef __AVX2__
957
#define RMS_FLOAT_ELTS 8
958
#define set1_ps _mm256_set1_ps
959
#define loadu_ps _mm256_loadu_ps
960
#define andnot_ps _mm256_andnot_ps
961
#define and_ps _mm256_and_ps
962
#define max_ps _mm256_max_ps
963
#define shuffle_ps _mm256_shuffle_ps
964
#define div_ps _mm256_div_ps
965
#define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
966
#define mul_ps _mm256_mul_ps
967
#define add_ps _mm256_add_ps
968
#define hadd_ps _mm256_hadd_ps
969
#define sqrt_ps _mm256_sqrt_ps
970
#define or_ps _mm256_or_ps
971
#define unpacklo_ps _mm256_unpacklo_ps
972
#define unpackhi_ps _mm256_unpackhi_ps
973
#define storeu_ps _mm256_storeu_ps
974
975
inline __m256 SQUARE_PS(__m256 x)
976
{
977
    return _mm256_mul_ps(x, x);
978
}
979
980
#else
981
982
0
#define RMS_FLOAT_ELTS 4
983
0
#define set1_ps _mm_set1_ps
984
0
#define loadu_ps _mm_loadu_ps
985
0
#define andnot_ps _mm_andnot_ps
986
0
#define and_ps _mm_and_ps
987
0
#define max_ps _mm_max_ps
988
0
#define shuffle_ps _mm_shuffle_ps
989
0
#define div_ps _mm_div_ps
990
0
#define cmpeq_ps _mm_cmpeq_ps
991
0
#define mul_ps _mm_mul_ps
992
0
#define add_ps _mm_add_ps
993
#define hadd_ps sse2_hadd_ps
994
0
#define sqrt_ps _mm_sqrt_ps
995
0
#define or_ps _mm_or_ps
996
#define unpacklo_ps _mm_unpacklo_ps
997
#define unpackhi_ps _mm_unpackhi_ps
998
0
#define storeu_ps _mm_storeu_ps
999
1000
inline __m128 SQUARE_PS(__m128 x)
1001
0
{
1002
0
    return _mm_mul_ps(x, x);
1003
0
}
1004
1005
inline __m128 FIXUP_LANES(__m128 x)
1006
0
{
1007
0
    return x;
1008
0
}
1009
1010
#endif
1011
1012
static int
1013
#if defined(__GNUC__)
1014
    __attribute__((noinline))
1015
#endif
1016
    QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
1017
                           const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1018
                           float *CPL_RESTRICT pDstScanline)
1019
0
{
1020
    // Optimized implementation for RMS on Float32 by
1021
    // processing by group of RMS_FLOAT_ELTS output pixels.
1022
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1023
1024
0
    int iDstPixel = 0;
1025
0
    const auto minus_zero = set1_ps(-0.0f);
1026
0
    const auto zeroDot25 = set1_ps(0.25f);
1027
0
    const auto one = set1_ps(1.0f);
1028
0
    const auto infv = set1_ps(std::numeric_limits<float>::infinity());
1029
1030
0
    for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
1031
0
         iDstPixel += RMS_FLOAT_ELTS)
1032
0
    {
1033
        // Load 2*RMS_FLOAT_ELTS Float32 from each line
1034
0
        auto firstLineLo = loadu_ps(pSrcScanlineShifted);
1035
0
        auto firstLineHi = loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS);
1036
0
        auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
1037
0
        auto secondLineHi =
1038
0
            loadu_ps(pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize);
1039
1040
        // Take the absolute value
1041
0
        firstLineLo = andnot_ps(minus_zero, firstLineLo);
1042
0
        firstLineHi = andnot_ps(minus_zero, firstLineHi);
1043
0
        secondLineLo = andnot_ps(minus_zero, secondLineLo);
1044
0
        secondLineHi = andnot_ps(minus_zero, secondLineHi);
1045
1046
0
        auto firstLineEven =
1047
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1048
0
        auto firstLineOdd =
1049
0
            shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1050
0
        auto secondLineEven =
1051
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
1052
0
        auto secondLineOdd =
1053
0
            shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
1054
1055
        // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
1056
0
        const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
1057
0
                                 max_ps(secondLineEven, secondLineEven));
1058
1059
        // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
1060
        // This step is important to avoid that the square evaluates to infinity
1061
        // for sufficiently big input.
1062
0
        auto invMax = div_ps(one, maxV);
1063
        // Deal with 0 being the maximum to correct division by zero
1064
        // note: comparing to -0 leads to identical results as to comparing with
1065
        // 0
1066
0
        invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
1067
1068
0
        firstLineEven = mul_ps(firstLineEven, invMax);
1069
0
        firstLineOdd = mul_ps(firstLineOdd, invMax);
1070
0
        secondLineEven = mul_ps(secondLineEven, invMax);
1071
0
        secondLineOdd = mul_ps(secondLineOdd, invMax);
1072
1073
        // Compute squares
1074
0
        firstLineEven = SQUARE_PS(firstLineEven);
1075
0
        firstLineOdd = SQUARE_PS(firstLineOdd);
1076
0
        secondLineEven = SQUARE_PS(secondLineEven);
1077
0
        secondLineOdd = SQUARE_PS(secondLineOdd);
1078
1079
0
        const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
1080
0
                                       add_ps(secondLineEven, secondLineOdd));
1081
1082
0
        auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
1083
1084
        // Deal with infinity being the maximum
1085
0
        const auto maskIsInf = cmpeq_ps(maxV, infv);
1086
0
        rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
1087
1088
0
        rms = FIXUP_LANES(rms);
1089
1090
0
        storeu_ps(&pDstScanline[iDstPixel], rms);
1091
0
        pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
1092
0
    }
1093
1094
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1095
0
    return iDstPixel;
1096
0
}
1097
1098
/************************************************************************/
1099
/*                        AverageFloatSSE2()                            */
1100
/************************************************************************/
1101
1102
static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
1103
                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
1104
                            float *CPL_RESTRICT pDstScanline)
1105
0
{
1106
    // Optimized implementation for average on Float32 by
1107
    // processing by group of 4 output pixels.
1108
0
    const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
1109
1110
0
    int iDstPixel = 0;
1111
0
    const auto zeroDot25 = _mm_set1_ps(0.25f);
1112
1113
0
    for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
1114
0
    {
1115
        // Load 8 Float32 from each line
1116
0
        const auto firstLineLo = _mm_loadu_ps(pSrcScanlineShifted);
1117
0
        const auto firstLineHi = _mm_loadu_ps(pSrcScanlineShifted + 4);
1118
0
        const auto secondLineLo =
1119
0
            _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize);
1120
0
        const auto secondLineHi =
1121
0
            _mm_loadu_ps(pSrcScanlineShifted + 4 + nChunkXSize);
1122
1123
        // Vertical addition
1124
0
        const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
1125
0
        const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
1126
1127
        // Horizontal addition
1128
0
        const auto sum = sse2_hadd_ps(sumLo, sumHi);
1129
1130
0
        const auto average = _mm_mul_ps(sum, zeroDot25);
1131
1132
0
        _mm_storeu_ps(&pDstScanline[iDstPixel], average);
1133
0
        pSrcScanlineShifted += 8;
1134
0
    }
1135
1136
0
    pSrcScanlineShiftedInOut = pSrcScanlineShifted;
1137
0
    return iDstPixel;
1138
0
}
1139
1140
#endif
1141
1142
/************************************************************************/
1143
/*                    GDALResampleChunk_AverageOrRMS()                  */
1144
/************************************************************************/
1145
1146
template <class T, class Tsum, GDALDataType eWrkDataType>
1147
static CPLErr
1148
GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
1149
                                 const T *pChunk, void **ppDstBuffer)
1150
0
{
1151
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1152
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1153
0
    const double dfSrcXDelta = args.dfSrcXDelta;
1154
0
    const double dfSrcYDelta = args.dfSrcYDelta;
1155
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1156
0
    const int nChunkXOff = args.nChunkXOff;
1157
0
    const int nChunkYOff = args.nChunkYOff;
1158
0
    const int nChunkXSize = args.nChunkXSize;
1159
0
    const int nChunkYSize = args.nChunkYSize;
1160
0
    const int nDstXOff = args.nDstXOff;
1161
0
    const int nDstXOff2 = args.nDstXOff2;
1162
0
    const int nDstYOff = args.nDstYOff;
1163
0
    const int nDstYOff2 = args.nDstYOff2;
1164
0
    const char *pszResampling = args.pszResampling;
1165
0
    bool bHasNoData = args.bHasNoData;
1166
0
    const double dfNoDataValue = args.dfNoDataValue;
1167
0
    const GDALColorTable *poColorTable = args.poColorTable;
1168
0
    const bool bPropagateNoData = args.bPropagateNoData;
1169
1170
    // AVERAGE_BIT2GRAYSCALE
1171
0
    const bool bBit2Grayscale =
1172
0
        CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
1173
0
    const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
1174
0
    if (bBit2Grayscale)
1175
0
        poColorTable = nullptr;
1176
1177
0
    T tNoDataValue;
1178
0
    if (!bHasNoData)
1179
0
        tNoDataValue = 0;
1180
0
    else
1181
0
        tNoDataValue = static_cast<T>(dfNoDataValue);
1182
0
    const T tReplacementVal =
1183
0
        bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
1184
0
                         args.eOvrDataType, dfNoDataValue))
1185
0
                   : 0;
1186
1187
0
    int nChunkRightXOff = nChunkXOff + nChunkXSize;
1188
0
    int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1189
0
    int nDstXWidth = nDstXOff2 - nDstXOff;
1190
1191
    /* -------------------------------------------------------------------- */
1192
    /*      Allocate buffers.                                               */
1193
    /* -------------------------------------------------------------------- */
1194
0
    *ppDstBuffer = static_cast<T *>(
1195
0
        VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
1196
0
                            GDALGetDataTypeSizeBytes(eWrkDataType)));
1197
0
    if (*ppDstBuffer == nullptr)
1198
0
    {
1199
0
        return CE_Failure;
1200
0
    }
1201
0
    T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
1202
1203
0
    struct PrecomputedXValue
1204
0
    {
1205
0
        int nLeftXOffShifted;
1206
0
        int nRightXOffShifted;
1207
0
        double dfLeftWeight;
1208
0
        double dfRightWeight;
1209
0
        double dfTotalWeightFullLine;
1210
0
    };
1211
1212
0
    PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
1213
0
        VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
1214
1215
0
    if (pasSrcX == nullptr)
1216
0
    {
1217
0
        return CE_Failure;
1218
0
    }
1219
1220
0
    int nTransparentIdx = -1;
1221
0
    std::vector<GDALColorEntry> colorEntries;
1222
0
    if (poColorTable)
1223
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1224
1225
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1226
    // it as nodata value
1227
0
    if (bHasNoData && dfNoDataValue >= 0.0f &&
1228
0
        tNoDataValue < colorEntries.size())
1229
0
        colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
1230
1231
    // Or if we have no explicit nodata, but a color table entry that is
1232
    // transparent, consider it as the nodata value
1233
0
    else if (!bHasNoData && nTransparentIdx >= 0)
1234
0
    {
1235
0
        bHasNoData = true;
1236
0
        tNoDataValue = static_cast<T>(nTransparentIdx);
1237
0
    }
1238
1239
    /* ==================================================================== */
1240
    /*      Precompute inner loop constants.                                */
1241
    /* ==================================================================== */
1242
0
    bool bSrcXSpacingIsTwo = true;
1243
0
    int nLastSrcXOff2 = -1;
1244
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1245
0
    {
1246
0
        double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
1247
        // Apply some epsilon to avoid numerical precision issues
1248
0
        int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
1249
0
        double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
1250
0
        int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
1251
1252
0
        if (nSrcXOff < nChunkXOff)
1253
0
            nSrcXOff = nChunkXOff;
1254
0
        if (nSrcXOff2 == nSrcXOff)
1255
0
            nSrcXOff2++;
1256
0
        if (nSrcXOff2 > nChunkRightXOff)
1257
0
            nSrcXOff2 = nChunkRightXOff;
1258
1259
0
        pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
1260
0
        pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
1261
0
            nSrcXOff2 - nChunkXOff;
1262
0
        pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
1263
0
            (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
1264
0
        pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
1265
0
            1 - (nSrcXOff2 - dfSrcXOff2);
1266
0
        pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
1267
0
            pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
1268
0
        if (nSrcXOff + 1 < nSrcXOff2)
1269
0
        {
1270
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1271
0
                nSrcXOff2 - nSrcXOff - 2;
1272
0
            pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
1273
0
                pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
1274
0
        }
1275
1276
0
        if (nSrcXOff2 - nSrcXOff != 2 ||
1277
0
            (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
1278
0
        {
1279
0
            bSrcXSpacingIsTwo = false;
1280
0
        }
1281
0
        nLastSrcXOff2 = nSrcXOff2;
1282
0
    }
1283
1284
    /* ==================================================================== */
1285
    /*      Loop over destination scanlines.                                */
1286
    /* ==================================================================== */
1287
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1288
0
    {
1289
0
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
1290
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
1291
0
        if (nSrcYOff < nChunkYOff)
1292
0
            nSrcYOff = nChunkYOff;
1293
1294
0
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
1295
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
1296
0
        if (nSrcYOff2 == nSrcYOff)
1297
0
            ++nSrcYOff2;
1298
0
        if (nSrcYOff2 > nChunkBottomYOff)
1299
0
            nSrcYOff2 = nChunkBottomYOff;
1300
1301
0
        T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1302
1303
        /* --------------------------------------------------------------------
1304
         */
1305
        /*      Loop over destination pixels */
1306
        /* --------------------------------------------------------------------
1307
         */
1308
0
        if (poColorTable == nullptr)
1309
0
        {
1310
0
            if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
1311
0
                pabyChunkNodataMask == nullptr)
1312
0
            {
1313
                if constexpr (eWrkDataType == GDT_Byte ||
1314
                              eWrkDataType == GDT_UInt16)
1315
0
                {
1316
                    // Optimized case : no nodata, overview by a factor of 2 and
1317
                    // regular x and y src spacing.
1318
0
                    const T *pSrcScanlineShifted =
1319
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1320
0
                        static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1321
0
                            nChunkXSize;
1322
0
                    int iDstPixel = 0;
1323
0
#ifdef USE_SSE2
1324
                    if constexpr (eWrkDataType == GDT_Byte)
1325
0
                    {
1326
0
                        if (bQuadraticMean)
1327
0
                        {
1328
0
                            iDstPixel = QuadraticMeanByteSSE2OrAVX2(
1329
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1330
0
                                pDstScanline);
1331
0
                        }
1332
0
                        else
1333
0
                        {
1334
0
                            iDstPixel = AverageByteSSE2OrAVX2(
1335
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1336
0
                                pDstScanline);
1337
0
                        }
1338
                    }
1339
                    else
1340
0
                    {
1341
0
                        static_assert(eWrkDataType == GDT_UInt16);
1342
0
                        if (bQuadraticMean)
1343
0
                        {
1344
0
                            iDstPixel = QuadraticMeanUInt16SSE2(
1345
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1346
0
                                pDstScanline);
1347
0
                        }
1348
0
                        else
1349
0
                        {
1350
0
                            iDstPixel = AverageUInt16SSE2(
1351
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1352
0
                                pDstScanline);
1353
0
                        }
1354
0
                    }
1355
0
#endif
1356
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1357
0
                    {
1358
0
                        Tsum nTotal = 0;
1359
0
                        T nVal;
1360
0
                        if (bQuadraticMean)
1361
0
                            nTotal =
1362
0
                                SQUARE<Tsum>(pSrcScanlineShifted[0]) +
1363
0
                                SQUARE<Tsum>(pSrcScanlineShifted[1]) +
1364
0
                                SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
1365
0
                                SQUARE<Tsum>(
1366
0
                                    pSrcScanlineShifted[1 + nChunkXSize]);
1367
0
                        else
1368
0
                            nTotal = pSrcScanlineShifted[0] +
1369
0
                                     pSrcScanlineShifted[1] +
1370
0
                                     pSrcScanlineShifted[nChunkXSize] +
1371
0
                                     pSrcScanlineShifted[1 + nChunkXSize];
1372
1373
0
                        constexpr int nTotalWeight = 4;
1374
0
                        if (bQuadraticMean)
1375
0
                            nVal = ComputeIntegerRMS_4values<T>(nTotal);
1376
0
                        else
1377
0
                            nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
1378
0
                                                  nTotalWeight);
1379
1380
                        // No need to compare nVal against tNoDataValue as we
1381
                        // are in a case where pabyChunkNodataMask == nullptr
1382
                        // implies the absence of nodata value.
1383
0
                        pDstScanline[iDstPixel] = nVal;
1384
0
                        pSrcScanlineShifted += 2;
1385
0
                    }
1386
                }
1387
                else
1388
0
                {
1389
0
                    static_assert(eWrkDataType == GDT_Float32 ||
1390
0
                                  eWrkDataType == GDT_Float64);
1391
0
                    const T *pSrcScanlineShifted =
1392
0
                        pChunk + pasSrcX[0].nLeftXOffShifted +
1393
0
                        static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
1394
0
                            nChunkXSize;
1395
0
                    int iDstPixel = 0;
1396
0
#ifdef USE_SSE2
1397
                    if constexpr (eWrkDataType == GDT_Float32)
1398
0
                    {
1399
0
                        static_assert(std::is_same_v<T, float>);
1400
0
                        if (bQuadraticMean)
1401
0
                        {
1402
0
                            iDstPixel = QuadraticMeanFloatSSE2(
1403
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1404
0
                                pDstScanline);
1405
0
                        }
1406
0
                        else
1407
0
                        {
1408
0
                            iDstPixel = AverageFloatSSE2(
1409
0
                                nDstXWidth, nChunkXSize, pSrcScanlineShifted,
1410
0
                                pDstScanline);
1411
0
                        }
1412
0
                    }
1413
0
#endif
1414
1415
0
                    for (; iDstPixel < nDstXWidth; ++iDstPixel)
1416
0
                    {
1417
0
                        T nVal;
1418
0
                        if (bQuadraticMean)
1419
0
                        {
1420
                            // Cast to double to avoid overflows
1421
                            // (using std::hypot() is much slower)
1422
0
                            nVal = static_cast<T>(std::sqrt(
1423
0
                                0.25 *
1424
0
                                (SQUARE<double>(pSrcScanlineShifted[0]) +
1425
0
                                 SQUARE<double>(pSrcScanlineShifted[1]) +
1426
0
                                 SQUARE<double>(
1427
0
                                     pSrcScanlineShifted[nChunkXSize]) +
1428
0
                                 SQUARE<double>(
1429
0
                                     pSrcScanlineShifted[1 + nChunkXSize]))));
1430
0
                        }
1431
0
                        else
1432
0
                        {
1433
0
                            nVal = static_cast<T>(
1434
0
                                0.25f * (pSrcScanlineShifted[0] +
1435
0
                                         pSrcScanlineShifted[1] +
1436
0
                                         pSrcScanlineShifted[nChunkXSize] +
1437
0
                                         pSrcScanlineShifted[1 + nChunkXSize]));
1438
0
                        }
1439
1440
                        // No need to compare nVal against tNoDataValue as we
1441
                        // are in a case where pabyChunkNodataMask == nullptr
1442
                        // implies the absence of nodata value.
1443
0
                        pDstScanline[iDstPixel] = nVal;
1444
0
                        pSrcScanlineShifted += 2;
1445
0
                    }
1446
0
                }
1447
0
            }
1448
0
            else
1449
0
            {
1450
0
                const double dfBottomWeight =
1451
0
                    (nSrcYOff + 1 == nSrcYOff2) ? 1.0
1452
0
                                                : 1.0 - (dfSrcYOff - nSrcYOff);
1453
0
                const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
1454
0
                nSrcYOff -= nChunkYOff;
1455
0
                nSrcYOff2 -= nChunkYOff;
1456
1457
0
                double dfTotalWeightFullColumn = dfBottomWeight;
1458
0
                if (nSrcYOff + 1 < nSrcYOff2)
1459
0
                {
1460
0
                    dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
1461
0
                    dfTotalWeightFullColumn += dfTopWeight;
1462
0
                }
1463
1464
0
                for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1465
0
                {
1466
0
                    const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1467
0
                    const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1468
1469
0
                    double dfTotal = 0;
1470
0
                    double dfTotalWeight = 0;
1471
0
                    if (pabyChunkNodataMask == nullptr)
1472
0
                    {
1473
0
                        auto pChunkShifted =
1474
0
                            pChunk +
1475
0
                            static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
1476
0
                        int nCounterY = nSrcYOff2 - nSrcYOff - 1;
1477
0
                        double dfWeightY = dfBottomWeight;
1478
0
                        while (true)
1479
0
                        {
1480
0
                            double dfTotalLine;
1481
0
                            if (bQuadraticMean)
1482
0
                            {
1483
                                // Left pixel
1484
0
                                {
1485
0
                                    const T val = pChunkShifted[nSrcXOff];
1486
0
                                    dfTotalLine =
1487
0
                                        SQUARE<double>(val) *
1488
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1489
0
                                }
1490
1491
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1492
0
                                {
1493
                                    // Middle pixels
1494
0
                                    for (int iX = nSrcXOff + 1;
1495
0
                                         iX + 1 < nSrcXOff2; ++iX)
1496
0
                                    {
1497
0
                                        const T val = pChunkShifted[iX];
1498
0
                                        dfTotalLine += SQUARE<double>(val);
1499
0
                                    }
1500
1501
                                    // Right pixel
1502
0
                                    {
1503
0
                                        const T val =
1504
0
                                            pChunkShifted[nSrcXOff2 - 1];
1505
0
                                        dfTotalLine +=
1506
0
                                            SQUARE<double>(val) *
1507
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1508
0
                                    }
1509
0
                                }
1510
0
                            }
1511
0
                            else
1512
0
                            {
1513
                                // Left pixel
1514
0
                                {
1515
0
                                    const T val = pChunkShifted[nSrcXOff];
1516
0
                                    dfTotalLine =
1517
0
                                        val * pasSrcX[iDstPixel].dfLeftWeight;
1518
0
                                }
1519
1520
0
                                if (nSrcXOff + 1 < nSrcXOff2)
1521
0
                                {
1522
                                    // Middle pixels
1523
0
                                    for (int iX = nSrcXOff + 1;
1524
0
                                         iX + 1 < nSrcXOff2; ++iX)
1525
0
                                    {
1526
0
                                        const T val = pChunkShifted[iX];
1527
0
                                        dfTotalLine += val;
1528
0
                                    }
1529
1530
                                    // Right pixel
1531
0
                                    {
1532
0
                                        const T val =
1533
0
                                            pChunkShifted[nSrcXOff2 - 1];
1534
0
                                        dfTotalLine +=
1535
0
                                            val *
1536
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1537
0
                                    }
1538
0
                                }
1539
0
                            }
1540
1541
0
                            dfTotal += dfTotalLine * dfWeightY;
1542
0
                            --nCounterY;
1543
0
                            if (nCounterY < 0)
1544
0
                                break;
1545
0
                            pChunkShifted += nChunkXSize;
1546
0
                            dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
1547
0
                        }
1548
1549
0
                        dfTotalWeight =
1550
0
                            pasSrcX[iDstPixel].dfTotalWeightFullLine *
1551
0
                            dfTotalWeightFullColumn;
1552
0
                    }
1553
0
                    else
1554
0
                    {
1555
0
                        GPtrDiff_t nCount = 0;
1556
0
                        for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1557
0
                        {
1558
0
                            const auto pChunkShifted =
1559
0
                                pChunk +
1560
0
                                static_cast<GPtrDiff_t>(iY) * nChunkXSize;
1561
1562
0
                            double dfTotalLine = 0;
1563
0
                            double dfTotalWeightLine = 0;
1564
                            // Left pixel
1565
0
                            {
1566
0
                                const int iX = nSrcXOff;
1567
0
                                const T val = pChunkShifted[iX];
1568
0
                                if (pabyChunkNodataMask[iX + iY * nChunkXSize])
1569
0
                                {
1570
0
                                    nCount++;
1571
0
                                    const double dfWeightX =
1572
0
                                        pasSrcX[iDstPixel].dfLeftWeight;
1573
0
                                    dfTotalWeightLine = dfWeightX;
1574
0
                                    if (bQuadraticMean)
1575
0
                                        dfTotalLine =
1576
0
                                            SQUARE<double>(val) * dfWeightX;
1577
0
                                    else
1578
0
                                        dfTotalLine = val * dfWeightX;
1579
0
                                }
1580
0
                            }
1581
1582
0
                            if (nSrcXOff + 1 < nSrcXOff2)
1583
0
                            {
1584
                                // Middle pixels
1585
0
                                for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
1586
0
                                     ++iX)
1587
0
                                {
1588
0
                                    const T val = pChunkShifted[iX];
1589
0
                                    if (pabyChunkNodataMask[iX +
1590
0
                                                            iY * nChunkXSize])
1591
0
                                    {
1592
0
                                        nCount++;
1593
0
                                        dfTotalWeightLine += 1;
1594
0
                                        if (bQuadraticMean)
1595
0
                                            dfTotalLine += SQUARE<double>(val);
1596
0
                                        else
1597
0
                                            dfTotalLine += val;
1598
0
                                    }
1599
0
                                }
1600
1601
                                // Right pixel
1602
0
                                {
1603
0
                                    const int iX = nSrcXOff2 - 1;
1604
0
                                    const T val = pChunkShifted[iX];
1605
0
                                    if (pabyChunkNodataMask[iX +
1606
0
                                                            iY * nChunkXSize])
1607
0
                                    {
1608
0
                                        nCount++;
1609
0
                                        const double dfWeightX =
1610
0
                                            pasSrcX[iDstPixel].dfRightWeight;
1611
0
                                        dfTotalWeightLine += dfWeightX;
1612
0
                                        if (bQuadraticMean)
1613
0
                                            dfTotalLine +=
1614
0
                                                SQUARE<double>(val) * dfWeightX;
1615
0
                                        else
1616
0
                                            dfTotalLine += val * dfWeightX;
1617
0
                                    }
1618
0
                                }
1619
0
                            }
1620
1621
0
                            const double dfWeightY =
1622
0
                                (iY == nSrcYOff)        ? dfBottomWeight
1623
0
                                : (iY + 1 == nSrcYOff2) ? dfTopWeight
1624
0
                                                        : 1.0;
1625
0
                            dfTotal += dfTotalLine * dfWeightY;
1626
0
                            dfTotalWeight += dfTotalWeightLine * dfWeightY;
1627
0
                        }
1628
1629
0
                        if (nCount == 0 ||
1630
0
                            (bPropagateNoData &&
1631
0
                             nCount <
1632
0
                                 static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1633
0
                                     (nSrcXOff2 - nSrcXOff)))
1634
0
                        {
1635
0
                            pDstScanline[iDstPixel] = tNoDataValue;
1636
0
                            continue;
1637
0
                        }
1638
0
                    }
1639
                    if constexpr (eWrkDataType == GDT_Byte)
1640
0
                    {
1641
0
                        T nVal;
1642
0
                        if (bQuadraticMean)
1643
0
                            nVal = ComputeIntegerRMS<T, int>(dfTotal,
1644
0
                                                             dfTotalWeight);
1645
0
                        else
1646
0
                            nVal =
1647
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1648
0
                        if (bHasNoData && nVal == tNoDataValue)
1649
0
                            nVal = tReplacementVal;
1650
0
                        pDstScanline[iDstPixel] = nVal;
1651
                    }
1652
                    else if constexpr (eWrkDataType == GDT_UInt16)
1653
0
                    {
1654
0
                        T nVal;
1655
0
                        if (bQuadraticMean)
1656
0
                            nVal = ComputeIntegerRMS<T, uint64_t>(
1657
0
                                dfTotal, dfTotalWeight);
1658
0
                        else
1659
0
                            nVal =
1660
0
                                static_cast<T>(dfTotal / dfTotalWeight + 0.5);
1661
0
                        if (bHasNoData && nVal == tNoDataValue)
1662
0
                            nVal = tReplacementVal;
1663
0
                        pDstScanline[iDstPixel] = nVal;
1664
                    }
1665
                    else
1666
0
                    {
1667
0
                        T nVal;
1668
0
                        if (bQuadraticMean)
1669
0
                            nVal =
1670
0
                                static_cast<T>(sqrt(dfTotal / dfTotalWeight));
1671
0
                        else
1672
0
                            nVal = static_cast<T>(dfTotal / dfTotalWeight);
1673
0
                        if (bHasNoData && nVal == tNoDataValue)
1674
0
                            nVal = tReplacementVal;
1675
0
                        pDstScanline[iDstPixel] = nVal;
1676
0
                    }
1677
0
                }
1678
0
            }
1679
0
        }
1680
0
        else
1681
0
        {
1682
0
            nSrcYOff -= nChunkYOff;
1683
0
            nSrcYOff2 -= nChunkYOff;
1684
1685
0
            for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
1686
0
            {
1687
0
                const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
1688
0
                const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
1689
1690
0
                GPtrDiff_t nTotalR = 0;
1691
0
                GPtrDiff_t nTotalG = 0;
1692
0
                GPtrDiff_t nTotalB = 0;
1693
0
                GPtrDiff_t nCount = 0;
1694
1695
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
1696
0
                {
1697
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
1698
0
                    {
1699
0
                        const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
1700
0
                                                      nChunkXSize];
1701
                        // cppcheck-suppress unsignedLessThanZero
1702
0
                        if (val < 0 || val >= colorEntries.size())
1703
0
                            continue;
1704
0
                        size_t idx = static_cast<size_t>(val);
1705
0
                        const auto &entry = colorEntries[idx];
1706
0
                        if (entry.c4)
1707
0
                        {
1708
0
                            if (bQuadraticMean)
1709
0
                            {
1710
0
                                nTotalR += SQUARE<int>(entry.c1);
1711
0
                                nTotalG += SQUARE<int>(entry.c2);
1712
0
                                nTotalB += SQUARE<int>(entry.c3);
1713
0
                                ++nCount;
1714
0
                            }
1715
0
                            else
1716
0
                            {
1717
0
                                nTotalR += entry.c1;
1718
0
                                nTotalG += entry.c2;
1719
0
                                nTotalB += entry.c3;
1720
0
                                ++nCount;
1721
0
                            }
1722
0
                        }
1723
0
                    }
1724
0
                }
1725
1726
0
                if (nCount == 0 ||
1727
0
                    (bPropagateNoData &&
1728
0
                     nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
1729
0
                                  (nSrcXOff2 - nSrcXOff)))
1730
0
                {
1731
0
                    pDstScanline[iDstPixel] = tNoDataValue;
1732
0
                }
1733
0
                else
1734
0
                {
1735
0
                    GDALColorEntry color;
1736
0
                    if (bQuadraticMean)
1737
0
                    {
1738
0
                        color.c1 =
1739
0
                            static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
1740
0
                        color.c2 =
1741
0
                            static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
1742
0
                        color.c3 =
1743
0
                            static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
1744
0
                    }
1745
0
                    else
1746
0
                    {
1747
0
                        color.c1 =
1748
0
                            static_cast<short>((nTotalR + nCount / 2) / nCount);
1749
0
                        color.c2 =
1750
0
                            static_cast<short>((nTotalG + nCount / 2) / nCount);
1751
0
                        color.c3 =
1752
0
                            static_cast<short>((nTotalB + nCount / 2) / nCount);
1753
0
                    }
1754
0
                    pDstScanline[iDstPixel] =
1755
0
                        static_cast<T>(BestColorEntry(colorEntries, color));
1756
0
                }
1757
0
            }
1758
0
        }
1759
0
    }
1760
1761
0
    CPLFree(pasSrcX);
1762
1763
0
    return CE_None;
1764
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned char, int, (GDALDataType)1>(GDALOverviewResampleArgs const&, unsigned char const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, double, (GDALDataType)2>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<unsigned short, unsigned int, (GDALDataType)2>(GDALOverviewResampleArgs const&, unsigned short const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<float, double, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void**)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_AverageOrRMS_T<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void**)
1765
1766
static CPLErr
1767
GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
1768
                               const void *pChunk, void **ppDstBuffer,
1769
                               GDALDataType *peDstBufferDataType)
1770
0
{
1771
0
    *peDstBufferDataType = args.eWrkDataType;
1772
0
    switch (args.eWrkDataType)
1773
0
    {
1774
0
        case GDT_Byte:
1775
0
        {
1776
0
            return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
1777
0
                args, static_cast<const GByte *>(pChunk), ppDstBuffer);
1778
0
        }
1779
1780
0
        case GDT_UInt16:
1781
0
        {
1782
0
            if (EQUAL(args.pszResampling, "RMS"))
1783
0
            {
1784
                // Use double as accumulation type, because UInt32 could overflow
1785
0
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
1786
0
                                                        GDT_UInt16>(
1787
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1788
0
            }
1789
0
            else
1790
0
            {
1791
0
                return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
1792
0
                                                        GDT_UInt16>(
1793
0
                    args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
1794
0
            }
1795
0
        }
1796
1797
0
        case GDT_Float32:
1798
0
        {
1799
0
            return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
1800
0
                args, static_cast<const float *>(pChunk), ppDstBuffer);
1801
0
        }
1802
1803
0
        case GDT_Float64:
1804
0
        {
1805
0
            return GDALResampleChunk_AverageOrRMS_T<double, double,
1806
0
                                                    GDT_Float64>(
1807
0
                args, static_cast<const double *>(pChunk), ppDstBuffer);
1808
0
        }
1809
1810
0
        default:
1811
0
            break;
1812
0
    }
1813
1814
0
    CPLAssert(false);
1815
0
    return CE_Failure;
1816
0
}
1817
1818
/************************************************************************/
1819
/*                     GDALResampleChunk_Gauss()                        */
1820
/************************************************************************/
1821
1822
static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
1823
                                      const void *pChunk, void **ppDstBuffer,
1824
                                      GDALDataType *peDstBufferDataType)
1825
1826
0
{
1827
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
1828
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
1829
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
1830
0
    const int nChunkXOff = args.nChunkXOff;
1831
0
    const int nChunkXSize = args.nChunkXSize;
1832
0
    const int nChunkYOff = args.nChunkYOff;
1833
0
    const int nChunkYSize = args.nChunkYSize;
1834
0
    const int nDstXOff = args.nDstXOff;
1835
0
    const int nDstXOff2 = args.nDstXOff2;
1836
0
    const int nDstYOff = args.nDstYOff;
1837
0
    const int nDstYOff2 = args.nDstYOff2;
1838
0
    const bool bHasNoData = args.bHasNoData;
1839
0
    double dfNoDataValue = args.dfNoDataValue;
1840
0
    const GDALColorTable *poColorTable = args.poColorTable;
1841
1842
0
    const double *const padfChunk = static_cast<const double *>(pChunk);
1843
1844
0
    *ppDstBuffer =
1845
0
        VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
1846
0
                            GDALGetDataTypeSizeBytes(GDT_Float64));
1847
0
    if (*ppDstBuffer == nullptr)
1848
0
    {
1849
0
        return CE_Failure;
1850
0
    }
1851
0
    *peDstBufferDataType = GDT_Float64;
1852
0
    double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
1853
1854
    /* -------------------------------------------------------------------- */
1855
    /*      Create the filter kernel and allocate scanline buffer.          */
1856
    /* -------------------------------------------------------------------- */
1857
0
    int nGaussMatrixDim = 3;
1858
0
    const int *panGaussMatrix;
1859
0
    constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
1860
0
    constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
1861
0
                                        4,  6, 24, 36, 24, 6, 4,  16, 24,
1862
0
                                        16, 4, 1,  4,  6,  4, 1};
1863
0
    constexpr int anGaussMatrix7x7[] = {
1864
0
        1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
1865
0
        6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
1866
0
        120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
1867
0
        90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
1868
1869
0
    const int nOXSize = args.nOvrXSize;
1870
0
    const int nOYSize = args.nOvrYSize;
1871
0
    const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
1872
1873
    // matrix for gauss filter
1874
0
    if (nResYFactor <= 2)
1875
0
    {
1876
0
        panGaussMatrix = anGaussMatrix3x3;
1877
0
        nGaussMatrixDim = 3;
1878
0
    }
1879
0
    else if (nResYFactor <= 4)
1880
0
    {
1881
0
        panGaussMatrix = anGaussMatrix5x5;
1882
0
        nGaussMatrixDim = 5;
1883
0
    }
1884
0
    else
1885
0
    {
1886
0
        panGaussMatrix = anGaussMatrix7x7;
1887
0
        nGaussMatrixDim = 7;
1888
0
    }
1889
1890
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
1891
    int *panGaussMatrixDup = static_cast<int *>(
1892
        CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
1893
    memcpy(panGaussMatrixDup, panGaussMatrix,
1894
           sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
1895
    panGaussMatrix = panGaussMatrixDup;
1896
#endif
1897
1898
0
    if (!bHasNoData)
1899
0
        dfNoDataValue = 0.0;
1900
1901
0
    std::vector<GDALColorEntry> colorEntries;
1902
0
    int nTransparentIdx = -1;
1903
0
    if (poColorTable)
1904
0
        colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
1905
1906
    // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
1907
    // it as nodata value.
1908
0
    if (bHasNoData && dfNoDataValue >= 0.0f &&
1909
0
        dfNoDataValue < colorEntries.size())
1910
0
        colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
1911
1912
    // Or if we have no explicit nodata, but a color table entry that is
1913
    // transparent, consider it as the nodata value.
1914
0
    else if (!bHasNoData && nTransparentIdx >= 0)
1915
0
    {
1916
0
        dfNoDataValue = nTransparentIdx;
1917
0
    }
1918
1919
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
1920
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
1921
0
    const int nDstXWidth = nDstXOff2 - nDstXOff;
1922
1923
    /* ==================================================================== */
1924
    /*      Loop over destination scanlines.                                */
1925
    /* ==================================================================== */
1926
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
1927
0
    {
1928
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
1929
0
        int nSrcYOff2 =
1930
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
1931
1932
0
        if (nSrcYOff < nChunkYOff)
1933
0
        {
1934
0
            nSrcYOff = nChunkYOff;
1935
0
            nSrcYOff2++;
1936
0
        }
1937
1938
0
        const int iSizeY = nSrcYOff2 - nSrcYOff;
1939
0
        nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
1940
0
        nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
1941
1942
0
        if (nSrcYOff2 > nChunkBottomYOff ||
1943
0
            (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
1944
0
        {
1945
0
            nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
1946
0
        }
1947
1948
0
        int nYShiftGaussMatrix = 0;
1949
0
        if (nSrcYOff < nChunkYOff)
1950
0
        {
1951
0
            nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
1952
0
            nSrcYOff = nChunkYOff;
1953
0
        }
1954
1955
0
        const double *const padfSrcScanline =
1956
0
            padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1957
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
1958
0
        if (pabyChunkNodataMask != nullptr)
1959
0
            pabySrcScanlineNodataMask =
1960
0
                pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
1961
1962
        /* --------------------------------------------------------------------
1963
         */
1964
        /*      Loop over destination pixels */
1965
        /* --------------------------------------------------------------------
1966
         */
1967
0
        double *const padfDstScanline =
1968
0
            padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
1969
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
1970
0
        {
1971
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
1972
0
            int nSrcXOff2 =
1973
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
1974
1975
0
            if (nSrcXOff < nChunkXOff)
1976
0
            {
1977
0
                nSrcXOff = nChunkXOff;
1978
0
                nSrcXOff2++;
1979
0
            }
1980
1981
0
            const int iSizeX = nSrcXOff2 - nSrcXOff;
1982
0
            nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
1983
0
            nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
1984
1985
0
            if (nSrcXOff2 > nChunkRightXOff ||
1986
0
                (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
1987
0
            {
1988
0
                nSrcXOff2 =
1989
0
                    std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
1990
0
            }
1991
1992
0
            int nXShiftGaussMatrix = 0;
1993
0
            if (nSrcXOff < nChunkXOff)
1994
0
            {
1995
0
                nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
1996
0
                nSrcXOff = nChunkXOff;
1997
0
            }
1998
1999
0
            if (poColorTable == nullptr)
2000
0
            {
2001
0
                double dfTotal = 0.0;
2002
0
                GInt64 nCount = 0;
2003
0
                const int *panLineWeight =
2004
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2005
0
                    nXShiftGaussMatrix;
2006
2007
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2008
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2009
0
                {
2010
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2011
0
                    {
2012
0
                        const double val =
2013
0
                            padfSrcScanline[iX - nChunkXOff +
2014
0
                                            static_cast<GPtrDiff_t>(iY -
2015
0
                                                                    nSrcYOff) *
2016
0
                                                nChunkXSize];
2017
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2018
0
                            pabySrcScanlineNodataMask[iX - nChunkXOff +
2019
0
                                                      static_cast<GPtrDiff_t>(
2020
0
                                                          iY - nSrcYOff) *
2021
0
                                                          nChunkXSize])
2022
0
                        {
2023
0
                            const int nWeight = panLineWeight[i];
2024
0
                            dfTotal += val * nWeight;
2025
0
                            nCount += nWeight;
2026
0
                        }
2027
0
                    }
2028
0
                }
2029
2030
0
                if (nCount == 0)
2031
0
                {
2032
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2033
0
                }
2034
0
                else
2035
0
                {
2036
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
2037
0
                }
2038
0
            }
2039
0
            else
2040
0
            {
2041
0
                GInt64 nTotalR = 0;
2042
0
                GInt64 nTotalG = 0;
2043
0
                GInt64 nTotalB = 0;
2044
0
                GInt64 nTotalWeight = 0;
2045
0
                const int *panLineWeight =
2046
0
                    panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
2047
0
                    nXShiftGaussMatrix;
2048
2049
0
                for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
2050
0
                     ++iY, ++j, panLineWeight += nGaussMatrixDim)
2051
0
                {
2052
0
                    for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
2053
0
                    {
2054
0
                        const double val =
2055
0
                            padfSrcScanline[iX - nChunkXOff +
2056
0
                                            static_cast<GPtrDiff_t>(iY -
2057
0
                                                                    nSrcYOff) *
2058
0
                                                nChunkXSize];
2059
0
                        if (val < 0 || val >= colorEntries.size())
2060
0
                            continue;
2061
2062
0
                        size_t idx = static_cast<size_t>(val);
2063
0
                        if (colorEntries[idx].c4)
2064
0
                        {
2065
0
                            const int nWeight = panLineWeight[i];
2066
0
                            nTotalR +=
2067
0
                                static_cast<GInt64>(colorEntries[idx].c1) *
2068
0
                                nWeight;
2069
0
                            nTotalG +=
2070
0
                                static_cast<GInt64>(colorEntries[idx].c2) *
2071
0
                                nWeight;
2072
0
                            nTotalB +=
2073
0
                                static_cast<GInt64>(colorEntries[idx].c3) *
2074
0
                                nWeight;
2075
0
                            nTotalWeight += nWeight;
2076
0
                        }
2077
0
                    }
2078
0
                }
2079
2080
0
                if (nTotalWeight == 0)
2081
0
                {
2082
0
                    padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
2083
0
                }
2084
0
                else
2085
0
                {
2086
0
                    GDALColorEntry color;
2087
2088
0
                    color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
2089
0
                                                  nTotalWeight);
2090
0
                    color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
2091
0
                                                  nTotalWeight);
2092
0
                    color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
2093
0
                                                  nTotalWeight);
2094
0
                    padfDstScanline[iDstPixel - nDstXOff] =
2095
0
                        BestColorEntry(colorEntries, color);
2096
0
                }
2097
0
            }
2098
0
        }
2099
0
    }
2100
2101
#ifdef DEBUG_OUT_OF_BOUND_ACCESS
2102
    CPLFree(panGaussMatrixDup);
2103
#endif
2104
2105
0
    return CE_None;
2106
0
}
2107
2108
/************************************************************************/
2109
/*                      GDALResampleChunk_Mode()                        */
2110
/************************************************************************/
2111
2112
template <class T> static inline bool IsSame(T a, T b)
2113
0
{
2114
0
    return a == b;
2115
0
}
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned char>(unsigned char, unsigned char)
Unexecuted instantiation: overview.cpp:bool IsSame<signed char>(signed char, signed char)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned short>(unsigned short, unsigned short)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned int>(unsigned int, unsigned int)
Unexecuted instantiation: overview.cpp:bool IsSame<unsigned long>(unsigned long, unsigned long)
2116
2117
template <> bool IsSame<float>(float a, float b)
2118
0
{
2119
0
    return a == b || (std::isnan(a) && std::isnan(b));
2120
0
}
2121
2122
template <> bool IsSame<double>(double a, double b)
2123
0
{
2124
0
    return a == b || (std::isnan(a) && std::isnan(b));
2125
0
}
2126
2127
template <>
2128
bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
2129
0
{
2130
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2131
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2132
0
}
2133
2134
template <>
2135
bool IsSame<std::complex<double>>(std::complex<double> a,
2136
                                  std::complex<double> b)
2137
0
{
2138
0
    return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
2139
0
                      std::isnan(b.real()) && std::isnan(b.imag()));
2140
0
}
2141
2142
template <class T>
2143
static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
2144
                                      const T *pChunk, T *const pDstBuffer)
2145
2146
0
{
2147
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
2148
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
2149
0
    const double dfSrcXDelta = args.dfSrcXDelta;
2150
0
    const double dfSrcYDelta = args.dfSrcYDelta;
2151
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
2152
0
    const int nChunkXOff = args.nChunkXOff;
2153
0
    const int nChunkXSize = args.nChunkXSize;
2154
0
    const int nChunkYOff = args.nChunkYOff;
2155
0
    const int nChunkYSize = args.nChunkYSize;
2156
0
    const int nDstXOff = args.nDstXOff;
2157
0
    const int nDstXOff2 = args.nDstXOff2;
2158
0
    const int nDstYOff = args.nDstYOff;
2159
0
    const int nDstYOff2 = args.nDstYOff2;
2160
0
    const bool bHasNoData = args.bHasNoData;
2161
0
    const GDALColorTable *poColorTable = args.poColorTable;
2162
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
2163
2164
0
    T tNoDataValue;
2165
    if constexpr (std::is_same<T, std::complex<float>>::value ||
2166
                  std::is_same<T, std::complex<double>>::value)
2167
0
    {
2168
0
        using BaseT = typename T::value_type;
2169
0
        tNoDataValue =
2170
0
            std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
2171
0
                                std::numeric_limits<BaseT>::quiet_NaN());
2172
    }
2173
0
    else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
2174
0
        tNoDataValue = 0;
2175
0
    else
2176
0
        tNoDataValue = static_cast<T>(args.dfNoDataValue);
2177
2178
0
    size_t nMaxNumPx = 0;
2179
0
    T *paVals = nullptr;
2180
0
    int *panSums = nullptr;
2181
2182
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
2183
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
2184
0
    std::vector<int> anVals(256, 0);
2185
2186
    /* ==================================================================== */
2187
    /*      Loop over destination scanlines.                                */
2188
    /* ==================================================================== */
2189
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
2190
0
    {
2191
0
        double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
2192
0
        int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
2193
#ifdef only_pixels_with_more_than_10_pct_participation
2194
        // When oversampling, don't take into account pixels that have a tiny
2195
        // participation in the resulting pixel
2196
        if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
2197
            nSrcYOff < nChunkBottomYOff)
2198
            nSrcYOff++;
2199
#endif
2200
0
        if (nSrcYOff < nChunkYOff)
2201
0
            nSrcYOff = nChunkYOff;
2202
2203
0
        double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
2204
0
        int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
2205
#ifdef only_pixels_with_more_than_10_pct_participation
2206
        // When oversampling, don't take into account pixels that have a tiny
2207
        // participation in the resulting pixel
2208
        if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
2209
            nSrcYOff2 > nChunkYOff)
2210
            nSrcYOff2--;
2211
#endif
2212
0
        if (nSrcYOff2 == nSrcYOff)
2213
0
            ++nSrcYOff2;
2214
0
        if (nSrcYOff2 > nChunkBottomYOff)
2215
0
            nSrcYOff2 = nChunkBottomYOff;
2216
2217
0
        const T *const paSrcScanline =
2218
0
            pChunk +
2219
0
            (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
2220
0
        const GByte *pabySrcScanlineNodataMask = nullptr;
2221
0
        if (pabyChunkNodataMask != nullptr)
2222
0
            pabySrcScanlineNodataMask =
2223
0
                pabyChunkNodataMask +
2224
0
                static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
2225
2226
0
        T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
2227
        /* --------------------------------------------------------------------
2228
         */
2229
        /*      Loop over destination pixels */
2230
        /* --------------------------------------------------------------------
2231
         */
2232
0
        for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
2233
0
        {
2234
0
            double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
2235
            // Apply some epsilon to avoid numerical precision issues
2236
0
            int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
2237
#ifdef only_pixels_with_more_than_10_pct_participation
2238
            // When oversampling, don't take into account pixels that have a
2239
            // tiny participation in the resulting pixel
2240
            if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
2241
                nSrcXOff < nChunkRightXOff)
2242
                nSrcXOff++;
2243
#endif
2244
0
            if (nSrcXOff < nChunkXOff)
2245
0
                nSrcXOff = nChunkXOff;
2246
2247
0
            double dfSrcXOff2 =
2248
0
                dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
2249
0
            int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
2250
#ifdef only_pixels_with_more_than_10_pct_participation
2251
            // When oversampling, don't take into account pixels that have a
2252
            // tiny participation in the resulting pixel
2253
            if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
2254
                nSrcXOff2 > nChunkXOff)
2255
                nSrcXOff2--;
2256
#endif
2257
0
            if (nSrcXOff2 == nSrcXOff)
2258
0
                nSrcXOff2++;
2259
0
            if (nSrcXOff2 > nChunkRightXOff)
2260
0
                nSrcXOff2 = nChunkRightXOff;
2261
2262
0
            bool bRegularProcessing = false;
2263
            if constexpr (!std::is_same<T, GByte>::value)
2264
0
                bRegularProcessing = true;
2265
0
            else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
2266
0
                bRegularProcessing = true;
2267
2268
0
            if (bRegularProcessing)
2269
0
            {
2270
                // Not sure how much sense it makes to run a majority
2271
                // filter on floating point data, but here it is for the sake
2272
                // of compatibility. It won't look right on RGB images by the
2273
                // nature of the filter.
2274
2275
0
                if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
2276
0
                    nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
2277
0
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2278
0
                            static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
2279
0
                        std::numeric_limits<size_t>::max() / sizeof(float))
2280
0
                {
2281
0
                    CPLError(CE_Failure, CPLE_NotSupported,
2282
0
                             "Too big downsampling factor");
2283
0
                    CPLFree(paVals);
2284
0
                    CPLFree(panSums);
2285
0
                    return CE_Failure;
2286
0
                }
2287
0
                const size_t nNumPx =
2288
0
                    static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
2289
0
                    static_cast<size_t>(nSrcXOff2 - nSrcXOff);
2290
0
                size_t iMaxInd = 0;
2291
0
                size_t iMaxVal = 0;
2292
0
                bool biMaxValdValid = false;
2293
2294
0
                if (paVals == nullptr || nNumPx > nMaxNumPx)
2295
0
                {
2296
0
                    T *paValsNew = static_cast<T *>(
2297
0
                        VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
2298
0
                    int *panSumsNew = static_cast<int *>(
2299
0
                        VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
2300
0
                    if (paValsNew != nullptr)
2301
0
                        paVals = paValsNew;
2302
0
                    if (panSumsNew != nullptr)
2303
0
                        panSums = panSumsNew;
2304
0
                    if (paValsNew == nullptr || panSumsNew == nullptr)
2305
0
                    {
2306
0
                        CPLFree(paVals);
2307
0
                        CPLFree(panSums);
2308
0
                        return CE_Failure;
2309
0
                    }
2310
0
                    nMaxNumPx = nNumPx;
2311
0
                }
2312
2313
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2314
0
                {
2315
0
                    const GPtrDiff_t iTotYOff =
2316
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2317
0
                        nChunkXOff;
2318
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2319
0
                    {
2320
0
                        if (pabySrcScanlineNodataMask == nullptr ||
2321
0
                            pabySrcScanlineNodataMask[iX + iTotYOff])
2322
0
                        {
2323
0
                            const T val = paSrcScanline[iX + iTotYOff];
2324
0
                            size_t i = 0;  // Used after for.
2325
2326
                            // Check array for existing entry.
2327
0
                            for (; i < iMaxInd; ++i)
2328
0
                                if (IsSame(paVals[i], val) &&
2329
0
                                    ++panSums[i] > panSums[iMaxVal])
2330
0
                                {
2331
0
                                    iMaxVal = i;
2332
0
                                    biMaxValdValid = true;
2333
0
                                    break;
2334
0
                                }
2335
2336
                            // Add to arr if entry not already there.
2337
0
                            if (i == iMaxInd)
2338
0
                            {
2339
0
                                paVals[iMaxInd] = val;
2340
0
                                panSums[iMaxInd] = 1;
2341
2342
0
                                if (!biMaxValdValid)
2343
0
                                {
2344
0
                                    iMaxVal = iMaxInd;
2345
0
                                    biMaxValdValid = true;
2346
0
                                }
2347
2348
0
                                ++iMaxInd;
2349
0
                            }
2350
0
                        }
2351
0
                    }
2352
0
                }
2353
2354
0
                if (!biMaxValdValid)
2355
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2356
0
                else
2357
0
                    paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
2358
0
            }
2359
            else if constexpr (std::is_same<T, GByte>::value)
2360
            // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
2361
0
            {
2362
                // So we go here for a paletted or non-paletted byte band.
2363
                // The input values are then between 0 and 255.
2364
0
                int nMaxVal = 0;
2365
0
                int iMaxInd = -1;
2366
2367
                // The cost of this zeroing might be high. Perhaps we should
2368
                // just use the above generic case, and go to this one if the
2369
                // number of source pixels is large enough
2370
0
                std::fill(anVals.begin(), anVals.end(), 0);
2371
2372
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
2373
0
                {
2374
0
                    const GPtrDiff_t iTotYOff =
2375
0
                        static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
2376
0
                        nChunkXOff;
2377
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
2378
0
                    {
2379
0
                        const T val = paSrcScanline[iX + iTotYOff];
2380
0
                        if (!bHasNoData || val != tNoDataValue)
2381
0
                        {
2382
0
                            int nVal = static_cast<int>(val);
2383
0
                            if (++anVals[nVal] > nMaxVal)
2384
0
                            {
2385
                                // Sum the density.
2386
                                // Is it the most common value so far?
2387
0
                                iMaxInd = nVal;
2388
0
                                nMaxVal = anVals[nVal];
2389
0
                            }
2390
0
                        }
2391
0
                    }
2392
0
                }
2393
2394
0
                if (iMaxInd == -1)
2395
0
                    paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
2396
0
                else
2397
0
                    paDstScanline[iDstPixel - nDstXOff] =
2398
0
                        static_cast<T>(iMaxInd);
2399
0
            }
2400
0
        }
2401
0
    }
2402
2403
0
    CPLFree(paVals);
2404
0
    CPLFree(panSums);
2405
2406
0
    return CE_None;
2407
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned char>(GDALOverviewResampleArgs const&, unsigned char const*, unsigned char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<signed char>(GDALOverviewResampleArgs const&, signed char const*, signed char*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned short>(GDALOverviewResampleArgs const&, unsigned short const*, unsigned short*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned int>(GDALOverviewResampleArgs const&, unsigned int const*, unsigned int*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<float>(GDALOverviewResampleArgs const&, float const*, float*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<unsigned long>(GDALOverviewResampleArgs const&, unsigned long const*, unsigned long*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<double>(GDALOverviewResampleArgs const&, double const*, double*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<float> >(GDALOverviewResampleArgs const&, std::__1::complex<float> const*, std::__1::complex<float>*)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ModeT<std::__1::complex<double> >(GDALOverviewResampleArgs const&, std::__1::complex<double> const*, std::__1::complex<double>*)
2408
2409
static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
2410
                                     const void *pChunk, void **ppDstBuffer,
2411
                                     GDALDataType *peDstBufferDataType)
2412
0
{
2413
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
2414
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
2415
0
        GDALGetDataTypeSizeBytes(args.eWrkDataType));
2416
0
    if (*ppDstBuffer == nullptr)
2417
0
    {
2418
0
        return CE_Failure;
2419
0
    }
2420
2421
0
    CPLAssert(args.eSrcDataType == args.eWrkDataType);
2422
2423
0
    *peDstBufferDataType = args.eWrkDataType;
2424
0
    switch (args.eWrkDataType)
2425
0
    {
2426
        // For mode resampling, as no computation is done, only the
2427
        // size of the data type matters... except for Byte where we have
2428
        // special processing. And for floating point values
2429
0
        case GDT_Byte:
2430
0
        {
2431
0
            return GDALResampleChunk_ModeT(args,
2432
0
                                           static_cast<const GByte *>(pChunk),
2433
0
                                           static_cast<GByte *>(*ppDstBuffer));
2434
0
        }
2435
2436
0
        case GDT_Int8:
2437
0
        {
2438
0
            return GDALResampleChunk_ModeT(args,
2439
0
                                           static_cast<const int8_t *>(pChunk),
2440
0
                                           static_cast<int8_t *>(*ppDstBuffer));
2441
0
        }
2442
2443
0
        case GDT_Int16:
2444
0
        case GDT_UInt16:
2445
0
        case GDT_Float16:
2446
0
        {
2447
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
2448
0
            return GDALResampleChunk_ModeT(
2449
0
                args, static_cast<const uint16_t *>(pChunk),
2450
0
                static_cast<uint16_t *>(*ppDstBuffer));
2451
0
        }
2452
2453
0
        case GDT_CInt16:
2454
0
        case GDT_CFloat16:
2455
0
        case GDT_Int32:
2456
0
        case GDT_UInt32:
2457
0
        {
2458
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2459
0
            return GDALResampleChunk_ModeT(
2460
0
                args, static_cast<const uint32_t *>(pChunk),
2461
0
                static_cast<uint32_t *>(*ppDstBuffer));
2462
0
        }
2463
2464
0
        case GDT_Float32:
2465
0
        {
2466
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
2467
0
            return GDALResampleChunk_ModeT(args,
2468
0
                                           static_cast<const float *>(pChunk),
2469
0
                                           static_cast<float *>(*ppDstBuffer));
2470
0
        }
2471
2472
0
        case GDT_CInt32:
2473
0
        case GDT_Int64:
2474
0
        case GDT_UInt64:
2475
0
        {
2476
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2477
0
            return GDALResampleChunk_ModeT(
2478
0
                args, static_cast<const uint64_t *>(pChunk),
2479
0
                static_cast<uint64_t *>(*ppDstBuffer));
2480
0
        }
2481
2482
0
        case GDT_Float64:
2483
0
        {
2484
0
            CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
2485
0
            return GDALResampleChunk_ModeT(args,
2486
0
                                           static_cast<const double *>(pChunk),
2487
0
                                           static_cast<double *>(*ppDstBuffer));
2488
0
        }
2489
2490
0
        case GDT_CFloat32:
2491
0
        {
2492
0
            return GDALResampleChunk_ModeT(
2493
0
                args, static_cast<const std::complex<float> *>(pChunk),
2494
0
                static_cast<std::complex<float> *>(*ppDstBuffer));
2495
0
        }
2496
2497
0
        case GDT_CFloat64:
2498
0
        {
2499
0
            return GDALResampleChunk_ModeT(
2500
0
                args, static_cast<const std::complex<double> *>(pChunk),
2501
0
                static_cast<std::complex<double> *>(*ppDstBuffer));
2502
0
        }
2503
2504
0
        case GDT_Unknown:
2505
0
        case GDT_TypeCount:
2506
0
            break;
2507
0
    }
2508
2509
0
    CPLAssert(false);
2510
0
    return CE_Failure;
2511
0
}
2512
2513
/************************************************************************/
2514
/*                  GDALResampleConvolutionHorizontal()                 */
2515
/************************************************************************/
2516
2517
template <class T>
2518
static inline double
2519
GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
2520
                                  int nSrcPixelCount)
2521
0
{
2522
0
    double dfVal1 = 0.0;
2523
0
    double dfVal2 = 0.0;
2524
0
    int i = 0;  // Used after for.
2525
    // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
2526
    // manually (untypical) unrolled loop in -O2 and -O3:
2527
    // https://github.com/OSGeo/gdal/issues/9508
2528
0
#if !defined(__INTEL_CLANG_COMPILER)
2529
0
    for (; i + 3 < nSrcPixelCount; i += 4)
2530
0
    {
2531
0
        dfVal1 += pChunk[i] * padfWeights[i];
2532
0
        dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
2533
0
        dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
2534
0
        dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
2535
0
    }
2536
0
#endif
2537
0
    for (; i < nSrcPixelCount; ++i)
2538
0
    {
2539
0
        dfVal1 += pChunk[i] * padfWeights[i];
2540
0
    }
2541
0
    return dfVal1 + dfVal2;
2542
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<float>(float const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontal<double>(double const*, double const*, int)
2543
2544
template <class T>
2545
static inline void GDALResampleConvolutionHorizontalWithMask(
2546
    const T *pChunk, const GByte *pabyMask, const double *padfWeights,
2547
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2548
0
{
2549
0
    dfVal = 0;
2550
0
    dfWeightSum = 0;
2551
0
    int i = 0;
2552
0
    for (; i + 3 < nSrcPixelCount; i += 4)
2553
0
    {
2554
0
        const double dfWeight0 = padfWeights[i] * pabyMask[i];
2555
0
        const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
2556
0
        const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
2557
0
        const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
2558
0
        dfVal += pChunk[i] * dfWeight0;
2559
0
        dfVal += pChunk[i + 1] * dfWeight1;
2560
0
        dfVal += pChunk[i + 2] * dfWeight2;
2561
0
        dfVal += pChunk[i + 3] * dfWeight3;
2562
0
        dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
2563
0
    }
2564
0
    for (; i < nSrcPixelCount; ++i)
2565
0
    {
2566
0
        const double dfWeight = padfWeights[i] * pabyMask[i];
2567
0
        dfVal += pChunk[i] * dfWeight;
2568
0
        dfWeightSum += dfWeight;
2569
0
    }
2570
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<float>(float const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMask<double>(double const*, unsigned char const*, double const*, int, double&, double&)
2571
2572
template <class T>
2573
static inline void GDALResampleConvolutionHorizontal_3rows(
2574
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2575
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2576
    double &dfRes2, double &dfRes3)
2577
0
{
2578
0
    double dfVal1 = 0.0;
2579
0
    double dfVal2 = 0.0;
2580
0
    double dfVal3 = 0.0;
2581
0
    double dfVal4 = 0.0;
2582
0
    double dfVal5 = 0.0;
2583
0
    double dfVal6 = 0.0;
2584
0
    int i = 0;  // Used after for.
2585
0
    for (; i + 3 < nSrcPixelCount; i += 4)
2586
0
    {
2587
0
        dfVal1 += pChunkRow1[i] * padfWeights[i];
2588
0
        dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
2589
0
        dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
2590
0
        dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
2591
0
        dfVal3 += pChunkRow2[i] * padfWeights[i];
2592
0
        dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
2593
0
        dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
2594
0
        dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
2595
0
        dfVal5 += pChunkRow3[i] * padfWeights[i];
2596
0
        dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
2597
0
        dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
2598
0
        dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
2599
0
    }
2600
0
    for (; i < nSrcPixelCount; ++i)
2601
0
    {
2602
0
        dfVal1 += pChunkRow1[i] * padfWeights[i];
2603
0
        dfVal3 += pChunkRow2[i] * padfWeights[i];
2604
0
        dfVal5 += pChunkRow3[i] * padfWeights[i];
2605
0
    }
2606
0
    dfRes1 = dfVal1 + dfVal2;
2607
0
    dfRes2 = dfVal3 + dfVal4;
2608
0
    dfRes3 = dfVal5 + dfVal6;
2609
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2610
2611
template <class T>
2612
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
2613
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2614
    const double *padfWeights, int nSrcPixelCount, double &dfRes1,
2615
    double &dfRes2, double &dfRes3)
2616
0
{
2617
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2618
0
                                            padfWeights, nSrcPixelCount, dfRes1,
2619
0
                                            dfRes2, dfRes3);
2620
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<float>(float const*, float const*, float const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<double>(double const*, double const*, double const*, double const*, int, double&, double&, double&)
2621
2622
template <class T>
2623
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
2624
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2625
    const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
2626
0
{
2627
0
    GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
2628
0
                                            padfWeights, 4, dfRes1, dfRes2,
2629
0
                                            dfRes3);
2630
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<float>(float const*, float const*, float const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows<double>(double const*, double const*, double const*, double const*, double&, double&, double&)
2631
2632
/************************************************************************/
2633
/*                  GDALResampleConvolutionVertical()                   */
2634
/************************************************************************/
2635
2636
template <class T>
2637
static inline double
2638
GDALResampleConvolutionVertical(const T *pChunk, int nStride,
2639
                                const double *padfWeights, int nSrcLineCount)
2640
0
{
2641
0
    double dfVal1 = 0.0;
2642
0
    double dfVal2 = 0.0;
2643
0
    int i = 0;
2644
0
    int j = 0;
2645
0
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2646
0
    {
2647
0
        dfVal1 += pChunk[j] * padfWeights[i];
2648
0
        dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2649
0
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2650
0
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2651
0
    }
2652
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2653
0
    {
2654
0
        dfVal1 += pChunk[j] * padfWeights[i];
2655
0
    }
2656
0
    return dfVal1 + dfVal2;
2657
0
}
2658
2659
template <class T>
2660
static inline void GDALResampleConvolutionVertical_2cols(
2661
    const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
2662
    double &dfRes1, double &dfRes2)
2663
0
{
2664
0
    double dfVal1 = 0.0;
2665
0
    double dfVal2 = 0.0;
2666
0
    double dfVal3 = 0.0;
2667
0
    double dfVal4 = 0.0;
2668
0
    int i = 0;
2669
0
    int j = 0;
2670
0
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2671
0
    {
2672
0
        dfVal1 += pChunk[j] * padfWeights[i];
2673
0
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2674
0
        dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
2675
0
        dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
2676
0
        dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
2677
0
        dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
2678
0
        dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
2679
0
        dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
2680
0
    }
2681
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2682
0
    {
2683
0
        dfVal1 += pChunk[j] * padfWeights[i];
2684
0
        dfVal3 += pChunk[j + 1] * padfWeights[i];
2685
0
    }
2686
0
    dfRes1 = dfVal1 + dfVal2;
2687
0
    dfRes2 = dfVal3 + dfVal4;
2688
0
}
2689
2690
#ifdef USE_SSE2
2691
2692
#ifdef __AVX__
2693
/************************************************************************/
2694
/*             GDALResampleConvolutionVertical_16cols<T>                */
2695
/************************************************************************/
2696
2697
template <class T>
2698
static inline void
2699
GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
2700
                                       const double *padfWeights,
2701
                                       int nSrcLineCount, float *afDest)
2702
{
2703
    int i = 0;
2704
    int j = 0;
2705
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2706
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2707
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2708
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
2709
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2710
    {
2711
        XMMReg4Double w0 =
2712
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2713
        XMMReg4Double w1 =
2714
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2715
        XMMReg4Double w2 =
2716
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2717
        XMMReg4Double w3 =
2718
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2719
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2720
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2721
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
2722
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
2723
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2724
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2725
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
2726
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
2727
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2728
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2729
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
2730
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
2731
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2732
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2733
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
2734
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
2735
    }
2736
    for (; i < nSrcLineCount; ++i, j += nStride)
2737
    {
2738
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2739
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2740
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2741
        v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
2742
        v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
2743
    }
2744
    v_acc0.Store4Val(afDest);
2745
    v_acc1.Store4Val(afDest + 4);
2746
    v_acc2.Store4Val(afDest + 8);
2747
    v_acc3.Store4Val(afDest + 12);
2748
}
2749
2750
template <class T>
2751
static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
2752
                                                          const double *, int,
2753
                                                          double *)
2754
{
2755
    // Cannot be reached
2756
    CPLAssert(false);
2757
}
2758
2759
#else
2760
2761
/************************************************************************/
2762
/*              GDALResampleConvolutionVertical_8cols<T>                */
2763
/************************************************************************/
2764
2765
template <class T>
2766
static inline void
2767
GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
2768
                                      const double *padfWeights,
2769
                                      int nSrcLineCount, float *afDest)
2770
0
{
2771
0
    int i = 0;
2772
0
    int j = 0;
2773
0
    XMMReg4Double v_acc0 = XMMReg4Double::Zero();
2774
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2775
0
    for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
2776
0
    {
2777
0
        XMMReg4Double w0 =
2778
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
2779
0
        XMMReg4Double w1 =
2780
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
2781
0
        XMMReg4Double w2 =
2782
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
2783
0
        XMMReg4Double w3 =
2784
0
            XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
2785
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
2786
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
2787
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
2788
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
2789
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
2790
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
2791
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
2792
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
2793
0
    }
2794
0
    for (; i < nSrcLineCount; ++i, j += nStride)
2795
0
    {
2796
0
        XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
2797
0
        v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
2798
0
        v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
2799
0
    }
2800
0
    v_acc0.Store4Val(afDest);
2801
0
    v_acc1.Store4Val(afDest + 4);
2802
0
}
2803
2804
template <class T>
2805
static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
2806
                                                         const double *, int,
2807
                                                         double *)
2808
{
2809
    // Cannot be reached
2810
    CPLAssert(false);
2811
}
2812
2813
#endif  // __AVX__
2814
2815
/************************************************************************/
2816
/*              GDALResampleConvolutionHorizontalSSE2<T>                */
2817
/************************************************************************/
2818
2819
template <class T>
2820
static inline double GDALResampleConvolutionHorizontalSSE2(
2821
    const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2822
0
{
2823
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
2824
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
2825
0
    int i = 0;  // Used after for.
2826
0
    for (; i + 7 < nSrcPixelCount; i += 8)
2827
0
    {
2828
        // Retrieve the pixel & accumulate
2829
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
2830
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
2831
0
        const XMMReg4Double v_weight1 =
2832
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2833
0
        const XMMReg4Double v_weight2 =
2834
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2835
2836
0
        v_acc1 += v_pixels1 * v_weight1;
2837
0
        v_acc2 += v_pixels2 * v_weight2;
2838
0
    }
2839
2840
0
    v_acc1 += v_acc2;
2841
2842
0
    double dfVal = v_acc1.GetHorizSum();
2843
0
    for (; i < nSrcPixelCount; ++i)
2844
0
    {
2845
0
        dfVal += pChunk[i] * padfWeightsAligned[i];
2846
0
    }
2847
0
    return dfVal;
2848
0
}
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned char>(unsigned char const*, double const*, int)
Unexecuted instantiation: overview.cpp:double GDALResampleConvolutionHorizontalSSE2<unsigned short>(unsigned short const*, double const*, int)
2849
2850
/************************************************************************/
2851
/*              GDALResampleConvolutionHorizontal<GByte>                */
2852
/************************************************************************/
2853
2854
template <>
2855
inline double GDALResampleConvolutionHorizontal<GByte>(
2856
    const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2857
0
{
2858
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2859
0
                                                 nSrcPixelCount);
2860
0
}
2861
2862
template <>
2863
inline double GDALResampleConvolutionHorizontal<GUInt16>(
2864
    const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
2865
0
{
2866
0
    return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
2867
0
                                                 nSrcPixelCount);
2868
0
}
2869
2870
/************************************************************************/
2871
/*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
2872
/************************************************************************/
2873
2874
template <class T>
2875
static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
2876
    const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
2877
    int nSrcPixelCount, double &dfVal, double &dfWeightSum)
2878
0
{
2879
0
    int i = 0;  // Used after for.
2880
0
    XMMReg4Double v_acc = XMMReg4Double::Zero();
2881
0
    XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
2882
0
    for (; i + 3 < nSrcPixelCount; i += 4)
2883
0
    {
2884
0
        const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
2885
0
        const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
2886
0
        XMMReg4Double v_weight =
2887
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2888
0
        v_weight *= v_mask;
2889
0
        v_acc += v_pixels * v_weight;
2890
0
        v_acc_weight += v_weight;
2891
0
    }
2892
2893
0
    dfVal = v_acc.GetHorizSum();
2894
0
    dfWeightSum = v_acc_weight.GetHorizSum();
2895
0
    for (; i < nSrcPixelCount; ++i)
2896
0
    {
2897
0
        const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
2898
0
        dfVal += pChunk[i] * dfWeight;
2899
0
        dfWeightSum += dfWeight;
2900
0
    }
2901
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned char>(unsigned char const*, unsigned char const*, double const*, int, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalWithMaskSSE2<unsigned short>(unsigned short const*, unsigned char const*, double const*, int, double&, double&)
2902
2903
/************************************************************************/
2904
/*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
2905
/************************************************************************/
2906
2907
template <>
2908
inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
2909
    const GByte *pChunk, const GByte *pabyMask,
2910
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2911
    double &dfWeightSum)
2912
0
{
2913
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
2914
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2915
0
        dfWeightSum);
2916
0
}
2917
2918
template <>
2919
inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
2920
    const GUInt16 *pChunk, const GByte *pabyMask,
2921
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
2922
    double &dfWeightSum)
2923
0
{
2924
0
    GDALResampleConvolutionHorizontalWithMaskSSE2(
2925
0
        pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
2926
0
        dfWeightSum);
2927
0
}
2928
2929
/************************************************************************/
2930
/*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
2931
/************************************************************************/
2932
2933
template <class T>
2934
static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
2935
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
2936
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2937
    double &dfRes2, double &dfRes3)
2938
0
{
2939
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
2940
0
                  v_acc2 = XMMReg4Double::Zero(),
2941
0
                  v_acc3 = XMMReg4Double::Zero();
2942
0
    int i = 0;
2943
0
    for (; i + 7 < nSrcPixelCount; i += 8)
2944
0
    {
2945
        // Retrieve the pixel & accumulate.
2946
0
        XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
2947
0
        XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
2948
0
        const XMMReg4Double v_weight1 =
2949
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
2950
0
        const XMMReg4Double v_weight2 =
2951
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
2952
2953
0
        v_acc1 += v_pixels1 * v_weight1;
2954
0
        v_acc1 += v_pixels2 * v_weight2;
2955
2956
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
2957
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
2958
0
        v_acc2 += v_pixels1 * v_weight1;
2959
0
        v_acc2 += v_pixels2 * v_weight2;
2960
2961
0
        v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
2962
0
        v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
2963
0
        v_acc3 += v_pixels1 * v_weight1;
2964
0
        v_acc3 += v_pixels2 * v_weight2;
2965
0
    }
2966
2967
0
    dfRes1 = v_acc1.GetHorizSum();
2968
0
    dfRes2 = v_acc2.GetHorizSum();
2969
0
    dfRes3 = v_acc3.GetHorizSum();
2970
0
    for (; i < nSrcPixelCount; ++i)
2971
0
    {
2972
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
2973
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
2974
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
2975
0
    }
2976
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontal_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
2977
2978
/************************************************************************/
2979
/*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
2980
/************************************************************************/
2981
2982
template <>
2983
inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
2984
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
2985
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
2986
    double &dfRes2, double &dfRes3)
2987
0
{
2988
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
2989
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
2990
0
        dfRes1, dfRes2, dfRes3);
2991
0
}
2992
2993
template <>
2994
inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
2995
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
2996
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
2997
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
2998
0
{
2999
0
    GDALResampleConvolutionHorizontal_3rows_SSE2(
3000
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3001
0
        dfRes1, dfRes2, dfRes3);
3002
0
}
3003
3004
/************************************************************************/
3005
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
3006
/************************************************************************/
3007
3008
template <class T>
3009
static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3010
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3011
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3012
    double &dfRes2, double &dfRes3)
3013
0
{
3014
0
    XMMReg4Double v_acc1 = XMMReg4Double::Zero();
3015
0
    XMMReg4Double v_acc2 = XMMReg4Double::Zero();
3016
0
    XMMReg4Double v_acc3 = XMMReg4Double::Zero();
3017
0
    int i = 0;  // Use after for.
3018
0
    for (; i + 3 < nSrcPixelCount; i += 4)
3019
0
    {
3020
        // Retrieve the pixel & accumulate.
3021
0
        const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
3022
0
        const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
3023
0
        const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
3024
0
        const XMMReg4Double v_weight =
3025
0
            XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
3026
3027
0
        v_acc1 += v_pixels1 * v_weight;
3028
0
        v_acc2 += v_pixels2 * v_weight;
3029
0
        v_acc3 += v_pixels3 * v_weight;
3030
0
    }
3031
3032
0
    dfRes1 = v_acc1.GetHorizSum();
3033
0
    dfRes2 = v_acc2.GetHorizSum();
3034
0
    dfRes3 = v_acc3.GetHorizSum();
3035
3036
0
    for (; i < nSrcPixelCount; ++i)
3037
0
    {
3038
0
        dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
3039
0
        dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
3040
0
        dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
3041
0
    }
3042
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, int, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, int, double&, double&, double&)
3043
3044
/************************************************************************/
3045
/*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
3046
/************************************************************************/
3047
3048
template <>
3049
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
3050
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3051
    const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
3052
    double &dfRes2, double &dfRes3)
3053
0
{
3054
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3055
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3056
0
        dfRes1, dfRes2, dfRes3);
3057
0
}
3058
3059
template <>
3060
inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
3061
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3062
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
3063
    int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
3064
0
{
3065
0
    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
3066
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
3067
0
        dfRes1, dfRes2, dfRes3);
3068
0
}
3069
3070
/************************************************************************/
3071
/*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
3072
/************************************************************************/
3073
3074
template <class T>
3075
static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3076
    const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
3077
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3078
    double &dfRes3)
3079
0
{
3080
0
    const XMMReg4Double v_weight =
3081
0
        XMMReg4Double::Load4ValAligned(padfWeightsAligned);
3082
3083
    // Retrieve the pixel & accumulate.
3084
0
    const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
3085
0
    const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
3086
0
    const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
3087
3088
0
    XMMReg4Double v_acc1 = v_pixels1 * v_weight;
3089
0
    XMMReg4Double v_acc2 = v_pixels2 * v_weight;
3090
0
    XMMReg4Double v_acc3 = v_pixels3 * v_weight;
3091
3092
0
    dfRes1 = v_acc1.GetHorizSum();
3093
0
    dfRes2 = v_acc2.GetHorizSum();
3094
0
    dfRes3 = v_acc3.GetHorizSum();
3095
0
}
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned char>(unsigned char const*, unsigned char const*, unsigned char const*, double const*, double&, double&, double&)
Unexecuted instantiation: overview.cpp:void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<unsigned short>(unsigned short const*, unsigned short const*, unsigned short const*, double const*, double&, double&, double&)
3096
3097
/************************************************************************/
3098
/*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
3099
/************************************************************************/
3100
3101
template <>
3102
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
3103
    const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
3104
    const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
3105
    double &dfRes3)
3106
0
{
3107
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3108
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3109
0
        dfRes3);
3110
0
}
3111
3112
template <>
3113
inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
3114
    const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
3115
    const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
3116
    double &dfRes2, double &dfRes3)
3117
0
{
3118
0
    GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
3119
0
        pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
3120
0
        dfRes3);
3121
0
}
3122
3123
#endif  // USE_SSE2
3124
3125
/************************************************************************/
3126
/*                    GDALResampleChunk_Convolution()                   */
3127
/************************************************************************/
3128
3129
template <class T, class Twork, GDALDataType eWrkDataType>
3130
static CPLErr GDALResampleChunk_ConvolutionT(
3131
    const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
3132
    FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
3133
    int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
3134
3135
0
{
3136
0
    const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
3137
0
    const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
3138
0
    const double dfSrcXDelta = args.dfSrcXDelta;
3139
0
    const double dfSrcYDelta = args.dfSrcYDelta;
3140
0
    constexpr int nBands = 1;
3141
0
    const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
3142
0
    const int nChunkXOff = args.nChunkXOff;
3143
0
    const int nChunkXSize = args.nChunkXSize;
3144
0
    const int nChunkYOff = args.nChunkYOff;
3145
0
    const int nChunkYSize = args.nChunkYSize;
3146
0
    const int nDstXOff = args.nDstXOff;
3147
0
    const int nDstXOff2 = args.nDstXOff2;
3148
0
    const int nDstYOff = args.nDstYOff;
3149
0
    const int nDstYOff2 = args.nDstYOff2;
3150
0
    const bool bHasNoData = args.bHasNoData;
3151
0
    double dfNoDataValue = args.dfNoDataValue;
3152
3153
0
    if (!bHasNoData)
3154
0
        dfNoDataValue = 0.0;
3155
0
    const auto dstDataType = args.eOvrDataType;
3156
0
    const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
3157
0
    const double dfReplacementVal =
3158
0
        bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
3159
0
                   : dfNoDataValue;
3160
    // cppcheck-suppress unreadVariable
3161
0
    const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
3162
0
    const bool bNoDataValueInt64Valid =
3163
0
        isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
3164
0
    const auto nNodataValueInt64 =
3165
0
        bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
3166
0
    constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
3167
3168
    // TODO: we should have some generic function to do this.
3169
0
    Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
3170
0
    Twork fDstMax = cpl::NumericLimits<Twork>::max();
3171
0
    if (dstDataType == GDT_Byte)
3172
0
    {
3173
0
        fDstMin = std::numeric_limits<GByte>::min();
3174
0
        fDstMax = std::numeric_limits<GByte>::max();
3175
0
    }
3176
0
    else if (dstDataType == GDT_Int8)
3177
0
    {
3178
0
        fDstMin = std::numeric_limits<GInt8>::min();
3179
0
        fDstMax = std::numeric_limits<GInt8>::max();
3180
0
    }
3181
0
    else if (dstDataType == GDT_UInt16)
3182
0
    {
3183
0
        fDstMin = std::numeric_limits<GUInt16>::min();
3184
0
        fDstMax = std::numeric_limits<GUInt16>::max();
3185
0
    }
3186
0
    else if (dstDataType == GDT_Int16)
3187
0
    {
3188
0
        fDstMin = std::numeric_limits<GInt16>::min();
3189
0
        fDstMax = std::numeric_limits<GInt16>::max();
3190
0
    }
3191
0
    else if (dstDataType == GDT_UInt32)
3192
0
    {
3193
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
3194
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
3195
0
    }
3196
0
    else if (dstDataType == GDT_Int32)
3197
0
    {
3198
        // cppcheck-suppress unreadVariable
3199
0
        fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
3200
        // cppcheck-suppress unreadVariable
3201
0
        fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
3202
0
    }
3203
0
    else if (dstDataType == GDT_UInt64)
3204
0
    {
3205
        // cppcheck-suppress unreadVariable
3206
0
        fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
3207
        // cppcheck-suppress unreadVariable
3208
        // (1 << 64) - 2048: largest uint64 value a double can hold
3209
0
        fDstMax = static_cast<Twork>(18446744073709549568ULL);
3210
0
    }
3211
0
    else if (dstDataType == GDT_Int64)
3212
0
    {
3213
        // cppcheck-suppress unreadVariable
3214
0
        fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
3215
        // cppcheck-suppress unreadVariable
3216
        // (1 << 63) - 1024: largest int64 that a double can hold
3217
0
        fDstMax = static_cast<Twork>(9223372036854774784LL);
3218
0
    }
3219
3220
0
    auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
3221
0
                               bNoDataValueInt64Valid, nNodataValueInt64,
3222
0
                               dfNoDataValue, dfReplacementVal](Twork fVal)
3223
0
    {
3224
0
        if (!bHasNoData)
3225
0
            return fVal;
3226
3227
        // Clamp value before comparing to nodata: this is only needed for
3228
        // kernels with negative weights (Lanczos)
3229
0
        Twork fClamped = fVal;
3230
0
        if (fClamped < fDstMin)
3231
0
            fClamped = fDstMin;
3232
0
        else if (fClamped > fDstMax)
3233
0
            fClamped = fDstMax;
3234
0
        if (isIntegerDT)
3235
0
        {
3236
0
            if (bNoDataValueInt64Valid)
3237
0
            {
3238
0
                const double fClampedRounded = std::round(fClamped);
3239
0
                if (fClampedRounded >=
3240
0
                        static_cast<Twork>(
3241
0
                            std::numeric_limits<int64_t>::min()) &&
3242
0
                    fClampedRounded <=
3243
0
                        static_cast<Twork>(9223372036854774784LL) &&
3244
0
                    nNodataValueInt64 ==
3245
0
                        static_cast<GInt64>(std::round(fClamped)))
3246
0
                {
3247
                    // Do not use the nodata value
3248
0
                    return static_cast<Twork>(dfReplacementVal);
3249
0
                }
3250
0
            }
3251
0
        }
3252
0
        else if (dfNoDataValue == fClamped)
3253
0
        {
3254
            // Do not use the nodata value
3255
0
            return static_cast<Twork>(dfReplacementVal);
3256
0
        }
3257
0
        return fClamped;
3258
0
    };
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(float)#1}::operator()(float) const
Unexecuted instantiation: overview.cpp:GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, bool, float)::{lambda(double)#1}::operator()(double) const
3259
3260
    /* -------------------------------------------------------------------- */
3261
    /*      Allocate work buffers.                                          */
3262
    /* -------------------------------------------------------------------- */
3263
0
    const int nDstXSize = nDstXOff2 - nDstXOff;
3264
0
    Twork *pafWrkScanline = nullptr;
3265
0
    if (dstDataType != eWrkDataType)
3266
0
    {
3267
0
        pafWrkScanline =
3268
0
            static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
3269
0
        if (pafWrkScanline == nullptr)
3270
0
            return CE_Failure;
3271
0
    }
3272
3273
0
    const double dfXScale = 1.0 / dfXRatioDstToSrc;
3274
0
    const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
3275
0
    const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
3276
0
    const double dfYScale = 1.0 / dfYRatioDstToSrc;
3277
0
    const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
3278
0
    const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
3279
3280
    // Temporary array to store result of horizontal filter.
3281
0
    double *padfHorizontalFiltered = static_cast<double *>(
3282
0
        VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
3283
3284
    // To store convolution coefficients.
3285
0
    double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
3286
0
        static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
3287
0
                         0.5) *
3288
0
        sizeof(double)));
3289
3290
0
    GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
3291
0
    if (pabyChunkNodataMask)
3292
0
        pabyChunkNodataMaskHorizontalFiltered =
3293
0
            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
3294
0
    if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
3295
0
        (pabyChunkNodataMask != nullptr &&
3296
0
         pabyChunkNodataMaskHorizontalFiltered == nullptr))
3297
0
    {
3298
0
        VSIFree(pafWrkScanline);
3299
0
        VSIFree(padfHorizontalFiltered);
3300
0
        VSIFreeAligned(padfWeights);
3301
0
        VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3302
0
        return CE_Failure;
3303
0
    }
3304
3305
    /* ==================================================================== */
3306
    /*      First pass: horizontal filter                                   */
3307
    /* ==================================================================== */
3308
0
    const int nChunkRightXOff = nChunkXOff + nChunkXSize;
3309
0
#ifdef USE_SSE2
3310
0
    bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
3311
0
#endif
3312
0
    for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
3313
0
    {
3314
0
        const double dfSrcPixel =
3315
0
            (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
3316
0
        int nSrcPixelStart =
3317
0
            static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
3318
0
        if (nSrcPixelStart < nChunkXOff)
3319
0
            nSrcPixelStart = nChunkXOff;
3320
0
        int nSrcPixelStop =
3321
0
            static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
3322
0
        if (nSrcPixelStop > nChunkRightXOff)
3323
0
            nSrcPixelStop = nChunkRightXOff;
3324
#if 0
3325
        if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
3326
        {
3327
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3328
        }
3329
        if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
3330
        {
3331
            printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
3332
        }
3333
#endif
3334
0
        const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
3335
0
        double dfWeightSum = 0.0;
3336
3337
        // Compute convolution coefficients.
3338
0
        int nSrcPixel = nSrcPixelStart;
3339
0
        double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
3340
0
        for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
3341
0
        {
3342
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
3343
0
            dfX += dfXScaleWeight;
3344
0
            padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
3345
0
            dfX += dfXScaleWeight;
3346
0
            padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
3347
0
            dfX += dfXScaleWeight;
3348
0
            padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
3349
0
            dfX += dfXScaleWeight;
3350
0
            dfWeightSum +=
3351
0
                pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
3352
0
        }
3353
0
        for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
3354
0
        {
3355
0
            const double dfWeight = pfnFilterFunc(dfX);
3356
0
            padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
3357
0
            dfWeightSum += dfWeight;
3358
0
        }
3359
3360
0
        const int nHeight = nChunkYSize * nBands;
3361
0
        if (pabyChunkNodataMask == nullptr)
3362
0
        {
3363
0
            if (dfWeightSum != 0)
3364
0
            {
3365
0
                const double dfInvWeightSum = 1.0 / dfWeightSum;
3366
0
                for (int i = 0; i < nSrcPixelCount; ++i)
3367
0
                    padfWeights[i] *= dfInvWeightSum;
3368
0
            }
3369
0
            int iSrcLineOff = 0;
3370
0
#ifdef USE_SSE2
3371
0
            if (nSrcPixelCount == 4)
3372
0
            {
3373
0
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3374
0
                {
3375
0
                    const GPtrDiff_t j =
3376
0
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3377
0
                        (nSrcPixelStart - nChunkXOff);
3378
0
                    double dfVal1 = 0.0;
3379
0
                    double dfVal2 = 0.0;
3380
0
                    double dfVal3 = 0.0;
3381
0
                    GDALResampleConvolutionHorizontalPixelCount4_3rows(
3382
0
                        pChunk + j, pChunk + j + nChunkXSize,
3383
0
                        pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
3384
0
                        dfVal2, dfVal3);
3385
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3386
0
                                               nDstXSize +
3387
0
                                           iDstPixel - nDstXOff] = dfVal1;
3388
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3389
0
                                            1) *
3390
0
                                               nDstXSize +
3391
0
                                           iDstPixel - nDstXOff] = dfVal2;
3392
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3393
0
                                            2) *
3394
0
                                               nDstXSize +
3395
0
                                           iDstPixel - nDstXOff] = dfVal3;
3396
0
                }
3397
0
            }
3398
0
            else if (bSrcPixelCountLess8)
3399
0
            {
3400
0
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3401
0
                {
3402
0
                    const GPtrDiff_t j =
3403
0
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3404
0
                        (nSrcPixelStart - nChunkXOff);
3405
0
                    double dfVal1 = 0.0;
3406
0
                    double dfVal2 = 0.0;
3407
0
                    double dfVal3 = 0.0;
3408
0
                    GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
3409
0
                        pChunk + j, pChunk + j + nChunkXSize,
3410
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3411
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3412
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3413
0
                                               nDstXSize +
3414
0
                                           iDstPixel - nDstXOff] = dfVal1;
3415
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3416
0
                                            1) *
3417
0
                                               nDstXSize +
3418
0
                                           iDstPixel - nDstXOff] = dfVal2;
3419
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3420
0
                                            2) *
3421
0
                                               nDstXSize +
3422
0
                                           iDstPixel - nDstXOff] = dfVal3;
3423
0
                }
3424
0
            }
3425
0
            else
3426
0
#endif
3427
0
            {
3428
0
                for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
3429
0
                {
3430
0
                    const GPtrDiff_t j =
3431
0
                        static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3432
0
                        (nSrcPixelStart - nChunkXOff);
3433
0
                    double dfVal1 = 0.0;
3434
0
                    double dfVal2 = 0.0;
3435
0
                    double dfVal3 = 0.0;
3436
0
                    GDALResampleConvolutionHorizontal_3rows(
3437
0
                        pChunk + j, pChunk + j + nChunkXSize,
3438
0
                        pChunk + j + 2 * nChunkXSize, padfWeights,
3439
0
                        nSrcPixelCount, dfVal1, dfVal2, dfVal3);
3440
0
                    padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3441
0
                                               nDstXSize +
3442
0
                                           iDstPixel - nDstXOff] = dfVal1;
3443
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3444
0
                                            1) *
3445
0
                                               nDstXSize +
3446
0
                                           iDstPixel - nDstXOff] = dfVal2;
3447
0
                    padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
3448
0
                                            2) *
3449
0
                                               nDstXSize +
3450
0
                                           iDstPixel - nDstXOff] = dfVal3;
3451
0
                }
3452
0
            }
3453
0
            for (; iSrcLineOff < nHeight; ++iSrcLineOff)
3454
0
            {
3455
0
                const GPtrDiff_t j =
3456
0
                    static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3457
0
                    (nSrcPixelStart - nChunkXOff);
3458
0
                const double dfVal = GDALResampleConvolutionHorizontal(
3459
0
                    pChunk + j, padfWeights, nSrcPixelCount);
3460
0
                padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
3461
0
                                           nDstXSize +
3462
0
                                       iDstPixel - nDstXOff] = dfVal;
3463
0
            }
3464
0
        }
3465
0
        else
3466
0
        {
3467
0
            for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
3468
0
            {
3469
0
                const GPtrDiff_t j =
3470
0
                    static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
3471
0
                    (nSrcPixelStart - nChunkXOff);
3472
3473
0
                if (bKernelWithNegativeWeights)
3474
0
                {
3475
0
                    int nConsecutiveValid = 0;
3476
0
                    int nMaxConsecutiveValid = 0;
3477
0
                    for (int k = 0; k < nSrcPixelCount; k++)
3478
0
                    {
3479
0
                        if (pabyChunkNodataMask[j + k])
3480
0
                            nConsecutiveValid++;
3481
0
                        else if (nConsecutiveValid)
3482
0
                        {
3483
0
                            nMaxConsecutiveValid = std::max(
3484
0
                                nMaxConsecutiveValid, nConsecutiveValid);
3485
0
                            nConsecutiveValid = 0;
3486
0
                        }
3487
0
                    }
3488
0
                    nMaxConsecutiveValid =
3489
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
3490
0
                    if (nMaxConsecutiveValid < nSrcPixelCount / 2)
3491
0
                    {
3492
0
                        const size_t nTempOffset =
3493
0
                            static_cast<size_t>(iSrcLineOff) * nDstXSize +
3494
0
                            iDstPixel - nDstXOff;
3495
0
                        padfHorizontalFiltered[nTempOffset] = 0.0;
3496
0
                        pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3497
0
                        continue;
3498
0
                    }
3499
0
                }
3500
3501
0
                double dfVal = 0.0;
3502
0
                GDALResampleConvolutionHorizontalWithMask(
3503
0
                    pChunk + j, pabyChunkNodataMask + j, padfWeights,
3504
0
                    nSrcPixelCount, dfVal, dfWeightSum);
3505
0
                const size_t nTempOffset =
3506
0
                    static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
3507
0
                    nDstXOff;
3508
0
                if (dfWeightSum > 0.0)
3509
0
                {
3510
0
                    padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
3511
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
3512
0
                }
3513
0
                else
3514
0
                {
3515
0
                    padfHorizontalFiltered[nTempOffset] = 0.0;
3516
0
                    pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
3517
0
                }
3518
0
            }
3519
0
        }
3520
0
    }
3521
3522
    /* ==================================================================== */
3523
    /*      Second pass: vertical filter                                    */
3524
    /* ==================================================================== */
3525
0
    const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
3526
3527
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3528
0
    {
3529
0
        Twork *const pafDstScanline =
3530
0
            pafWrkScanline ? pafWrkScanline
3531
0
                           : static_cast<Twork *>(pDstBuffer) +
3532
0
                                 (iDstLine - nDstYOff) * nDstXSize;
3533
3534
0
        const double dfSrcLine =
3535
0
            (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
3536
0
        int nSrcLineStart =
3537
0
            static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
3538
0
        int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
3539
0
        if (nSrcLineStart < nChunkYOff)
3540
0
            nSrcLineStart = nChunkYOff;
3541
0
        if (nSrcLineStop > nChunkBottomYOff)
3542
0
            nSrcLineStop = nChunkBottomYOff;
3543
#if 0
3544
        if( nSrcLineStart < nChunkYOff &&
3545
            nChunkYOff > 0 )
3546
        {
3547
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3548
        }
3549
        if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
3550
        {
3551
            printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
3552
        }
3553
#endif
3554
0
        const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
3555
0
        double dfWeightSum = 0.0;
3556
3557
        // Compute convolution coefficients.
3558
0
        int nSrcLine = nSrcLineStart;  // Used after for.
3559
0
        double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
3560
0
        for (; nSrcLine + 3 < nSrcLineStop;
3561
0
             nSrcLine += 4, dfY += 4 * dfYScaleWeight)
3562
0
        {
3563
0
            padfWeights[nSrcLine - nSrcLineStart] = dfY;
3564
0
            padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
3565
0
            padfWeights[nSrcLine + 2 - nSrcLineStart] =
3566
0
                dfY + 2 * dfYScaleWeight;
3567
0
            padfWeights[nSrcLine + 3 - nSrcLineStart] =
3568
0
                dfY + 3 * dfYScaleWeight;
3569
0
            dfWeightSum +=
3570
0
                pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
3571
0
        }
3572
0
        for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
3573
0
        {
3574
0
            const double dfWeight = pfnFilterFunc(dfY);
3575
0
            padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
3576
0
            dfWeightSum += dfWeight;
3577
0
        }
3578
3579
0
        if (pabyChunkNodataMask == nullptr)
3580
0
        {
3581
0
            if (dfWeightSum != 0)
3582
0
            {
3583
0
                const double dfInvWeightSum = 1.0 / dfWeightSum;
3584
0
                for (int i = 0; i < nSrcLineCount; ++i)
3585
0
                    padfWeights[i] *= dfInvWeightSum;
3586
0
            }
3587
0
        }
3588
3589
0
        if (pabyChunkNodataMask == nullptr)
3590
0
        {
3591
0
            int iFilteredPixelOff = 0;  // Used after for.
3592
            // j used after for.
3593
0
            size_t j =
3594
0
                (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
3595
0
#ifdef USE_SSE2
3596
            if constexpr (eWrkDataType == GDT_Float32)
3597
0
            {
3598
#ifdef __AVX__
3599
                for (; iFilteredPixelOff + 15 < nDstXSize;
3600
                     iFilteredPixelOff += 16, j += 16)
3601
                {
3602
                    GDALResampleConvolutionVertical_16cols(
3603
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3604
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3605
                    if (bHasNoData)
3606
                    {
3607
                        for (int k = 0; k < 16; k++)
3608
                        {
3609
                            pafDstScanline[iFilteredPixelOff + k] =
3610
                                replaceValIfNodata(
3611
                                    pafDstScanline[iFilteredPixelOff + k]);
3612
                        }
3613
                    }
3614
                }
3615
#else
3616
0
                for (; iFilteredPixelOff + 7 < nDstXSize;
3617
0
                     iFilteredPixelOff += 8, j += 8)
3618
0
                {
3619
0
                    GDALResampleConvolutionVertical_8cols(
3620
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3621
0
                        nSrcLineCount, pafDstScanline + iFilteredPixelOff);
3622
0
                    if (bHasNoData)
3623
0
                    {
3624
0
                        for (int k = 0; k < 8; k++)
3625
0
                        {
3626
0
                            pafDstScanline[iFilteredPixelOff + k] =
3627
0
                                replaceValIfNodata(
3628
0
                                    pafDstScanline[iFilteredPixelOff + k]);
3629
0
                        }
3630
0
                    }
3631
0
                }
3632
0
#endif
3633
3634
0
                for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
3635
0
                {
3636
0
                    const Twork fVal =
3637
0
                        static_cast<Twork>(GDALResampleConvolutionVertical(
3638
0
                            padfHorizontalFiltered + j, nDstXSize, padfWeights,
3639
0
                            nSrcLineCount));
3640
0
                    pafDstScanline[iFilteredPixelOff] =
3641
0
                        replaceValIfNodata(fVal);
3642
0
                }
3643
            }
3644
            else
3645
#endif
3646
0
            {
3647
0
                for (; iFilteredPixelOff + 1 < nDstXSize;
3648
0
                     iFilteredPixelOff += 2, j += 2)
3649
0
                {
3650
0
                    double dfVal1 = 0.0;
3651
0
                    double dfVal2 = 0.0;
3652
0
                    GDALResampleConvolutionVertical_2cols(
3653
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3654
0
                        nSrcLineCount, dfVal1, dfVal2);
3655
0
                    pafDstScanline[iFilteredPixelOff] =
3656
0
                        replaceValIfNodata(static_cast<Twork>(dfVal1));
3657
0
                    pafDstScanline[iFilteredPixelOff + 1] =
3658
0
                        replaceValIfNodata(static_cast<Twork>(dfVal2));
3659
0
                }
3660
0
                if (iFilteredPixelOff < nDstXSize)
3661
0
                {
3662
0
                    const double dfVal = GDALResampleConvolutionVertical(
3663
0
                        padfHorizontalFiltered + j, nDstXSize, padfWeights,
3664
0
                        nSrcLineCount);
3665
0
                    pafDstScanline[iFilteredPixelOff] =
3666
0
                        replaceValIfNodata(static_cast<Twork>(dfVal));
3667
0
                }
3668
0
            }
3669
0
        }
3670
0
        else
3671
0
        {
3672
0
            for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
3673
0
                 ++iFilteredPixelOff)
3674
0
            {
3675
0
                double dfVal = 0.0;
3676
0
                dfWeightSum = 0.0;
3677
0
                size_t j = (nSrcLineStart - nChunkYOff) *
3678
0
                               static_cast<size_t>(nDstXSize) +
3679
0
                           iFilteredPixelOff;
3680
0
                if (bKernelWithNegativeWeights)
3681
0
                {
3682
0
                    int nConsecutiveValid = 0;
3683
0
                    int nMaxConsecutiveValid = 0;
3684
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3685
0
                    {
3686
0
                        const double dfWeight =
3687
0
                            padfWeights[i] *
3688
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
3689
0
                        if (pabyChunkNodataMaskHorizontalFiltered[j])
3690
0
                        {
3691
0
                            nConsecutiveValid++;
3692
0
                        }
3693
0
                        else if (nConsecutiveValid)
3694
0
                        {
3695
0
                            nMaxConsecutiveValid = std::max(
3696
0
                                nMaxConsecutiveValid, nConsecutiveValid);
3697
0
                            nConsecutiveValid = 0;
3698
0
                        }
3699
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
3700
0
                        dfWeightSum += dfWeight;
3701
0
                    }
3702
0
                    nMaxConsecutiveValid =
3703
0
                        std::max(nMaxConsecutiveValid, nConsecutiveValid);
3704
0
                    if (nMaxConsecutiveValid < nSrcLineCount / 2)
3705
0
                    {
3706
0
                        pafDstScanline[iFilteredPixelOff] =
3707
0
                            static_cast<Twork>(dfNoDataValue);
3708
0
                        continue;
3709
0
                    }
3710
0
                }
3711
0
                else
3712
0
                {
3713
0
                    for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
3714
0
                    {
3715
0
                        const double dfWeight =
3716
0
                            padfWeights[i] *
3717
0
                            pabyChunkNodataMaskHorizontalFiltered[j];
3718
0
                        dfVal += padfHorizontalFiltered[j] * dfWeight;
3719
0
                        dfWeightSum += dfWeight;
3720
0
                    }
3721
0
                }
3722
0
                if (dfWeightSum > 0.0)
3723
0
                {
3724
0
                    pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
3725
0
                        static_cast<Twork>(dfVal / dfWeightSum));
3726
0
                }
3727
0
                else
3728
0
                {
3729
0
                    pafDstScanline[iFilteredPixelOff] =
3730
0
                        static_cast<Twork>(dfNoDataValue);
3731
0
                }
3732
0
            }
3733
0
        }
3734
3735
0
        if (fMaxVal != 0.0f)
3736
0
        {
3737
0
            for (int i = 0; i < nDstXSize; ++i)
3738
0
            {
3739
0
                if (pafDstScanline[i] > fMaxVal)
3740
0
                    pafDstScanline[i] = fMaxVal;
3741
0
            }
3742
0
        }
3743
3744
0
        if (pafWrkScanline)
3745
0
        {
3746
0
            GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
3747
0
                            static_cast<GByte *>(pDstBuffer) +
3748
0
                                static_cast<size_t>(iDstLine - nDstYOff) *
3749
0
                                    nDstXSize * nDstDataTypeSize,
3750
0
                            dstDataType, nDstDataTypeSize, nDstXSize);
3751
0
        }
3752
0
    }
3753
3754
0
    VSIFree(pafWrkScanline);
3755
0
    VSIFreeAligned(padfWeights);
3756
0
    VSIFree(padfHorizontalFiltered);
3757
0
    VSIFree(pabyChunkNodataMaskHorizontalFiltered);
3758
3759
0
    return CE_None;
3760
0
}
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned char, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned char const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<unsigned short, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, unsigned short const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<float, float, (GDALDataType)6>(GDALOverviewResampleArgs const&, float const*, void*, double (*)(double), double (*)(double*), int, bool, float)
Unexecuted instantiation: overview.cpp:CPLErr GDALResampleChunk_ConvolutionT<double, double, (GDALDataType)7>(GDALOverviewResampleArgs const&, double const*, void*, double (*)(double), double (*)(double*), int, bool, float)
3761
3762
static CPLErr
3763
GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
3764
                              const void *pChunk, void **ppDstBuffer,
3765
                              GDALDataType *peDstBufferDataType)
3766
0
{
3767
0
    GDALResampleAlg eResample;
3768
0
    bool bKernelWithNegativeWeights = false;
3769
0
    if (EQUAL(args.pszResampling, "BILINEAR"))
3770
0
        eResample = GRA_Bilinear;
3771
0
    else if (EQUAL(args.pszResampling, "CUBIC"))
3772
0
    {
3773
0
        eResample = GRA_Cubic;
3774
0
        bKernelWithNegativeWeights = true;
3775
0
    }
3776
0
    else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
3777
0
        eResample = GRA_CubicSpline;
3778
0
    else if (EQUAL(args.pszResampling, "LANCZOS"))
3779
0
    {
3780
0
        eResample = GRA_Lanczos;
3781
0
        bKernelWithNegativeWeights = true;
3782
0
    }
3783
0
    else
3784
0
    {
3785
0
        CPLAssert(false);
3786
0
        return CE_Failure;
3787
0
    }
3788
0
    const int nKernelRadius = GWKGetFilterRadius(eResample);
3789
0
    FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
3790
0
    const FilterFunc4ValuesType pfnFilterFunc4Values =
3791
0
        GWKGetFilterFunc4Values(eResample);
3792
3793
0
    float fMaxVal = 0.f;
3794
    // Cubic, etc... can have overshoots, so make sure we clamp values to the
3795
    // maximum value if NBITS is set.
3796
0
    if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
3797
0
        (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
3798
0
         args.eOvrDataType == GDT_UInt32))
3799
0
    {
3800
0
        int nBits = args.nOvrNBITS;
3801
0
        if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
3802
0
            nBits = 0;
3803
0
        if (nBits > 0 && nBits < 32)
3804
0
            fMaxVal = static_cast<float>((1U << nBits) - 1);
3805
0
    }
3806
3807
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(
3808
0
        args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
3809
0
        GDALGetDataTypeSizeBytes(args.eOvrDataType));
3810
0
    if (*ppDstBuffer == nullptr)
3811
0
    {
3812
0
        return CE_Failure;
3813
0
    }
3814
0
    *peDstBufferDataType = args.eOvrDataType;
3815
3816
0
    switch (args.eWrkDataType)
3817
0
    {
3818
0
        case GDT_Byte:
3819
0
        {
3820
0
            return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
3821
0
                args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
3822
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3823
0
                bKernelWithNegativeWeights, fMaxVal);
3824
0
        }
3825
3826
0
        case GDT_UInt16:
3827
0
        {
3828
0
            return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
3829
0
                args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
3830
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3831
0
                bKernelWithNegativeWeights, fMaxVal);
3832
0
        }
3833
3834
0
        case GDT_Float32:
3835
0
        {
3836
0
            return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
3837
0
                args, static_cast<const float *>(pChunk), *ppDstBuffer,
3838
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3839
0
                bKernelWithNegativeWeights, fMaxVal);
3840
0
        }
3841
3842
0
        case GDT_Float64:
3843
0
        {
3844
0
            return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
3845
0
                args, static_cast<const double *>(pChunk), *ppDstBuffer,
3846
0
                pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
3847
0
                bKernelWithNegativeWeights, fMaxVal);
3848
0
        }
3849
3850
0
        default:
3851
0
            break;
3852
0
    }
3853
3854
0
    CPLAssert(false);
3855
0
    return CE_Failure;
3856
0
}
3857
3858
/************************************************************************/
3859
/*                       GDALResampleChunkC32R()                        */
3860
/************************************************************************/
3861
3862
static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
3863
                                    const float *pafChunk, const int nChunkYOff,
3864
                                    const int nChunkYSize, const int nDstYOff,
3865
                                    const int nDstYOff2, const int nOvrXSize,
3866
                                    const int nOvrYSize, void **ppDstBuffer,
3867
                                    GDALDataType *peDstBufferDataType,
3868
                                    const char *pszResampling)
3869
3870
0
{
3871
0
    enum Method
3872
0
    {
3873
0
        NEAR,
3874
0
        AVERAGE,
3875
0
        AVERAGE_MAGPHASE,
3876
0
        RMS,
3877
0
    };
3878
3879
0
    Method eMethod = NEAR;
3880
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
3881
0
    {
3882
0
        eMethod = NEAR;
3883
0
    }
3884
0
    else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
3885
0
    {
3886
0
        eMethod = AVERAGE_MAGPHASE;
3887
0
    }
3888
0
    else if (EQUAL(pszResampling, "RMS"))
3889
0
    {
3890
0
        eMethod = RMS;
3891
0
    }
3892
0
    else if (STARTS_WITH_CI(pszResampling, "AVER"))
3893
0
    {
3894
0
        eMethod = AVERAGE;
3895
0
    }
3896
0
    else
3897
0
    {
3898
0
        CPLError(
3899
0
            CE_Failure, CPLE_NotSupported,
3900
0
            "Resampling method %s is not supported for complex data types. "
3901
0
            "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
3902
0
            pszResampling);
3903
0
        return CE_Failure;
3904
0
    }
3905
3906
0
    const int nOXSize = nOvrXSize;
3907
0
    *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
3908
0
                                       GDALGetDataTypeSizeBytes(GDT_CFloat32));
3909
0
    if (*ppDstBuffer == nullptr)
3910
0
    {
3911
0
        return CE_Failure;
3912
0
    }
3913
0
    float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
3914
0
    *peDstBufferDataType = GDT_CFloat32;
3915
3916
0
    const int nOYSize = nOvrYSize;
3917
0
    const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
3918
0
    const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
3919
3920
    /* ==================================================================== */
3921
    /*      Loop over destination scanlines.                                */
3922
    /* ==================================================================== */
3923
0
    for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
3924
0
    {
3925
0
        int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
3926
0
        if (nSrcYOff < nChunkYOff)
3927
0
            nSrcYOff = nChunkYOff;
3928
3929
0
        int nSrcYOff2 =
3930
0
            static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
3931
0
        if (nSrcYOff2 == nSrcYOff)
3932
0
            nSrcYOff2++;
3933
3934
0
        if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
3935
0
        {
3936
0
            if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
3937
0
                nSrcYOff = nSrcHeight - 1;
3938
0
            nSrcYOff2 = nSrcHeight;
3939
0
        }
3940
0
        if (nSrcYOff2 > nChunkYOff + nChunkYSize)
3941
0
            nSrcYOff2 = nChunkYOff + nChunkYSize;
3942
3943
0
        const float *const pafSrcScanline =
3944
0
            pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
3945
0
        float *const pafDstScanline =
3946
0
            pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
3947
3948
        /* --------------------------------------------------------------------
3949
         */
3950
        /*      Loop over destination pixels */
3951
        /* --------------------------------------------------------------------
3952
         */
3953
0
        for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
3954
0
        {
3955
0
            int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
3956
0
            int nSrcXOff2 =
3957
0
                static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
3958
0
            if (nSrcXOff2 == nSrcXOff)
3959
0
                nSrcXOff2++;
3960
0
            if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
3961
0
            {
3962
0
                if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
3963
0
                    nSrcXOff = nSrcWidth - 1;
3964
0
                nSrcXOff2 = nSrcWidth;
3965
0
            }
3966
3967
0
            if (eMethod == NEAR)
3968
0
            {
3969
0
                pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
3970
0
                pafDstScanline[iDstPixel * 2 + 1] =
3971
0
                    pafSrcScanline[nSrcXOff * 2 + 1];
3972
0
            }
3973
0
            else if (eMethod == AVERAGE_MAGPHASE)
3974
0
            {
3975
0
                double dfTotalR = 0.0;
3976
0
                double dfTotalI = 0.0;
3977
0
                double dfTotalM = 0.0;
3978
0
                int nCount = 0;
3979
3980
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
3981
0
                {
3982
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
3983
0
                    {
3984
0
                        const double dfR =
3985
0
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
3986
0
                                                        iY - nSrcYOff) *
3987
0
                                                        nSrcWidth * 2];
3988
0
                        const double dfI =
3989
0
                            pafSrcScanline[iX * 2 +
3990
0
                                           static_cast<GPtrDiff_t>(iY -
3991
0
                                                                   nSrcYOff) *
3992
0
                                               nSrcWidth * 2 +
3993
0
                                           1];
3994
0
                        dfTotalR += dfR;
3995
0
                        dfTotalI += dfI;
3996
0
                        dfTotalM += std::hypot(dfR, dfI);
3997
0
                        ++nCount;
3998
0
                    }
3999
0
                }
4000
4001
0
                CPLAssert(nCount > 0);
4002
0
                if (nCount == 0)
4003
0
                {
4004
0
                    pafDstScanline[iDstPixel * 2] = 0.0;
4005
0
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4006
0
                }
4007
0
                else
4008
0
                {
4009
0
                    pafDstScanline[iDstPixel * 2] =
4010
0
                        static_cast<float>(dfTotalR / nCount);
4011
0
                    pafDstScanline[iDstPixel * 2 + 1] =
4012
0
                        static_cast<float>(dfTotalI / nCount);
4013
0
                    const double dfM =
4014
0
                        std::hypot(pafDstScanline[iDstPixel * 2],
4015
0
                                   pafDstScanline[iDstPixel * 2 + 1]);
4016
0
                    const double dfDesiredM = dfTotalM / nCount;
4017
0
                    double dfRatio = 1.0;
4018
0
                    if (dfM != 0.0)
4019
0
                        dfRatio = dfDesiredM / dfM;
4020
4021
0
                    pafDstScanline[iDstPixel * 2] *=
4022
0
                        static_cast<float>(dfRatio);
4023
0
                    pafDstScanline[iDstPixel * 2 + 1] *=
4024
0
                        static_cast<float>(dfRatio);
4025
0
                }
4026
0
            }
4027
0
            else if (eMethod == RMS)
4028
0
            {
4029
0
                double dfTotalR = 0.0;
4030
0
                double dfTotalI = 0.0;
4031
0
                int nCount = 0;
4032
4033
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4034
0
                {
4035
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4036
0
                    {
4037
0
                        const double dfR =
4038
0
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4039
0
                                                        iY - nSrcYOff) *
4040
0
                                                        nSrcWidth * 2];
4041
0
                        const double dfI =
4042
0
                            pafSrcScanline[iX * 2 +
4043
0
                                           static_cast<GPtrDiff_t>(iY -
4044
0
                                                                   nSrcYOff) *
4045
0
                                               nSrcWidth * 2 +
4046
0
                                           1];
4047
4048
0
                        dfTotalR += SQUARE(dfR);
4049
0
                        dfTotalI += SQUARE(dfI);
4050
4051
0
                        ++nCount;
4052
0
                    }
4053
0
                }
4054
4055
0
                CPLAssert(nCount > 0);
4056
0
                if (nCount == 0)
4057
0
                {
4058
0
                    pafDstScanline[iDstPixel * 2] = 0.0;
4059
0
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4060
0
                }
4061
0
                else
4062
0
                {
4063
                    /* compute RMS */
4064
0
                    pafDstScanline[iDstPixel * 2] =
4065
0
                        static_cast<float>(sqrt(dfTotalR / nCount));
4066
0
                    pafDstScanline[iDstPixel * 2 + 1] =
4067
0
                        static_cast<float>(sqrt(dfTotalI / nCount));
4068
0
                }
4069
0
            }
4070
0
            else if (eMethod == AVERAGE)
4071
0
            {
4072
0
                double dfTotalR = 0.0;
4073
0
                double dfTotalI = 0.0;
4074
0
                int nCount = 0;
4075
4076
0
                for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
4077
0
                {
4078
0
                    for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
4079
0
                    {
4080
                        // TODO(schwehr): Maybe use std::complex?
4081
0
                        dfTotalR +=
4082
0
                            pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
4083
0
                                                        iY - nSrcYOff) *
4084
0
                                                        nSrcWidth * 2];
4085
0
                        dfTotalI += pafSrcScanline[iX * 2 +
4086
0
                                                   static_cast<GPtrDiff_t>(
4087
0
                                                       iY - nSrcYOff) *
4088
0
                                                       nSrcWidth * 2 +
4089
0
                                                   1];
4090
0
                        ++nCount;
4091
0
                    }
4092
0
                }
4093
4094
0
                CPLAssert(nCount > 0);
4095
0
                if (nCount == 0)
4096
0
                {
4097
0
                    pafDstScanline[iDstPixel * 2] = 0.0;
4098
0
                    pafDstScanline[iDstPixel * 2 + 1] = 0.0;
4099
0
                }
4100
0
                else
4101
0
                {
4102
0
                    pafDstScanline[iDstPixel * 2] =
4103
0
                        static_cast<float>(dfTotalR / nCount);
4104
0
                    pafDstScanline[iDstPixel * 2 + 1] =
4105
0
                        static_cast<float>(dfTotalI / nCount);
4106
0
                }
4107
0
            }
4108
0
        }
4109
0
    }
4110
4111
0
    return CE_None;
4112
0
}
4113
4114
/************************************************************************/
4115
/*                  GDALRegenerateCascadingOverviews()                  */
4116
/*                                                                      */
4117
/*      Generate a list of overviews in order from largest to           */
4118
/*      smallest, computing each from the next larger.                  */
4119
/************************************************************************/
4120
4121
static CPLErr GDALRegenerateCascadingOverviews(
4122
    GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
4123
    const char *pszResampling, GDALProgressFunc pfnProgress,
4124
    void *pProgressData, CSLConstList papszOptions)
4125
4126
0
{
4127
    /* -------------------------------------------------------------------- */
4128
    /*      First, we must put the overviews in order from largest to       */
4129
    /*      smallest.                                                       */
4130
    /* -------------------------------------------------------------------- */
4131
0
    for (int i = 0; i < nOverviews - 1; ++i)
4132
0
    {
4133
0
        for (int j = 0; j < nOverviews - i - 1; ++j)
4134
0
        {
4135
0
            if (papoOvrBands[j]->GetXSize() *
4136
0
                    static_cast<float>(papoOvrBands[j]->GetYSize()) <
4137
0
                papoOvrBands[j + 1]->GetXSize() *
4138
0
                    static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
4139
0
            {
4140
0
                GDALRasterBand *poTempBand = papoOvrBands[j];
4141
0
                papoOvrBands[j] = papoOvrBands[j + 1];
4142
0
                papoOvrBands[j + 1] = poTempBand;
4143
0
            }
4144
0
        }
4145
0
    }
4146
4147
    /* -------------------------------------------------------------------- */
4148
    /*      Count total pixels so we can prepare appropriate scaled         */
4149
    /*      progress functions.                                             */
4150
    /* -------------------------------------------------------------------- */
4151
0
    double dfTotalPixels = 0.0;
4152
4153
0
    for (int i = 0; i < nOverviews; ++i)
4154
0
    {
4155
0
        dfTotalPixels += papoOvrBands[i]->GetXSize() *
4156
0
                         static_cast<double>(papoOvrBands[i]->GetYSize());
4157
0
    }
4158
4159
    /* -------------------------------------------------------------------- */
4160
    /*      Generate all the bands.                                         */
4161
    /* -------------------------------------------------------------------- */
4162
0
    double dfPixelsProcessed = 0.0;
4163
4164
0
    for (int i = 0; i < nOverviews; ++i)
4165
0
    {
4166
0
        GDALRasterBand *poBaseBand = poSrcBand;
4167
0
        if (i != 0)
4168
0
            poBaseBand = papoOvrBands[i - 1];
4169
4170
0
        double dfPixels = papoOvrBands[i]->GetXSize() *
4171
0
                          static_cast<double>(papoOvrBands[i]->GetYSize());
4172
4173
0
        void *pScaledProgressData = GDALCreateScaledProgress(
4174
0
            dfPixelsProcessed / dfTotalPixels,
4175
0
            (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
4176
0
            pProgressData);
4177
4178
0
        const CPLErr eErr = GDALRegenerateOverviewsEx(
4179
0
            poBaseBand, 1,
4180
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
4181
0
            pszResampling, GDALScaledProgress, pScaledProgressData,
4182
0
            papszOptions);
4183
0
        GDALDestroyScaledProgress(pScaledProgressData);
4184
4185
0
        if (eErr != CE_None)
4186
0
            return eErr;
4187
4188
0
        dfPixelsProcessed += dfPixels;
4189
4190
        // Only do the bit2grayscale promotion on the base band.
4191
0
        if (STARTS_WITH_CI(pszResampling,
4192
0
                           "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
4193
0
            pszResampling = "AVERAGE";
4194
0
    }
4195
4196
0
    return CE_None;
4197
0
}
4198
4199
/************************************************************************/
4200
/*                    GDALGetResampleFunction()                         */
4201
/************************************************************************/
4202
4203
GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
4204
                                             int *pnRadius)
4205
0
{
4206
0
    if (pnRadius)
4207
0
        *pnRadius = 0;
4208
0
    if (STARTS_WITH_CI(pszResampling, "NEAR"))
4209
0
        return GDALResampleChunk_Near;
4210
0
    else if (STARTS_WITH_CI(pszResampling, "AVER") ||
4211
0
             EQUAL(pszResampling, "RMS"))
4212
0
        return GDALResampleChunk_AverageOrRMS;
4213
0
    else if (EQUAL(pszResampling, "GAUSS"))
4214
0
    {
4215
0
        if (pnRadius)
4216
0
            *pnRadius = 1;
4217
0
        return GDALResampleChunk_Gauss;
4218
0
    }
4219
0
    else if (EQUAL(pszResampling, "MODE"))
4220
0
        return GDALResampleChunk_Mode;
4221
0
    else if (EQUAL(pszResampling, "CUBIC"))
4222
0
    {
4223
0
        if (pnRadius)
4224
0
            *pnRadius = GWKGetFilterRadius(GRA_Cubic);
4225
0
        return GDALResampleChunk_Convolution;
4226
0
    }
4227
0
    else if (EQUAL(pszResampling, "CUBICSPLINE"))
4228
0
    {
4229
0
        if (pnRadius)
4230
0
            *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
4231
0
        return GDALResampleChunk_Convolution;
4232
0
    }
4233
0
    else if (EQUAL(pszResampling, "LANCZOS"))
4234
0
    {
4235
0
        if (pnRadius)
4236
0
            *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
4237
0
        return GDALResampleChunk_Convolution;
4238
0
    }
4239
0
    else if (EQUAL(pszResampling, "BILINEAR"))
4240
0
    {
4241
0
        if (pnRadius)
4242
0
            *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
4243
0
        return GDALResampleChunk_Convolution;
4244
0
    }
4245
0
    else
4246
0
    {
4247
0
        CPLError(
4248
0
            CE_Failure, CPLE_AppDefined,
4249
0
            "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
4250
0
            pszResampling);
4251
0
        return nullptr;
4252
0
    }
4253
0
}
4254
4255
/************************************************************************/
4256
/*                      GDALGetOvrWorkDataType()                        */
4257
/************************************************************************/
4258
4259
GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
4260
                                    GDALDataType eSrcDataType)
4261
0
{
4262
0
    if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
4263
0
    {
4264
0
        return eSrcDataType;
4265
0
    }
4266
0
    else if (eSrcDataType == GDT_Byte &&
4267
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4268
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4269
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4270
0
              EQUAL(pszResampling, "LANCZOS") ||
4271
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4272
0
    {
4273
0
        return GDT_Byte;
4274
0
    }
4275
0
    else if (eSrcDataType == GDT_UInt16 &&
4276
0
             (STARTS_WITH_CI(pszResampling, "AVER") ||
4277
0
              EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
4278
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4279
0
              EQUAL(pszResampling, "LANCZOS") ||
4280
0
              EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
4281
0
    {
4282
0
        return GDT_UInt16;
4283
0
    }
4284
0
    else if (EQUAL(pszResampling, "GAUSS"))
4285
0
        return GDT_Float64;
4286
4287
0
    if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
4288
0
        eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
4289
0
        eSrcDataType == GDT_Float32)
4290
0
    {
4291
0
        return GDT_Float32;
4292
0
    }
4293
0
    return GDT_Float64;
4294
0
}
4295
4296
namespace
4297
{
4298
// Structure to hold a pointer to free with CPLFree()
4299
struct PointerHolder
4300
{
4301
    void *ptr = nullptr;
4302
4303
0
    explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
4304
0
    {
4305
0
    }
4306
4307
    ~PointerHolder()
4308
0
    {
4309
0
        CPLFree(ptr);
4310
0
    }
4311
4312
    PointerHolder(const PointerHolder &) = delete;
4313
    PointerHolder &operator=(const PointerHolder &) = delete;
4314
};
4315
}  // namespace
4316
4317
/************************************************************************/
4318
/*                      GDALRegenerateOverviews()                       */
4319
/************************************************************************/
4320
4321
/**
4322
 * \brief Generate downsampled overviews.
4323
 *
4324
 * This function will generate one or more overview images from a base image
4325
 * using the requested downsampling algorithm.  Its primary use is for
4326
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4327
 * used to generate downsampled images in one file from another outside the
4328
 * overview architecture.
4329
 *
4330
 * The output bands need to exist in advance.
4331
 *
4332
 * The full set of resampling algorithms is documented in
4333
 * GDALDataset::BuildOverviews().
4334
 *
4335
 * This function will honour properly NODATA_VALUES tuples (special dataset
4336
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4337
 * considered as the nodata value and not each value of the triplet
4338
 * independently per band.
4339
 *
4340
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4341
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4342
 * overview computation.
4343
 *
4344
 * @param hSrcBand the source (base level) band.
4345
 * @param nOverviewCount the number of downsampled bands being generated.
4346
 * @param pahOvrBands the list of downsampled bands to be generated.
4347
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4348
 * @param pfnProgress progress report function.
4349
 * @param pProgressData progress function callback data.
4350
 * @return CE_None on success or CE_Failure on failure.
4351
 */
4352
CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
4353
                               GDALRasterBandH *pahOvrBands,
4354
                               const char *pszResampling,
4355
                               GDALProgressFunc pfnProgress,
4356
                               void *pProgressData)
4357
4358
0
{
4359
0
    return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
4360
0
                                     pszResampling, pfnProgress, pProgressData,
4361
0
                                     nullptr);
4362
0
}
4363
4364
/************************************************************************/
4365
/*                     GDALRegenerateOverviewsEx()                      */
4366
/************************************************************************/
4367
4368
constexpr int RADIUS_TO_DIAMETER = 2;
4369
4370
/**
4371
 * \brief Generate downsampled overviews.
4372
 *
4373
 * This function will generate one or more overview images from a base image
4374
 * using the requested downsampling algorithm.  Its primary use is for
4375
 * generating overviews via GDALDataset::BuildOverviews(), but it can also be
4376
 * used to generate downsampled images in one file from another outside the
4377
 * overview architecture.
4378
 *
4379
 * The output bands need to exist in advance.
4380
 *
4381
 * The full set of resampling algorithms is documented in
4382
 * GDALDataset::BuildOverviews().
4383
 *
4384
 * This function will honour properly NODATA_VALUES tuples (special dataset
4385
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
4386
 * considered as the nodata value and not each value of the triplet
4387
 * independently per band.
4388
 *
4389
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
4390
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
4391
 * overview computation.
4392
 *
4393
 * @param hSrcBand the source (base level) band.
4394
 * @param nOverviewCount the number of downsampled bands being generated.
4395
 * @param pahOvrBands the list of downsampled bands to be generated.
4396
 * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
4397
 * @param pfnProgress progress report function.
4398
 * @param pProgressData progress function callback data.
4399
 * @param papszOptions NULL terminated list of options as key=value pairs, or
4400
 * NULL
4401
 * @return CE_None on success or CE_Failure on failure.
4402
 * @since GDAL 3.6
4403
 */
4404
CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
4405
                                 GDALRasterBandH *pahOvrBands,
4406
                                 const char *pszResampling,
4407
                                 GDALProgressFunc pfnProgress,
4408
                                 void *pProgressData, CSLConstList papszOptions)
4409
4410
0
{
4411
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
4412
0
    GDALRasterBand **papoOvrBands =
4413
0
        reinterpret_cast<GDALRasterBand **>(pahOvrBands);
4414
4415
0
    if (pfnProgress == nullptr)
4416
0
        pfnProgress = GDALDummyProgress;
4417
4418
0
    if (EQUAL(pszResampling, "NONE"))
4419
0
        return CE_None;
4420
4421
0
    int nKernelRadius = 0;
4422
0
    GDALResampleFunction pfnResampleFn =
4423
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
4424
4425
0
    if (pfnResampleFn == nullptr)
4426
0
        return CE_Failure;
4427
4428
    /* -------------------------------------------------------------------- */
4429
    /*      Check color tables...                                           */
4430
    /* -------------------------------------------------------------------- */
4431
0
    GDALColorTable *poColorTable = nullptr;
4432
4433
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
4434
0
         EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
4435
0
        poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4436
0
    {
4437
0
        poColorTable = poSrcBand->GetColorTable();
4438
0
        if (poColorTable != nullptr)
4439
0
        {
4440
0
            if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
4441
0
            {
4442
0
                CPLError(CE_Warning, CPLE_AppDefined,
4443
0
                         "Computing overviews on palette index raster bands "
4444
0
                         "with a palette whose color interpretation is not RGB "
4445
0
                         "will probably lead to unexpected results.");
4446
0
                poColorTable = nullptr;
4447
0
            }
4448
0
            else if (poColorTable->IsIdentity())
4449
0
            {
4450
0
                poColorTable = nullptr;
4451
0
            }
4452
0
        }
4453
0
        else
4454
0
        {
4455
0
            CPLError(CE_Warning, CPLE_AppDefined,
4456
0
                     "Computing overviews on palette index raster bands "
4457
0
                     "without a palette will probably lead to unexpected "
4458
0
                     "results.");
4459
0
        }
4460
0
    }
4461
    // Not ready yet
4462
0
    else if ((EQUAL(pszResampling, "CUBIC") ||
4463
0
              EQUAL(pszResampling, "CUBICSPLINE") ||
4464
0
              EQUAL(pszResampling, "LANCZOS") ||
4465
0
              EQUAL(pszResampling, "BILINEAR")) &&
4466
0
             poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
4467
0
    {
4468
0
        CPLError(CE_Warning, CPLE_AppDefined,
4469
0
                 "Computing %s overviews on palette index raster bands "
4470
0
                 "will probably lead to unexpected results.",
4471
0
                 pszResampling);
4472
0
    }
4473
4474
    // If we have a nodata mask and we are doing something more complicated
4475
    // than nearest neighbouring, we have to fetch to nodata mask.
4476
4477
0
    GDALRasterBand *poMaskBand = nullptr;
4478
0
    bool bUseNoDataMask = false;
4479
0
    bool bCanUseCascaded = true;
4480
4481
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR"))
4482
0
    {
4483
        // Special case if we are an alpha/mask band. We want it to be
4484
        // considered as the mask band to avoid alpha=0 to be taken into account
4485
        // in average computation.
4486
0
        if (poSrcBand->IsMaskBand())
4487
0
        {
4488
0
            poMaskBand = poSrcBand;
4489
0
            bUseNoDataMask = true;
4490
0
        }
4491
0
        else
4492
0
        {
4493
0
            poMaskBand = poSrcBand->GetMaskBand();
4494
0
            const int nMaskFlags = poSrcBand->GetMaskFlags();
4495
0
            bCanUseCascaded =
4496
0
                (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
4497
0
            bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
4498
0
        }
4499
0
    }
4500
4501
    /* -------------------------------------------------------------------- */
4502
    /*      If we are operating on multiple overviews, and using            */
4503
    /*      averaging, lets do them in cascading order to reduce the        */
4504
    /*      amount of computation.                                          */
4505
    /* -------------------------------------------------------------------- */
4506
4507
    // In case the mask made be computed from another band of the dataset,
4508
    // we can't use cascaded generation, as the computation of the overviews
4509
    // of the band used for the mask band may not have yet occurred (#3033).
4510
0
    if ((STARTS_WITH_CI(pszResampling, "AVER") ||
4511
0
         EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
4512
0
         EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
4513
0
         EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
4514
0
         EQUAL(pszResampling, "MODE")) &&
4515
0
        nOverviewCount > 1 && bCanUseCascaded)
4516
0
        return GDALRegenerateCascadingOverviews(
4517
0
            poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
4518
0
            pProgressData, papszOptions);
4519
4520
    /* -------------------------------------------------------------------- */
4521
    /*      Setup one horizontal swath to read from the raw buffer.         */
4522
    /* -------------------------------------------------------------------- */
4523
0
    int nFRXBlockSize = 0;
4524
0
    int nFRYBlockSize = 0;
4525
0
    poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
4526
4527
0
    const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
4528
0
    const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
4529
0
                                       EQUAL(pszResampling, "MODE") ||
4530
0
                                       !GDALDataTypeIsComplex(eSrcDataType);
4531
0
    const GDALDataType eWrkDataType =
4532
0
        bUseGenericResampleFn
4533
0
            ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
4534
0
            : GDT_CFloat32;
4535
4536
0
    const int nWidth = poSrcBand->GetXSize();
4537
0
    const int nHeight = poSrcBand->GetYSize();
4538
4539
0
    int nMaxOvrFactor = 1;
4540
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
4541
0
    {
4542
0
        const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
4543
0
        const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
4544
0
        nMaxOvrFactor = std::max(
4545
0
            nMaxOvrFactor,
4546
0
            static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
4547
0
        nMaxOvrFactor = std::max(
4548
0
            nMaxOvrFactor,
4549
0
            static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
4550
0
    }
4551
4552
0
    int nFullResYChunk = nFRYBlockSize;
4553
0
    int nMaxChunkYSizeQueried = 0;
4554
4555
0
    const auto UpdateChunkHeightAndGetChunkSize =
4556
0
        [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
4557
0
         eWrkDataType, nWidth]()
4558
0
    {
4559
        // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
4560
        // + nFullResYChunk) / nMaxOvrFactor)
4561
0
        if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
4562
0
        {
4563
0
            return GINTBIG_MAX;
4564
0
        }
4565
0
        nFullResYChunk =
4566
0
            std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
4567
0
        if ((nKernelRadius > 0 &&
4568
0
             nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
4569
0
            nFullResYChunk >
4570
0
                INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
4571
0
        {
4572
0
            return GINTBIG_MAX;
4573
0
        }
4574
0
        nMaxChunkYSizeQueried =
4575
0
            nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
4576
0
        if (GDALGetDataTypeSizeBytes(eWrkDataType) >
4577
0
            std::numeric_limits<int64_t>::max() /
4578
0
                (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
4579
0
        {
4580
0
            return GINTBIG_MAX;
4581
0
        }
4582
0
        return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
4583
0
               nMaxChunkYSizeQueried * nWidth;
4584
0
    };
4585
4586
0
    const char *pszChunkYSize =
4587
0
        CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
4588
0
#ifndef __COVERITY__
4589
    // Only configurable for debug / testing
4590
0
    if (pszChunkYSize)
4591
0
    {
4592
0
        nFullResYChunk = atoi(pszChunkYSize);
4593
0
    }
4594
0
#endif
4595
4596
    // Only configurable for debug / testing
4597
0
    const int nChunkMaxSize =
4598
0
        atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
4599
4600
0
    auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
4601
0
    if (nChunkSize > nChunkMaxSize)
4602
0
    {
4603
0
        if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
4604
0
            !GDALDataTypeIsComplex(eSrcDataType) &&
4605
0
            (!STARTS_WITH_CI(pszResampling, "AVER") ||
4606
0
             EQUAL(pszResampling, "AVERAGE")))
4607
0
        {
4608
            // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
4609
            // which use a block based strategy, which is much less memory
4610
            // hungry.
4611
0
            return GDALRegenerateOverviewsMultiBand(
4612
0
                1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
4613
0
                pfnProgress, pProgressData, papszOptions);
4614
0
        }
4615
0
        else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
4616
0
        {
4617
0
            return GDALRegenerateCascadingOverviews(
4618
0
                poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
4619
0
                pfnProgress, pProgressData, papszOptions);
4620
0
        }
4621
0
    }
4622
0
    else if (pszChunkYSize == nullptr)
4623
0
    {
4624
        // Try to get as close as possible to nChunkMaxSize
4625
0
        while (nChunkSize < nChunkMaxSize / 2)
4626
0
        {
4627
0
            nFullResYChunk *= 2;
4628
0
            nChunkSize = UpdateChunkHeightAndGetChunkSize();
4629
0
        }
4630
0
    }
4631
4632
0
    int nHasNoData = 0;
4633
0
    const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
4634
0
    const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
4635
0
    const bool bPropagateNoData =
4636
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
4637
4638
    // Structure describing a resampling job
4639
0
    struct OvrJob
4640
0
    {
4641
        // Buffers to free when job is finished
4642
0
        std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
4643
0
        std::shared_ptr<PointerHolder> oSrcBufferHolder{};
4644
0
        std::unique_ptr<PointerHolder> oDstBufferHolder{};
4645
4646
0
        GDALRasterBand *poDstBand = nullptr;
4647
4648
        // Input parameters of pfnResampleFn
4649
0
        GDALResampleFunction pfnResampleFn = nullptr;
4650
0
        int nSrcWidth = 0;
4651
0
        int nSrcHeight = 0;
4652
0
        int nDstWidth = 0;
4653
0
        GDALOverviewResampleArgs args{};
4654
0
        const void *pChunk = nullptr;
4655
0
        bool bUseGenericResampleFn = false;
4656
4657
        // Output values of resampling function
4658
0
        CPLErr eErr = CE_Failure;
4659
0
        void *pDstBuffer = nullptr;
4660
0
        GDALDataType eDstBufferDataType = GDT_Unknown;
4661
4662
0
        void SetSrcMaskBufferHolder(
4663
0
            const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
4664
0
        {
4665
0
            oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
4666
0
        }
4667
4668
0
        void SetSrcBufferHolder(
4669
0
            const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
4670
0
        {
4671
0
            oSrcBufferHolder = oSrcBufferHolderIn;
4672
0
        }
4673
4674
0
        void NotifyFinished()
4675
0
        {
4676
0
            std::lock_guard guard(mutex);
4677
0
            bFinished = true;
4678
0
            cv.notify_one();
4679
0
        }
4680
4681
0
        bool IsFinished()
4682
0
        {
4683
0
            std::lock_guard guard(mutex);
4684
0
            return bFinished;
4685
0
        }
4686
4687
0
        void WaitFinished()
4688
0
        {
4689
0
            std::unique_lock oGuard(mutex);
4690
0
            while (!bFinished)
4691
0
            {
4692
0
                cv.wait(oGuard);
4693
0
            }
4694
0
        }
4695
4696
0
      private:
4697
        // Synchronization
4698
0
        bool bFinished = false;
4699
0
        std::mutex mutex{};
4700
0
        std::condition_variable cv{};
4701
0
    };
4702
4703
    // Thread function to resample
4704
0
    const auto JobResampleFunc = [](void *pData)
4705
0
    {
4706
0
        OvrJob *poJob = static_cast<OvrJob *>(pData);
4707
4708
0
        if (poJob->bUseGenericResampleFn)
4709
0
        {
4710
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
4711
0
                                               &(poJob->pDstBuffer),
4712
0
                                               &(poJob->eDstBufferDataType));
4713
0
        }
4714
0
        else
4715
0
        {
4716
0
            poJob->eErr = GDALResampleChunkC32R(
4717
0
                poJob->nSrcWidth, poJob->nSrcHeight,
4718
0
                static_cast<const float *>(poJob->pChunk),
4719
0
                poJob->args.nChunkYOff, poJob->args.nChunkYSize,
4720
0
                poJob->args.nDstYOff, poJob->args.nDstYOff2,
4721
0
                poJob->args.nOvrXSize, poJob->args.nOvrYSize,
4722
0
                &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
4723
0
                poJob->args.pszResampling);
4724
0
        }
4725
4726
0
        poJob->oDstBufferHolder =
4727
0
            std::make_unique<PointerHolder>(poJob->pDstBuffer);
4728
4729
0
        poJob->NotifyFinished();
4730
0
    };
4731
4732
    // Function to write resample data to target band
4733
0
    const auto WriteJobData = [](const OvrJob *poJob)
4734
0
    {
4735
0
        return poJob->poDstBand->RasterIO(
4736
0
            GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
4737
0
            poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
4738
0
            poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
4739
0
            poJob->eDstBufferDataType, 0, 0, nullptr);
4740
0
    };
4741
4742
    // Wait for completion of oldest job and serialize it
4743
0
    const auto WaitAndFinalizeOldestJob =
4744
0
        [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
4745
0
    {
4746
0
        auto poOldestJob = jobList.front().get();
4747
0
        poOldestJob->WaitFinished();
4748
0
        CPLErr l_eErr = poOldestJob->eErr;
4749
0
        if (l_eErr == CE_None)
4750
0
        {
4751
0
            l_eErr = WriteJobData(poOldestJob);
4752
0
        }
4753
4754
0
        jobList.pop_front();
4755
0
        return l_eErr;
4756
0
    };
4757
4758
    // Queue of jobs
4759
0
    std::list<std::unique_ptr<OvrJob>> jobList;
4760
4761
0
    GByte *pabyChunkNodataMask = nullptr;
4762
0
    void *pChunk = nullptr;
4763
4764
0
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
4765
0
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
4766
0
                                                       ? CPLGetNumCPUs()
4767
0
                                                       : atoi(pszThreads)));
4768
0
    auto poThreadPool =
4769
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
4770
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
4771
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
4772
4773
    /* -------------------------------------------------------------------- */
4774
    /*      Loop over image operating on chunks.                            */
4775
    /* -------------------------------------------------------------------- */
4776
0
    int nChunkYOff = 0;
4777
0
    CPLErr eErr = CE_None;
4778
4779
0
    for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
4780
0
         nChunkYOff += nFullResYChunk)
4781
0
    {
4782
0
        if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
4783
0
                         pProgressData))
4784
0
        {
4785
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
4786
0
            eErr = CE_Failure;
4787
0
        }
4788
4789
0
        if (nFullResYChunk + nChunkYOff > nHeight)
4790
0
            nFullResYChunk = nHeight - nChunkYOff;
4791
4792
0
        int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
4793
0
        int nChunkYSizeQueried =
4794
0
            nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
4795
0
        if (nChunkYOffQueried < 0)
4796
0
        {
4797
0
            nChunkYSizeQueried += nChunkYOffQueried;
4798
0
            nChunkYOffQueried = 0;
4799
0
        }
4800
0
        if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
4801
0
            nChunkYSizeQueried = nHeight - nChunkYOffQueried;
4802
4803
        // Avoid accumulating too many tasks and exhaust RAM
4804
        // Try to complete already finished jobs
4805
0
        while (eErr == CE_None && !jobList.empty())
4806
0
        {
4807
0
            auto poOldestJob = jobList.front().get();
4808
0
            if (!poOldestJob->IsFinished())
4809
0
                break;
4810
0
            eErr = poOldestJob->eErr;
4811
0
            if (eErr == CE_None)
4812
0
            {
4813
0
                eErr = WriteJobData(poOldestJob);
4814
0
            }
4815
4816
0
            jobList.pop_front();
4817
0
        }
4818
4819
        // And in case we have saturated the number of threads,
4820
        // wait for completion of tasks to go below the threshold.
4821
0
        while (eErr == CE_None &&
4822
0
               jobList.size() >= static_cast<size_t>(nThreads))
4823
0
        {
4824
0
            eErr = WaitAndFinalizeOldestJob(jobList);
4825
0
        }
4826
4827
        // (Re)allocate buffers if needed
4828
0
        if (pChunk == nullptr)
4829
0
        {
4830
0
            pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
4831
0
                                         nMaxChunkYSizeQueried, nWidth);
4832
0
        }
4833
0
        if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
4834
0
        {
4835
0
            pabyChunkNodataMask = static_cast<GByte *>(
4836
0
                VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
4837
0
        }
4838
4839
0
        if (pChunk == nullptr ||
4840
0
            (bUseNoDataMask && pabyChunkNodataMask == nullptr))
4841
0
        {
4842
0
            CPLFree(pChunk);
4843
0
            CPLFree(pabyChunkNodataMask);
4844
0
            return CE_Failure;
4845
0
        }
4846
4847
        // Read chunk.
4848
0
        if (eErr == CE_None)
4849
0
            eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4850
0
                                       nChunkYSizeQueried, pChunk, nWidth,
4851
0
                                       nChunkYSizeQueried, eWrkDataType, 0, 0,
4852
0
                                       nullptr);
4853
0
        if (eErr == CE_None && bUseNoDataMask)
4854
0
            eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
4855
0
                                        nChunkYSizeQueried, pabyChunkNodataMask,
4856
0
                                        nWidth, nChunkYSizeQueried, GDT_Byte, 0,
4857
0
                                        0, nullptr);
4858
4859
        // Special case to promote 1bit data to 8bit 0/255 values.
4860
0
        if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
4861
0
        {
4862
0
            if (eWrkDataType == GDT_Float32)
4863
0
            {
4864
0
                float *pafChunk = static_cast<float *>(pChunk);
4865
0
                for (GPtrDiff_t i = 0;
4866
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4867
0
                     i++)
4868
0
                {
4869
0
                    if (pafChunk[i] == 1.0)
4870
0
                        pafChunk[i] = 255.0;
4871
0
                }
4872
0
            }
4873
0
            else if (eWrkDataType == GDT_Byte)
4874
0
            {
4875
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
4876
0
                for (GPtrDiff_t i = 0;
4877
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4878
0
                     i++)
4879
0
                {
4880
0
                    if (pabyChunk[i] == 1)
4881
0
                        pabyChunk[i] = 255;
4882
0
                }
4883
0
            }
4884
0
            else if (eWrkDataType == GDT_UInt16)
4885
0
            {
4886
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4887
0
                for (GPtrDiff_t i = 0;
4888
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4889
0
                     i++)
4890
0
                {
4891
0
                    if (pasChunk[i] == 1)
4892
0
                        pasChunk[i] = 255;
4893
0
                }
4894
0
            }
4895
0
            else if (eWrkDataType == GDT_Float64)
4896
0
            {
4897
0
                double *padfChunk = static_cast<double *>(pChunk);
4898
0
                for (GPtrDiff_t i = 0;
4899
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4900
0
                     i++)
4901
0
                {
4902
0
                    if (padfChunk[i] == 1.0)
4903
0
                        padfChunk[i] = 255.0;
4904
0
                }
4905
0
            }
4906
0
            else
4907
0
            {
4908
0
                CPLAssert(false);
4909
0
            }
4910
0
        }
4911
0
        else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
4912
0
        {
4913
0
            if (eWrkDataType == GDT_Float32)
4914
0
            {
4915
0
                float *pafChunk = static_cast<float *>(pChunk);
4916
0
                for (GPtrDiff_t i = 0;
4917
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4918
0
                     i++)
4919
0
                {
4920
0
                    if (pafChunk[i] == 1.0)
4921
0
                        pafChunk[i] = 0.0;
4922
0
                    else if (pafChunk[i] == 0.0)
4923
0
                        pafChunk[i] = 255.0;
4924
0
                }
4925
0
            }
4926
0
            else if (eWrkDataType == GDT_Byte)
4927
0
            {
4928
0
                GByte *pabyChunk = static_cast<GByte *>(pChunk);
4929
0
                for (GPtrDiff_t i = 0;
4930
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4931
0
                     i++)
4932
0
                {
4933
0
                    if (pabyChunk[i] == 1)
4934
0
                        pabyChunk[i] = 0;
4935
0
                    else if (pabyChunk[i] == 0)
4936
0
                        pabyChunk[i] = 255;
4937
0
                }
4938
0
            }
4939
0
            else if (eWrkDataType == GDT_UInt16)
4940
0
            {
4941
0
                GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
4942
0
                for (GPtrDiff_t i = 0;
4943
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4944
0
                     i++)
4945
0
                {
4946
0
                    if (pasChunk[i] == 1)
4947
0
                        pasChunk[i] = 0;
4948
0
                    else if (pasChunk[i] == 0)
4949
0
                        pasChunk[i] = 255;
4950
0
                }
4951
0
            }
4952
0
            else if (eWrkDataType == GDT_Float64)
4953
0
            {
4954
0
                double *padfChunk = static_cast<double *>(pChunk);
4955
0
                for (GPtrDiff_t i = 0;
4956
0
                     i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
4957
0
                     i++)
4958
0
                {
4959
0
                    if (padfChunk[i] == 1.0)
4960
0
                        padfChunk[i] = 0.0;
4961
0
                    else if (padfChunk[i] == 0.0)
4962
0
                        padfChunk[i] = 255.0;
4963
0
                }
4964
0
            }
4965
0
            else
4966
0
            {
4967
0
                CPLAssert(false);
4968
0
            }
4969
0
        }
4970
4971
0
        auto oSrcBufferHolder =
4972
0
            std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
4973
0
        auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
4974
0
            poJobQueue ? pabyChunkNodataMask : nullptr);
4975
4976
0
        for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
4977
0
             ++iOverview)
4978
0
        {
4979
0
            GDALRasterBand *poDstBand = papoOvrBands[iOverview];
4980
0
            const int nDstWidth = poDstBand->GetXSize();
4981
0
            const int nDstHeight = poDstBand->GetYSize();
4982
4983
0
            const double dfXRatioDstToSrc =
4984
0
                static_cast<double>(nWidth) / nDstWidth;
4985
0
            const double dfYRatioDstToSrc =
4986
0
                static_cast<double>(nHeight) / nDstHeight;
4987
4988
            /* --------------------------------------------------------------------
4989
             */
4990
            /*      Figure out the line to start writing to, and the first line
4991
             */
4992
            /*      to not write to.  In theory this approach should ensure that
4993
             */
4994
            /*      every output line will be written if all input chunks are */
4995
            /*      processed. */
4996
            /* --------------------------------------------------------------------
4997
             */
4998
0
            int nDstYOff =
4999
0
                static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
5000
0
            if (nDstYOff == nDstHeight)
5001
0
                continue;
5002
0
            int nDstYOff2 = static_cast<int>(
5003
0
                0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
5004
5005
0
            if (nChunkYOff + nFullResYChunk == nHeight)
5006
0
                nDstYOff2 = nDstHeight;
5007
#if DEBUG_VERBOSE
5008
            CPLDebug("GDAL",
5009
                     "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
5010
                     nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
5011
                     nDstWidth, nDstYOff2 - nDstYOff);
5012
#endif
5013
5014
0
            auto poJob = std::make_unique<OvrJob>();
5015
0
            poJob->pfnResampleFn = pfnResampleFn;
5016
0
            poJob->bUseGenericResampleFn = bUseGenericResampleFn;
5017
0
            poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
5018
0
            poJob->args.nOvrXSize = poDstBand->GetXSize();
5019
0
            poJob->args.nOvrYSize = poDstBand->GetYSize();
5020
0
            const char *pszNBITS =
5021
0
                poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
5022
0
            poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
5023
0
            poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
5024
0
            poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
5025
0
            poJob->args.eWrkDataType = eWrkDataType;
5026
0
            poJob->pChunk = pChunk;
5027
0
            poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
5028
0
            poJob->nSrcWidth = nWidth;
5029
0
            poJob->nSrcHeight = nHeight;
5030
0
            poJob->args.nChunkXOff = 0;
5031
0
            poJob->args.nChunkXSize = nWidth;
5032
0
            poJob->args.nChunkYOff = nChunkYOffQueried;
5033
0
            poJob->args.nChunkYSize = nChunkYSizeQueried;
5034
0
            poJob->nDstWidth = nDstWidth;
5035
0
            poJob->args.nDstXOff = 0;
5036
0
            poJob->args.nDstXOff2 = nDstWidth;
5037
0
            poJob->args.nDstYOff = nDstYOff;
5038
0
            poJob->args.nDstYOff2 = nDstYOff2;
5039
0
            poJob->poDstBand = poDstBand;
5040
0
            poJob->args.pszResampling = pszResampling;
5041
0
            poJob->args.bHasNoData = bHasNoData;
5042
0
            poJob->args.dfNoDataValue = dfNoDataValue;
5043
0
            poJob->args.poColorTable = poColorTable;
5044
0
            poJob->args.eSrcDataType = eSrcDataType;
5045
0
            poJob->args.bPropagateNoData = bPropagateNoData;
5046
5047
0
            if (poJobQueue)
5048
0
            {
5049
0
                poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
5050
0
                poJob->SetSrcBufferHolder(oSrcBufferHolder);
5051
0
                poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
5052
0
                jobList.emplace_back(std::move(poJob));
5053
0
            }
5054
0
            else
5055
0
            {
5056
0
                JobResampleFunc(poJob.get());
5057
0
                eErr = poJob->eErr;
5058
0
                if (eErr == CE_None)
5059
0
                {
5060
0
                    eErr = WriteJobData(poJob.get());
5061
0
                }
5062
0
            }
5063
0
        }
5064
5065
0
        if (poJobQueue)
5066
0
        {
5067
0
            pChunk = nullptr;
5068
0
            pabyChunkNodataMask = nullptr;
5069
0
        }
5070
0
    }
5071
5072
0
    VSIFree(pChunk);
5073
0
    VSIFree(pabyChunkNodataMask);
5074
5075
    // Wait for all pending jobs to complete
5076
0
    while (!jobList.empty())
5077
0
    {
5078
0
        const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
5079
0
        if (l_eErr != CE_None && eErr == CE_None)
5080
0
            eErr = l_eErr;
5081
0
    }
5082
5083
    /* -------------------------------------------------------------------- */
5084
    /*      Renormalized overview mean / stddev if needed.                  */
5085
    /* -------------------------------------------------------------------- */
5086
0
    if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
5087
0
    {
5088
0
        GDALOverviewMagnitudeCorrection(
5089
0
            poSrcBand, nOverviewCount,
5090
0
            reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
5091
0
            GDALDummyProgress, nullptr);
5092
0
    }
5093
5094
    /* -------------------------------------------------------------------- */
5095
    /*      It can be important to flush out data to overviews.             */
5096
    /* -------------------------------------------------------------------- */
5097
0
    for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
5098
0
         ++iOverview)
5099
0
    {
5100
0
        eErr = papoOvrBands[iOverview]->FlushCache(false);
5101
0
    }
5102
5103
0
    if (eErr == CE_None)
5104
0
        pfnProgress(1.0, nullptr, pProgressData);
5105
5106
0
    return eErr;
5107
0
}
5108
5109
/************************************************************************/
5110
/*            GDALRegenerateOverviewsMultiBand()                        */
5111
/************************************************************************/
5112
5113
/**
5114
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
5115
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
5116
 *
5117
 * This function will generate one or more overview images from a base
5118
 * image using the requested downsampling algorithm.  Its primary use
5119
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
5120
 * can also be used to generate downsampled images in one file from another
5121
 * outside the overview architecture.
5122
 *
5123
 * The output bands need to exist in advance and share the same characteristics
5124
 * (type, dimensions)
5125
 *
5126
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
5127
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
5128
 *
5129
 * It does not support color tables or complex data types.
5130
 *
5131
 * The pseudo-algorithm used by the function is :
5132
 *    for each overview
5133
 *       iterate on lines of the source by a step of deltay
5134
 *           iterate on columns of the source  by a step of deltax
5135
 *               read the source data of size deltax * deltay for all the bands
5136
 *               generate the corresponding overview block for all the bands
5137
 *
5138
 * This function will honour properly NODATA_VALUES tuples (special dataset
5139
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
5140
 * considered as the nodata value and not each value of the triplet
5141
 * independently per band.
5142
 *
5143
 * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
5144
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
5145
 * overview computation.
5146
 *
5147
 * @param nBands the number of bands, size of papoSrcBands and size of
5148
 *               first dimension of papapoOverviewBands
5149
 * @param papoSrcBands the list of source bands to downsample
5150
 * @param nOverviews the number of downsampled overview levels being generated.
5151
 * @param papapoOverviewBands bidimension array of bands. First dimension is
5152
 *                            indexed by nBands. Second dimension is indexed by
5153
 *                            nOverviews.
5154
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
5155
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
5156
 * @param pfnProgress progress report function.
5157
 * @param pProgressData progress function callback data.
5158
 * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
5159
 *                     key=value pairs, or NULL
5160
 *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
5161
 *                     options can be specified to express that overviews should
5162
 *                     be regenerated only in the specified subset of the source
5163
 *                     dataset.
5164
 * @return CE_None on success or CE_Failure on failure.
5165
 */
5166
5167
CPLErr GDALRegenerateOverviewsMultiBand(
5168
    int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
5169
    GDALRasterBand *const *const *papapoOverviewBands,
5170
    const char *pszResampling, GDALProgressFunc pfnProgress,
5171
    void *pProgressData, CSLConstList papszOptions)
5172
0
{
5173
0
    CPL_IGNORE_RET_VAL(papszOptions);
5174
5175
0
    if (pfnProgress == nullptr)
5176
0
        pfnProgress = GDALDummyProgress;
5177
5178
0
    if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
5179
0
        return CE_None;
5180
5181
    // Sanity checks.
5182
0
    if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
5183
0
        !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
5184
0
        !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
5185
0
        !EQUAL(pszResampling, "CUBICSPLINE") &&
5186
0
        !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
5187
0
        !EQUAL(pszResampling, "MODE"))
5188
0
    {
5189
0
        CPLError(CE_Failure, CPLE_NotSupported,
5190
0
                 "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
5191
0
                 "not supported",
5192
0
                 pszResampling);
5193
0
        return CE_Failure;
5194
0
    }
5195
5196
0
    int nKernelRadius = 0;
5197
0
    GDALResampleFunction pfnResampleFn =
5198
0
        GDALGetResampleFunction(pszResampling, &nKernelRadius);
5199
0
    if (pfnResampleFn == nullptr)
5200
0
        return CE_Failure;
5201
5202
0
    const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
5203
0
    const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
5204
0
    if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
5205
0
        return CE_None;
5206
0
    GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
5207
0
    for (int iBand = 1; iBand < nBands; ++iBand)
5208
0
    {
5209
0
        if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
5210
0
            papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
5211
0
        {
5212
0
            CPLError(
5213
0
                CE_Failure, CPLE_NotSupported,
5214
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5215
0
                "have the same dimensions");
5216
0
            return CE_Failure;
5217
0
        }
5218
0
        if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
5219
0
        {
5220
0
            CPLError(
5221
0
                CE_Failure, CPLE_NotSupported,
5222
0
                "GDALRegenerateOverviewsMultiBand: all the source bands must "
5223
0
                "have the same data type");
5224
0
            return CE_Failure;
5225
0
        }
5226
0
    }
5227
5228
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5229
0
    {
5230
0
        const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
5231
0
        const int nDstWidth = poOvrFirstBand->GetXSize();
5232
0
        const int nDstHeight = poOvrFirstBand->GetYSize();
5233
0
        for (int iBand = 1; iBand < nBands; ++iBand)
5234
0
        {
5235
0
            const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
5236
0
            if (poOvrBand->GetXSize() != nDstWidth ||
5237
0
                poOvrBand->GetYSize() != nDstHeight)
5238
0
            {
5239
0
                CPLError(
5240
0
                    CE_Failure, CPLE_NotSupported,
5241
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5242
0
                    "of the same level must have the same dimensions");
5243
0
                return CE_Failure;
5244
0
            }
5245
0
            if (poOvrBand->GetRasterDataType() != eDataType)
5246
0
            {
5247
0
                CPLError(
5248
0
                    CE_Failure, CPLE_NotSupported,
5249
0
                    "GDALRegenerateOverviewsMultiBand: all the overviews bands "
5250
0
                    "must have the same data type as the source bands");
5251
0
                return CE_Failure;
5252
0
            }
5253
0
        }
5254
0
    }
5255
5256
    // First pass to compute the total number of pixels to write.
5257
0
    double dfTotalPixelCount = 0;
5258
0
    const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
5259
0
    const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
5260
0
    const int nSrcXSize = atoi(CSLFetchNameValueDef(
5261
0
        papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
5262
0
    const int nSrcYSize = atoi(CSLFetchNameValueDef(
5263
0
        papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
5264
0
    for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
5265
0
    {
5266
0
        dfTotalPixelCount +=
5267
0
            static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
5268
0
            papapoOverviewBands[0][iOverview]->GetXSize() *
5269
0
            static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
5270
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5271
0
    }
5272
5273
0
    const GDALDataType eWrkDataType =
5274
0
        GDALGetOvrWorkDataType(pszResampling, eDataType);
5275
0
    const int nWrkDataTypeSize =
5276
0
        std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
5277
5278
0
    const bool bIsMask = papoSrcBands[0]->IsMaskBand();
5279
5280
    // If we have a nodata mask and we are doing something more complicated
5281
    // than nearest neighbouring, we have to fetch to nodata mask.
5282
0
    const bool bUseNoDataMask =
5283
0
        !STARTS_WITH_CI(pszResampling, "NEAR") &&
5284
0
        (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
5285
5286
0
    std::vector<bool> abHasNoData(nBands);
5287
0
    std::vector<double> adfNoDataValue(nBands);
5288
5289
0
    for (int iBand = 0; iBand < nBands; ++iBand)
5290
0
    {
5291
0
        int nHasNoData = 0;
5292
0
        adfNoDataValue[iBand] =
5293
0
            papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
5294
0
        abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
5295
0
    }
5296
0
    const bool bPropagateNoData =
5297
0
        CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
5298
5299
0
    const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
5300
0
    const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
5301
0
                                                       ? CPLGetNumCPUs()
5302
0
                                                       : atoi(pszThreads)));
5303
0
    auto poThreadPool =
5304
0
        nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
5305
0
    auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
5306
0
                                   : std::unique_ptr<CPLJobQueue>(nullptr);
5307
5308
    // Only configurable for debug / testing
5309
0
    const GIntBig nChunkMaxSize = []() -> GIntBig
5310
0
    {
5311
0
        const char *pszVal =
5312
0
            CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
5313
0
        if (pszVal)
5314
0
        {
5315
0
            GIntBig nRet = 0;
5316
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5317
0
            return std::max<GIntBig>(100, nRet);
5318
0
        }
5319
0
        return 10 * 1024 * 1024;
5320
0
    }();
5321
5322
    // Only configurable for debug / testing
5323
0
    const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
5324
0
    {
5325
0
        const char *pszVal = CPLGetConfigOption(
5326
0
            "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
5327
0
        if (pszVal)
5328
0
        {
5329
0
            GIntBig nRet = 0;
5330
0
            CPLParseMemorySize(pszVal, &nRet, nullptr);
5331
0
            return std::max<GIntBig>(100, nRet);
5332
0
        }
5333
0
        const auto nUsableRAM = CPLGetUsablePhysicalRAM();
5334
0
        if (nUsableRAM > 0)
5335
0
            return nUsableRAM / 10;
5336
        // Select a value to be able to at least downsample by 2 for a RGB
5337
        // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
5338
0
        return 100 * 1024 * 1024;
5339
0
    }();
5340
5341
    // Second pass to do the real job.
5342
0
    double dfCurPixelCount = 0;
5343
0
    CPLErr eErr = CE_None;
5344
0
    for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
5345
0
         ++iOverview)
5346
0
    {
5347
0
        int iSrcOverview = -1;  // -1 means the source bands.
5348
5349
0
        const int nDstTotalWidth =
5350
0
            papapoOverviewBands[0][iOverview]->GetXSize();
5351
0
        const int nDstTotalHeight =
5352
0
            papapoOverviewBands[0][iOverview]->GetYSize();
5353
5354
        // Compute the coordinates of the target region to refresh
5355
0
        constexpr double EPS = 1e-8;
5356
0
        const int nDstXOffStart = static_cast<int>(
5357
0
            static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
5358
0
            EPS);
5359
0
        const int nDstXOffEnd =
5360
0
            std::min(static_cast<int>(
5361
0
                         std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
5362
0
                                       nToplevelSrcWidth * nDstTotalWidth -
5363
0
                                   EPS)),
5364
0
                     nDstTotalWidth);
5365
0
        const int nDstWidth = nDstXOffEnd - nDstXOffStart;
5366
0
        const int nDstYOffStart =
5367
0
            static_cast<int>(static_cast<double>(nSrcYOff) /
5368
0
                                 nToplevelSrcHeight * nDstTotalHeight +
5369
0
                             EPS);
5370
0
        const int nDstYOffEnd =
5371
0
            std::min(static_cast<int>(
5372
0
                         std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
5373
0
                                       nToplevelSrcHeight * nDstTotalHeight -
5374
0
                                   EPS)),
5375
0
                     nDstTotalHeight);
5376
0
        const int nDstHeight = nDstYOffEnd - nDstYOffStart;
5377
5378
        // Try to use previous level of overview as the source to compute
5379
        // the next level.
5380
0
        int nSrcWidth = nToplevelSrcWidth;
5381
0
        int nSrcHeight = nToplevelSrcHeight;
5382
0
        if (iOverview > 0 &&
5383
0
            papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
5384
0
        {
5385
0
            nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
5386
0
            nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
5387
0
            iSrcOverview = iOverview - 1;
5388
0
        }
5389
5390
0
        const double dfXRatioDstToSrc =
5391
0
            static_cast<double>(nSrcWidth) / nDstTotalWidth;
5392
0
        const double dfYRatioDstToSrc =
5393
0
            static_cast<double>(nSrcHeight) / nDstTotalHeight;
5394
5395
0
        const int nOvrFactor =
5396
0
            std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
5397
0
                                 static_cast<int>(0.5 + dfYRatioDstToSrc)));
5398
5399
0
        int nDstChunkXSize = 0;
5400
0
        int nDstChunkYSize = 0;
5401
0
        papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
5402
0
                                                        &nDstChunkYSize);
5403
5404
0
        constexpr int PIXEL_MARGIN = 2;
5405
        // Try to extend the chunk size so that the memory needed to acquire
5406
        // source pixels goes up to 10 MB.
5407
        // This can help for drivers that support multi-threaded reading
5408
0
        const int nFullResYChunk = static_cast<int>(std::min<double>(
5409
0
            nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
5410
0
        const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
5411
0
            nSrcHeight,
5412
0
            nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5413
0
                                 nKernelRadius * nOvrFactor));
5414
0
        while (nDstChunkXSize < nDstWidth)
5415
0
        {
5416
0
            constexpr int INCREASE_FACTOR = 2;
5417
5418
0
            const int nFullResXChunk = static_cast<int>(std::min<double>(
5419
0
                nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
5420
0
                                              dfXRatioDstToSrc));
5421
5422
0
            const int nFullResXChunkQueried =
5423
0
                static_cast<int>(std::min<int64_t>(
5424
0
                    nSrcWidth,
5425
0
                    nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5426
0
                                         nKernelRadius * nOvrFactor));
5427
5428
0
            if (nBands > nChunkMaxSize / nFullResXChunkQueried /
5429
0
                             nFullResYChunkQueried / nWrkDataTypeSize)
5430
0
            {
5431
0
                break;
5432
0
            }
5433
5434
0
            nDstChunkXSize *= INCREASE_FACTOR;
5435
0
        }
5436
0
        nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
5437
5438
0
        const int nFullResXChunk = static_cast<int>(std::min<double>(
5439
0
            nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
5440
0
        const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
5441
0
            nSrcWidth,
5442
0
            nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
5443
0
                                 nKernelRadius * nOvrFactor));
5444
5445
        // Make sure that the RAM requirements to acquire the source data does
5446
        // not exceed nChunkMaxSizeForTempFile
5447
        // If so, reduce the destination chunk size, generate overviews in a
5448
        // temporary dataset, and copy that temporary dataset over the target
5449
        // overview bands (to avoid issues with lossy compression)
5450
0
        const bool bOverflowFullResXChunkYChunkQueried =
5451
0
            nBands > std::numeric_limits<int64_t>::max() /
5452
0
                         nFullResXChunkQueried / nFullResYChunkQueried /
5453
0
                         nWrkDataTypeSize;
5454
5455
0
        const auto nMemRequirement =
5456
0
            bOverflowFullResXChunkYChunkQueried
5457
0
                ? 0
5458
0
                : static_cast<GIntBig>(nFullResXChunkQueried) *
5459
0
                      nFullResYChunkQueried * nBands * nWrkDataTypeSize;
5460
        // Use a temporary dataset with a smaller destination chunk size
5461
0
        const auto nOverShootFactor =
5462
0
            nMemRequirement / nChunkMaxSizeForTempFile;
5463
5464
0
        constexpr int MIN_OVERSHOOT_FACTOR = 4;
5465
0
        const auto nSqrtOverShootFactor = std::max<GIntBig>(
5466
0
            MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
5467
0
                                      static_cast<double>(nOverShootFactor)))));
5468
0
        constexpr int DEFAULT_CHUNK_SIZE = 256;
5469
0
        constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
5470
0
        const int nReducedDstChunkXSize =
5471
0
            bOverflowFullResXChunkYChunkQueried
5472
0
                ? DEFAULT_CHUNK_SIZE
5473
0
                : std::max(1, static_cast<int>(nDstChunkXSize /
5474
0
                                               nSqrtOverShootFactor) &
5475
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5476
0
        const int nReducedDstChunkYSize =
5477
0
            bOverflowFullResXChunkYChunkQueried
5478
0
                ? DEFAULT_CHUNK_SIZE
5479
0
                : std::max(1, static_cast<int>(nDstChunkYSize /
5480
0
                                               nSqrtOverShootFactor) &
5481
0
                                  ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
5482
5483
0
        if (bOverflowFullResXChunkYChunkQueried ||
5484
0
            nMemRequirement > nChunkMaxSizeForTempFile)
5485
0
        {
5486
0
            const auto nDTSize =
5487
0
                std::max(1, GDALGetDataTypeSizeBytes(eDataType));
5488
0
            const bool bTmpDSMemRequirementOverflow =
5489
0
                nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
5490
0
                             nDstHeight / nDTSize;
5491
0
            const auto nTmpDSMemRequirement =
5492
0
                bTmpDSMemRequirementOverflow
5493
0
                    ? 0
5494
0
                    : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
5495
0
                          nDTSize;
5496
5497
            // make sure that one band buffer doesn't overflow size_t
5498
0
            const bool bChunkSizeOverflow =
5499
0
                static_cast<size_t>(nDTSize) >
5500
0
                std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
5501
0
            const size_t nChunkSize =
5502
0
                bChunkSizeOverflow
5503
0
                    ? 0
5504
0
                    : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
5505
5506
0
            const auto CreateVRT =
5507
0
                [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
5508
0
                 pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
5509
0
                 iSrcOverview, &abHasNoData,
5510
0
                 &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
5511
0
            {
5512
0
                auto poVRTDS = std::make_unique<VRTDataset>(
5513
0
                    nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
5514
0
                    nVRTBlockYSize);
5515
5516
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5517
0
                {
5518
0
                    auto poVRTSrc = std::make_unique<VRTSimpleSource>();
5519
0
                    poVRTSrc->SetResampling(pszResampling);
5520
0
                    poVRTDS->AddBand(eWrkDataType);
5521
0
                    auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
5522
0
                        poVRTDS->GetRasterBand(iBand + 1));
5523
5524
0
                    auto poSrcBand = papoSrcBands[iBand];
5525
0
                    if (iSrcOverview != -1)
5526
0
                        poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
5527
0
                    poVRTBand->ConfigureSource(
5528
0
                        poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
5529
0
                        nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
5530
                    // Add the source to the band
5531
0
                    poVRTBand->AddSource(poVRTSrc.release());
5532
0
                    if (abHasNoData[iBand])
5533
0
                        poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
5534
0
                }
5535
5536
0
                if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
5537
0
                    poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
5538
0
                {
5539
0
                    VRTSourcedRasterBand *poMaskVRTBand =
5540
0
                        cpl::down_cast<VRTSourcedRasterBand *>(
5541
0
                            poVRTDS->GetRasterBand(1)->GetMaskBand());
5542
0
                    auto poSrcBand = papoSrcBands[0];
5543
0
                    if (iSrcOverview != -1)
5544
0
                        poSrcBand = papapoOverviewBands[0][iSrcOverview];
5545
0
                    poMaskVRTBand->AddMaskBandSource(
5546
0
                        poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
5547
0
                        0, 0, nDstTotalWidth, nDstTotalHeight);
5548
0
                }
5549
5550
0
                return poVRTDS;
5551
0
            };
5552
5553
            // If the overview accommodates chunking, do so and recurse
5554
            // to avoid generating full size temporary files
5555
0
            if (!bOverflowFullResXChunkYChunkQueried &&
5556
0
                !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
5557
0
                (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
5558
0
            {
5559
                // Create a VRT with the smaller chunk to do the scaling
5560
0
                auto poVRTDS =
5561
0
                    CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5562
5563
0
                std::vector<GDALRasterBand *> apoVRTBand(nBands);
5564
0
                std::vector<GDALRasterBand *> apoDstBand(nBands);
5565
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5566
0
                {
5567
0
                    apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
5568
0
                    apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
5569
0
                }
5570
5571
                // Use a flag to avoid reading from the overview being built
5572
0
                GDALRasterIOExtraArg sExtraArg;
5573
0
                INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5574
0
                if (iSrcOverview == -1)
5575
0
                    sExtraArg.bUseOnlyThisScale = true;
5576
5577
                // A single band buffer for data transfer to the overview
5578
0
                std::vector<GByte> abyChunk;
5579
0
                try
5580
0
                {
5581
0
                    abyChunk.resize(nChunkSize);
5582
0
                }
5583
0
                catch (const std::exception &)
5584
0
                {
5585
0
                    CPLError(CE_Failure, CPLE_OutOfMemory,
5586
0
                             "Out of memory allocating temporary buffer");
5587
0
                    return CE_Failure;
5588
0
                }
5589
5590
                // Loop over output height, in chunks
5591
0
                for (int nDstYOff = nDstYOffStart;
5592
0
                     nDstYOff < nDstYOffEnd && eErr == CE_None;
5593
0
                     /* */)
5594
0
                {
5595
0
                    const int nDstYCount =
5596
0
                        std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5597
                    // Loop over output width, in output chunks
5598
0
                    for (int nDstXOff = nDstXOffStart;
5599
0
                         nDstXOff < nDstXOffEnd && eErr == CE_None;
5600
0
                         /* */)
5601
0
                    {
5602
0
                        const int nDstXCount =
5603
0
                            std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5604
                        // Read and transfer the chunk to the overview
5605
0
                        for (int iBand = 0; iBand < nBands && eErr == CE_None;
5606
0
                             ++iBand)
5607
0
                        {
5608
0
                            eErr = apoVRTBand[iBand]->RasterIO(
5609
0
                                GF_Read, nDstXOff, nDstYOff, nDstXCount,
5610
0
                                nDstYCount, abyChunk.data(), nDstXCount,
5611
0
                                nDstYCount, eDataType, 0, 0, &sExtraArg);
5612
0
                            if (eErr == CE_None)
5613
0
                            {
5614
0
                                eErr = apoDstBand[iBand]->RasterIO(
5615
0
                                    GF_Write, nDstXOff, nDstYOff, nDstXCount,
5616
0
                                    nDstYCount, abyChunk.data(), nDstXCount,
5617
0
                                    nDstYCount, eDataType, 0, 0, nullptr);
5618
0
                            }
5619
0
                        }
5620
5621
0
                        dfCurPixelCount +=
5622
0
                            static_cast<double>(nDstXCount) * nDstYCount;
5623
5624
0
                        nDstXOff += nDstXCount;
5625
0
                    }  // width
5626
5627
0
                    if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
5628
0
                                     nullptr, pProgressData))
5629
0
                    {
5630
0
                        CPLError(CE_Failure, CPLE_UserInterrupt,
5631
0
                                 "User terminated");
5632
0
                        eErr = CE_Failure;
5633
0
                    }
5634
5635
0
                    nDstYOff += nDstYCount;
5636
0
                }  // height
5637
5638
0
                if (CE_None != eErr)
5639
0
                {
5640
0
                    CPLError(CE_Failure, CPLE_AppDefined,
5641
0
                             "Error while writing overview");
5642
0
                    return CE_Failure;
5643
0
                }
5644
5645
0
                pfnProgress(1.0, nullptr, pProgressData);
5646
                // Flush the overviews we just generated
5647
0
                for (int iBand = 0; iBand < nBands; ++iBand)
5648
0
                    apoDstBand[iBand]->FlushCache(false);
5649
5650
0
                continue;  // Next overview
5651
0
            }              // chunking via temporary dataset
5652
5653
0
            std::unique_ptr<GDALDataset> poTmpDS;
5654
            // Config option mostly/only for autotest purposes
5655
0
            const char *pszGDAL_OVR_TEMP_DRIVER =
5656
0
                CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
5657
0
            if ((!bTmpDSMemRequirementOverflow &&
5658
0
                 nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
5659
0
                 !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
5660
0
                EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
5661
0
            {
5662
0
                auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
5663
0
                if (!poTmpDrv)
5664
0
                {
5665
0
                    eErr = CE_Failure;
5666
0
                    break;
5667
0
                }
5668
0
                poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
5669
0
                                               nDstTotalHeight, nBands,
5670
0
                                               eDataType, nullptr));
5671
0
            }
5672
0
            else
5673
0
            {
5674
                // Create a temporary file for the overview
5675
0
                auto poTmpDrv =
5676
0
                    GetGDALDriverManager()->GetDriverByName("GTiff");
5677
0
                if (!poTmpDrv)
5678
0
                {
5679
0
                    eErr = CE_Failure;
5680
0
                    break;
5681
0
                }
5682
0
                std::string osTmpFilename;
5683
0
                auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
5684
0
                if (poDstDS)
5685
0
                {
5686
0
                    osTmpFilename = poDstDS->GetDescription();
5687
0
                    VSIStatBufL sStatBuf;
5688
0
                    if (!osTmpFilename.empty() &&
5689
0
                        VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
5690
0
                        osTmpFilename += "_tmp_ovr.tif";
5691
0
                }
5692
0
                if (osTmpFilename.empty())
5693
0
                {
5694
0
                    osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
5695
0
                    osTmpFilename += ".tif";
5696
0
                }
5697
0
                CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
5698
0
                         osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
5699
0
                CPLStringList aosCO;
5700
0
                if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
5701
0
                          (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
5702
0
                {
5703
0
                    aosCO.SetNameValue("TILED", "YES");
5704
0
                    aosCO.SetNameValue("BLOCKXSIZE",
5705
0
                                       CPLSPrintf("%d", nReducedDstChunkXSize));
5706
0
                    aosCO.SetNameValue("BLOCKYSIZE",
5707
0
                                       CPLSPrintf("%d", nReducedDstChunkYSize));
5708
0
                }
5709
0
                if (const char *pszCOList =
5710
0
                        poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
5711
0
                {
5712
0
                    aosCO.SetNameValue(
5713
0
                        "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
5714
0
                }
5715
0
                poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
5716
0
                                               nDstHeight, nBands, eDataType,
5717
0
                                               aosCO.List()));
5718
0
                if (poTmpDS)
5719
0
                {
5720
0
                    poTmpDS->MarkSuppressOnClose();
5721
0
                    VSIUnlink(osTmpFilename.c_str());
5722
0
                }
5723
0
            }
5724
0
            if (!poTmpDS)
5725
0
            {
5726
0
                eErr = CE_Failure;
5727
0
                break;
5728
0
            }
5729
5730
            // Create a full size VRT to do the resampling without edge effects
5731
0
            auto poVRTDS =
5732
0
                CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
5733
5734
            // Allocate a band buffer with the overview chunk size
5735
0
            std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
5736
0
                VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
5737
0
                                    nDstChunkYSize));
5738
0
            if (pDstBuffer == nullptr)
5739
0
            {
5740
0
                eErr = CE_Failure;
5741
0
                break;
5742
0
            }
5743
5744
            // Use a flag to avoid reading the overview being built
5745
0
            GDALRasterIOExtraArg sExtraArg;
5746
0
            INIT_RASTERIO_EXTRA_ARG(sExtraArg);
5747
0
            if (iSrcOverview == -1)
5748
0
                sExtraArg.bUseOnlyThisScale = true;
5749
5750
            // Scale and copy data from the VRT to the temp file
5751
0
            for (int nDstYOff = nDstYOffStart;
5752
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
5753
0
                 /* */)
5754
0
            {
5755
0
                const int nDstYCount =
5756
0
                    std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
5757
0
                for (int nDstXOff = nDstXOffStart;
5758
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
5759
0
                     /* */)
5760
0
                {
5761
0
                    const int nDstXCount =
5762
0
                        std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
5763
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
5764
0
                         ++iBand)
5765
0
                    {
5766
0
                        auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
5767
0
                        eErr = poSrcBand->RasterIO(
5768
0
                            GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
5769
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
5770
0
                            eWrkDataType, 0, 0, &sExtraArg);
5771
0
                        if (eErr == CE_None)
5772
0
                        {
5773
                            // Write to the temporary dataset, shifted
5774
0
                            auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
5775
0
                            eErr = poOvrBand->RasterIO(
5776
0
                                GF_Write, nDstXOff - nDstXOffStart,
5777
0
                                nDstYOff - nDstYOffStart, nDstXCount,
5778
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
5779
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
5780
0
                        }
5781
0
                    }
5782
0
                    nDstXOff += nDstXCount;
5783
0
                }
5784
0
                nDstYOff += nDstYCount;
5785
0
            }
5786
5787
            // Copy from the temporary to the overview
5788
0
            for (int nDstYOff = nDstYOffStart;
5789
0
                 nDstYOff < nDstYOffEnd && eErr == CE_None;
5790
0
                 /* */)
5791
0
            {
5792
0
                const int nDstYCount =
5793
0
                    std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
5794
0
                for (int nDstXOff = nDstXOffStart;
5795
0
                     nDstXOff < nDstXOffEnd && eErr == CE_None;
5796
0
                     /* */)
5797
0
                {
5798
0
                    const int nDstXCount =
5799
0
                        std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
5800
0
                    for (int iBand = 0; iBand < nBands && eErr == CE_None;
5801
0
                         ++iBand)
5802
0
                    {
5803
0
                        auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
5804
0
                        eErr = poSrcBand->RasterIO(
5805
0
                            GF_Read, nDstXOff - nDstXOffStart,
5806
0
                            nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
5807
0
                            pDstBuffer.get(), nDstXCount, nDstYCount,
5808
0
                            eWrkDataType, 0, 0, nullptr);
5809
0
                        if (eErr == CE_None)
5810
0
                        {
5811
                            // Write to the destination overview bands
5812
0
                            auto poOvrBand =
5813
0
                                papapoOverviewBands[iBand][iOverview];
5814
0
                            eErr = poOvrBand->RasterIO(
5815
0
                                GF_Write, nDstXOff, nDstYOff, nDstXCount,
5816
0
                                nDstYCount, pDstBuffer.get(), nDstXCount,
5817
0
                                nDstYCount, eWrkDataType, 0, 0, nullptr);
5818
0
                        }
5819
0
                    }
5820
0
                    nDstXOff += nDstXCount;
5821
0
                }
5822
0
                nDstYOff += nDstYCount;
5823
0
            }
5824
5825
0
            if (eErr != CE_None)
5826
0
            {
5827
0
                CPLError(CE_Failure, CPLE_AppDefined,
5828
0
                         "Failed to write overview %d", iOverview);
5829
0
                return eErr;
5830
0
            }
5831
5832
            // Flush the data to overviews.
5833
0
            for (int iBand = 0; iBand < nBands; ++iBand)
5834
0
                papapoOverviewBands[iBand][iOverview]->FlushCache(false);
5835
5836
0
            continue;
5837
0
        }
5838
5839
        // Structure describing a resampling job
5840
0
        struct OvrJob
5841
0
        {
5842
            // Buffers to free when job is finished
5843
0
            std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
5844
0
            std::unique_ptr<PointerHolder> oSrcBufferHolder{};
5845
0
            std::unique_ptr<PointerHolder> oDstBufferHolder{};
5846
5847
0
            GDALRasterBand *poDstBand = nullptr;
5848
5849
            // Input parameters of pfnResampleFn
5850
0
            GDALResampleFunction pfnResampleFn = nullptr;
5851
0
            GDALOverviewResampleArgs args{};
5852
0
            const void *pChunk = nullptr;
5853
5854
            // Output values of resampling function
5855
0
            CPLErr eErr = CE_Failure;
5856
0
            void *pDstBuffer = nullptr;
5857
0
            GDALDataType eDstBufferDataType = GDT_Unknown;
5858
5859
0
            void NotifyFinished()
5860
0
            {
5861
0
                std::lock_guard guard(mutex);
5862
0
                bFinished = true;
5863
0
                cv.notify_one();
5864
0
            }
5865
5866
0
            bool IsFinished()
5867
0
            {
5868
0
                std::lock_guard guard(mutex);
5869
0
                return bFinished;
5870
0
            }
5871
5872
0
            void WaitFinished()
5873
0
            {
5874
0
                std::unique_lock oGuard(mutex);
5875
0
                while (!bFinished)
5876
0
                {
5877
0
                    cv.wait(oGuard);
5878
0
                }
5879
0
            }
5880
5881
0
          private:
5882
            // Synchronization
5883
0
            bool bFinished = false;
5884
0
            std::mutex mutex{};
5885
0
            std::condition_variable cv{};
5886
0
        };
5887
5888
        // Thread function to resample
5889
0
        const auto JobResampleFunc = [](void *pData)
5890
0
        {
5891
0
            OvrJob *poJob = static_cast<OvrJob *>(pData);
5892
5893
0
            poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
5894
0
                                               &(poJob->pDstBuffer),
5895
0
                                               &(poJob->eDstBufferDataType));
5896
5897
0
            poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
5898
5899
0
            poJob->NotifyFinished();
5900
0
        };
5901
5902
        // Function to write resample data to target band
5903
0
        const auto WriteJobData = [](const OvrJob *poJob)
5904
0
        {
5905
0
            return poJob->poDstBand->RasterIO(
5906
0
                GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
5907
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5908
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
5909
0
                poJob->args.nDstXOff2 - poJob->args.nDstXOff,
5910
0
                poJob->args.nDstYOff2 - poJob->args.nDstYOff,
5911
0
                poJob->eDstBufferDataType, 0, 0, nullptr);
5912
0
        };
5913
5914
        // Wait for completion of oldest job and serialize it
5915
0
        const auto WaitAndFinalizeOldestJob =
5916
0
            [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
5917
0
        {
5918
0
            auto poOldestJob = jobList.front().get();
5919
0
            poOldestJob->WaitFinished();
5920
0
            CPLErr l_eErr = poOldestJob->eErr;
5921
0
            if (l_eErr == CE_None)
5922
0
            {
5923
0
                l_eErr = WriteJobData(poOldestJob);
5924
0
            }
5925
5926
0
            jobList.pop_front();
5927
0
            return l_eErr;
5928
0
        };
5929
5930
        // Queue of jobs
5931
0
        std::list<std::unique_ptr<OvrJob>> jobList;
5932
5933
0
        std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
5934
0
        std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
5935
0
            apabyChunkNoDataMask(nBands);
5936
5937
        // Iterate on destination overview, block by block.
5938
0
        for (int nDstYOff = nDstYOffStart;
5939
0
             nDstYOff < nDstYOffEnd && eErr == CE_None;
5940
0
             nDstYOff += nDstChunkYSize)
5941
0
        {
5942
0
            int nDstYCount;
5943
0
            if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
5944
0
                nDstYCount = nDstChunkYSize;
5945
0
            else
5946
0
                nDstYCount = nDstYOffEnd - nDstYOff;
5947
5948
0
            int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
5949
0
            int nChunkYOff2 = static_cast<int>(
5950
0
                ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
5951
0
            if (nChunkYOff2 > nSrcHeight ||
5952
0
                nDstYOff + nDstYCount == nDstTotalHeight)
5953
0
                nChunkYOff2 = nSrcHeight;
5954
0
            int nYCount = nChunkYOff2 - nChunkYOff;
5955
0
            CPLAssert(nYCount <= nFullResYChunk);
5956
5957
0
            int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
5958
0
            int nChunkYSizeQueried =
5959
0
                nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
5960
0
            if (nChunkYOffQueried < 0)
5961
0
            {
5962
0
                nChunkYSizeQueried += nChunkYOffQueried;
5963
0
                nChunkYOffQueried = 0;
5964
0
            }
5965
0
            if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
5966
0
                nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
5967
0
            CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
5968
5969
0
            if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
5970
0
                             nullptr, pProgressData))
5971
0
            {
5972
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
5973
0
                eErr = CE_Failure;
5974
0
            }
5975
5976
            // Iterate on destination overview, block by block.
5977
0
            for (int nDstXOff = nDstXOffStart;
5978
0
                 nDstXOff < nDstXOffEnd && eErr == CE_None;
5979
0
                 nDstXOff += nDstChunkXSize)
5980
0
            {
5981
0
                int nDstXCount = 0;
5982
0
                if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
5983
0
                    nDstXCount = nDstChunkXSize;
5984
0
                else
5985
0
                    nDstXCount = nDstXOffEnd - nDstXOff;
5986
5987
0
                dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
5988
5989
0
                int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
5990
0
                int nChunkXOff2 = static_cast<int>(
5991
0
                    ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
5992
0
                if (nChunkXOff2 > nSrcWidth ||
5993
0
                    nDstXOff + nDstXCount == nDstTotalWidth)
5994
0
                    nChunkXOff2 = nSrcWidth;
5995
0
                const int nXCount = nChunkXOff2 - nChunkXOff;
5996
0
                CPLAssert(nXCount <= nFullResXChunk);
5997
5998
0
                int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
5999
0
                int nChunkXSizeQueried =
6000
0
                    nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
6001
0
                if (nChunkXOffQueried < 0)
6002
0
                {
6003
0
                    nChunkXSizeQueried += nChunkXOffQueried;
6004
0
                    nChunkXOffQueried = 0;
6005
0
                }
6006
0
                if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
6007
0
                    nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
6008
0
                CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
6009
#if DEBUG_VERBOSE
6010
                CPLDebug("GDAL",
6011
                         "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
6012
                         nChunkXOffQueried, nChunkYOffQueried,
6013
                         nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
6014
                         nDstYOff, nDstXCount, nDstYCount);
6015
#endif
6016
6017
                // Avoid accumulating too many tasks and exhaust RAM
6018
6019
                // Try to complete already finished jobs
6020
0
                while (eErr == CE_None && !jobList.empty())
6021
0
                {
6022
0
                    auto poOldestJob = jobList.front().get();
6023
0
                    if (!poOldestJob->IsFinished())
6024
0
                        break;
6025
0
                    eErr = poOldestJob->eErr;
6026
0
                    if (eErr == CE_None)
6027
0
                    {
6028
0
                        eErr = WriteJobData(poOldestJob);
6029
0
                    }
6030
6031
0
                    jobList.pop_front();
6032
0
                }
6033
6034
                // And in case we have saturated the number of threads,
6035
                // wait for completion of tasks to go below the threshold.
6036
0
                while (eErr == CE_None &&
6037
0
                       jobList.size() >= static_cast<size_t>(nThreads))
6038
0
                {
6039
0
                    eErr = WaitAndFinalizeOldestJob(jobList);
6040
0
                }
6041
6042
                // Read the source buffers for all the bands.
6043
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6044
0
                {
6045
                    // (Re)allocate buffers if needed
6046
0
                    if (apaChunk[iBand] == nullptr)
6047
0
                    {
6048
0
                        apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
6049
0
                            nFullResXChunkQueried, nFullResYChunkQueried,
6050
0
                            nWrkDataTypeSize));
6051
0
                        if (apaChunk[iBand] == nullptr)
6052
0
                        {
6053
0
                            eErr = CE_Failure;
6054
0
                        }
6055
0
                    }
6056
0
                    if (bUseNoDataMask &&
6057
0
                        apabyChunkNoDataMask[iBand] == nullptr)
6058
0
                    {
6059
0
                        apabyChunkNoDataMask[iBand].reset(
6060
0
                            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
6061
0
                                nFullResXChunkQueried, nFullResYChunkQueried)));
6062
0
                        if (apabyChunkNoDataMask[iBand] == nullptr)
6063
0
                        {
6064
0
                            eErr = CE_Failure;
6065
0
                        }
6066
0
                    }
6067
6068
0
                    if (eErr == CE_None)
6069
0
                    {
6070
0
                        GDALRasterBand *poSrcBand = nullptr;
6071
0
                        if (iSrcOverview == -1)
6072
0
                            poSrcBand = papoSrcBands[iBand];
6073
0
                        else
6074
0
                            poSrcBand =
6075
0
                                papapoOverviewBands[iBand][iSrcOverview];
6076
0
                        eErr = poSrcBand->RasterIO(
6077
0
                            GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6078
0
                            nChunkXSizeQueried, nChunkYSizeQueried,
6079
0
                            apaChunk[iBand].get(), nChunkXSizeQueried,
6080
0
                            nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
6081
6082
0
                        if (bUseNoDataMask && eErr == CE_None)
6083
0
                        {
6084
0
                            auto poMaskBand = poSrcBand->IsMaskBand()
6085
0
                                                  ? poSrcBand
6086
0
                                                  : poSrcBand->GetMaskBand();
6087
0
                            eErr = poMaskBand->RasterIO(
6088
0
                                GF_Read, nChunkXOffQueried, nChunkYOffQueried,
6089
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6090
0
                                apabyChunkNoDataMask[iBand].get(),
6091
0
                                nChunkXSizeQueried, nChunkYSizeQueried,
6092
0
                                GDT_Byte, 0, 0, nullptr);
6093
0
                        }
6094
0
                    }
6095
0
                }
6096
6097
                // Compute the resulting overview block.
6098
0
                for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
6099
0
                {
6100
0
                    auto poJob = std::make_unique<OvrJob>();
6101
0
                    poJob->pfnResampleFn = pfnResampleFn;
6102
0
                    poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
6103
0
                    poJob->args.eOvrDataType =
6104
0
                        poJob->poDstBand->GetRasterDataType();
6105
0
                    poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
6106
0
                    poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
6107
0
                    const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
6108
0
                        "NBITS", "IMAGE_STRUCTURE");
6109
0
                    poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
6110
0
                    poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
6111
0
                    poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
6112
0
                    poJob->args.eWrkDataType = eWrkDataType;
6113
0
                    poJob->pChunk = apaChunk[iBand].get();
6114
0
                    poJob->args.pabyChunkNodataMask =
6115
0
                        apabyChunkNoDataMask[iBand].get();
6116
0
                    poJob->args.nChunkXOff = nChunkXOffQueried;
6117
0
                    poJob->args.nChunkXSize = nChunkXSizeQueried;
6118
0
                    poJob->args.nChunkYOff = nChunkYOffQueried;
6119
0
                    poJob->args.nChunkYSize = nChunkYSizeQueried;
6120
0
                    poJob->args.nDstXOff = nDstXOff;
6121
0
                    poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
6122
0
                    poJob->args.nDstYOff = nDstYOff;
6123
0
                    poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
6124
0
                    poJob->args.pszResampling = pszResampling;
6125
0
                    poJob->args.bHasNoData = abHasNoData[iBand];
6126
0
                    poJob->args.dfNoDataValue = adfNoDataValue[iBand];
6127
0
                    poJob->args.eSrcDataType = eDataType;
6128
0
                    poJob->args.bPropagateNoData = bPropagateNoData;
6129
6130
0
                    if (poJobQueue)
6131
0
                    {
6132
0
                        poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
6133
0
                            apabyChunkNoDataMask[iBand].release()));
6134
6135
0
                        poJob->oSrcBufferHolder.reset(
6136
0
                            new PointerHolder(apaChunk[iBand].release()));
6137
6138
0
                        poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
6139
0
                        jobList.emplace_back(std::move(poJob));
6140
0
                    }
6141
0
                    else
6142
0
                    {
6143
0
                        JobResampleFunc(poJob.get());
6144
0
                        eErr = poJob->eErr;
6145
0
                        if (eErr == CE_None)
6146
0
                        {
6147
0
                            eErr = WriteJobData(poJob.get());
6148
0
                        }
6149
0
                    }
6150
0
                }
6151
0
            }
6152
0
        }
6153
6154
        // Wait for all pending jobs to complete
6155
0
        while (!jobList.empty())
6156
0
        {
6157
0
            const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
6158
0
            if (l_eErr != CE_None && eErr == CE_None)
6159
0
                eErr = l_eErr;
6160
0
        }
6161
6162
        // Flush the data to overviews.
6163
0
        for (int iBand = 0; iBand < nBands; ++iBand)
6164
0
        {
6165
0
            if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
6166
0
                CE_None)
6167
0
                eErr = CE_Failure;
6168
0
        }
6169
0
    }
6170
6171
0
    if (eErr == CE_None)
6172
0
        pfnProgress(1.0, nullptr, pProgressData);
6173
6174
0
    return eErr;
6175
0
}
6176
6177
/************************************************************************/
6178
/*            GDALRegenerateOverviewsMultiBand()                        */
6179
/************************************************************************/
6180
6181
/**
6182
 * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
6183
 * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
6184
 *
6185
 * This function will generate one or more overview images from a base
6186
 * image using the requested downsampling algorithm.  Its primary use
6187
 * is for generating overviews via GDALDataset::BuildOverviews(), but it
6188
 * can also be used to generate downsampled images in one file from another
6189
 * outside the overview architecture.
6190
 *
6191
 * The output bands need to exist in advance and share the same characteristics
6192
 * (type, dimensions)
6193
 *
6194
 * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
6195
 * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
6196
 *
6197
 * It does not support color tables or complex data types.
6198
 *
6199
 * The pseudo-algorithm used by the function is :
6200
 *    for each overview
6201
 *       iterate on lines of the source by a step of deltay
6202
 *           iterate on columns of the source  by a step of deltax
6203
 *               read the source data of size deltax * deltay for all the bands
6204
 *               generate the corresponding overview block for all the bands
6205
 *
6206
 * This function will honour properly NODATA_VALUES tuples (special dataset
6207
 * metadata) so that only a given RGB triplet (in case of a RGB image) will be
6208
 * considered as the nodata value and not each value of the triplet
6209
 * independently per band.
6210
 *
6211
 * The GDAL_NUM_THREADS configuration option can be set
6212
 * to "ALL_CPUS" or a integer value to specify the number of threads to use for
6213
 * overview computation.
6214
 *
6215
 * @param apoSrcBands the list of source bands to downsample
6216
 * @param aapoOverviewBands bidimension array of bands. First dimension is
6217
 *                          indexed by bands. Second dimension is indexed by
6218
 *                          overview levels. All aapoOverviewBands[i] arrays
6219
 *                          must have the same size (i.e. same number of
6220
 *                          overviews)
6221
 * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
6222
 * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
6223
 * @param pfnProgress progress report function.
6224
 * @param pProgressData progress function callback data.
6225
 * @param papszOptions NULL terminated list of options as
6226
 *                     key=value pairs, or NULL
6227
 *                     The XOFF, YOFF, XSIZE and YSIZE
6228
 *                     options can be specified to express that overviews should
6229
 *                     be regenerated only in the specified subset of the source
6230
 *                     dataset.
6231
 * @return CE_None on success or CE_Failure on failure.
6232
 * @since 3.10
6233
 */
6234
6235
CPLErr GDALRegenerateOverviewsMultiBand(
6236
    const std::vector<GDALRasterBand *> &apoSrcBands,
6237
    const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
6238
    const char *pszResampling, GDALProgressFunc pfnProgress,
6239
    void *pProgressData, CSLConstList papszOptions)
6240
0
{
6241
0
    CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
6242
0
    for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
6243
0
    {
6244
0
        CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
6245
0
    }
6246
6247
0
    if (aapoOverviewBands.empty())
6248
0
        return CE_None;
6249
6250
0
    std::vector<GDALRasterBand **> apapoOverviewBands;
6251
0
    for (auto &apoOverviewBands : aapoOverviewBands)
6252
0
    {
6253
0
        auto papoOverviewBands = static_cast<GDALRasterBand **>(
6254
0
            CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
6255
0
        for (size_t i = 0; i < apoOverviewBands.size(); ++i)
6256
0
        {
6257
0
            papoOverviewBands[i] = apoOverviewBands[i];
6258
0
        }
6259
0
        apapoOverviewBands.push_back(papoOverviewBands);
6260
0
    }
6261
0
    const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
6262
0
        static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
6263
0
        static_cast<int>(aapoOverviewBands[0].size()),
6264
0
        apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
6265
0
        papszOptions);
6266
0
    for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
6267
0
        CPLFree(papoOverviewBands);
6268
0
    return eErr;
6269
0
}
6270
6271
/************************************************************************/
6272
/*                        GDALComputeBandStats()                        */
6273
/************************************************************************/
6274
6275
/** Undocumented
6276
 * @param hSrcBand undocumented.
6277
 * @param nSampleStep Step between scanlines used to compute statistics.
6278
 *                    When nSampleStep is equal to 1, all scanlines will
6279
 *                    be processed.
6280
 * @param pdfMean undocumented.
6281
 * @param pdfStdDev undocumented.
6282
 * @param pfnProgress undocumented.
6283
 * @param pProgressData undocumented.
6284
 * @return undocumented
6285
 */
6286
CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
6287
                                        int nSampleStep, double *pdfMean,
6288
                                        double *pdfStdDev,
6289
                                        GDALProgressFunc pfnProgress,
6290
                                        void *pProgressData)
6291
6292
0
{
6293
0
    VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
6294
6295
0
    GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
6296
6297
0
    if (pfnProgress == nullptr)
6298
0
        pfnProgress = GDALDummyProgress;
6299
6300
0
    const int nWidth = poSrcBand->GetXSize();
6301
0
    const int nHeight = poSrcBand->GetYSize();
6302
6303
0
    if (nSampleStep >= nHeight || nSampleStep < 1)
6304
0
        nSampleStep = 1;
6305
6306
0
    GDALDataType eWrkType = GDT_Unknown;
6307
0
    float *pafData = nullptr;
6308
0
    GDALDataType eType = poSrcBand->GetRasterDataType();
6309
0
    const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6310
0
    if (bComplex)
6311
0
    {
6312
0
        pafData = static_cast<float *>(
6313
0
            VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6314
0
        eWrkType = GDT_CFloat32;
6315
0
    }
6316
0
    else
6317
0
    {
6318
0
        pafData =
6319
0
            static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6320
0
        eWrkType = GDT_Float32;
6321
0
    }
6322
6323
0
    if (nWidth == 0 || pafData == nullptr)
6324
0
    {
6325
0
        VSIFree(pafData);
6326
0
        return CE_Failure;
6327
0
    }
6328
6329
    /* -------------------------------------------------------------------- */
6330
    /*      Loop over all sample lines.                                     */
6331
    /* -------------------------------------------------------------------- */
6332
0
    double dfSum = 0.0;
6333
0
    double dfSum2 = 0.0;
6334
0
    int iLine = 0;
6335
0
    GIntBig nSamples = 0;
6336
6337
0
    do
6338
0
    {
6339
0
        if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6340
0
                         pProgressData))
6341
0
        {
6342
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6343
0
            CPLFree(pafData);
6344
0
            return CE_Failure;
6345
0
        }
6346
6347
0
        const CPLErr eErr =
6348
0
            poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
6349
0
                                1, eWrkType, 0, 0, nullptr);
6350
0
        if (eErr != CE_None)
6351
0
        {
6352
0
            CPLFree(pafData);
6353
0
            return eErr;
6354
0
        }
6355
6356
0
        for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6357
0
        {
6358
0
            float fValue = 0.0f;
6359
6360
0
            if (bComplex)
6361
0
            {
6362
                // Compute the magnitude of the complex value.
6363
0
                fValue =
6364
0
                    std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
6365
0
            }
6366
0
            else
6367
0
            {
6368
0
                fValue = pafData[iPixel];
6369
0
            }
6370
6371
0
            dfSum += fValue;
6372
0
            dfSum2 += static_cast<double>(fValue) * fValue;
6373
0
        }
6374
6375
0
        nSamples += nWidth;
6376
0
        iLine += nSampleStep;
6377
0
    } while (iLine < nHeight);
6378
6379
0
    if (!pfnProgress(1.0, nullptr, pProgressData))
6380
0
    {
6381
0
        CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6382
0
        CPLFree(pafData);
6383
0
        return CE_Failure;
6384
0
    }
6385
6386
    /* -------------------------------------------------------------------- */
6387
    /*      Produce the result values.                                      */
6388
    /* -------------------------------------------------------------------- */
6389
0
    if (pdfMean != nullptr)
6390
0
        *pdfMean = dfSum / nSamples;
6391
6392
0
    if (pdfStdDev != nullptr)
6393
0
    {
6394
0
        const double dfMean = dfSum / nSamples;
6395
6396
0
        *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
6397
0
    }
6398
6399
0
    CPLFree(pafData);
6400
6401
0
    return CE_None;
6402
0
}
6403
6404
/************************************************************************/
6405
/*                  GDALOverviewMagnitudeCorrection()                   */
6406
/*                                                                      */
6407
/*      Correct the mean and standard deviation of the overviews of     */
6408
/*      the given band to match the base layer approximately.           */
6409
/************************************************************************/
6410
6411
/** Undocumented
6412
 * @param hBaseBand undocumented.
6413
 * @param nOverviewCount undocumented.
6414
 * @param pahOverviews undocumented.
6415
 * @param pfnProgress undocumented.
6416
 * @param pProgressData undocumented.
6417
 * @return undocumented
6418
 */
6419
CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
6420
                                       int nOverviewCount,
6421
                                       GDALRasterBandH *pahOverviews,
6422
                                       GDALProgressFunc pfnProgress,
6423
                                       void *pProgressData)
6424
6425
0
{
6426
0
    VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
6427
6428
    /* -------------------------------------------------------------------- */
6429
    /*      Compute mean/stddev for source raster.                          */
6430
    /* -------------------------------------------------------------------- */
6431
0
    double dfOrigMean = 0.0;
6432
0
    double dfOrigStdDev = 0.0;
6433
0
    {
6434
0
        const CPLErr eErr =
6435
0
            GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
6436
0
                                 pfnProgress, pProgressData);
6437
6438
0
        if (eErr != CE_None)
6439
0
            return eErr;
6440
0
    }
6441
6442
    /* -------------------------------------------------------------------- */
6443
    /*      Loop on overview bands.                                         */
6444
    /* -------------------------------------------------------------------- */
6445
0
    for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
6446
0
    {
6447
0
        GDALRasterBand *poOverview =
6448
0
            GDALRasterBand::FromHandle(pahOverviews[iOverview]);
6449
0
        double dfOverviewMean, dfOverviewStdDev;
6450
6451
0
        const CPLErr eErr =
6452
0
            GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
6453
0
                                 &dfOverviewStdDev, pfnProgress, pProgressData);
6454
6455
0
        if (eErr != CE_None)
6456
0
            return eErr;
6457
6458
0
        double dfGain = 1.0;
6459
0
        if (dfOrigStdDev >= 0.0001)
6460
0
            dfGain = dfOrigStdDev / dfOverviewStdDev;
6461
6462
        /* --------------------------------------------------------------------
6463
         */
6464
        /*      Apply gain and offset. */
6465
        /* --------------------------------------------------------------------
6466
         */
6467
0
        const int nWidth = poOverview->GetXSize();
6468
0
        const int nHeight = poOverview->GetYSize();
6469
6470
0
        GDALDataType eWrkType = GDT_Unknown;
6471
0
        float *pafData = nullptr;
6472
0
        const GDALDataType eType = poOverview->GetRasterDataType();
6473
0
        const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
6474
0
        if (bComplex)
6475
0
        {
6476
0
            pafData = static_cast<float *>(
6477
0
                VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
6478
0
            eWrkType = GDT_CFloat32;
6479
0
        }
6480
0
        else
6481
0
        {
6482
0
            pafData = static_cast<float *>(
6483
0
                VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
6484
0
            eWrkType = GDT_Float32;
6485
0
        }
6486
6487
0
        if (pafData == nullptr)
6488
0
        {
6489
0
            return CE_Failure;
6490
0
        }
6491
6492
0
        for (int iLine = 0; iLine < nHeight; ++iLine)
6493
0
        {
6494
0
            if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
6495
0
                             pProgressData))
6496
0
            {
6497
0
                CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6498
0
                CPLFree(pafData);
6499
0
                return CE_Failure;
6500
0
            }
6501
6502
0
            if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
6503
0
                                     nWidth, 1, eWrkType, 0, 0,
6504
0
                                     nullptr) != CE_None)
6505
0
            {
6506
0
                CPLFree(pafData);
6507
0
                return CE_Failure;
6508
0
            }
6509
6510
0
            for (int iPixel = 0; iPixel < nWidth; ++iPixel)
6511
0
            {
6512
0
                if (bComplex)
6513
0
                {
6514
0
                    pafData[iPixel * 2] *= static_cast<float>(dfGain);
6515
0
                    pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
6516
0
                }
6517
0
                else
6518
0
                {
6519
0
                    pafData[iPixel] = static_cast<float>(
6520
0
                        (pafData[iPixel] - dfOverviewMean) * dfGain +
6521
0
                        dfOrigMean);
6522
0
                }
6523
0
            }
6524
6525
0
            if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
6526
0
                                     nWidth, 1, eWrkType, 0, 0,
6527
0
                                     nullptr) != CE_None)
6528
0
            {
6529
0
                CPLFree(pafData);
6530
0
                return CE_Failure;
6531
0
            }
6532
0
        }
6533
6534
0
        if (!pfnProgress(1.0, nullptr, pProgressData))
6535
0
        {
6536
0
            CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
6537
0
            CPLFree(pafData);
6538
0
            return CE_Failure;
6539
0
        }
6540
6541
0
        CPLFree(pafData);
6542
0
    }
6543
6544
0
    return CE_None;
6545
0
}