Coverage Report

Created: 2026-02-14 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/apps/gdalalg_raster_blend.cpp
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Project:  GDAL
4
 * Purpose:  "blend" step of "raster pipeline"
5
 * Author:   Even Rouault <even dot rouault at spatialys.com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com>
9
 * Copyright (c) 2009, Frank Warmerdam
10
11
 * SPDX-License-Identifier: MIT
12
 ****************************************************************************/
13
14
#include "gdalalg_raster_blend.h"
15
16
#include "cpl_conv.h"
17
#include "gdal_priv.h"
18
#include "gdal_utils.h"
19
20
#include <algorithm>
21
#include <array>
22
#include <cassert>
23
#include <limits>
24
#include <utility>
25
26
#if defined(__x86_64) || defined(_M_X64)
27
#define HAVE_SSE2
28
#include <immintrin.h>
29
#endif
30
#ifdef HAVE_SSE2
31
#include "gdalsse_priv.h"
32
#endif
33
34
//! @cond Doxygen_Suppress
35
36
#ifndef _
37
0
#define _(x) (x)
38
#endif
39
40
/************************************************************************/
41
/*                           CompositionModes                           */
42
/************************************************************************/
43
std::map<CompositionMode, std::string> CompositionModes()
44
0
{
45
0
    return {
46
0
        {CompositionMode::SRC_OVER, "src-over"},
47
0
        {CompositionMode::HSV_VALUE, "hsv-value"},
48
0
        {CompositionMode::MULTIPLY, "multiply"},
49
0
        {CompositionMode::SCREEN, "screen"},
50
0
        {CompositionMode::OVERLAY, "overlay"},
51
0
        {CompositionMode::HARD_LIGHT, "hard-light"},
52
0
        {CompositionMode::DARKEN, "darken"},
53
0
        {CompositionMode::LIGHTEN, "lighten"},
54
0
        {CompositionMode::COLOR_BURN, "color-burn"},
55
0
        {CompositionMode::COLOR_DODGE, "color-dodge"},
56
0
    };
57
0
}
58
59
/************************************************************************/
60
/*                      CompositionModeToString()                       */
61
/************************************************************************/
62
63
std::string CompositionModeToString(CompositionMode mode)
64
0
{
65
0
    const auto &modes = CompositionModes();
66
0
    const auto &iter = modes.find(mode);
67
0
    if (iter != modes.end())
68
0
    {
69
0
        return iter->second;
70
0
    }
71
0
    CPLError(CE_Failure, CPLE_IllegalArg,
72
0
             "Invalid composition mode value: %d, returning 'src-over'",
73
0
             static_cast<int>(mode));
74
0
    return "src-over";
75
0
}
76
77
/************************************************************************/
78
/*                    CompositionModesIdentifiers()                     */
79
/************************************************************************/
80
81
std::vector<std::string> CompositionModesIdentifiers()
82
0
{
83
0
    const auto &modes = CompositionModes();
84
0
    std::vector<std::string> identifiers;
85
0
    for (const auto &pair : modes)
86
0
    {
87
0
        identifiers.push_back(pair.second);
88
0
    }
89
0
    return identifiers;
90
0
}
91
92
/************************************************************************/
93
/*                     CompositionModeFromString()                      */
94
/************************************************************************/
95
96
CompositionMode CompositionModeFromString(const std::string &str)
97
0
{
98
0
    const auto &modes = CompositionModes();
99
0
    auto iter =
100
0
        std::find_if(modes.begin(), modes.end(),
101
0
                     [&str](const auto &pair) { return pair.second == str; });
102
0
    if (iter != modes.end())
103
0
    {
104
0
        return iter->first;
105
0
    }
106
0
    CPLError(CE_Failure, CPLE_IllegalArg,
107
0
             "Invalid composition identifier: %s, returning SRC_OVER",
108
0
             str.c_str());
109
0
    return CompositionMode::SRC_OVER;
110
0
}
111
112
/************************************************************************/
113
/*                   MinBandCountForCompositionMode()                   */
114
/************************************************************************/
115
116
//! Returns the minimum number of bands required for the given composition mode
117
int MinBandCountForCompositionMode(CompositionMode mode)
118
0
{
119
0
    switch (mode)
120
0
    {
121
0
        case CompositionMode::HSV_VALUE:
122
0
            return 3;
123
0
        case CompositionMode::SRC_OVER:
124
0
        case CompositionMode::MULTIPLY:
125
0
        case CompositionMode::SCREEN:
126
0
        case CompositionMode::OVERLAY:
127
0
        case CompositionMode::HARD_LIGHT:
128
0
        case CompositionMode::DARKEN:
129
0
        case CompositionMode::LIGHTEN:
130
0
        case CompositionMode::COLOR_BURN:
131
0
        case CompositionMode::COLOR_DODGE:
132
0
            return 1;
133
0
    }
134
    // unreachable...
135
0
    return 1;
136
0
}
137
138
/************************************************************************/
139
/*                   MaxBandCountForCompositionMode()                   */
140
/************************************************************************/
141
142
/**
143
 *  Returns the maximum number of bands allowed for the given composition mode
144
 */
145
int MaxBandCountForCompositionMode(CompositionMode mode)
146
0
{
147
0
    switch (mode)
148
0
    {
149
0
        case CompositionMode::SRC_OVER:
150
0
        case CompositionMode::HSV_VALUE:
151
0
        case CompositionMode::MULTIPLY:
152
0
        case CompositionMode::SCREEN:
153
0
        case CompositionMode::OVERLAY:
154
0
        case CompositionMode::HARD_LIGHT:
155
0
        case CompositionMode::DARKEN:
156
0
        case CompositionMode::LIGHTEN:
157
0
        case CompositionMode::COLOR_BURN:
158
0
        case CompositionMode::COLOR_DODGE:
159
0
            return 4;
160
0
    }
161
    // unreachable...
162
0
    return 4;
163
0
}
164
165
/************************************************************************/
166
/*              BandCountIsCompatibleWithCompositionMode()              */
167
/************************************************************************/
168
169
//! Checks whether the number of bands is compatible with the given composition mode
170
bool BandCountIsCompatibleWithCompositionMode(int bandCount,
171
                                              CompositionMode mode)
172
0
{
173
0
    const int minBands = MinBandCountForCompositionMode(mode);
174
0
    const int maxBands = MaxBandCountForCompositionMode(mode);
175
0
    return minBands <= bandCount && bandCount <= maxBands;
176
0
}
177
178
/************************************************************************/
179
/*                            MulScale255()                             */
180
/************************************************************************/
181
182
/** Multiply 2 bytes considering them as ratios with 255 = 100%, and return their product unscaled to [0, 255], by ceiling */
183
inline GByte MulScale255(GByte a, GByte b)
184
0
{
185
0
    return static_cast<GByte>((a * b + 255) / 256);
186
0
}
187
188
/************************************************************************/
189
/*                         ProcessAlphaChannels                         */
190
/************************************************************************/
191
192
static inline void ProcessAlphaChannels(size_t i,
193
                                        const GByte *CPL_RESTRICT pabyA,
194
                                        const GByte *CPL_RESTRICT pabyOverlayA,
195
                                        int nOpacity, bool bSwappedOpacity,
196
                                        GByte &outA, GByte &outOverlaA,
197
                                        GByte &outFinalAlpha)
198
0
{
199
    // Apply opacity depending on whether overlay and base were swapped
200
0
    const GByte byOpacity = static_cast<GByte>(nOpacity);
201
0
    if (!bSwappedOpacity)
202
0
    {
203
0
        outOverlaA =
204
0
            pabyOverlayA ? MulScale255(pabyOverlayA[i], byOpacity) : byOpacity;
205
206
0
        outA = pabyA ? pabyA[i] : 255;
207
0
    }
208
0
    else
209
0
    {
210
0
        outOverlaA = pabyOverlayA ? pabyOverlayA[i] : 255;
211
0
        if (outOverlaA != 255)
212
0
        {
213
0
            outOverlaA = pabyOverlayA ? pabyOverlayA[i] : 255;
214
0
        }
215
0
        outA = pabyA ? MulScale255(pabyA[i], byOpacity) : byOpacity;
216
0
    }
217
218
    // Da'  = Sa + Da - Sa.Da
219
0
    outFinalAlpha =
220
0
        static_cast<GByte>(outOverlaA + outA - MulScale255(outOverlaA, outA));
221
0
}
222
223
/************************************************************************/
224
/*                            DivScale255()                             */
225
/************************************************************************/
226
227
/** Divide 2 bytes considering them as ratios with 255 = 100%, and return their quotient unscaled to [0, 255], by flooring
228
 *  \warning Caution: this function does not check that the result actually fits on a byte, and just casts the computed value to byte.
229
 */
230
inline GByte DivScale255(GByte a, GByte b)
231
0
{
232
0
    if (a == 0)
233
0
    {
234
0
        return 0;
235
0
    }
236
0
    else if (b == 0)
237
0
    {
238
0
        return 255;
239
0
    }
240
0
    else
241
0
    {
242
0
        const int nRes = (a * 255) / b;
243
0
#ifdef DEBUG
244
0
        CPLAssert(nRes <= 255);
245
0
#endif
246
0
        return static_cast<GByte>(nRes);
247
0
    }
248
0
}
249
250
/************************************************************************/
251
/*                         PremultiplyChannels                          */
252
/************************************************************************/
253
254
//! Premultiply RGB channels by alpha (A)
255
static inline void PremultiplyChannels(size_t i, const GByte *pabyR,
256
                                       const GByte *pabyG, const GByte *pabyB,
257
                                       GByte &outR, GByte &outG, GByte &outB,
258
                                       const GByte &A)
259
260
0
{
261
0
    if (A == 255)
262
0
    {
263
0
        outR = pabyR ? pabyR[i] : 255;
264
0
        outG = pabyG ? pabyG[i] : outR;  // in case only R is present
265
0
        outB = pabyB ? pabyB[i] : outR;  // in case only R is present
266
0
    }
267
0
    else
268
0
    {
269
0
        outR = pabyR ? MulScale255(pabyR[i], A) : A;
270
0
        outG = pabyG ? MulScale255(pabyG[i], A)
271
0
                     : outR;  // in case only R is present
272
0
        outB = pabyB ? MulScale255(pabyB[i], A)
273
0
                     : outR;  // in case only R is present
274
0
    }
275
0
}
276
277
/************************************************************************/
278
/*         GDALRasterBlendAlgorithm::GDALRasterBlendAlgorithm()         */
279
/************************************************************************/
280
281
GDALRasterBlendAlgorithm::GDALRasterBlendAlgorithm(bool standaloneStep)
282
0
    : GDALRasterPipelineStepAlgorithm(
283
0
          NAME, DESCRIPTION, HELP_URL,
284
0
          ConstructorOptions()
285
0
              .SetStandaloneStep(standaloneStep)
286
0
              .SetAddDefaultArguments(false)
287
0
              .SetInputDatasetHelpMsg(_("Input raster dataset"))
288
0
              .SetInputDatasetAlias("color-input")
289
0
              .SetInputDatasetMetaVar("COLOR-INPUT")
290
0
              .SetOutputDatasetHelpMsg(_("Output raster dataset")))
291
0
{
292
0
    const auto AddOverlayDatasetArg = [this]()
293
0
    {
294
0
        auto &arg = AddArg("overlay", 0, _("Overlay dataset"),
295
0
                           &m_overlayDataset, GDAL_OF_RASTER)
296
0
                        .SetPositional()
297
0
                        .SetRequired();
298
299
0
        SetAutoCompleteFunctionForFilename(arg, GDAL_OF_RASTER);
300
0
    };
301
302
0
    if (standaloneStep)
303
0
    {
304
0
        AddRasterInputArgs(false, false);
305
0
        AddOverlayDatasetArg();
306
0
        AddProgressArg();
307
0
        AddRasterOutputArgs(false);
308
0
    }
309
0
    else
310
0
    {
311
0
        AddRasterHiddenInputDatasetArg();
312
0
        AddOverlayDatasetArg();
313
0
    }
314
315
0
    const std::vector<std::string> compositionModeChoices{
316
0
        CompositionModesIdentifiers()};
317
0
    AddArg("operator", 0, _("Composition operator"), &m_operatorIdentifier)
318
0
        .SetChoices(compositionModeChoices)
319
0
        .SetDefault(CompositionModeToString(CompositionMode::SRC_OVER))
320
0
        .AddAction(
321
0
            [this]()
322
0
            { m_operator = CompositionModeFromString(m_operatorIdentifier); });
323
324
0
    AddArg("opacity", 0,
325
0
           _("Opacity percentage to apply to the overlay dataset (0=fully "
326
0
             "transparent, 100=full use of overlay opacity)"),
327
0
           &m_opacity)
328
0
        .SetDefault(m_opacity)
329
0
        .SetMinValueIncluded(0)
330
0
        .SetMaxValueIncluded(OPACITY_INPUT_RANGE);
331
332
0
    AddValidationAction([this]() { return ValidateGlobal(); });
333
0
}
334
335
namespace
336
{
337
338
/************************************************************************/
339
/*                             BlendDataset                             */
340
/************************************************************************/
341
342
class BlendDataset final : public GDALDataset
343
{
344
  public:
345
    BlendDataset(GDALDataset &oColorDS, GDALDataset &oOverlayDS,
346
                 const CompositionMode eOperator, int nOpacity255Scale,
347
                 bool bSwappedOpacity);
348
    ~BlendDataset() override;
349
350
    CPLErr GetGeoTransform(GDALGeoTransform &gt) const override
351
0
    {
352
0
        return m_oColorDS.GetGeoTransform(gt);
353
0
    }
354
355
    const OGRSpatialReference *GetSpatialRef() const override
356
0
    {
357
0
        return m_oColorDS.GetSpatialRef();
358
0
    }
359
360
    bool AcquireSourcePixels(int nXOff, int nYOff, int nXSize, int nYSize,
361
                             int nBufXSize, int nBufYSize,
362
                             GDALRasterIOExtraArg *psExtraArg);
363
364
  protected:
365
    CPLErr IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize,
366
                     int nYSize, void *pData, int nBufXSize, int nBufYSize,
367
                     GDALDataType eBufType, int nBandCount,
368
                     BANDMAP_TYPE panBandMap, GSpacing nPixelSpace,
369
                     GSpacing nLineSpace, GSpacing nBandSpace,
370
                     GDALRasterIOExtraArg *psExtraArg) override;
371
372
  private:
373
    friend class BlendBand;
374
    GDALDataset &m_oColorDS;
375
    GDALDataset &m_oOverlayDS;
376
    const CompositionMode m_operator;
377
    const int m_opacity255Scale;
378
    std::vector<std::unique_ptr<BlendDataset>> m_apoOverviews{};
379
    int m_nCachedXOff = 0;
380
    int m_nCachedYOff = 0;
381
    int m_nCachedXSize = 0;
382
    int m_nCachedYSize = 0;
383
    int m_nCachedBufXSize = 0;
384
    int m_nCachedBufYSize = 0;
385
    GDALRasterIOExtraArg m_sCachedExtraArg{};
386
    std::vector<GByte> m_abyBuffer{};
387
    bool m_ioError = false;
388
    bool m_bSwappedOpacity = false;
389
};
390
391
/************************************************************************/
392
/*                             rgb_to_hs()                              */
393
/************************************************************************/
394
395
// rgb comes in as [r,g,b] with values in the range [0,255]. The returned
396
// values will be with hue and saturation in the range [0,1].
397
398
// Derived from hsv_merge.py
399
400
static void rgb_to_hs(int r, int g, int b, float *h, float *s)
401
0
{
402
0
    int minc, maxc;
403
0
    if (r <= g)
404
0
    {
405
0
        if (r <= b)
406
0
        {
407
0
            minc = r;
408
0
            maxc = std::max(g, b);
409
0
        }
410
0
        else /* b < r */
411
0
        {
412
0
            minc = b;
413
0
            maxc = g;
414
0
        }
415
0
    }
416
0
    else /* g < r */
417
0
    {
418
0
        if (g <= b)
419
0
        {
420
0
            minc = g;
421
0
            maxc = std::max(r, b);
422
0
        }
423
0
        else /* b < g */
424
0
        {
425
0
            minc = b;
426
0
            maxc = r;
427
0
        }
428
0
    }
429
0
    const int maxc_minus_minc = maxc - minc;
430
0
    if (s)
431
0
        *s = maxc_minus_minc / static_cast<float>(std::max(1, maxc));
432
0
    if (h)
433
0
    {
434
0
        const float maxc_minus_minc_times_6 =
435
0
            maxc_minus_minc == 0 ? 1.0f : 6.0f * maxc_minus_minc;
436
0
        if (maxc == b)
437
0
            *h = 4.0f / 6.0f + (r - g) / maxc_minus_minc_times_6;
438
0
        else if (maxc == g)
439
0
            *h = 2.0f / 6.0f + (b - r) / maxc_minus_minc_times_6;
440
0
        else
441
0
        {
442
0
            const float tmp = (g - b) / maxc_minus_minc_times_6;
443
0
            *h = tmp < 0.0f ? tmp + 1.0f : tmp;
444
0
        }
445
0
    }
446
0
}
447
448
/************************************************************************/
449
/*                            choose_among()                            */
450
/************************************************************************/
451
452
template <typename T>
453
static inline T choose_among(int idx, T a0, T a1, T a2, T a3, T a4, T a5)
454
0
{
455
0
    switch (idx)
456
0
    {
457
0
        case 0:
458
0
            return a0;
459
0
        case 1:
460
0
            return a1;
461
0
        case 2:
462
0
            return a2;
463
0
        case 3:
464
0
            return a3;
465
0
        case 4:
466
0
            return a4;
467
0
        default:
468
0
            break;
469
0
    }
470
0
    return a5;
471
0
}
472
473
/************************************************************************/
474
/*                             hsv_to_rgb()                             */
475
/************************************************************************/
476
477
// hsv comes in as [h,s,v] with hue and saturation in the range [0,1],
478
// but value in the range [0,255].
479
480
// Derived from hsv_merge.py
481
482
static void hsv_to_rgb(float h, float s, GByte v, GByte *r, GByte *g, GByte *b)
483
0
{
484
0
    const int i = static_cast<int>(6.0f * h);
485
0
    const float f = 6.0f * h - i;
486
0
    const GByte p = static_cast<GByte>(v * (1.0f - s) + 0.5f);
487
0
    const GByte q = static_cast<GByte>(v * (1.0f - s * f) + 0.5f);
488
0
    const GByte t = static_cast<GByte>(v * (1.0f - s * (1.0f - f)) + 0.5f);
489
490
0
    if (r)
491
0
        *r = choose_among(i, v, q, p, p, t, v);
492
0
    if (g)
493
0
        *g = choose_among(i, t, v, v, q, p, p);
494
0
    if (b)
495
0
        *b = choose_among(i, p, p, t, v, v, q);
496
0
}
497
498
/************************************************************************/
499
/*                           XMM_RGB_to_HS()                            */
500
/************************************************************************/
501
502
#ifdef HAVE_SSE2
503
static inline void
504
XMM_RGB_to_HS(const GByte *CPL_RESTRICT pInR, const GByte *CPL_RESTRICT pInG,
505
              const GByte *CPL_RESTRICT pInB, const XMMReg4Float &zero,
506
              const XMMReg4Float &one, const XMMReg4Float &six,
507
              const XMMReg4Float &two_over_six,
508
              const XMMReg4Float &four_over_six, XMMReg4Float &h,
509
              XMMReg4Float &s)
510
0
{
511
0
    const auto r = XMMReg4Float::Load4Val(pInR);
512
0
    const auto g = XMMReg4Float::Load4Val(pInG);
513
0
    const auto b = XMMReg4Float::Load4Val(pInB);
514
0
    const auto minc = XMMReg4Float::Min(XMMReg4Float::Min(r, g), b);
515
0
    const auto maxc = XMMReg4Float::Max(XMMReg4Float::Max(r, g), b);
516
0
    const auto max_minus_min = maxc - minc;
517
0
    s = max_minus_min / XMMReg4Float::Max(one, maxc);
518
0
    const auto inv_max_minus_min_times_6_0 =
519
0
        XMMReg4Float::Ternary(XMMReg4Float::Equals(max_minus_min, zero), one,
520
0
                              six * max_minus_min)
521
0
            .inverse();
522
0
    const auto tmp = (g - b) * inv_max_minus_min_times_6_0;
523
0
    h = XMMReg4Float::Ternary(
524
0
        XMMReg4Float::Equals(maxc, b),
525
0
        four_over_six + (r - g) * inv_max_minus_min_times_6_0,
526
0
        XMMReg4Float::Ternary(
527
0
            XMMReg4Float::Equals(maxc, g),
528
0
            two_over_six + (b - r) * inv_max_minus_min_times_6_0,
529
0
            XMMReg4Float::Ternary(XMMReg4Float::Lesser(tmp, zero), tmp + one,
530
0
                                  tmp)));
531
0
}
532
#endif
533
534
/************************************************************************/
535
/*                          patch_value_line()                          */
536
/************************************************************************/
537
538
static
539
#ifdef __GNUC__
540
    __attribute__((__noinline__))
541
#endif
542
    void patch_value_line(int nCount, const GByte *CPL_RESTRICT pInR,
543
                          const GByte *CPL_RESTRICT pInG,
544
                          const GByte *CPL_RESTRICT pInB,
545
                          const GByte *CPL_RESTRICT pInGray,
546
                          GByte *CPL_RESTRICT pOutR, GByte *CPL_RESTRICT pOutG,
547
                          GByte *CPL_RESTRICT pOutB)
548
0
{
549
0
    int i = 0;
550
0
#ifdef HAVE_SSE2
551
0
    const auto zero = XMMReg4Float::Zero();
552
0
    const auto one = XMMReg4Float::Set1(1.0f);
553
0
    const auto six = XMMReg4Float::Set1(6.0f);
554
0
    const auto two_over_six = XMMReg4Float::Set1(2.0f / 6.0f);
555
0
    const auto four_over_six = two_over_six + two_over_six;
556
557
0
    constexpr int ELTS = 8;
558
0
    for (; i + (ELTS - 1) < nCount; i += ELTS)
559
0
    {
560
0
        XMMReg4Float h0, s0;
561
0
        XMM_RGB_to_HS(pInR + i, pInG + i, pInB + i, zero, one, six,
562
0
                      two_over_six, four_over_six, h0, s0);
563
0
        XMMReg4Float h1, s1;
564
0
        XMM_RGB_to_HS(pInR + i + ELTS / 2, pInG + i + ELTS / 2,
565
0
                      pInB + i + ELTS / 2, zero, one, six, two_over_six,
566
0
                      four_over_six, h1, s1);
567
568
0
        XMMReg4Float v0, v1;
569
0
        XMMReg4Float::Load8Val(pInGray + i, v0, v1);
570
571
0
        const auto half = XMMReg4Float::Set1(0.5f);
572
0
        const auto six_h0 = six * h0;
573
0
        const auto idx0 = six_h0.truncate_to_int();
574
0
        const auto f0 = six_h0 - idx0.cast_to_float();
575
0
        const auto p0 = (v0 * (one - s0) + half).truncate_to_int();
576
0
        const auto q0 = (v0 * (one - s0 * f0) + half).truncate_to_int();
577
0
        const auto t0 = (v0 * (one - s0 * (one - f0)) + half).truncate_to_int();
578
579
0
        const auto six_h1 = six * h1;
580
0
        const auto idx1 = six_h1.truncate_to_int();
581
0
        const auto f1 = six_h1 - idx1.cast_to_float();
582
0
        const auto p1 = (v1 * (one - s1) + half).truncate_to_int();
583
0
        const auto q1 = (v1 * (one - s1 * f1) + half).truncate_to_int();
584
0
        const auto t1 = (v1 * (one - s1 * (one - f1)) + half).truncate_to_int();
585
586
0
        const auto idx = XMMReg8Byte::Pack(idx0, idx1);
587
0
        const auto v =
588
0
            XMMReg8Byte::Pack(v0.truncate_to_int(), v1.truncate_to_int());
589
0
        const auto p = XMMReg8Byte::Pack(p0, p1);
590
0
        const auto q = XMMReg8Byte::Pack(q0, q1);
591
0
        const auto t = XMMReg8Byte::Pack(t0, t1);
592
593
0
        const auto equalsTo0 = XMMReg8Byte::Equals(idx, XMMReg8Byte::Zero());
594
0
        const auto one8Byte = XMMReg8Byte::Set1(1);
595
0
        const auto equalsTo1 = XMMReg8Byte::Equals(idx, one8Byte);
596
0
        const auto two8Byte = one8Byte + one8Byte;
597
0
        const auto equalsTo2 = XMMReg8Byte::Equals(idx, two8Byte);
598
0
        const auto four8Byte = two8Byte + two8Byte;
599
0
        const auto equalsTo4 = XMMReg8Byte::Equals(idx, four8Byte);
600
0
        const auto equalsTo3 = XMMReg8Byte::Equals(idx, four8Byte - one8Byte);
601
        // clang-format off
602
0
        if (pOutR)
603
0
        {
604
0
            const auto out_r =
605
0
                XMMReg8Byte::Ternary(equalsTo0, v,
606
0
                XMMReg8Byte::Ternary(equalsTo1, q,
607
0
                XMMReg8Byte::Ternary(XMMReg8Byte::Or(equalsTo2, equalsTo3), p,
608
0
                XMMReg8Byte::Ternary(equalsTo4, t, v))));
609
0
            out_r.Store8Val(pOutR + i);
610
0
        }
611
0
        if (pOutG)
612
0
        {
613
0
            const auto out_g =
614
0
                XMMReg8Byte::Ternary(equalsTo0, t,
615
0
                XMMReg8Byte::Ternary(XMMReg8Byte::Or(equalsTo1, equalsTo2), v,
616
0
                XMMReg8Byte::Ternary(equalsTo3, q, p)));
617
0
            out_g.Store8Val(pOutG + i);
618
0
        }
619
0
        if (pOutB)
620
0
        {
621
0
            const auto out_b =
622
0
                XMMReg8Byte::Ternary(XMMReg8Byte::Or(equalsTo0, equalsTo1), p,
623
0
                XMMReg8Byte::Ternary(equalsTo2, t,
624
0
                XMMReg8Byte::Ternary(XMMReg8Byte::Or(equalsTo3, equalsTo4),
625
0
                                     v, q)));
626
0
            out_b.Store8Val(pOutB + i);
627
0
        }
628
        // clang-format on
629
0
    }
630
0
#endif
631
632
0
    for (; i < nCount; ++i)
633
0
    {
634
0
        float h, s;
635
0
        rgb_to_hs(pInR[i], pInG[i], pInB[i], &h, &s);
636
0
        hsv_to_rgb(h, s, pInGray[i], pOutR ? pOutR + i : nullptr,
637
0
                   pOutG ? pOutG + i : nullptr, pOutB ? pOutB + i : nullptr);
638
0
    }
639
0
}
640
641
/************************************************************************/
642
/*                              BlendBand                               */
643
/************************************************************************/
644
645
class BlendBand final : public GDALRasterBand
646
{
647
  public:
648
    BlendBand(BlendDataset &oBlendDataset, int nBandIn)
649
0
        : m_oBlendDataset(oBlendDataset)
650
0
    {
651
0
        nBand = nBandIn;
652
0
        nRasterXSize = oBlendDataset.GetRasterXSize();
653
0
        nRasterYSize = oBlendDataset.GetRasterYSize();
654
0
        oBlendDataset.m_oColorDS.GetRasterBand(1)->GetBlockSize(&nBlockXSize,
655
0
                                                                &nBlockYSize);
656
0
        eDataType = GDT_UInt8;
657
0
    }
658
659
    GDALColorInterp GetColorInterpretation() override
660
0
    {
661
0
        if (m_oBlendDataset.GetRasterCount() <= 2 && nBand == 1)
662
0
            return GCI_GrayIndex;
663
0
        else if (m_oBlendDataset.GetRasterCount() == 2 || nBand == 4)
664
0
            return GCI_AlphaBand;
665
0
        else
666
0
            return static_cast<GDALColorInterp>(GCI_RedBand + nBand - 1);
667
0
    }
668
669
    int GetOverviewCount() override
670
0
    {
671
0
        return static_cast<int>(m_oBlendDataset.m_apoOverviews.size());
672
0
    }
673
674
    GDALRasterBand *GetOverview(int idx) override
675
0
    {
676
0
        return idx >= 0 && idx < GetOverviewCount()
677
0
                   ? m_oBlendDataset.m_apoOverviews[idx]->GetRasterBand(nBand)
678
0
                   : nullptr;
679
0
    }
680
681
  protected:
682
    CPLErr IReadBlock(int nBlockXOff, int nBlockYOff, void *pData) override
683
0
    {
684
0
        int nReqXSize = 0;
685
0
        int nReqYSize = 0;
686
0
        GetActualBlockSize(nBlockXOff, nBlockYOff, &nReqXSize, &nReqYSize);
687
0
        return RasterIO(GF_Read, nBlockXOff * nBlockXSize,
688
0
                        nBlockYOff * nBlockYSize, nReqXSize, nReqYSize, pData,
689
0
                        nReqXSize, nReqYSize, GDT_UInt8, 1, nBlockXSize,
690
0
                        nullptr);
691
0
    }
692
693
    CPLErr IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize,
694
                     int nYSize, void *pData, int nBufXSize, int nBufYSize,
695
                     GDALDataType eBufType, GSpacing nPixelSpace,
696
                     GSpacing nLineSpace,
697
                     GDALRasterIOExtraArg *psExtraArg) override;
698
699
  private:
700
    BlendDataset &m_oBlendDataset;
701
};
702
703
/************************************************************************/
704
/*                     BlendDataset::BlendDataset()                     */
705
/************************************************************************/
706
707
BlendDataset::BlendDataset(GDALDataset &oColorDS, GDALDataset &oOverlayDS,
708
                           const CompositionMode eOperator,
709
                           int nOpacity255Scale, bool bSwappedOpacity)
710
0
    : m_oColorDS(oColorDS), m_oOverlayDS(oOverlayDS), m_operator(eOperator),
711
0
      m_opacity255Scale(nOpacity255Scale), m_bSwappedOpacity(bSwappedOpacity)
712
0
{
713
0
    m_oColorDS.Reference();
714
0
    m_oOverlayDS.Reference();
715
716
0
    CPLAssert(oColorDS.GetRasterXSize() == oOverlayDS.GetRasterXSize());
717
0
    CPLAssert(oColorDS.GetRasterYSize() == oOverlayDS.GetRasterYSize());
718
0
    nRasterXSize = oColorDS.GetRasterXSize();
719
0
    nRasterYSize = oColorDS.GetRasterYSize();
720
0
    const int nOvrCount = oOverlayDS.GetRasterBand(1)->GetOverviewCount();
721
0
    bool bCanCreateOvr = true;
722
0
    for (int iBand = 1; iBand <= oColorDS.GetRasterCount(); ++iBand)
723
0
    {
724
0
        SetBand(iBand, std::make_unique<BlendBand>(*this, iBand));
725
0
        bCanCreateOvr =
726
0
            bCanCreateOvr &&
727
0
            (iBand > oColorDS.GetRasterCount() ||
728
0
             oColorDS.GetRasterBand(iBand)->GetOverviewCount() == nOvrCount) &&
729
0
            (iBand > oOverlayDS.GetRasterCount() ||
730
0
             oOverlayDS.GetRasterBand(iBand)->GetOverviewCount() == nOvrCount);
731
0
        const int nColorBandxIdx =
732
0
            iBand <= oColorDS.GetRasterCount() ? iBand : 1;
733
0
        const int nOverlayBandIdx =
734
0
            iBand <= oOverlayDS.GetRasterCount() ? iBand : 1;
735
0
        for (int iOvr = 0; iOvr < nOvrCount && bCanCreateOvr; ++iOvr)
736
0
        {
737
0
            const auto poColorOvrBand =
738
0
                oColorDS.GetRasterBand(nColorBandxIdx)->GetOverview(iOvr);
739
0
            const auto poGSOvrBand =
740
0
                oOverlayDS.GetRasterBand(nOverlayBandIdx)->GetOverview(iOvr);
741
0
            bCanCreateOvr =
742
0
                poColorOvrBand->GetDataset() != &oColorDS &&
743
0
                poColorOvrBand->GetDataset() == oColorDS.GetRasterBand(1)
744
0
                                                    ->GetOverview(iOvr)
745
0
                                                    ->GetDataset() &&
746
0
                poGSOvrBand->GetDataset() != &oOverlayDS &&
747
0
                poGSOvrBand->GetDataset() == oOverlayDS.GetRasterBand(1)
748
0
                                                 ->GetOverview(iOvr)
749
0
                                                 ->GetDataset() &&
750
0
                poColorOvrBand->GetXSize() == poGSOvrBand->GetXSize() &&
751
0
                poColorOvrBand->GetYSize() == poGSOvrBand->GetYSize();
752
0
        }
753
0
    }
754
755
0
    SetDescription(CPLSPrintf("Blend %s width %s", m_oColorDS.GetDescription(),
756
0
                              m_oOverlayDS.GetDescription()));
757
0
    if (nBands > 1)
758
0
    {
759
0
        SetMetadataItem("INTERLEAVE", "PIXEL", "IMAGE_STRUCTURE");
760
0
    }
761
762
0
    if (bCanCreateOvr)
763
0
    {
764
0
        for (int iOvr = 0; iOvr < nOvrCount; ++iOvr)
765
0
        {
766
0
            m_apoOverviews.push_back(std::make_unique<BlendDataset>(
767
0
                *(oColorDS.GetRasterBand(1)->GetOverview(iOvr)->GetDataset()),
768
0
                *(oOverlayDS.GetRasterBand(1)->GetOverview(iOvr)->GetDataset()),
769
0
                m_operator, nOpacity255Scale, bSwappedOpacity));
770
0
        }
771
0
    }
772
0
}
773
774
/************************************************************************/
775
/*                    ~BlendDataset::BlendDataset()                     */
776
/************************************************************************/
777
778
BlendDataset::~BlendDataset()
779
0
{
780
0
    m_oColorDS.ReleaseRef();
781
0
    m_oOverlayDS.ReleaseRef();
782
0
}
783
784
/************************************************************************/
785
/*                 BlendDataset::AcquireSourcePixels()                  */
786
/************************************************************************/
787
788
bool BlendDataset::AcquireSourcePixels(int nXOff, int nYOff, int nXSize,
789
                                       int nYSize, int nBufXSize, int nBufYSize,
790
                                       GDALRasterIOExtraArg *psExtraArg)
791
0
{
792
0
    if (nXOff == m_nCachedXOff && nYOff == m_nCachedYOff &&
793
0
        nXSize == m_nCachedXSize && nYSize == m_nCachedYSize &&
794
0
        nBufXSize == m_nCachedBufXSize && nBufYSize == m_nCachedBufYSize &&
795
0
        psExtraArg->eResampleAlg == m_sCachedExtraArg.eResampleAlg &&
796
0
        psExtraArg->bFloatingPointWindowValidity ==
797
0
            m_sCachedExtraArg.bFloatingPointWindowValidity &&
798
0
        (!psExtraArg->bFloatingPointWindowValidity ||
799
0
         (psExtraArg->dfXOff == m_sCachedExtraArg.dfXOff &&
800
0
          psExtraArg->dfYOff == m_sCachedExtraArg.dfYOff &&
801
0
          psExtraArg->dfXSize == m_sCachedExtraArg.dfXSize &&
802
0
          psExtraArg->dfYSize == m_sCachedExtraArg.dfYSize)))
803
0
    {
804
0
        return !m_abyBuffer.empty();
805
0
    }
806
807
0
    const int nColorCount = m_oColorDS.GetRasterCount();
808
0
    const int nOverlayCount = m_oOverlayDS.GetRasterCount();
809
0
    const int nCompsInBuffer = nColorCount + nOverlayCount;
810
0
    assert(nCompsInBuffer > 0);
811
812
0
    if (static_cast<size_t>(nBufXSize) >
813
0
        std::numeric_limits<size_t>::max() / nBufYSize / nCompsInBuffer)
814
0
    {
815
0
        CPLError(CE_Failure, CPLE_OutOfMemory,
816
0
                 "Out of memory allocating temporary buffer");
817
0
        m_abyBuffer.clear();
818
0
        m_ioError = true;
819
0
        return false;
820
0
    }
821
822
0
    const size_t nPixelCount = static_cast<size_t>(nBufXSize) * nBufYSize;
823
0
    try
824
0
    {
825
0
        if (m_abyBuffer.size() < nPixelCount * nCompsInBuffer)
826
0
            m_abyBuffer.resize(nPixelCount * nCompsInBuffer);
827
0
    }
828
0
    catch (const std::exception &)
829
0
    {
830
0
        CPLError(CE_Failure, CPLE_OutOfMemory,
831
0
                 "Out of memory allocating temporary buffer");
832
0
        m_abyBuffer.clear();
833
0
        m_ioError = true;
834
0
        return false;
835
0
    }
836
837
0
    const bool bOK =
838
0
        (m_oColorDS.RasterIO(GF_Read, nXOff, nYOff, nXSize, nYSize,
839
0
                             m_abyBuffer.data(), nBufXSize, nBufYSize,
840
0
                             GDT_UInt8, nColorCount, nullptr, 1, nBufXSize,
841
0
                             static_cast<GSpacing>(nPixelCount),
842
0
                             psExtraArg) == CE_None &&
843
0
         m_oOverlayDS.RasterIO(
844
0
             GF_Read, nXOff, nYOff, nXSize, nYSize,
845
0
             m_abyBuffer.data() + nPixelCount * nColorCount, nBufXSize,
846
0
             nBufYSize, GDT_UInt8, nOverlayCount, nullptr, 1, nBufXSize,
847
0
             static_cast<GSpacing>(nPixelCount), psExtraArg) == CE_None);
848
0
    if (bOK)
849
0
    {
850
0
        m_nCachedXOff = nXOff;
851
0
        m_nCachedYOff = nYOff;
852
0
        m_nCachedXSize = nXSize;
853
0
        m_nCachedYSize = nYSize;
854
0
        m_nCachedBufXSize = nBufXSize;
855
0
        m_nCachedBufYSize = nBufYSize;
856
0
        m_sCachedExtraArg = *psExtraArg;
857
0
    }
858
0
    else
859
0
    {
860
0
        m_abyBuffer.clear();
861
0
        m_ioError = true;
862
0
    }
863
0
    return bOK;
864
0
}
865
866
/************************************************************************/
867
/*                             gTabInvDstA                              */
868
/************************************************************************/
869
870
constexpr int SHIFT_DIV_DSTA = 8;
871
872
// Table of (255 * 256 + k/2) / k values for k in [0,255]
873
constexpr auto gTabInvDstA = []()
874
{
875
    std::array<uint16_t, 256> arr{};
876
877
    arr[0] = 0;
878
    for (int k = 1; k <= 255; ++k)
879
    {
880
        arr[k] = static_cast<uint16_t>(((255 << SHIFT_DIV_DSTA) + (k / 2)) / k);
881
    }
882
883
    return arr;
884
}();
885
886
/************************************************************************/
887
/*                        BlendMultiply_Generic                         */
888
/************************************************************************/
889
890
static void BlendMultiply_Generic(
891
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
892
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
893
    const GByte *CPL_RESTRICT pabyOverlayR,
894
    const GByte *CPL_RESTRICT pabyOverlayG,
895
    const GByte *CPL_RESTRICT pabyOverlayB,
896
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
897
    GSpacing nPixelSpace, GSpacing nBandSpace, size_t i, size_t N,
898
    GByte nOpacity, int nOutputBands, bool bSwappedOpacity)
899
0
{
900
901
    // Generic formulas from Mapserver
902
    // Dca' = Sca.Dca + Sca.(1 - Da) + Dca.(1 - Sa)
903
    // Da'  = Sa + Da - Sa.Da
904
905
    // TODO: optimize for the various cases (with/without alpha, grayscale/RGB)
906
    // TODO: optimize mathematically to avoid redundant computations
907
908
0
    GSpacing nDstOffset = 0;
909
910
0
    for (; i < N; ++i)
911
0
    {
912
913
0
        GByte nOverlayR, nOverlayG, nOverlayB, nOverlayA;
914
0
        GByte nR, nG, nB, nA;
915
0
        GByte nFinalAlpha;
916
0
        ProcessAlphaChannels(i, pabyA, pabyOverlayA, nOpacity, bSwappedOpacity,
917
0
                             nA, nOverlayA, nFinalAlpha);
918
0
        PremultiplyChannels(i, pabyR, pabyG, pabyB, nR, nG, nB, nA);
919
0
        PremultiplyChannels(i, pabyOverlayR, pabyOverlayG, pabyOverlayB,
920
0
                            nOverlayR, nOverlayG, nOverlayB, nOverlayA);
921
922
        // Result
923
0
        auto processComponent = [&nFinalAlpha](GByte C, GByte A, GByte OverlayC,
924
0
                                               GByte OverlayA) -> GByte
925
0
        {
926
0
            return DivScale255((MulScale255(C, OverlayC) +
927
0
                                MulScale255(C, 255 - OverlayA) +
928
0
                                MulScale255(OverlayC, 255 - A)),
929
0
                               nFinalAlpha);
930
0
        };
931
932
0
        pabyDst[nDstOffset] = processComponent(nR, nA, nOverlayR, nOverlayA);
933
934
        // Grayscale with alpha
935
0
        if (nOutputBands == 2)
936
0
        {
937
0
            pabyDst[nDstOffset + nBandSpace] = nFinalAlpha;
938
0
        }
939
0
        else
940
0
        {
941
            // RBG and RGBA
942
0
            if (nOutputBands >= 3)
943
0
            {
944
0
                pabyDst[nDstOffset + nBandSpace] =
945
0
                    processComponent(nG, nA, nOverlayG, nOverlayA);
946
0
                pabyDst[nDstOffset + 2 * nBandSpace] =
947
0
                    processComponent(nB, nA, nOverlayB, nOverlayA);
948
0
            }
949
950
            // RGBA
951
0
            if (nOutputBands == 4)
952
0
            {
953
0
                pabyDst[nDstOffset + 3 * nBandSpace] = nFinalAlpha;
954
0
            }
955
0
        }
956
0
        nDstOffset += nPixelSpace;
957
0
    }
958
0
}
959
960
/************************************************************************/
961
/*                         BlendScreen_Generic                          */
962
/************************************************************************/
963
964
static void BlendScreen_Generic(
965
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
966
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
967
    const GByte *CPL_RESTRICT pabyOverlayR,
968
    const GByte *CPL_RESTRICT pabyOverlayG,
969
    const GByte *CPL_RESTRICT pabyOverlayB,
970
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
971
    GSpacing nPixelSpace, GSpacing nBandSpace, size_t i, size_t N,
972
    GByte nOpacity, int nOutputBands, bool bSwappedOpacity)
973
0
{
974
975
    // Generic formulas from Mapserver
976
    // Dca' = Sca + Dca - Sca.Dca
977
    // Da'  = Sa + Da - Sa.Da
978
979
    // TODO: optimize for the various cases (with/without alpha, grayscale/RGB)
980
    // TODO: optimize mathematically to avoid redundant computations
981
982
0
    GSpacing nDstOffset = 0;
983
984
0
    for (; i < N; ++i)
985
0
    {
986
987
0
        GByte nOverlayR, nOverlayG, nOverlayB, nOverlayA;
988
0
        GByte nR, nG, nB, nA;
989
0
        GByte nFinalAlpha;
990
0
        ProcessAlphaChannels(i, pabyA, pabyOverlayA, nOpacity, bSwappedOpacity,
991
0
                             nA, nOverlayA, nFinalAlpha);
992
0
        PremultiplyChannels(i, pabyR, pabyG, pabyB, nR, nG, nB, nA);
993
0
        PremultiplyChannels(i, pabyOverlayR, pabyOverlayG, pabyOverlayB,
994
0
                            nOverlayR, nOverlayG, nOverlayB, nOverlayA);
995
996
        // Result
997
998
0
        auto processComponent = [&nFinalAlpha](GByte C, GByte OverlayC) -> GByte
999
0
        {
1000
0
            return DivScale255(C + OverlayC - MulScale255(C, OverlayC),
1001
0
                               nFinalAlpha);
1002
0
        };
1003
1004
0
        pabyDst[nDstOffset] = processComponent(nR, nOverlayR);
1005
1006
        // Grayscale with alpha
1007
0
        if (nOutputBands == 2)
1008
0
        {
1009
0
            pabyDst[nDstOffset + nBandSpace] = nFinalAlpha;
1010
0
        }
1011
0
        else
1012
0
        {
1013
            // RBG and RGBA
1014
0
            if (nOutputBands >= 3)
1015
0
            {
1016
0
                pabyDst[nDstOffset + nBandSpace] =
1017
0
                    processComponent(nG, nOverlayG);
1018
0
                pabyDst[nDstOffset + 2 * nBandSpace] =
1019
0
                    processComponent(nB, nOverlayB);
1020
0
            }
1021
1022
            // RGBA
1023
0
            if (nOutputBands == 4)
1024
0
            {
1025
0
                pabyDst[nDstOffset + 3 * nBandSpace] = nFinalAlpha;
1026
0
            }
1027
0
        }
1028
0
        nDstOffset += nPixelSpace;
1029
0
    }
1030
0
}
1031
1032
/************************************************************************/
1033
/*                         BlendOverlay_Generic                         */
1034
/************************************************************************/
1035
1036
static void BlendOverlay_Generic(
1037
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
1038
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
1039
    const GByte *CPL_RESTRICT pabyOverlayR,
1040
    const GByte *CPL_RESTRICT pabyOverlayG,
1041
    const GByte *CPL_RESTRICT pabyOverlayB,
1042
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
1043
    GSpacing nPixelSpace, GSpacing nBandSpace, size_t i, size_t N,
1044
    GByte nOpacity, int nOutputBands, bool bSwappedOpacity)
1045
0
{
1046
1047
    // Generic formulas from Mapserver
1048
    // Where "D" is destination, "S" is source (overlay)
1049
    // if 2.Dca < Da
1050
    //   Dca' = 2.Sca.Dca + Sca.(1 - Da) + Dca.(1 - Sa)
1051
    // otherwise
1052
    //   Dca' = Sa.Da - 2.(Da - Dca).(Sa - Sca) + Sca.(1 - Da) + Dca.(1 - Sa)
1053
    //
1054
    // Da'  = Sa + Da - Sa.Da
1055
1056
    // TODO: optimize for the various cases (with/without alpha, grayscale/RGB)
1057
    // TODO: optimize mathematically to avoid redundant computations
1058
1059
0
    GSpacing nDstOffset = 0;
1060
1061
0
    for (; i < N; ++i)
1062
0
    {
1063
1064
0
        GByte nOverlayR, nOverlayG, nOverlayB, nOverlayA;
1065
0
        GByte nR, nG, nB, nA;
1066
0
        GByte nFinalAlpha;
1067
0
        ProcessAlphaChannels(i, pabyA, pabyOverlayA, nOpacity, bSwappedOpacity,
1068
0
                             nA, nOverlayA, nFinalAlpha);
1069
0
        PremultiplyChannels(i, pabyR, pabyG, pabyB, nR, nG, nB, nA);
1070
0
        PremultiplyChannels(i, pabyOverlayR, pabyOverlayG, pabyOverlayB,
1071
0
                            nOverlayR, nOverlayG, nOverlayB, nOverlayA);
1072
1073
        // Sa.Da
1074
0
        const GByte nAlphaMul{MulScale255(nOverlayA, nA)};
1075
1076
0
        auto processComponent_lessThan = [&nFinalAlpha](GByte C, GByte A,
1077
0
                                                        GByte OverlayC,
1078
0
                                                        GByte OverlayA) -> GByte
1079
0
        {
1080
0
            return DivScale255(2 * MulScale255(C, OverlayC) +
1081
0
                                   MulScale255(C, 255 - OverlayA) +
1082
0
                                   MulScale255(OverlayC, 255 - A),
1083
0
                               nFinalAlpha);
1084
0
        };
1085
1086
0
        auto processComponent_greaterEqual =
1087
0
            [&nFinalAlpha, &nAlphaMul](GByte C, GByte A, GByte OverlayC,
1088
0
                                       GByte OverlayA) -> GByte
1089
0
        {
1090
0
            return DivScale255(nAlphaMul -
1091
0
                                   2 * MulScale255(A - C, OverlayA - OverlayC) +
1092
0
                                   MulScale255(C, 255 - OverlayA) +
1093
0
                                   MulScale255(OverlayC, 255 - A),
1094
0
                               nFinalAlpha);
1095
0
        };
1096
1097
0
        if (2 * nR < nA)
1098
0
        {
1099
0
            pabyDst[nDstOffset] =
1100
0
                processComponent_lessThan(nR, nA, nOverlayR, nOverlayA);
1101
0
        }
1102
0
        else
1103
0
        {
1104
0
            pabyDst[nDstOffset] =
1105
0
                processComponent_greaterEqual(nR, nA, nOverlayR, nOverlayA);
1106
0
        }
1107
1108
        // Grayscale with alpha
1109
0
        if (nOutputBands == 2)
1110
0
        {
1111
0
            pabyDst[nDstOffset + nBandSpace] = nFinalAlpha;
1112
0
        }
1113
0
        else
1114
0
        {
1115
            // RBG and RGBA
1116
0
            if (nOutputBands >= 3)
1117
0
            {
1118
1119
0
                if (2 * nG < nA)
1120
0
                {
1121
0
                    pabyDst[nDstOffset + nBandSpace] =
1122
0
                        processComponent_lessThan(nG, nA, nOverlayG, nOverlayA);
1123
0
                }
1124
0
                else
1125
0
                {
1126
0
                    pabyDst[nDstOffset + nBandSpace] =
1127
0
                        processComponent_greaterEqual(nG, nA, nOverlayG,
1128
0
                                                      nOverlayA);
1129
0
                }
1130
1131
0
                if (2 * nB < nA)
1132
0
                {
1133
0
                    pabyDst[nDstOffset + 2 * nBandSpace] =
1134
0
                        processComponent_lessThan(nB, nA, nOverlayB, nOverlayA);
1135
0
                }
1136
0
                else
1137
0
                {
1138
0
                    pabyDst[nDstOffset + 2 * nBandSpace] =
1139
0
                        processComponent_greaterEqual(nB, nA, nOverlayB,
1140
0
                                                      nOverlayA);
1141
0
                }
1142
0
            }
1143
1144
            // RGBA
1145
0
            if (nOutputBands == 4)
1146
0
            {
1147
0
                pabyDst[nDstOffset + 3 * nBandSpace] = nFinalAlpha;
1148
0
            }
1149
0
        }
1150
0
        nDstOffset += nPixelSpace;
1151
0
    }
1152
0
}
1153
1154
/************************************************************************/
1155
/*                        BlendHardLight_Generic                        */
1156
/************************************************************************/
1157
1158
static void BlendHardLight_Generic(
1159
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
1160
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
1161
    const GByte *CPL_RESTRICT pabyOverlayR,
1162
    const GByte *CPL_RESTRICT pabyOverlayG,
1163
    const GByte *CPL_RESTRICT pabyOverlayB,
1164
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
1165
    GSpacing nPixelSpace, GSpacing nBandSpace, size_t i, size_t N,
1166
    GByte nOpacity, int nOutputBands, bool bSwappedOpacity)
1167
0
{
1168
    // Hard Light is Overlay with roles of source and overlay swapped
1169
0
    BlendOverlay_Generic(pabyOverlayR, pabyOverlayG, pabyOverlayB, pabyOverlayA,
1170
0
                         pabyR, pabyG, pabyB, pabyA, pabyDst, nPixelSpace,
1171
0
                         nBandSpace, i, N, nOpacity, nOutputBands,
1172
0
                         !bSwappedOpacity);
1173
0
}
1174
1175
/************************************************************************/
1176
/*                         BlendDarken_Generic                          */
1177
/************************************************************************/
1178
1179
static void BlendDarken_Generic(
1180
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
1181
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
1182
    const GByte *CPL_RESTRICT pabyOverlayR,
1183
    const GByte *CPL_RESTRICT pabyOverlayG,
1184
    const GByte *CPL_RESTRICT pabyOverlayB,
1185
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
1186
    GSpacing nPixelSpace, GSpacing nBandSpace, size_t i, size_t N,
1187
    GByte nOpacity, int nOutputBands, bool bSwappedOpacity)
1188
0
{
1189
    // Generic formulas from Mapserver
1190
    // Dca' = min(Sca.Da, Dca.Sa) + Sca.(1 - Da) + Dca.(1 - Sa)
1191
    // Da'  = Sa + Da - Sa.Da
1192
1193
    // TODO: optimize for the various cases (with/without alpha, grayscale/RGB)
1194
    // TODO: optimize mathematically to avoid redundant computations
1195
1196
0
    GSpacing nDstOffset = 0;
1197
1198
0
    for (; i < N; ++i)
1199
0
    {
1200
1201
0
        GByte nOverlayR, nOverlayG, nOverlayB, nOverlayA;
1202
0
        GByte nR, nG, nB, nA;
1203
0
        GByte nFinalAlpha;
1204
0
        ProcessAlphaChannels(i, pabyA, pabyOverlayA, nOpacity, bSwappedOpacity,
1205
0
                             nA, nOverlayA, nFinalAlpha);
1206
0
        PremultiplyChannels(i, pabyR, pabyG, pabyB, nR, nG, nB, nA);
1207
0
        PremultiplyChannels(i, pabyOverlayR, pabyOverlayG, pabyOverlayB,
1208
0
                            nOverlayR, nOverlayG, nOverlayB, nOverlayA);
1209
1210
        // Result
1211
1212
0
        auto processComponent = [&nFinalAlpha](GByte C, GByte A, GByte OverlayC,
1213
0
                                               GByte OverlayA) -> GByte
1214
0
        {
1215
0
            return DivScale255(
1216
0
                std::min(MulScale255(OverlayC, A), MulScale255(C, OverlayA)) +
1217
0
                    MulScale255(C, 255 - OverlayA) +
1218
0
                    MulScale255(OverlayC, 255 - A),
1219
0
                nFinalAlpha);
1220
0
        };
1221
1222
0
        pabyDst[nDstOffset] = processComponent(nR, nA, nOverlayR, nOverlayA);
1223
1224
        // Grayscale with alpha
1225
0
        if (nOutputBands == 2)
1226
0
        {
1227
0
            pabyDst[nDstOffset + nBandSpace] = nFinalAlpha;
1228
0
        }
1229
0
        else
1230
0
        {
1231
            // RBG and RGBA
1232
0
            if (nOutputBands >= 3)
1233
0
            {
1234
0
                pabyDst[nDstOffset + nBandSpace] =
1235
0
                    processComponent(nG, nA, nOverlayG, nOverlayA);
1236
0
                pabyDst[nDstOffset + 2 * nBandSpace] =
1237
0
                    processComponent(nB, nA, nOverlayB, nOverlayA);
1238
0
            }
1239
1240
            // RGBA
1241
0
            if (nOutputBands == 4)
1242
0
            {
1243
0
                pabyDst[nDstOffset + 3 * nBandSpace] = nFinalAlpha;
1244
0
            }
1245
0
        }
1246
0
        nDstOffset += nPixelSpace;
1247
0
    }
1248
0
}
1249
1250
/************************************************************************/
1251
/*                         BlendLighten_Generic                         */
1252
/************************************************************************/
1253
1254
static void BlendLighten_Generic(
1255
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
1256
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
1257
    const GByte *CPL_RESTRICT pabyOverlayR,
1258
    const GByte *CPL_RESTRICT pabyOverlayG,
1259
    const GByte *CPL_RESTRICT pabyOverlayB,
1260
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
1261
    GSpacing nPixelSpace, GSpacing nBandSpace, size_t i, size_t N,
1262
    GByte nOpacity, int nOutputBands, bool bSwappedOpacity)
1263
0
{
1264
1265
    // Generic formulas from Mapserver
1266
    // Dca' = max(Sca.Da, Dca.Sa) + Sca.(1 - Da) + Dca.(1 - Sa)
1267
    // Da'  = Sa + Da - Sa.Da
1268
1269
    // TODO: optimize for the various cases (with/without alpha, grayscale/RGB)
1270
    // TODO: optimize mathematically to avoid redundant computations
1271
1272
0
    GSpacing nDstOffset = 0;
1273
1274
0
    for (; i < N; ++i)
1275
0
    {
1276
1277
0
        GByte nOverlayR, nOverlayG, nOverlayB, nOverlayA;
1278
0
        GByte nR, nG, nB, nA;
1279
0
        GByte nFinalAlpha;
1280
0
        ProcessAlphaChannels(i, pabyA, pabyOverlayA, nOpacity, bSwappedOpacity,
1281
0
                             nA, nOverlayA, nFinalAlpha);
1282
0
        PremultiplyChannels(i, pabyR, pabyG, pabyB, nR, nG, nB, nA);
1283
0
        PremultiplyChannels(i, pabyOverlayR, pabyOverlayG, pabyOverlayB,
1284
0
                            nOverlayR, nOverlayG, nOverlayB, nOverlayA);
1285
1286
        // Result
1287
1288
0
        auto processComponent = [&nFinalAlpha](GByte C, GByte A, GByte OverlayC,
1289
0
                                               GByte OverlayA) -> GByte
1290
0
        {
1291
0
            return DivScale255(
1292
0
                std::max(MulScale255(OverlayC, A), MulScale255(C, OverlayA)) +
1293
0
                    MulScale255(C, 255 - OverlayA) +
1294
0
                    MulScale255(OverlayC, 255 - A),
1295
0
                nFinalAlpha);
1296
0
        };
1297
1298
0
        pabyDst[nDstOffset] = processComponent(nR, nA, nOverlayR, nOverlayA);
1299
1300
        // Grayscale with alpha
1301
0
        if (nOutputBands == 2)
1302
0
        {
1303
0
            pabyDst[nDstOffset + nBandSpace] = nFinalAlpha;
1304
0
        }
1305
0
        else
1306
0
        {
1307
            // RBG and RGBA
1308
0
            if (nOutputBands >= 3)
1309
0
            {
1310
0
                pabyDst[nDstOffset + nBandSpace] =
1311
0
                    processComponent(nG, nA, nOverlayG, nOverlayA);
1312
0
                pabyDst[nDstOffset + 2 * nBandSpace] =
1313
0
                    processComponent(nB, nA, nOverlayB, nOverlayA);
1314
0
            }
1315
1316
            // RGBA
1317
0
            if (nOutputBands == 4)
1318
0
            {
1319
0
                pabyDst[nDstOffset + 3 * nBandSpace] = nFinalAlpha;
1320
0
            }
1321
0
        }
1322
0
        nDstOffset += nPixelSpace;
1323
0
    }
1324
0
}
1325
1326
/************************************************************************/
1327
/*                       BlendColorDodge_Generic                        */
1328
/************************************************************************/
1329
1330
static void BlendColorDodge_Generic(
1331
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
1332
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
1333
    const GByte *CPL_RESTRICT pabyOverlayR,
1334
    const GByte *CPL_RESTRICT pabyOverlayG,
1335
    const GByte *CPL_RESTRICT pabyOverlayB,
1336
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
1337
    GSpacing nPixelSpace, GSpacing nBandSpace, size_t i, size_t N,
1338
    GByte nOpacity, int nOutputBands, bool bSwappedOpacity)
1339
0
{
1340
1341
    // Generic formulas from Mapserver
1342
    // if Sca.Da + Dca.Sa >= Sa.Da
1343
    //   Dca' = Sa.Da + Sca.(1 - Da) + Dca.(1 - Sa)
1344
    // otherwise
1345
    //   Dca' = Dca.Sa/(1-Sca/Sa) + Sca.(1 - Da) + Dca.(1 - Sa)
1346
    //
1347
    // Da'  = Sa + Da - Sa.Da
1348
1349
    // TODO: optimize for the various cases (with/without alpha, grayscale/RGB)
1350
    // TODO: optimize mathematically to avoid redundant computations
1351
1352
0
    GSpacing nDstOffset = 0;
1353
1354
0
    for (; i < N; ++i)
1355
0
    {
1356
1357
0
        GByte nOverlayR, nOverlayG, nOverlayB, nOverlayA;
1358
0
        GByte nR, nG, nB, nA;
1359
0
        GByte nFinalAlpha;
1360
0
        ProcessAlphaChannels(i, pabyA, pabyOverlayA, nOpacity, bSwappedOpacity,
1361
0
                             nA, nOverlayA, nFinalAlpha);
1362
0
        PremultiplyChannels(i, pabyR, pabyG, pabyB, nR, nG, nB, nA);
1363
0
        PremultiplyChannels(i, pabyOverlayR, pabyOverlayG, pabyOverlayB,
1364
0
                            nOverlayR, nOverlayG, nOverlayB, nOverlayA);
1365
1366
        // Result
1367
1368
0
        const GByte alphaMul255{MulScale255(nOverlayA, nA)};
1369
1370
        // Dca' = Sa.Da + Sca.(1 - Da) + Dca.(1 - Sa)
1371
0
        auto processComponent_greaterEqual =
1372
0
            [&nFinalAlpha, &alphaMul255](GByte C, GByte A, GByte OverlayC,
1373
0
                                         GByte OverlayA) -> GByte
1374
0
        {
1375
0
            return DivScale255(alphaMul255 + MulScale255(C, 255 - OverlayA) +
1376
0
                                   MulScale255(OverlayC, 255 - A),
1377
0
                               nFinalAlpha);
1378
0
        };
1379
1380
        // Dca.Sa/(1-Sca/Sa) + Sca.(1 - Da) + Dca.(1 - Sa)
1381
0
        auto processComponent_lessThan = [&nFinalAlpha](GByte C, GByte A,
1382
0
                                                        GByte OverlayC,
1383
0
                                                        GByte OverlayA) -> GByte
1384
0
        {
1385
0
            return DivScale255(
1386
0
                DivScale255(MulScale255(C, OverlayA),
1387
0
                            255 - DivScale255(OverlayC, OverlayA)) +
1388
0
                    MulScale255(C, 255 - OverlayA) +
1389
0
                    MulScale255(OverlayC, 255 - A),
1390
0
                nFinalAlpha);
1391
0
        };
1392
1393
0
        auto branchingCondition = [&alphaMul255](GByte C, GByte A,
1394
0
                                                 GByte OverlayC,
1395
0
                                                 GByte OverlayA) -> bool
1396
0
        {
1397
0
            return MulScale255(OverlayC, A) + MulScale255(C, OverlayA) >=
1398
0
                   alphaMul255;
1399
0
        };
1400
1401
0
        if (branchingCondition(nR, nA, nOverlayR, nOverlayA))
1402
0
        {
1403
0
            pabyDst[nDstOffset] =
1404
0
                processComponent_greaterEqual(nR, nA, nOverlayR, nOverlayA);
1405
0
        }
1406
0
        else
1407
0
        {
1408
0
            pabyDst[nDstOffset] =
1409
0
                processComponent_lessThan(nR, nA, nOverlayR, nOverlayA);
1410
0
        }
1411
1412
        // Grayscale with alpha
1413
0
        if (nOutputBands == 2)
1414
0
        {
1415
0
            pabyDst[nDstOffset + nBandSpace] = nFinalAlpha;
1416
0
        }
1417
0
        else
1418
0
        {
1419
            // RBG and RGBA
1420
0
            if (nOutputBands >= 3)
1421
0
            {
1422
0
                if (branchingCondition(nG, nA, nOverlayG, nOverlayA))
1423
0
                {
1424
0
                    pabyDst[nDstOffset + nBandSpace] =
1425
0
                        processComponent_greaterEqual(nG, nA, nOverlayG,
1426
0
                                                      nOverlayA);
1427
0
                }
1428
0
                else
1429
0
                {
1430
0
                    pabyDst[nDstOffset + nBandSpace] =
1431
0
                        processComponent_lessThan(nG, nA, nOverlayG, nOverlayA);
1432
0
                }
1433
1434
0
                if (branchingCondition(nB, nA, nOverlayB, nOverlayA))
1435
0
                {
1436
0
                    pabyDst[nDstOffset + 2 * nBandSpace] =
1437
0
                        processComponent_greaterEqual(nB, nA, nOverlayB,
1438
0
                                                      nOverlayA);
1439
0
                }
1440
0
                else
1441
0
                {
1442
0
                    pabyDst[nDstOffset + 2 * nBandSpace] =
1443
0
                        processComponent_lessThan(nB, nA, nOverlayB, nOverlayA);
1444
0
                }
1445
0
            }
1446
1447
            // RGBA
1448
0
            if (nOutputBands == 4)
1449
0
            {
1450
0
                pabyDst[nDstOffset + 3 * nBandSpace] = nFinalAlpha;
1451
0
            }
1452
0
        }
1453
0
        nDstOffset += nPixelSpace;
1454
0
    }
1455
0
}
1456
1457
/************************************************************************/
1458
/*                        BlendColorBurn_Generic                        */
1459
/************************************************************************/
1460
1461
static void BlendColorBurn_Generic(
1462
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
1463
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
1464
    const GByte *CPL_RESTRICT pabyOverlayR,
1465
    const GByte *CPL_RESTRICT pabyOverlayG,
1466
    const GByte *CPL_RESTRICT pabyOverlayB,
1467
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
1468
    GSpacing nPixelSpace, GSpacing nBandSpace, size_t i, size_t N,
1469
    GByte nOpacity, int nOutputBands, bool bSwappedOpacity)
1470
0
{
1471
1472
    // if Sca.Da + Dca.Sa <= Sa.Da
1473
    //   Dca' = Sca.(1 - Da) + Dca.(1 - Sa)
1474
    // otherwise
1475
    //   Dca' = Sa.(Sca.Da + Dca.Sa - Sa.Da)/Sca + Sca.(1 - Da) + Dca.(1 - Sa)
1476
    // Da'  = Sa + Da - Sa.Da
1477
1478
    // TODO: optimize for the various cases (with/without alpha, grayscale/RGB)
1479
    // TODO: optimize mathematically to avoid redundant computations
1480
1481
0
    GSpacing nDstOffset = 0;
1482
1483
0
    for (; i < N; ++i)
1484
0
    {
1485
1486
0
        GByte nOverlayR, nOverlayG, nOverlayB, nOverlayA;
1487
0
        GByte nR, nG, nB, nA;
1488
0
        GByte nFinalAlpha;
1489
0
        ProcessAlphaChannels(i, pabyA, pabyOverlayA, nOpacity, bSwappedOpacity,
1490
0
                             nA, nOverlayA, nFinalAlpha);
1491
0
        PremultiplyChannels(i, pabyR, pabyG, pabyB, nR, nG, nB, nA);
1492
0
        PremultiplyChannels(i, pabyOverlayR, pabyOverlayG, pabyOverlayB,
1493
0
                            nOverlayR, nOverlayG, nOverlayB, nOverlayA);
1494
1495
        // Result
1496
1497
0
        const GByte alphaMul255{MulScale255(nOverlayA, nA)};
1498
1499
0
        auto processComponent_lessEqual =
1500
0
            [&nFinalAlpha](GByte C, GByte A, GByte OverlayC,
1501
0
                           GByte OverlayA) -> GByte
1502
0
        {
1503
0
            return DivScale255(MulScale255(C, 255 - OverlayA) +
1504
0
                                   MulScale255(OverlayC, 255 - A),
1505
0
                               nFinalAlpha);
1506
0
        };
1507
1508
        // The simplified formula below was tested and verified with the floating point equivalent [0..1] normalized version
1509
0
        auto processComponent_greater =
1510
0
            [&nFinalAlpha, &alphaMul255](GByte C, GByte A, GByte OverlayC,
1511
0
                                         GByte OverlayA) -> GByte
1512
0
        {
1513
0
            const GByte C_unpremultiplied = DivScale255(C, A);
1514
0
            const GByte Overlay_C_unpremultiplied =
1515
0
                DivScale255(OverlayC, OverlayA);
1516
0
            return DivScale255(
1517
0
                MulScale255(alphaMul255, (C_unpremultiplied +
1518
0
                                          Overlay_C_unpremultiplied - 255)) +
1519
0
                    MulScale255(C, 255 - OverlayA) +
1520
0
                    MulScale255(OverlayC, 255 - A),
1521
0
                nFinalAlpha);
1522
0
        };
1523
1524
0
        auto branchingCondition = [&alphaMul255](GByte C, GByte A,
1525
0
                                                 GByte OverlayC,
1526
0
                                                 GByte OverlayA) -> bool
1527
0
        {
1528
0
            return MulScale255(OverlayC, A) + MulScale255(C, OverlayA) <=
1529
0
                   alphaMul255;
1530
0
        };
1531
1532
0
        if (branchingCondition(nR, nA, nOverlayR, nOverlayA))
1533
0
        {
1534
0
            pabyDst[nDstOffset] =
1535
0
                processComponent_lessEqual(nR, nA, nOverlayR, nOverlayA);
1536
0
        }
1537
0
        else
1538
0
        {
1539
0
            pabyDst[nDstOffset] =
1540
0
                processComponent_greater(nR, nA, nOverlayR, nOverlayA);
1541
0
        }
1542
1543
        // Grayscale with alpha
1544
0
        if (nOutputBands == 2)
1545
0
        {
1546
0
            pabyDst[nDstOffset + nBandSpace] = nFinalAlpha;
1547
0
        }
1548
0
        else
1549
0
        {
1550
            // RBG and RGBA
1551
0
            if (nOutputBands >= 3)
1552
0
            {
1553
0
                if (branchingCondition(nG, nA, nOverlayG, nOverlayA))
1554
0
                {
1555
0
                    pabyDst[nDstOffset + nBandSpace] =
1556
0
                        processComponent_lessEqual(nG, nA, nOverlayG,
1557
0
                                                   nOverlayA);
1558
0
                }
1559
0
                else
1560
0
                {
1561
0
                    pabyDst[nDstOffset + nBandSpace] =
1562
0
                        processComponent_greater(nG, nA, nOverlayG, nOverlayA);
1563
0
                }
1564
1565
0
                if (branchingCondition(nB, nA, nOverlayB, nOverlayA))
1566
0
                {
1567
0
                    pabyDst[nDstOffset + 2 * nBandSpace] =
1568
0
                        processComponent_lessEqual(nB, nA, nOverlayB,
1569
0
                                                   nOverlayA);
1570
0
                }
1571
0
                else
1572
0
                {
1573
0
                    pabyDst[nDstOffset + 2 * nBandSpace] =
1574
0
                        processComponent_greater(nB, nA, nOverlayB, nOverlayA);
1575
0
                }
1576
0
            }
1577
1578
            // RGBA
1579
0
            if (nOutputBands == 4)
1580
0
            {
1581
0
                pabyDst[nDstOffset + 3 * nBandSpace] = nFinalAlpha;
1582
0
            }
1583
0
        }
1584
0
        nDstOffset += nPixelSpace;
1585
0
    }
1586
0
}
1587
1588
/************************************************************************/
1589
/*                       BlendSrcOverRGBA_SSE2()                        */
1590
/************************************************************************/
1591
1592
#ifdef HAVE_SSE2
1593
#if defined(__GNUC__)
1594
__attribute__((noinline))
1595
#endif
1596
static int BlendSrcOverRGBA_SSE2(const GByte *CPL_RESTRICT pabyR,
1597
                                 const GByte *CPL_RESTRICT pabyG,
1598
                                 const GByte *CPL_RESTRICT pabyB,
1599
                                 const GByte *CPL_RESTRICT pabyA,
1600
                                 const GByte *CPL_RESTRICT pabyOverlayR,
1601
                                 const GByte *CPL_RESTRICT pabyOverlayG,
1602
                                 const GByte *CPL_RESTRICT pabyOverlayB,
1603
                                 const GByte *CPL_RESTRICT pabyOverlayA,
1604
                                 GByte *CPL_RESTRICT pabyDst,
1605
                                 GSpacing nBandSpace, int N, GByte nOpacity)
1606
0
{
1607
    // See scalar code after call to BlendSrcOverRGBA_SSE2() below for the
1608
    // non-obfuscated formulas...
1609
1610
0
    const auto load_and_unpack = [](const void *p)
1611
0
    {
1612
0
        const auto zero = _mm_setzero_si128();
1613
0
        auto overlayA = _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
1614
0
        return std::make_pair(_mm_unpacklo_epi8(overlayA, zero),
1615
0
                              _mm_unpackhi_epi8(overlayA, zero));
1616
0
    };
1617
1618
0
    const auto pack_and_store = [](void *p, __m128i lo, __m128i hi)
1619
0
    {
1620
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(p),
1621
0
                         _mm_packus_epi16(lo, hi));
1622
0
    };
1623
1624
0
    const auto mul16bit_8bit_result = [](__m128i a, __m128i b)
1625
0
    {
1626
0
        const auto r255 = _mm_set1_epi16(255);
1627
0
        return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(a, b), r255), 8);
1628
0
    };
1629
1630
0
    const auto opacity = _mm_set1_epi16(nOpacity);
1631
0
    const auto r255 = _mm_set1_epi16(255);
1632
0
    const int16_t *tabInvDstASigned =
1633
0
        reinterpret_cast<const int16_t *>(gTabInvDstA.data());
1634
0
    constexpr int REG_WIDTH = static_cast<int>(sizeof(opacity));
1635
1636
0
    int i = 0;
1637
0
    for (; i <= N - REG_WIDTH; i += REG_WIDTH)
1638
0
    {
1639
0
        auto [overlayA_lo, overlayA_hi] = load_and_unpack(pabyOverlayA + i);
1640
0
        auto [srcA_lo, srcA_hi] = load_and_unpack(pabyA + i);
1641
0
        overlayA_lo = mul16bit_8bit_result(overlayA_lo, opacity);
1642
0
        overlayA_hi = mul16bit_8bit_result(overlayA_hi, opacity);
1643
0
        auto srcAMul255MinusOverlayA_lo =
1644
0
            mul16bit_8bit_result(srcA_lo, _mm_sub_epi16(r255, overlayA_lo));
1645
0
        auto srcAMul255MinusOverlayA_hi =
1646
0
            mul16bit_8bit_result(srcA_hi, _mm_sub_epi16(r255, overlayA_hi));
1647
0
        auto dstA_lo = _mm_add_epi16(overlayA_lo, srcAMul255MinusOverlayA_lo);
1648
0
        auto dstA_hi = _mm_add_epi16(overlayA_hi, srcAMul255MinusOverlayA_hi);
1649
1650
        // This would be the equivalent of a "_mm_i16gather_epi16" operation
1651
        // which does not exist...
1652
        // invDstA_{i} = [tabInvDstASigned[dstA_{i}] for i in range(8)]
1653
0
        auto invDstA_lo = _mm_undefined_si128();
1654
0
        auto invDstA_hi = _mm_undefined_si128();
1655
0
#define SET_INVDSTA(k)                                                         \
1656
0
    do                                                                         \
1657
0
    {                                                                          \
1658
0
        const int idxLo = _mm_extract_epi16(dstA_lo, k);                       \
1659
0
        const int idxHi = _mm_extract_epi16(dstA_hi, k);                       \
1660
0
        invDstA_lo = _mm_insert_epi16(invDstA_lo, tabInvDstASigned[idxLo], k); \
1661
0
        invDstA_hi = _mm_insert_epi16(invDstA_hi, tabInvDstASigned[idxHi], k); \
1662
0
    } while (0)
1663
1664
0
        SET_INVDSTA(0);
1665
0
        SET_INVDSTA(1);
1666
0
        SET_INVDSTA(2);
1667
0
        SET_INVDSTA(3);
1668
0
        SET_INVDSTA(4);
1669
0
        SET_INVDSTA(5);
1670
0
        SET_INVDSTA(6);
1671
0
        SET_INVDSTA(7);
1672
1673
0
        pack_and_store(pabyDst + i + 3 * nBandSpace, dstA_lo, dstA_hi);
1674
1675
0
#define PROCESS_COMPONENT(pabySrc, pabyOverlay, iBand)                         \
1676
0
    do                                                                         \
1677
0
    {                                                                          \
1678
0
        auto [src_lo, src_hi] = load_and_unpack((pabySrc) + i);                \
1679
0
        auto [overlay_lo, overlay_hi] = load_and_unpack((pabyOverlay) + i);    \
1680
0
        auto dst_lo = _mm_srli_epi16(                                          \
1681
0
            _mm_add_epi16(                                                     \
1682
0
                _mm_add_epi16(                                                 \
1683
0
                    _mm_mullo_epi16(overlay_lo, overlayA_lo),                  \
1684
0
                    _mm_mullo_epi16(src_lo, srcAMul255MinusOverlayA_lo)),      \
1685
0
                r255),                                                         \
1686
0
            8);                                                                \
1687
0
        auto dst_hi = _mm_srli_epi16(                                          \
1688
0
            _mm_add_epi16(                                                     \
1689
0
                _mm_add_epi16(                                                 \
1690
0
                    _mm_mullo_epi16(overlay_hi, overlayA_hi),                  \
1691
0
                    _mm_mullo_epi16(src_hi, srcAMul255MinusOverlayA_hi)),      \
1692
0
                r255),                                                         \
1693
0
            8);                                                                \
1694
0
        dst_lo = mul16bit_8bit_result(dst_lo, invDstA_lo);                     \
1695
0
        dst_hi = mul16bit_8bit_result(dst_hi, invDstA_hi);                     \
1696
0
        pack_and_store(pabyDst + i + (iBand) * nBandSpace, dst_lo, dst_hi);    \
1697
0
    } while (0)
1698
1699
0
        PROCESS_COMPONENT(pabyR, pabyOverlayR, 0);
1700
0
        PROCESS_COMPONENT(pabyG, pabyOverlayG, 1);
1701
0
        PROCESS_COMPONENT(pabyB, pabyOverlayB, 2);
1702
0
    }
1703
0
    return i;
1704
0
}
1705
#endif
1706
1707
template <bool bPixelSpaceIsOne>
1708
#if defined(__GNUC__)
1709
__attribute__((noinline))
1710
#endif
1711
static void BlendSrcOverRGBA_Generic(
1712
    const GByte *CPL_RESTRICT pabyR, const GByte *CPL_RESTRICT pabyG,
1713
    const GByte *CPL_RESTRICT pabyB, const GByte *CPL_RESTRICT pabyA,
1714
    const GByte *CPL_RESTRICT pabyOverlayR,
1715
    const GByte *CPL_RESTRICT pabyOverlayG,
1716
    const GByte *CPL_RESTRICT pabyOverlayB,
1717
    const GByte *CPL_RESTRICT pabyOverlayA, GByte *CPL_RESTRICT pabyDst,
1718
    [[maybe_unused]] GSpacing nPixelSpace, GSpacing nBandSpace, int i, int N,
1719
    GByte nOpacity)
1720
0
{
1721
#if !(defined(HAVE_SSE2) && defined(__GNUC__))
1722
    if constexpr (!bPixelSpaceIsOne)
1723
    {
1724
        assert(nPixelSpace != 1);
1725
    }
1726
#endif
1727
0
    [[maybe_unused]] GSpacing nDstOffset = 0;
1728
0
#if defined(HAVE_SSE2) && defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
1729
// No need to vectorize for SSE2 and clang
1730
0
#pragma clang loop vectorize(disable)
1731
0
#endif
1732
0
    for (; i < N; ++i)
1733
0
    {
1734
0
        const GByte nOverlayR = pabyOverlayR[i];
1735
0
        const GByte nOverlayG = pabyOverlayG[i];
1736
0
        const GByte nOverlayB = pabyOverlayB[i];
1737
0
        const GByte nOverlayA =
1738
0
            static_cast<GByte>((pabyOverlayA[i] * nOpacity + 255) / 256);
1739
0
        const GByte nR = pabyR[i];
1740
0
        const GByte nG = pabyG[i];
1741
0
        const GByte nB = pabyB[i];
1742
0
        const GByte nA = pabyA[i];
1743
0
        const GByte nSrcAMul255MinusOverlayA =
1744
0
            static_cast<GByte>((nA * (255 - nOverlayA) + 255) / 256);
1745
0
        const GByte nDstA =
1746
0
            static_cast<GByte>(nOverlayA + nSrcAMul255MinusOverlayA);
1747
0
        GByte nDstR = static_cast<GByte>(
1748
0
            (nOverlayR * nOverlayA + nR * nSrcAMul255MinusOverlayA + 255) /
1749
0
            256);
1750
0
        GByte nDstG = static_cast<GByte>(
1751
0
            (nOverlayG * nOverlayA + nG * nSrcAMul255MinusOverlayA + 255) /
1752
0
            256);
1753
0
        GByte nDstB = static_cast<GByte>(
1754
0
            (nOverlayB * nOverlayA + nB * nSrcAMul255MinusOverlayA + 255) /
1755
0
            256);
1756
        // nInvDstA = (255 << SHIFT_DIV_DSTA) / nDstA;
1757
0
        const uint16_t nInvDstA = gTabInvDstA[nDstA];
1758
0
        constexpr unsigned ROUND_OFFSET_DIV_DSTA = ((1 << SHIFT_DIV_DSTA) - 1);
1759
0
        nDstR = static_cast<GByte>((nDstR * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
1760
0
                                   SHIFT_DIV_DSTA);
1761
0
        nDstG = static_cast<GByte>((nDstG * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
1762
0
                                   SHIFT_DIV_DSTA);
1763
0
        nDstB = static_cast<GByte>((nDstB * nInvDstA + ROUND_OFFSET_DIV_DSTA) >>
1764
0
                                   SHIFT_DIV_DSTA);
1765
        if constexpr (bPixelSpaceIsOne)
1766
        {
1767
            pabyDst[i] = nDstR;
1768
            pabyDst[i + nBandSpace] = nDstG;
1769
            pabyDst[i + 2 * nBandSpace] = nDstB;
1770
            pabyDst[i + 3 * nBandSpace] = nDstA;
1771
        }
1772
        else
1773
0
        {
1774
0
            pabyDst[nDstOffset] = nDstR;
1775
0
            pabyDst[nDstOffset + nBandSpace] = nDstG;
1776
0
            pabyDst[nDstOffset + 2 * nBandSpace] = nDstB;
1777
0
            pabyDst[nDstOffset + 3 * nBandSpace] = nDstA;
1778
0
            nDstOffset += nPixelSpace;
1779
0
        }
1780
0
    }
1781
0
}
1782
1783
/************************************************************************/
1784
/*                      BlendDataset::IRasterIO()                       */
1785
/************************************************************************/
1786
1787
CPLErr BlendDataset::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
1788
                               int nXSize, int nYSize, void *pData,
1789
                               int nBufXSize, int nBufYSize,
1790
                               GDALDataType eBufType, int nBandCount,
1791
                               BANDMAP_TYPE panBandMap, GSpacing nPixelSpace,
1792
                               GSpacing nLineSpace, GSpacing nBandSpace,
1793
                               GDALRasterIOExtraArg *psExtraArg)
1794
0
{
1795
    // Try to pass the request to the most appropriate overview dataset.
1796
0
    if (nBufXSize < nXSize && nBufYSize < nYSize)
1797
0
    {
1798
0
        int bTried = FALSE;
1799
0
        const CPLErr eErr = TryOverviewRasterIO(
1800
0
            eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize, nBufYSize,
1801
0
            eBufType, nBandCount, panBandMap, nPixelSpace, nLineSpace,
1802
0
            nBandSpace, psExtraArg, &bTried);
1803
0
        if (bTried)
1804
0
            return eErr;
1805
0
    }
1806
1807
0
    GByte *const CPL_RESTRICT pabyDst = static_cast<GByte *>(pData);
1808
0
    const int nColorCount = m_oColorDS.GetRasterCount();
1809
0
    const int nOverlayCount = m_oOverlayDS.GetRasterCount();
1810
1811
    /************************************************************************/
1812
    /* HSV_VALUE                                                            */
1813
    /*************************************************************************/
1814
0
    if (nOverlayCount == 1 && m_opacity255Scale == 255 &&
1815
0
        m_operator == CompositionMode::HSV_VALUE && eRWFlag == GF_Read &&
1816
0
        eBufType == GDT_UInt8 && nBandCount == nBands &&
1817
0
        IsAllBands(nBands, panBandMap) &&
1818
0
        AcquireSourcePixels(nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize,
1819
0
                            psExtraArg))
1820
0
    {
1821
0
        const size_t nPixelCount = static_cast<size_t>(nBufXSize) * nBufYSize;
1822
0
        const GByte *pabyR = m_abyBuffer.data();
1823
0
        const GByte *pabyG = m_abyBuffer.data() + nPixelCount;
1824
0
        const GByte *pabyB = m_abyBuffer.data() + nPixelCount * 2;
1825
0
        const GByte *pabyValue = m_abyBuffer.data() + nPixelCount * nColorCount;
1826
0
        size_t nSrcIdx = 0;
1827
0
        for (int j = 0; j < nBufYSize; ++j)
1828
0
        {
1829
0
            auto nDstOffset = j * nLineSpace;
1830
0
            if (nPixelSpace == 1 && nLineSpace >= nPixelSpace * nBufXSize &&
1831
0
                nBandSpace >= nLineSpace * nBufYSize)
1832
0
            {
1833
0
                patch_value_line(nBufXSize, pabyR + nSrcIdx, pabyG + nSrcIdx,
1834
0
                                 pabyB + nSrcIdx, pabyValue + nSrcIdx,
1835
0
                                 pabyDst + nDstOffset,
1836
0
                                 pabyDst + nDstOffset + nBandSpace,
1837
0
                                 pabyDst + nDstOffset + 2 * nBandSpace);
1838
0
                nSrcIdx += nBufXSize;
1839
0
            }
1840
0
            else
1841
0
            {
1842
0
                for (int i = 0; i < nBufXSize;
1843
0
                     ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
1844
0
                {
1845
0
                    float h, s;
1846
0
                    rgb_to_hs(pabyR[nSrcIdx], pabyG[nSrcIdx], pabyB[nSrcIdx],
1847
0
                              &h, &s);
1848
0
                    hsv_to_rgb(h, s, pabyValue[nSrcIdx],
1849
0
                               &pabyDst[nDstOffset + 0 * nBandSpace],
1850
0
                               &pabyDst[nDstOffset + 1 * nBandSpace],
1851
0
                               &pabyDst[nDstOffset + 2 * nBandSpace]);
1852
0
                }
1853
0
            }
1854
0
        }
1855
0
        if (nColorCount == 4)
1856
0
        {
1857
0
            for (int j = 0; j < nBufYSize; ++j)
1858
0
            {
1859
0
                auto nDstOffset = 3 * nBandSpace + j * nLineSpace;
1860
0
                const GByte *pabyA = m_abyBuffer.data() + nPixelCount * 3;
1861
0
                GDALCopyWords64(pabyA, GDT_UInt8, 1, pabyDst + nDstOffset,
1862
0
                                GDT_UInt8, static_cast<int>(nPixelSpace),
1863
0
                                nBufXSize);
1864
0
            }
1865
0
        }
1866
1867
0
        return CE_None;
1868
0
    }
1869
1870
    /************************************************************************/
1871
    /* SRC_OVER                                                             */
1872
    /************************************************************************/
1873
0
    else if (nOverlayCount == 4 && nColorCount == 4 &&
1874
0
             m_operator == CompositionMode::SRC_OVER && eRWFlag == GF_Read &&
1875
0
             eBufType == GDT_UInt8 && nBandCount == nBands &&
1876
0
             IsAllBands(nBands, panBandMap) &&
1877
0
             AcquireSourcePixels(nXOff, nYOff, nXSize, nYSize, nBufXSize,
1878
0
                                 nBufYSize, psExtraArg))
1879
0
    {
1880
0
        const GByte nOpacity = static_cast<GByte>(m_opacity255Scale);
1881
0
        const size_t nPixelCount = static_cast<size_t>(nBufXSize) * nBufYSize;
1882
0
        const GByte *CPL_RESTRICT pabyR = m_abyBuffer.data();
1883
0
        const GByte *CPL_RESTRICT pabyG = m_abyBuffer.data() + nPixelCount;
1884
0
        const GByte *CPL_RESTRICT pabyB = m_abyBuffer.data() + nPixelCount * 2;
1885
0
        const GByte *CPL_RESTRICT pabyA = m_abyBuffer.data() + nPixelCount * 3;
1886
0
        const GByte *CPL_RESTRICT pabyOverlayR =
1887
0
            m_abyBuffer.data() + nPixelCount * nColorCount;
1888
0
        const GByte *CPL_RESTRICT pabyOverlayG =
1889
0
            m_abyBuffer.data() + nPixelCount * (nColorCount + 1);
1890
0
        const GByte *CPL_RESTRICT pabyOverlayB =
1891
0
            m_abyBuffer.data() + nPixelCount * (nColorCount + 2);
1892
0
        const GByte *CPL_RESTRICT pabyOverlayA =
1893
0
            m_abyBuffer.data() + nPixelCount * (nColorCount + 3);
1894
0
        size_t nSrcIdx = 0;
1895
0
        for (int j = 0; j < nBufYSize; ++j)
1896
0
        {
1897
0
            auto nDstOffset = j * nLineSpace;
1898
0
            int i = 0;
1899
0
#ifdef HAVE_SSE2
1900
0
            if (nPixelSpace == 1)
1901
0
            {
1902
0
                i = BlendSrcOverRGBA_SSE2(
1903
0
                    pabyR + nSrcIdx, pabyG + nSrcIdx, pabyB + nSrcIdx,
1904
0
                    pabyA + nSrcIdx, pabyOverlayR + nSrcIdx,
1905
0
                    pabyOverlayG + nSrcIdx, pabyOverlayB + nSrcIdx,
1906
0
                    pabyOverlayA + nSrcIdx, pabyDst + nDstOffset, nBandSpace,
1907
0
                    nBufXSize, nOpacity);
1908
0
                nSrcIdx += i;
1909
0
                nDstOffset += i;
1910
0
            }
1911
0
#endif
1912
#if !(defined(HAVE_SSE2) && defined(__GNUC__))
1913
            if (nPixelSpace == 1)
1914
            {
1915
                // Note: clang 20 does a rather good job at autovectorizing
1916
                // for SSE2, but BlendSrcOverRGBA_SSE2() performs better.
1917
                BlendSrcOverRGBA_Generic</* bPixelSpaceIsOne = */ true>(
1918
                    pabyR + nSrcIdx, pabyG + nSrcIdx, pabyB + nSrcIdx,
1919
                    pabyA + nSrcIdx, pabyOverlayR + nSrcIdx,
1920
                    pabyOverlayG + nSrcIdx, pabyOverlayB + nSrcIdx,
1921
                    pabyOverlayA + nSrcIdx, pabyDst + nDstOffset,
1922
                    /*nPixelSpace = */ 1, nBandSpace, i, nBufXSize, nOpacity);
1923
            }
1924
            else
1925
#endif
1926
0
            {
1927
0
                BlendSrcOverRGBA_Generic</* bPixelSpaceIsOne = */ false>(
1928
0
                    pabyR + nSrcIdx, pabyG + nSrcIdx, pabyB + nSrcIdx,
1929
0
                    pabyA + nSrcIdx, pabyOverlayR + nSrcIdx,
1930
0
                    pabyOverlayG + nSrcIdx, pabyOverlayB + nSrcIdx,
1931
0
                    pabyOverlayA + nSrcIdx, pabyDst + nDstOffset, nPixelSpace,
1932
0
                    nBandSpace, i, nBufXSize, nOpacity);
1933
0
            }
1934
0
            nSrcIdx += nBufXSize - i;
1935
0
        }
1936
0
        return CE_None;
1937
0
    }
1938
1939
    /************************************************************************/
1940
    /* OTHER OPERATORS                                                      */
1941
    /************************************************************************/
1942
1943
0
    else if ((m_operator == CompositionMode::MULTIPLY ||
1944
0
              m_operator == CompositionMode::OVERLAY ||
1945
0
              m_operator == CompositionMode::SCREEN ||
1946
0
              m_operator == CompositionMode::HARD_LIGHT ||
1947
0
              m_operator == CompositionMode::DARKEN ||
1948
0
              m_operator == CompositionMode::LIGHTEN ||
1949
0
              m_operator == CompositionMode::COLOR_BURN ||
1950
0
              m_operator == CompositionMode::COLOR_DODGE) &&
1951
0
             eRWFlag == GF_Read && eBufType == GDT_UInt8 &&
1952
0
             nBandCount == nBands && IsAllBands(nBands, panBandMap) &&
1953
0
             AcquireSourcePixels(nXOff, nYOff, nXSize, nYSize, nBufXSize,
1954
0
                                 nBufYSize, psExtraArg))
1955
0
    {
1956
        // We should have optimized paths for 1, 2, 3 and 4 bands on input and overlay
1957
        // permutations but let's keep it simple for now.
1958
0
        const GByte nOpacity = static_cast<GByte>(m_opacity255Scale);
1959
0
        const size_t nPixelCount = static_cast<size_t>(nBufXSize) * nBufYSize;
1960
0
        const GByte *CPL_RESTRICT pabyR = m_abyBuffer.data();
1961
0
        GByte *CPL_RESTRICT pabyG = nullptr;
1962
0
        GByte *CPL_RESTRICT pabyB = nullptr;
1963
0
        GByte *CPL_RESTRICT pabyA = nullptr;
1964
0
        switch (nColorCount)
1965
0
        {
1966
0
            case 2:
1967
0
                pabyA = m_abyBuffer.data() + nPixelCount;
1968
0
                break;
1969
0
            case 3:
1970
0
                pabyG = m_abyBuffer.data() + nPixelCount;
1971
0
                pabyB = m_abyBuffer.data() + nPixelCount * 2;
1972
0
                break;
1973
0
            case 4:
1974
0
                pabyG = m_abyBuffer.data() + nPixelCount;
1975
0
                pabyB = m_abyBuffer.data() + nPixelCount * 2;
1976
0
                pabyA = m_abyBuffer.data() + nPixelCount * 3;
1977
0
                break;
1978
0
        }
1979
1980
0
        const GByte *CPL_RESTRICT pabyOverlayR =
1981
0
            m_abyBuffer.data() + nPixelCount * nColorCount;
1982
0
        GByte *CPL_RESTRICT pabyOverlayG = nullptr;
1983
0
        GByte *CPL_RESTRICT pabyOverlayB = nullptr;
1984
0
        GByte *CPL_RESTRICT pabyOverlayA = nullptr;
1985
0
        switch (nOverlayCount)
1986
0
        {
1987
0
            case 2:
1988
0
                pabyOverlayA =
1989
0
                    m_abyBuffer.data() + nPixelCount * (nColorCount + 1);
1990
0
                break;
1991
0
            case 3:
1992
0
                pabyOverlayG =
1993
0
                    m_abyBuffer.data() + nPixelCount * (nColorCount + 1);
1994
0
                pabyOverlayB =
1995
0
                    m_abyBuffer.data() + nPixelCount * (nColorCount + 2);
1996
0
                break;
1997
0
            case 4:
1998
0
                pabyOverlayG =
1999
0
                    m_abyBuffer.data() + nPixelCount * (nColorCount + 1);
2000
0
                pabyOverlayB =
2001
0
                    m_abyBuffer.data() + nPixelCount * (nColorCount + 2);
2002
0
                pabyOverlayA =
2003
0
                    m_abyBuffer.data() + nPixelCount * (nColorCount + 3);
2004
0
                break;
2005
0
        }
2006
2007
0
        size_t nSrcIdx = 0;
2008
0
        for (int j = 0; j < nBufYSize; ++j)
2009
0
        {
2010
0
            auto nDstOffset = j * nLineSpace;
2011
0
            int i = 0;
2012
2013
0
            const GByte *CPL_RESTRICT pabyOverlayG_current =
2014
0
                pabyOverlayG ? pabyOverlayG + nSrcIdx : nullptr;
2015
0
            const GByte *CPL_RESTRICT pabyOverlayB_current =
2016
0
                pabyOverlayB ? pabyOverlayB + nSrcIdx : nullptr;
2017
0
            const GByte *CPL_RESTRICT pabyOverlayA_current =
2018
0
                pabyOverlayA ? pabyOverlayA + nSrcIdx : nullptr;
2019
2020
0
            const GByte *CPL_RESTRICT pabyG_current =
2021
0
                pabyG ? pabyG + nSrcIdx : nullptr;
2022
0
            const GByte *CPL_RESTRICT pabyB_current =
2023
0
                pabyB ? pabyB + nSrcIdx : nullptr;
2024
0
            const GByte *CPL_RESTRICT pabyA_current =
2025
0
                pabyA ? pabyA + nSrcIdx : nullptr;
2026
2027
            // Determine the number of  bands
2028
0
            const int nInputBands{1 + (pabyG ? 2 : 0) + (pabyA ? 1 : 0)};
2029
0
            const int nOverlayBands{1 + (pabyOverlayG ? 2 : 0) +
2030
0
                                    (pabyOverlayA ? 1 : 0)};
2031
0
            const int nOutputBands = std::max(nInputBands, nOverlayBands);
2032
2033
0
            if (m_operator == CompositionMode::SCREEN)
2034
0
                BlendScreen_Generic(
2035
0
                    pabyR + nSrcIdx, pabyG_current, pabyB_current,
2036
0
                    pabyA_current, pabyOverlayR + nSrcIdx, pabyOverlayG_current,
2037
0
                    pabyOverlayB_current, pabyOverlayA_current,
2038
0
                    pabyDst + nDstOffset, nPixelSpace, nBandSpace, i, nBufXSize,
2039
0
                    nOpacity, nOutputBands, m_bSwappedOpacity);
2040
0
            else if (m_operator == CompositionMode::MULTIPLY)
2041
0
                BlendMultiply_Generic(
2042
0
                    pabyR + nSrcIdx, pabyG_current, pabyB_current,
2043
0
                    pabyA_current, pabyOverlayR + nSrcIdx, pabyOverlayG_current,
2044
0
                    pabyOverlayB_current, pabyOverlayA_current,
2045
0
                    pabyDst + nDstOffset, nPixelSpace, nBandSpace, i, nBufXSize,
2046
0
                    nOpacity, nOutputBands, m_bSwappedOpacity);
2047
0
            else if (m_operator == CompositionMode::HARD_LIGHT)
2048
0
                BlendHardLight_Generic(
2049
0
                    pabyR + nSrcIdx, pabyG_current, pabyB_current,
2050
0
                    pabyA_current, pabyOverlayR + nSrcIdx, pabyOverlayG_current,
2051
0
                    pabyOverlayB_current, pabyOverlayA_current,
2052
0
                    pabyDst + nDstOffset, nPixelSpace, nBandSpace, i, nBufXSize,
2053
0
                    nOpacity, nOutputBands, m_bSwappedOpacity);
2054
0
            else if (m_operator == CompositionMode::OVERLAY)
2055
0
                BlendOverlay_Generic(
2056
0
                    pabyR + nSrcIdx, pabyG_current, pabyB_current,
2057
0
                    pabyA_current, pabyOverlayR + nSrcIdx, pabyOverlayG_current,
2058
0
                    pabyOverlayB_current, pabyOverlayA_current,
2059
0
                    pabyDst + nDstOffset, nPixelSpace, nBandSpace, i, nBufXSize,
2060
0
                    nOpacity, nOutputBands, m_bSwappedOpacity);
2061
0
            else if (m_operator == CompositionMode::DARKEN)
2062
0
                BlendDarken_Generic(
2063
0
                    pabyR + nSrcIdx, pabyG_current, pabyB_current,
2064
0
                    pabyA_current, pabyOverlayR + nSrcIdx, pabyOverlayG_current,
2065
0
                    pabyOverlayB_current, pabyOverlayA_current,
2066
0
                    pabyDst + nDstOffset, nPixelSpace, nBandSpace, i, nBufXSize,
2067
0
                    nOpacity, nOutputBands, m_bSwappedOpacity);
2068
0
            else if (m_operator == CompositionMode::LIGHTEN)
2069
0
                BlendLighten_Generic(
2070
0
                    pabyR + nSrcIdx, pabyG_current, pabyB_current,
2071
0
                    pabyA_current, pabyOverlayR + nSrcIdx, pabyOverlayG_current,
2072
0
                    pabyOverlayB_current, pabyOverlayA_current,
2073
0
                    pabyDst + nDstOffset, nPixelSpace, nBandSpace, i, nBufXSize,
2074
0
                    nOpacity, nOutputBands, m_bSwappedOpacity);
2075
0
            else if (m_operator == CompositionMode::COLOR_BURN)
2076
0
                BlendColorBurn_Generic(
2077
0
                    pabyR + nSrcIdx, pabyG_current, pabyB_current,
2078
0
                    pabyA_current, pabyOverlayR + nSrcIdx, pabyOverlayG_current,
2079
0
                    pabyOverlayB_current, pabyOverlayA_current,
2080
0
                    pabyDst + nDstOffset, nPixelSpace, nBandSpace, i, nBufXSize,
2081
0
                    nOpacity, nOutputBands, m_bSwappedOpacity);
2082
0
            else if (m_operator == CompositionMode::COLOR_DODGE)
2083
0
                BlendColorDodge_Generic(
2084
0
                    pabyR + nSrcIdx, pabyG_current, pabyB_current,
2085
0
                    pabyA_current, pabyOverlayR + nSrcIdx, pabyOverlayG_current,
2086
0
                    pabyOverlayB_current, pabyOverlayA_current,
2087
0
                    pabyDst + nDstOffset, nPixelSpace, nBandSpace, i, nBufXSize,
2088
0
                    nOpacity, nOutputBands, m_bSwappedOpacity);
2089
0
            else
2090
0
            {
2091
0
                CPLAssert(false);
2092
0
            }
2093
2094
0
            nSrcIdx += nBufXSize - i;
2095
0
        }
2096
0
        return CE_None;
2097
0
    }
2098
2099
    /************************************************************************/
2100
    /* ERRORS                                                               */
2101
    /************************************************************************/
2102
0
    else if (m_ioError)
2103
0
    {
2104
0
        return CE_Failure;
2105
0
    }
2106
0
    else
2107
0
    {
2108
0
        const CPLErr eErr = GDALDataset::IRasterIO(
2109
0
            eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize, nBufYSize,
2110
0
            eBufType, nBandCount, panBandMap, nPixelSpace, nLineSpace,
2111
0
            nBandSpace, psExtraArg);
2112
0
        m_ioError = eErr != CE_None;
2113
0
        return eErr;
2114
0
    }
2115
0
}
2116
2117
/************************************************************************/
2118
/*                       SrcOverRGBOneComponent()                       */
2119
/************************************************************************/
2120
2121
// GCC and clang do a god job a auto vectorizing the below function
2122
#if defined(__GNUC__) && !defined(__clang__)
2123
__attribute__((optimize("tree-vectorize")))
2124
#endif
2125
static void SrcOverRGB(const uint8_t *const __restrict pabyOverlay,
2126
                       const uint8_t *const __restrict pabySrc,
2127
                       uint8_t *const __restrict pabyDst, const size_t N,
2128
                       const uint8_t nOpacity)
2129
0
{
2130
0
    for (size_t i = 0; i < N; ++i)
2131
0
    {
2132
0
        const uint8_t nOverlay = pabyOverlay[i];
2133
0
        const uint8_t nSrc = pabySrc[i];
2134
0
        pabyDst[i] = static_cast<uint8_t>(
2135
0
            (nOverlay * nOpacity + nSrc * (255 - nOpacity) + 255) / 256);
2136
0
    }
2137
0
}
2138
2139
/************************************************************************/
2140
/*                        BlendBand::IRasterIO()                        */
2141
/************************************************************************/
2142
2143
CPLErr BlendBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
2144
                            int nXSize, int nYSize, void *pData, int nBufXSize,
2145
                            int nBufYSize, GDALDataType eBufType,
2146
                            GSpacing nPixelSpace, GSpacing nLineSpace,
2147
                            GDALRasterIOExtraArg *psExtraArg)
2148
0
{
2149
    // Try to pass the request to the most appropriate overview dataset.
2150
0
    if (nBufXSize < nXSize && nBufYSize < nYSize)
2151
0
    {
2152
0
        int bTried = FALSE;
2153
0
        const CPLErr eErr = TryOverviewRasterIO(
2154
0
            eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize, nBufYSize,
2155
0
            eBufType, nPixelSpace, nLineSpace, psExtraArg, &bTried);
2156
0
        if (bTried)
2157
0
            return eErr;
2158
0
    }
2159
2160
0
    const size_t nPixelCount = static_cast<size_t>(nBufXSize) * nBufYSize;
2161
0
    const int nColorCount = m_oBlendDataset.m_oColorDS.GetRasterCount();
2162
0
    const int nOverlayCount = m_oBlendDataset.m_oOverlayDS.GetRasterCount();
2163
0
    if (nBand == 4 && m_oBlendDataset.m_operator == CompositionMode::HSV_VALUE)
2164
0
    {
2165
0
        if (nColorCount == 3)
2166
0
        {
2167
0
            GByte ch = 255;
2168
0
            for (int iY = 0; iY < nBufYSize; ++iY)
2169
0
            {
2170
0
                GDALCopyWords64(&ch, GDT_UInt8, 0,
2171
0
                                static_cast<GByte *>(pData) + iY * nLineSpace,
2172
0
                                eBufType, static_cast<int>(nPixelSpace),
2173
0
                                nBufXSize);
2174
0
            }
2175
0
            return CE_None;
2176
0
        }
2177
0
        else
2178
0
        {
2179
0
            CPLAssert(nColorCount == 4);
2180
0
            return m_oBlendDataset.m_oColorDS.GetRasterBand(4)->RasterIO(
2181
0
                eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
2182
0
                nBufYSize, eBufType, nPixelSpace, nLineSpace, psExtraArg);
2183
0
        }
2184
0
    }
2185
0
    else if (nOverlayCount == 3 && nColorCount == 3 &&
2186
0
             m_oBlendDataset.m_operator == CompositionMode::SRC_OVER &&
2187
0
             eRWFlag == GF_Read && eBufType == GDT_UInt8 &&
2188
0
             m_oBlendDataset.AcquireSourcePixels(nXOff, nYOff, nXSize, nYSize,
2189
0
                                                 nBufXSize, nBufYSize,
2190
0
                                                 psExtraArg))
2191
0
    {
2192
0
        const int nOpacity = m_oBlendDataset.m_opacity255Scale;
2193
0
        const GByte *const CPL_RESTRICT pabySrc =
2194
0
            m_oBlendDataset.m_abyBuffer.data() + nPixelCount * (nBand - 1);
2195
0
        const GByte *const CPL_RESTRICT pabyOverlay =
2196
0
            m_oBlendDataset.m_abyBuffer.data() +
2197
0
            nPixelCount * (nColorCount + nBand - 1);
2198
0
        GByte *const CPL_RESTRICT pabyDst = static_cast<GByte *>(pData);
2199
0
        size_t nSrcIdx = 0;
2200
0
        for (int j = 0; j < nBufYSize; ++j)
2201
0
        {
2202
0
            auto nDstOffset = j * nLineSpace;
2203
0
            if (nPixelSpace == 1)
2204
0
            {
2205
0
                SrcOverRGB(pabyOverlay + nSrcIdx, pabySrc + nSrcIdx,
2206
0
                           pabyDst + nDstOffset, nBufXSize,
2207
0
                           static_cast<uint8_t>(nOpacity));
2208
0
                nSrcIdx += nBufXSize;
2209
0
            }
2210
0
            else
2211
0
            {
2212
0
                for (int i = 0; i < nBufXSize;
2213
0
                     ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
2214
0
                {
2215
0
                    const int nOverlay = pabyOverlay[nSrcIdx];
2216
0
                    const int nSrc = pabySrc[nSrcIdx];
2217
0
                    pabyDst[nDstOffset] = static_cast<GByte>(
2218
0
                        (nOverlay * nOpacity + nSrc * (255 - nOpacity) + 255) /
2219
0
                        256);
2220
0
                }
2221
0
            }
2222
0
        }
2223
0
        return CE_None;
2224
0
    }
2225
0
    else if (eRWFlag == GF_Read && eBufType == GDT_UInt8 &&
2226
0
             m_oBlendDataset.AcquireSourcePixels(nXOff, nYOff, nXSize, nYSize,
2227
0
                                                 nBufXSize, nBufYSize,
2228
0
                                                 psExtraArg))
2229
0
    {
2230
0
        GByte *pabyDst = static_cast<GByte *>(pData);
2231
2232
0
        if (m_oBlendDataset.m_operator == CompositionMode::MULTIPLY ||
2233
0
            m_oBlendDataset.m_operator == CompositionMode::SCREEN ||
2234
0
            m_oBlendDataset.m_operator == CompositionMode::HARD_LIGHT ||
2235
0
            m_oBlendDataset.m_operator == CompositionMode::OVERLAY ||
2236
0
            m_oBlendDataset.m_operator == CompositionMode::DARKEN ||
2237
0
            m_oBlendDataset.m_operator == CompositionMode::LIGHTEN ||
2238
0
            m_oBlendDataset.m_operator == CompositionMode::COLOR_BURN ||
2239
0
            m_oBlendDataset.m_operator == CompositionMode::COLOR_DODGE)
2240
0
        {
2241
0
            CPLAssert(nBand <= 4);
2242
0
            const GByte *pabyR = m_oBlendDataset.m_abyBuffer.data();
2243
0
            const GByte *pabyG =
2244
0
                nColorCount >= 3
2245
0
                    ? m_oBlendDataset.m_abyBuffer.data() + nPixelCount
2246
0
                    : nullptr;
2247
0
            const GByte *pabyB =
2248
0
                nColorCount >= 3
2249
0
                    ? m_oBlendDataset.m_abyBuffer.data() + nPixelCount * 2
2250
0
                    : nullptr;
2251
2252
0
            GByte *pabyA = nullptr;
2253
2254
0
            if (nColorCount == 2)
2255
0
            {
2256
0
                pabyA = m_oBlendDataset.m_abyBuffer.data() + nPixelCount;
2257
0
            }
2258
0
            else if (nColorCount == 4)
2259
0
            {
2260
0
                pabyA = m_oBlendDataset.m_abyBuffer.data() + nPixelCount * 3;
2261
0
            }
2262
2263
            // Retrieve single band value as R
2264
0
            const GByte *pabyOverlayR =
2265
0
                m_oBlendDataset.m_abyBuffer.data() + nPixelCount * nColorCount;
2266
2267
0
            const GByte *pabyOverlayG =
2268
0
                nOverlayCount >= 3 ? m_oBlendDataset.m_abyBuffer.data() +
2269
0
                                         nPixelCount * (nColorCount + 1)
2270
0
                                   : nullptr;
2271
0
            const GByte *pabyOverlayB =
2272
0
                nOverlayCount >= 3 ? m_oBlendDataset.m_abyBuffer.data() +
2273
0
                                         nPixelCount * (nColorCount + 2)
2274
0
                                   : nullptr;
2275
0
            const GByte *pabyOverlayA =
2276
0
                (nOverlayCount == 2 || nOverlayCount == 4)
2277
0
                    ? m_oBlendDataset.m_abyBuffer.data() +
2278
0
                          nPixelCount * (nColorCount + nOverlayCount - 1)
2279
0
                    : nullptr;
2280
2281
            // Determine the number of bands
2282
0
            const int nInputBands{1 + (pabyG ? 2 : 0) + (pabyA ? 1 : 0)};
2283
0
            const int nOverlayBands{1 + (pabyOverlayG ? 2 : 0) +
2284
0
                                    (pabyOverlayA ? 1 : 0)};
2285
0
            const int nOutputBands = std::max(nInputBands, nOverlayBands);
2286
0
            const bool bSwappedOpacity{m_oBlendDataset.m_bSwappedOpacity};
2287
2288
0
            size_t nSrcIdx = 0;
2289
0
            for (int j = 0; j < nBufYSize; ++j)
2290
0
            {
2291
0
                auto nDstOffset = j * nLineSpace;
2292
0
                for (int i = 0; i < nBufXSize;
2293
0
                     ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
2294
0
                {
2295
                    // TODO: This need to be optimized for requesting a single band
2296
0
                    std::vector<GByte> byteBuffer;
2297
0
                    byteBuffer.resize(std::max(nColorCount, nOverlayCount));
2298
0
                    if (m_oBlendDataset.m_operator == CompositionMode::SCREEN)
2299
0
                        BlendScreen_Generic(
2300
0
                            pabyR, pabyG, pabyB, pabyA, pabyOverlayR,
2301
0
                            pabyOverlayG, pabyOverlayB, pabyOverlayA,
2302
0
                            byteBuffer.data(), nPixelSpace, 1, nSrcIdx, 1,
2303
0
                            static_cast<GByte>(
2304
0
                                m_oBlendDataset.m_opacity255Scale),
2305
0
                            nOutputBands, bSwappedOpacity);
2306
0
                    else if (m_oBlendDataset.m_operator ==
2307
0
                             CompositionMode::HARD_LIGHT)
2308
0
                        BlendHardLight_Generic(
2309
0
                            pabyR, pabyG, pabyB, pabyA, pabyOverlayR,
2310
0
                            pabyOverlayG, pabyOverlayB, pabyOverlayA,
2311
0
                            byteBuffer.data(), nPixelSpace, 1, nSrcIdx, 1,
2312
0
                            static_cast<GByte>(
2313
0
                                m_oBlendDataset.m_opacity255Scale),
2314
0
                            nOutputBands, bSwappedOpacity);
2315
0
                    else if (m_oBlendDataset.m_operator ==
2316
0
                             CompositionMode::OVERLAY)
2317
0
                        BlendOverlay_Generic(
2318
0
                            pabyR, pabyG, pabyB, pabyA, pabyOverlayR,
2319
0
                            pabyOverlayG, pabyOverlayB, pabyOverlayA,
2320
0
                            byteBuffer.data(), nPixelSpace, 1, nSrcIdx, 1,
2321
0
                            static_cast<GByte>(
2322
0
                                m_oBlendDataset.m_opacity255Scale),
2323
0
                            nOutputBands, bSwappedOpacity);
2324
0
                    else if (m_oBlendDataset.m_operator ==
2325
0
                             CompositionMode::MULTIPLY)
2326
0
                        BlendMultiply_Generic(
2327
0
                            pabyR, pabyG, pabyB, pabyA, pabyOverlayR,
2328
0
                            pabyOverlayG, pabyOverlayB, pabyOverlayA,
2329
0
                            byteBuffer.data(), nPixelSpace, 1, nSrcIdx, 1,
2330
0
                            static_cast<GByte>(
2331
0
                                m_oBlendDataset.m_opacity255Scale),
2332
0
                            nOutputBands, bSwappedOpacity);
2333
0
                    else if (m_oBlendDataset.m_operator ==
2334
0
                             CompositionMode::DARKEN)
2335
0
                        BlendDarken_Generic(
2336
0
                            pabyR, pabyG, pabyB, pabyA, pabyOverlayR,
2337
0
                            pabyOverlayG, pabyOverlayB, pabyOverlayA,
2338
0
                            byteBuffer.data(), nPixelSpace, 1, nSrcIdx, 1,
2339
0
                            static_cast<GByte>(
2340
0
                                m_oBlendDataset.m_opacity255Scale),
2341
0
                            nOutputBands, bSwappedOpacity);
2342
0
                    else if (m_oBlendDataset.m_operator ==
2343
0
                             CompositionMode::LIGHTEN)
2344
0
                        BlendLighten_Generic(
2345
0
                            pabyR, pabyG, pabyB, pabyA, pabyOverlayR,
2346
0
                            pabyOverlayG, pabyOverlayB, pabyOverlayA,
2347
0
                            byteBuffer.data(), nPixelSpace, 1, nSrcIdx, 1,
2348
0
                            static_cast<GByte>(
2349
0
                                m_oBlendDataset.m_opacity255Scale),
2350
0
                            nOutputBands, bSwappedOpacity);
2351
0
                    else if (m_oBlendDataset.m_operator ==
2352
0
                             CompositionMode::COLOR_BURN)
2353
0
                        BlendColorBurn_Generic(
2354
0
                            pabyR, pabyG, pabyB, pabyA, pabyOverlayR,
2355
0
                            pabyOverlayG, pabyOverlayB, pabyOverlayA,
2356
0
                            byteBuffer.data(), nPixelSpace, 1, nSrcIdx, 1,
2357
0
                            static_cast<GByte>(
2358
0
                                m_oBlendDataset.m_opacity255Scale),
2359
0
                            nOutputBands, bSwappedOpacity);
2360
0
                    else if (m_oBlendDataset.m_operator ==
2361
0
                             CompositionMode::COLOR_DODGE)
2362
0
                        BlendColorDodge_Generic(
2363
0
                            pabyR, pabyG, pabyB, pabyA, pabyOverlayR,
2364
0
                            pabyOverlayG, pabyOverlayB, pabyOverlayA,
2365
0
                            byteBuffer.data(), nPixelSpace, 1, nSrcIdx, 1,
2366
0
                            static_cast<GByte>(
2367
0
                                m_oBlendDataset.m_opacity255Scale),
2368
0
                            nOutputBands, bSwappedOpacity);
2369
0
                    else
2370
0
                    {
2371
0
                        CPLAssert(false);
2372
0
                    }
2373
2374
0
                    pabyDst[nDstOffset] = byteBuffer[nBand - 1];
2375
0
                }
2376
0
            }
2377
0
        }
2378
0
        else if (m_oBlendDataset.m_operator == CompositionMode::SRC_OVER)
2379
0
        {
2380
0
            const auto RGBToGrayScale = [](int R, int G, int B)
2381
0
            {
2382
                // Equivalent to R * 0.299 + G * 0.587 + B * 0.114
2383
                // but using faster computation
2384
0
                return (R * 306 + G * 601 + B * 117) / 1024;
2385
0
            };
2386
2387
0
            const GByte *paby =
2388
0
                (nBand <= nColorCount) ? m_oBlendDataset.m_abyBuffer.data() +
2389
0
                                             nPixelCount * (nBand - 1)
2390
0
                : (nBand == 4 && nColorCount == 2)
2391
0
                    ? m_oBlendDataset.m_abyBuffer.data() + nPixelCount
2392
0
                    : nullptr;
2393
0
            const GByte *pabyA =
2394
0
                (nColorCount == 4)
2395
0
                    ? m_oBlendDataset.m_abyBuffer.data() + nPixelCount * 3
2396
0
                : nColorCount == 2
2397
0
                    ? m_oBlendDataset.m_abyBuffer.data() + nPixelCount
2398
0
                    : nullptr;
2399
0
            const GByte *pabyOverlay =
2400
0
                (nBand <= nOverlayCount)
2401
0
                    ? m_oBlendDataset.m_abyBuffer.data() +
2402
0
                          nPixelCount * (nColorCount + nBand - 1)
2403
0
                : (nBand <= 3) ? m_oBlendDataset.m_abyBuffer.data() +
2404
0
                                     nPixelCount * nColorCount
2405
0
                               : nullptr;
2406
0
            const GByte *pabyOverlayA =
2407
0
                (nOverlayCount == 2 || nOverlayCount == 4)
2408
0
                    ? m_oBlendDataset.m_abyBuffer.data() +
2409
0
                          nPixelCount * (nColorCount + nOverlayCount - 1)
2410
0
                    : nullptr;
2411
0
            const GByte *pabyOverlayR =
2412
0
                (nOverlayCount >= 3 && nColorCount < 3 && nBand <= 3)
2413
0
                    ? m_oBlendDataset.m_abyBuffer.data() +
2414
0
                          nPixelCount * nColorCount
2415
0
                    : nullptr;
2416
0
            const GByte *pabyOverlayG =
2417
0
                (nOverlayCount >= 3 && nColorCount < 3 && nBand <= 3)
2418
0
                    ? m_oBlendDataset.m_abyBuffer.data() +
2419
0
                          nPixelCount * (nColorCount + 1)
2420
0
                    : nullptr;
2421
0
            const GByte *pabyOverlayB =
2422
0
                (nOverlayCount >= 3 && nColorCount < 3 && nBand <= 3)
2423
0
                    ? m_oBlendDataset.m_abyBuffer.data() +
2424
0
                          nPixelCount * (nColorCount + 2)
2425
0
                    : nullptr;
2426
2427
0
            size_t nSrcIdx = 0;
2428
0
            for (int j = 0; j < nBufYSize; ++j)
2429
0
            {
2430
0
                auto nDstOffset = j * nLineSpace;
2431
0
                for (int i = 0; i < nBufXSize;
2432
0
                     ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
2433
0
                {
2434
                    // Corrected to take into account m_opacity255Scale
2435
0
                    const int nOverlayA =
2436
0
                        pabyOverlayA ? ((pabyOverlayA[nSrcIdx] *
2437
0
                                             m_oBlendDataset.m_opacity255Scale +
2438
0
                                         255) /
2439
0
                                        256)
2440
0
                                     : m_oBlendDataset.m_opacity255Scale;
2441
2442
0
                    const int nSrcA = pabyA ? pabyA[nSrcIdx] : 255;
2443
2444
0
                    const int nSrcAMul255MinusOverlayA =
2445
0
                        (nSrcA * (255 - nOverlayA) + 255) / 256;
2446
0
                    const int nDstA = nOverlayA + nSrcAMul255MinusOverlayA;
2447
0
                    if (nBand != 4)
2448
0
                    {
2449
0
                        const int nOverlay =
2450
0
                            (pabyOverlayR && pabyOverlayG && pabyOverlayB)
2451
0
                                ? RGBToGrayScale(pabyOverlayR[nSrcIdx],
2452
0
                                                 pabyOverlayG[nSrcIdx],
2453
0
                                                 pabyOverlayB[nSrcIdx])
2454
0
                            : pabyOverlay ? pabyOverlay[nSrcIdx]
2455
0
                                          : 255;
2456
2457
0
                        const int nSrc = paby ? paby[nSrcIdx] : 255;
2458
0
                        int nDst = (nOverlay * nOverlayA +
2459
0
                                    nSrc * nSrcAMul255MinusOverlayA + 255) /
2460
0
                                   256;
2461
0
                        if (nDstA != 0 && nDstA != 255)
2462
0
                            nDst = (nDst * 255 + (nDstA / 2)) / nDstA;
2463
0
                        pabyDst[nDstOffset] =
2464
0
                            static_cast<GByte>(std::min(nDst, 255));
2465
0
                    }
2466
0
                    else
2467
0
                    {
2468
0
                        pabyDst[nDstOffset] = static_cast<GByte>(nDstA);
2469
0
                    }
2470
0
                }
2471
0
            }
2472
0
        }
2473
0
        else if (nOverlayCount == 1 && m_oBlendDataset.m_opacity255Scale == 255)
2474
0
        {
2475
0
            const GByte *pabyR = m_oBlendDataset.m_abyBuffer.data();
2476
0
            const GByte *pabyG =
2477
0
                m_oBlendDataset.m_abyBuffer.data() + nPixelCount;
2478
0
            const GByte *pabyB =
2479
0
                m_oBlendDataset.m_abyBuffer.data() + nPixelCount * 2;
2480
0
            CPLAssert(m_oBlendDataset.m_operator == CompositionMode::HSV_VALUE);
2481
0
            size_t nSrcIdx = 0;
2482
0
            const GByte *pabyValue =
2483
0
                m_oBlendDataset.m_abyBuffer.data() + nPixelCount * nColorCount;
2484
0
            for (int j = 0; j < nBufYSize; ++j)
2485
0
            {
2486
0
                auto nDstOffset = j * nLineSpace;
2487
0
                if (nPixelSpace == 1 && nLineSpace >= nPixelSpace * nBufXSize)
2488
0
                {
2489
0
                    patch_value_line(
2490
0
                        nBufXSize, pabyR + nSrcIdx, pabyG + nSrcIdx,
2491
0
                        pabyB + nSrcIdx, pabyValue + nSrcIdx,
2492
0
                        nBand == 1 ? pabyDst + nDstOffset : nullptr,
2493
0
                        nBand == 2 ? pabyDst + nDstOffset : nullptr,
2494
0
                        nBand == 3 ? pabyDst + nDstOffset : nullptr);
2495
0
                    nSrcIdx += nBufXSize;
2496
0
                }
2497
0
                else
2498
0
                {
2499
0
                    for (int i = 0; i < nBufXSize;
2500
0
                         ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
2501
0
                    {
2502
0
                        float h, s;
2503
0
                        rgb_to_hs(pabyR[nSrcIdx], pabyG[nSrcIdx],
2504
0
                                  pabyB[nSrcIdx], &h, &s);
2505
0
                        if (nBand == 1)
2506
0
                        {
2507
0
                            hsv_to_rgb(h, s, pabyValue[nSrcIdx],
2508
0
                                       &pabyDst[nDstOffset], nullptr, nullptr);
2509
0
                        }
2510
0
                        else if (nBand == 2)
2511
0
                        {
2512
0
                            hsv_to_rgb(h, s, pabyValue[nSrcIdx], nullptr,
2513
0
                                       &pabyDst[nDstOffset], nullptr);
2514
0
                        }
2515
0
                        else
2516
0
                        {
2517
0
                            CPLAssert(nBand == 3);
2518
0
                            hsv_to_rgb(h, s, pabyValue[nSrcIdx], nullptr,
2519
0
                                       nullptr, &pabyDst[nDstOffset]);
2520
0
                        }
2521
0
                    }
2522
0
                }
2523
0
            }
2524
0
        }
2525
0
        else
2526
0
        {
2527
0
            CPLAssert(m_oBlendDataset.m_operator == CompositionMode::HSV_VALUE);
2528
0
            CPLAssert(nBand <= 3);
2529
0
            const GByte *pabyR = m_oBlendDataset.m_abyBuffer.data();
2530
0
            const GByte *pabyG =
2531
0
                m_oBlendDataset.m_abyBuffer.data() + nPixelCount;
2532
0
            const GByte *pabyB =
2533
0
                m_oBlendDataset.m_abyBuffer.data() + nPixelCount * 2;
2534
0
            const GByte *pabyValue =
2535
0
                m_oBlendDataset.m_abyBuffer.data() + nPixelCount * nColorCount;
2536
0
            const GByte *pabyOverlayR =
2537
0
                nOverlayCount >= 3 ? m_oBlendDataset.m_abyBuffer.data() +
2538
0
                                         nPixelCount * nColorCount
2539
0
                                   : nullptr;
2540
0
            const GByte *pabyOverlayG =
2541
0
                nOverlayCount >= 3 ? m_oBlendDataset.m_abyBuffer.data() +
2542
0
                                         nPixelCount * (nColorCount + 1)
2543
0
                                   : nullptr;
2544
0
            const GByte *pabyOverlayB =
2545
0
                nOverlayCount >= 3 ? m_oBlendDataset.m_abyBuffer.data() +
2546
0
                                         nPixelCount * (nColorCount + 2)
2547
0
                                   : nullptr;
2548
0
            const GByte *pabyOverlayA =
2549
0
                (nOverlayCount == 2 || nOverlayCount == 4)
2550
0
                    ? m_oBlendDataset.m_abyBuffer.data() +
2551
0
                          nPixelCount * (nColorCount + nOverlayCount - 1)
2552
0
                    : nullptr;
2553
2554
0
            size_t nSrcIdx = 0;
2555
0
            for (int j = 0; j < nBufYSize; ++j)
2556
0
            {
2557
0
                auto nDstOffset = j * nLineSpace;
2558
0
                for (int i = 0; i < nBufXSize;
2559
0
                     ++i, ++nSrcIdx, nDstOffset += nPixelSpace)
2560
0
                {
2561
0
                    const int nColorR = pabyR[nSrcIdx];
2562
0
                    const int nColorG = pabyG[nSrcIdx];
2563
0
                    const int nColorB = pabyB[nSrcIdx];
2564
0
                    const int nOverlayV =
2565
0
                        (pabyOverlayR && pabyOverlayG && pabyOverlayB)
2566
0
                            ? std::max({pabyOverlayR[nSrcIdx],
2567
0
                                        pabyOverlayG[nSrcIdx],
2568
0
                                        pabyOverlayB[nSrcIdx]})
2569
0
                            : pabyValue[nSrcIdx];
2570
0
                    const int nOverlayA =
2571
0
                        pabyOverlayA ? ((pabyOverlayA[nSrcIdx] *
2572
0
                                             m_oBlendDataset.m_opacity255Scale +
2573
0
                                         255) /
2574
0
                                        256)
2575
0
                                     : m_oBlendDataset.m_opacity255Scale;
2576
0
                    const int nColorValue =
2577
0
                        std::max({nColorR, nColorG, nColorB});
2578
2579
0
                    float h, s;
2580
0
                    rgb_to_hs(pabyR[nSrcIdx], pabyG[nSrcIdx], pabyB[nSrcIdx],
2581
0
                              &h, &s);
2582
2583
0
                    const GByte nTargetValue = static_cast<GByte>(
2584
0
                        (nOverlayV * nOverlayA +
2585
0
                         nColorValue * (255 - nOverlayA) + 255) /
2586
0
                        256);
2587
2588
0
                    if (nBand == 1)
2589
0
                    {
2590
0
                        hsv_to_rgb(h, s, nTargetValue, &pabyDst[nDstOffset],
2591
0
                                   nullptr, nullptr);
2592
0
                    }
2593
0
                    else if (nBand == 2)
2594
0
                    {
2595
0
                        hsv_to_rgb(h, s, nTargetValue, nullptr,
2596
0
                                   &pabyDst[nDstOffset], nullptr);
2597
0
                    }
2598
0
                    else
2599
0
                    {
2600
0
                        CPLAssert(nBand == 3);
2601
0
                        hsv_to_rgb(h, s, nTargetValue, nullptr, nullptr,
2602
0
                                   &pabyDst[nDstOffset]);
2603
0
                    }
2604
0
                }
2605
0
            }
2606
0
        }
2607
2608
0
        return CE_None;
2609
0
    }
2610
0
    else if (m_oBlendDataset.m_ioError)
2611
0
    {
2612
0
        return CE_Failure;
2613
0
    }
2614
0
    else
2615
0
    {
2616
0
        const CPLErr eErr = GDALRasterBand::IRasterIO(
2617
0
            eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize, nBufYSize,
2618
0
            eBufType, nPixelSpace, nLineSpace, psExtraArg);
2619
0
        m_oBlendDataset.m_ioError = eErr != CE_None;
2620
0
        return eErr;
2621
0
    }
2622
0
}
2623
2624
}  // namespace
2625
2626
/************************************************************************/
2627
/*              GDALRasterBlendAlgorithm::ValidateGlobal()              */
2628
/************************************************************************/
2629
2630
bool GDALRasterBlendAlgorithm::ValidateGlobal()
2631
0
{
2632
0
    auto poSrcDS =
2633
0
        m_inputDataset.empty() ? nullptr : m_inputDataset[0].GetDatasetRef();
2634
0
    auto poOverlayDS = m_overlayDataset.GetDatasetRef();
2635
0
    if (poSrcDS)
2636
0
    {
2637
0
        if (poSrcDS->GetRasterCount() == 0 || poSrcDS->GetRasterCount() > 4 ||
2638
0
            poSrcDS->GetRasterBand(1)->GetRasterDataType() != GDT_UInt8)
2639
0
        {
2640
0
            ReportError(CE_Failure, CPLE_NotSupported,
2641
0
                        "Only 1-band, 2-band, 3-band or 4-band Byte dataset "
2642
0
                        "supported as input");
2643
0
            return false;
2644
0
        }
2645
0
    }
2646
0
    if (poOverlayDS)
2647
0
    {
2648
0
        if (poOverlayDS->GetRasterCount() == 0 ||
2649
0
            poOverlayDS->GetRasterCount() > 4 ||
2650
0
            poOverlayDS->GetRasterBand(1)->GetRasterDataType() != GDT_UInt8)
2651
0
        {
2652
0
            ReportError(CE_Failure, CPLE_NotSupported,
2653
0
                        "Only 1-band, 2-band, 3-band or 4-band Byte dataset "
2654
0
                        "supported as overlay");
2655
0
            return false;
2656
0
        }
2657
0
    }
2658
2659
0
    if (poSrcDS && poOverlayDS)
2660
0
    {
2661
0
        if (poSrcDS->GetRasterXSize() != poOverlayDS->GetRasterXSize() ||
2662
0
            poSrcDS->GetRasterYSize() != poOverlayDS->GetRasterYSize())
2663
0
        {
2664
0
            ReportError(CE_Failure, CPLE_IllegalArg,
2665
0
                        "Input dataset and overlay dataset must have "
2666
0
                        "the same dimensions");
2667
0
            return false;
2668
0
        }
2669
2670
0
        if (!BandCountIsCompatibleWithCompositionMode(poSrcDS->GetRasterCount(),
2671
0
                                                      m_operator))
2672
0
        {
2673
0
            const int minRequiredBands{
2674
0
                MinBandCountForCompositionMode(m_operator)};
2675
0
            const int maxRequiredBands{
2676
0
                MaxBandCountForCompositionMode(m_operator)};
2677
0
            if (minRequiredBands != maxRequiredBands)
2678
0
                ReportError(CE_Failure, CPLE_IllegalArg,
2679
0
                            "Input dataset has %d band(s), but operator %s "
2680
0
                            "requires between %d and %d bands",
2681
0
                            poSrcDS->GetRasterCount(),
2682
0
                            CompositionModeToString(m_operator).c_str(),
2683
0
                            minRequiredBands, maxRequiredBands);
2684
0
            else
2685
0
                ReportError(CE_Failure, CPLE_IllegalArg,
2686
0
                            "Input dataset has %d band(s), but operator %s "
2687
0
                            "requires %d bands",
2688
0
                            poSrcDS->GetRasterCount(),
2689
0
                            CompositionModeToString(m_operator).c_str(),
2690
0
                            minRequiredBands);
2691
0
            return false;
2692
0
        }
2693
0
    }
2694
2695
    // Check that for LIGHTEN and DARKEN, the source dataset and destination dataset
2696
    // have the same number of color bands (do not consider alpha)
2697
0
    if (poSrcDS && poOverlayDS &&
2698
0
        (m_operator == CompositionMode::LIGHTEN ||
2699
0
         m_operator == CompositionMode::DARKEN))
2700
0
    {
2701
0
        const int nSrcColorBands =
2702
0
            (poSrcDS->GetRasterCount() == 2 || poSrcDS->GetRasterCount() == 4)
2703
0
                ? poSrcDS->GetRasterCount() - 1
2704
0
                : poSrcDS->GetRasterCount();
2705
0
        const int nOverlayColorBands = (poOverlayDS->GetRasterCount() == 2 ||
2706
0
                                        poOverlayDS->GetRasterCount() == 4)
2707
0
                                           ? poOverlayDS->GetRasterCount() - 1
2708
0
                                           : poOverlayDS->GetRasterCount();
2709
0
        if (nSrcColorBands != nOverlayColorBands)
2710
0
        {
2711
0
            ReportError(
2712
0
                CE_Failure, CPLE_IllegalArg,
2713
0
                "For LIGHTEN and DARKEN operators, the source dataset "
2714
0
                "and overlay dataset must have the same number of "
2715
0
                "bands (without considering alpha). They have %d and %d "
2716
0
                "bands respectively",
2717
0
                nSrcColorBands, nOverlayColorBands);
2718
0
            return false;
2719
0
        }
2720
0
    }
2721
2722
0
    return true;
2723
0
}
2724
2725
/************************************************************************/
2726
/*                 GDALRasterBlendAlgorithm::RunStep()                  */
2727
/************************************************************************/
2728
2729
bool GDALRasterBlendAlgorithm::RunStep(GDALPipelineStepRunContext &)
2730
0
{
2731
0
    auto poSrcDS = m_inputDataset[0].GetDatasetRef();
2732
0
    CPLAssert(poSrcDS);
2733
2734
0
    auto poOverlayDS = m_overlayDataset.GetDatasetRef();
2735
0
    CPLAssert(poOverlayDS);
2736
2737
    // If any of the dataset single band has a color table implicitly convert it to RGBA by calling
2738
    // GDALTranslate with -expand RGBA
2739
0
    auto convertToRGBAifNeeded =
2740
0
        [](GDALDataset *&poDS,
2741
0
           std::unique_ptr<GDALDataset, GDALDatasetUniquePtrReleaser> &ds)
2742
0
        -> bool
2743
0
    {
2744
0
        if (poDS->GetRasterCount() == 1 &&
2745
0
            poDS->GetRasterBand(1)->GetColorTable() != nullptr)
2746
0
        {
2747
0
            CPLStringList aosOptions;
2748
0
            aosOptions.AddString("-of");
2749
0
            aosOptions.AddString("VRT");
2750
0
            aosOptions.AddString("-expand");
2751
0
            aosOptions.AddString("RGBA");
2752
0
            GDALTranslateOptions *translateOptions =
2753
0
                GDALTranslateOptionsNew(aosOptions.List(), nullptr);
2754
2755
0
            ds.reset(GDALDataset::FromHandle(GDALTranslate(
2756
0
                "", GDALDataset::ToHandle(poDS), translateOptions, nullptr)));
2757
2758
0
            GDALTranslateOptionsFree(translateOptions);
2759
2760
0
            if (ds != nullptr)
2761
0
            {
2762
0
                poDS = ds.get();
2763
0
                return true;
2764
0
            }
2765
0
            else
2766
0
            {
2767
0
                return false;
2768
0
            }
2769
0
        }
2770
0
        return true;
2771
0
    };
2772
2773
0
    if (!convertToRGBAifNeeded(poSrcDS, m_poTmpSrcDS))
2774
0
    {
2775
0
        ReportError(CE_Failure, CPLE_AppDefined,
2776
0
                    "Conversion of source dataset color table to RGBA failed");
2777
0
        return false;
2778
0
    }
2779
2780
0
    if (!convertToRGBAifNeeded(poOverlayDS, m_poTmpOverlayDS))
2781
0
    {
2782
0
        ReportError(CE_Failure, CPLE_AppDefined,
2783
0
                    "Conversion of overlay dataset color table to RGBA failed");
2784
0
        return false;
2785
0
    }
2786
2787
0
    if (!ValidateGlobal())
2788
0
        return false;
2789
2790
0
    const int nOpacity255Scale =
2791
0
        (m_opacity * 255 + OPACITY_INPUT_RANGE / 2) / OPACITY_INPUT_RANGE;
2792
2793
0
    bool bSwappedOpacity = false;
2794
    // Many algorithms are commutative regarding the two inputs but BlendDataset assume
2795
    // RGB(A) is in the source (and not in the overlay).
2796
0
    if ((m_operator == CompositionMode::MULTIPLY ||
2797
0
         m_operator == CompositionMode::SCREEN ||
2798
0
         m_operator == CompositionMode::HARD_LIGHT ||
2799
0
         m_operator == CompositionMode::OVERLAY) &&
2800
0
        (poSrcDS->GetRasterCount() < poOverlayDS->GetRasterCount()))
2801
0
    {
2802
0
        bSwappedOpacity = true;
2803
0
        std::swap(poSrcDS, poOverlayDS);
2804
0
    }
2805
2806
0
    m_outputDataset.Set(std::make_unique<BlendDataset>(
2807
0
        *poSrcDS, *poOverlayDS, m_operator, nOpacity255Scale, bSwappedOpacity));
2808
2809
0
    return true;
2810
0
}
2811
2812
0
GDALRasterBlendAlgorithmStandalone::~GDALRasterBlendAlgorithmStandalone() =
2813
    default;
2814
2815
//! @endcond