Coverage Report

Created: 2026-02-14 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/apps/gdalalg_vector_partition.cpp
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Project:  GDAL
4
 * Purpose:  "partition" step of "vector pipeline"
5
 * Author:   Even Rouault <even dot rouault at spatialys.com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#include "gdalalg_vector_partition.h"
14
15
#include "cpl_vsi.h"
16
#include "cpl_mem_cache.h"
17
18
#include <algorithm>
19
#include <set>
20
#include <string_view>
21
22
#ifndef _
23
0
#define _(x) (x)
24
#endif
25
26
//! @cond Doxygen_Suppress
27
28
constexpr int DIRECTORY_CREATION_MODE = 0755;
29
30
constexpr const char *NULL_MARKER = "__HIVE_DEFAULT_PARTITION__";
31
32
constexpr const char *DEFAULT_PATTERN_HIVE = "part_%010d";
33
constexpr const char *DEFAULT_PATTERN_FLAT_NO_FIELD = "{LAYER_NAME}_%010d";
34
constexpr const char *DEFAULT_PATTERN_FLAT = "{LAYER_NAME}_{FIELD_VALUE}_%010d";
35
36
constexpr char DIGIT_ZERO = '0';
37
38
/************************************************************************/
39
/*                       GetConstructorOptions()                        */
40
/************************************************************************/
41
42
/* static */
43
GDALVectorPartitionAlgorithm::ConstructorOptions
44
GDALVectorPartitionAlgorithm::GetConstructorOptions(bool standaloneStep)
45
0
{
46
0
    GDALVectorPartitionAlgorithm::ConstructorOptions options;
47
0
    options.SetStandaloneStep(standaloneStep);
48
0
    options.SetAddInputLayerNameArgument(false);
49
0
    options.SetAddDefaultArguments(false);
50
0
    return options;
51
0
}
52
53
/************************************************************************/
54
/*     GDALVectorPartitionAlgorithm::GDALVectorPartitionAlgorithm()     */
55
/************************************************************************/
56
57
GDALVectorPartitionAlgorithm::GDALVectorPartitionAlgorithm(bool standaloneStep)
58
0
    : GDALVectorPipelineStepAlgorithm(NAME, DESCRIPTION, HELP_URL,
59
0
                                      GetConstructorOptions(standaloneStep))
60
0
{
61
0
    if (standaloneStep)
62
0
    {
63
0
        AddVectorInputArgs(false);
64
0
    }
65
0
    else
66
0
    {
67
0
        AddVectorHiddenInputDatasetArg();
68
0
    }
69
0
    AddProgressArg();
70
71
0
    AddArg(GDAL_ARG_NAME_OUTPUT, 'o', _("Output directory"), &m_output)
72
0
        .SetRequired()
73
0
        .SetIsInput()
74
0
        .SetMinCharCount(1)
75
0
        .SetPositional();
76
77
0
    constexpr const char *OVERWRITE_APPEND_EXCLUSION_GROUP = "overwrite-append";
78
0
    AddOverwriteArg(&m_overwrite)
79
0
        .SetMutualExclusionGroup(OVERWRITE_APPEND_EXCLUSION_GROUP);
80
0
    AddAppendLayerArg(&m_appendLayer)
81
0
        .SetMutualExclusionGroup(OVERWRITE_APPEND_EXCLUSION_GROUP);
82
0
    AddUpdateArg(&m_update).SetHidden();
83
84
0
    AddOutputFormatArg(&m_format, /* bStreamAllowed = */ false,
85
0
                       /* bGDALGAllowed = */ false)
86
0
        .AddMetadataItem(GAAMDI_REQUIRED_CAPABILITIES,
87
0
                         {GDAL_DCAP_VECTOR, GDAL_DCAP_CREATE});
88
0
    AddCreationOptionsArg(&m_creationOptions);
89
0
    AddLayerCreationOptionsArg(&m_layerCreationOptions);
90
91
0
    AddArg("field", 0,
92
0
           _("Attribute or geometry field(s) on which to partition"),
93
0
           &m_fields);
94
0
    AddArg("scheme", 0, _("Partitioning scheme"), &m_scheme)
95
0
        .SetChoices(SCHEME_HIVE, SCHEME_FLAT)
96
0
        .SetDefault(m_scheme);
97
0
    AddArg("pattern", 0,
98
0
           _("Filename pattern ('part_%010d' for scheme=hive, "
99
0
             "'{LAYER_NAME}_{FIELD_VALUE}_%010d' for scheme=flat)"),
100
0
           &m_pattern)
101
0
        .SetMinCharCount(1)
102
0
        .AddValidationAction(
103
0
            [this]()
104
0
            {
105
0
                if (!m_pattern.empty())
106
0
                {
107
0
                    const auto nPercentPos = m_pattern.find('%');
108
0
                    if (nPercentPos == std::string::npos)
109
0
                    {
110
0
                        ReportError(CE_Failure, CPLE_IllegalArg, "%s",
111
0
                                    "Missing '%' character in pattern");
112
0
                        return false;
113
0
                    }
114
0
                    if (nPercentPos + 1 < m_pattern.size() &&
115
0
                        m_pattern.find('%', nPercentPos + 1) !=
116
0
                            std::string::npos)
117
0
                    {
118
0
                        ReportError(
119
0
                            CE_Failure, CPLE_IllegalArg, "%s",
120
0
                            "A single '%' character is expected in pattern");
121
0
                        return false;
122
0
                    }
123
0
                    bool percentFound = false;
124
0
                    for (size_t i = nPercentPos + 1; i < m_pattern.size(); ++i)
125
0
                    {
126
0
                        if (m_pattern[i] >= DIGIT_ZERO && m_pattern[i] <= '9')
127
0
                        {
128
                            // ok
129
0
                        }
130
0
                        else if (m_pattern[i] == 'd')
131
0
                        {
132
0
                            percentFound = true;
133
0
                            break;
134
0
                        }
135
0
                        else
136
0
                        {
137
0
                            break;
138
0
                        }
139
0
                    }
140
0
                    if (!percentFound)
141
0
                    {
142
0
                        ReportError(
143
0
                            CE_Failure, CPLE_IllegalArg, "%s",
144
0
                            "pattern value must include a single "
145
0
                            "'%[0]?[1-9]?[0]?d' part number specification");
146
0
                        return false;
147
0
                    }
148
0
                    m_partDigitCount =
149
0
                        atoi(m_pattern.c_str() + nPercentPos + 1);
150
0
                    if (m_partDigitCount > 10)
151
0
                    {
152
0
                        ReportError(CE_Failure, CPLE_IllegalArg,
153
0
                                    "Number of digits in part number "
154
0
                                    "specifiation should be in [1,10] range");
155
0
                        return false;
156
0
                    }
157
0
                    m_partDigitLeadingZeroes =
158
0
                        m_pattern[nPercentPos + 1] == DIGIT_ZERO;
159
0
                }
160
0
                return true;
161
0
            });
162
0
    AddArg("feature-limit", 0, _("Maximum number of features per file"),
163
0
           &m_featureLimit)
164
0
        .SetMinValueExcluded(0);
165
0
    AddArg("max-file-size", 0,
166
0
           _("Maximum file size (MB or GB suffix can be used)"),
167
0
           &m_maxFileSizeStr)
168
0
        .AddValidationAction(
169
0
            [this]()
170
0
            {
171
0
                bool ok;
172
0
                {
173
0
                    CPLErrorStateBackuper oBackuper(CPLQuietErrorHandler);
174
0
                    ok = CPLParseMemorySize(m_maxFileSizeStr.c_str(),
175
0
                                            &m_maxFileSize,
176
0
                                            nullptr) == CE_None &&
177
0
                         m_maxFileSize > 0;
178
0
                }
179
0
                if (!ok)
180
0
                {
181
0
                    ReportError(CE_Failure, CPLE_IllegalArg,
182
0
                                "Invalid value for max-file-size");
183
0
                    return false;
184
0
                }
185
0
                else if (m_maxFileSize < 1024 * 1024)
186
0
                {
187
0
                    ReportError(CE_Failure, CPLE_IllegalArg,
188
0
                                "max-file-size should be at least one MB");
189
0
                    return false;
190
0
                }
191
0
                return true;
192
0
            });
193
0
    AddArg("omit-partitioned-field", 0,
194
0
           _("Whether to omit partitioned fields from target layer definition"),
195
0
           &m_omitPartitionedFields);
196
0
    AddArg("skip-errors", 0, _("Skip errors when writing features"),
197
0
           &m_skipErrors);
198
199
    // Hidden for now
200
201
0
    AddArg("max-cache-size", 0,
202
0
           _("Maximum number of datasets simultaneously opened"),
203
0
           &m_maxCacheSize)
204
0
        .SetMinValueIncluded(0)  // 0 = unlimited
205
0
        .SetDefault(m_maxCacheSize)
206
0
        .SetHidden();
207
208
0
    AddArg("transaction-size", 0,
209
0
           _("Maximum number of features per transaction"), &m_transactionSize)
210
0
        .SetMinValueIncluded(1)
211
0
        .SetDefault(m_transactionSize)
212
0
        .SetHidden();
213
214
0
    AddValidationAction(
215
0
        [this]()
216
0
        {
217
0
            if (m_fields.empty() && m_featureLimit == 0 && m_maxFileSize == 0)
218
0
            {
219
0
                ReportError(
220
0
                    CE_Failure, CPLE_IllegalArg,
221
0
                    "When 'fields' argument is not specified, "
222
0
                    "'feature-limit' and/or 'max-file-size' must be specified");
223
0
                return false;
224
0
            }
225
0
            return true;
226
0
        });
227
0
}
228
229
/************************************************************************/
230
/*                           PercentEncode()                            */
231
/************************************************************************/
232
233
static void PercentEncode(std::string &out, const std::string_view &s)
234
0
{
235
0
    for (unsigned char c : s)
236
0
    {
237
0
        if (c > 32 && c <= 127 && c != ':' && c != '/' && c != '\\' &&
238
0
            c != '>' && c != '%' && c != '=')
239
0
        {
240
0
            out += c;
241
0
        }
242
0
        else
243
0
        {
244
0
            out += CPLSPrintf("%%%02X", c);
245
0
        }
246
0
    }
247
0
}
248
249
static std::string PercentEncode(const std::string_view &s)
250
0
{
251
0
    std::string out;
252
0
    PercentEncode(out, s);
253
0
    return out;
254
0
}
255
256
/************************************************************************/
257
/*                      GetEstimatedFeatureSize()                       */
258
/************************************************************************/
259
260
static size_t GetEstimatedFeatureSize(
261
    const OGRFeature *poFeature, const std::vector<bool> &abPartitionedFields,
262
    const bool omitPartitionedFields,
263
    const std::vector<OGRFieldType> &aeSrcFieldTypes, bool bIsBinary)
264
0
{
265
0
    size_t nSize = 16;
266
0
    const int nFieldCount = poFeature->GetFieldCount();
267
0
    nSize += 4 * nFieldCount;
268
0
    for (int i = 0; i < nFieldCount; ++i)
269
0
    {
270
0
        if (!(omitPartitionedFields && abPartitionedFields[i]))
271
0
        {
272
0
            switch (aeSrcFieldTypes[i])
273
0
            {
274
0
                case OFTInteger:
275
0
                    nSize += bIsBinary ? sizeof(int) : 11;
276
0
                    break;
277
0
                case OFTInteger64:
278
0
                    nSize += bIsBinary ? sizeof(int64_t) : 21;
279
0
                    break;
280
0
                case OFTReal:
281
                    // Decimal representation
282
0
                    nSize += bIsBinary ? sizeof(double) : 15;
283
0
                    break;
284
0
                case OFTString:
285
0
                    nSize += 4 + strlen(poFeature->GetFieldAsStringUnsafe(i));
286
0
                    break;
287
0
                case OFTBinary:
288
0
                {
289
0
                    int nCount = 0;
290
0
                    CPL_IGNORE_RET_VAL(poFeature->GetFieldAsBinary(i, &nCount));
291
0
                    nSize += 4 + nCount;
292
0
                    break;
293
0
                }
294
0
                case OFTIntegerList:
295
0
                {
296
0
                    int nCount = 0;
297
0
                    CPL_IGNORE_RET_VAL(
298
0
                        poFeature->GetFieldAsIntegerList(i, &nCount));
299
0
                    nSize += 4 + (bIsBinary ? sizeof(int) : 11) * nCount;
300
0
                    break;
301
0
                }
302
0
                case OFTInteger64List:
303
0
                {
304
0
                    int nCount = 0;
305
0
                    CPL_IGNORE_RET_VAL(
306
0
                        poFeature->GetFieldAsInteger64List(i, &nCount));
307
0
                    nSize += 4 + (bIsBinary ? sizeof(int64_t) : 21) * nCount;
308
0
                    break;
309
0
                }
310
0
                case OFTRealList:
311
0
                {
312
0
                    int nCount = 0;
313
0
                    CPL_IGNORE_RET_VAL(
314
0
                        poFeature->GetFieldAsDoubleList(i, &nCount));
315
0
                    nSize += 4 + (bIsBinary ? sizeof(double) : 15) * nCount;
316
0
                    break;
317
0
                }
318
0
                case OFTStringList:
319
0
                {
320
0
                    CSLConstList papszIter = poFeature->GetFieldAsStringList(i);
321
0
                    nSize += 4;
322
0
                    for (; papszIter && *papszIter; ++papszIter)
323
0
                        nSize += 4 + strlen(*papszIter);
324
0
                    break;
325
0
                }
326
0
                case OFTTime:
327
                    // Decimal representation
328
0
                    nSize += 4 + sizeof("HH:MM:SS.sss");
329
0
                    break;
330
0
                case OFTDate:
331
                    // Decimal representation
332
0
                    nSize += 4 + sizeof("YYYY-MM-DD");
333
0
                    break;
334
0
                case OFTDateTime:
335
                    // Decimal representation
336
0
                    nSize += 4 + sizeof("YYYY-MM-DDTHH:MM:SS.sss+HH:MM");
337
0
                    break;
338
0
                case OFTWideString:
339
0
                case OFTWideStringList:
340
0
                    break;
341
0
            }
342
0
        }
343
0
    }
344
345
0
    const int nGeomFieldCount = poFeature->GetGeomFieldCount();
346
0
    nSize += 4 * nGeomFieldCount;
347
0
    for (int i = 0; i < nGeomFieldCount; ++i)
348
0
    {
349
0
        const auto poGeom = poFeature->GetGeomFieldRef(i);
350
0
        if (poGeom)
351
0
            nSize += poGeom->WkbSize();
352
0
    }
353
354
0
    return nSize;
355
0
}
356
357
/************************************************************************/
358
/*                       GetCurrentOutputLayer()                        */
359
/************************************************************************/
360
361
constexpr int MIN_FILE_SIZE = 65536;
362
363
namespace
364
{
365
struct Layer
366
{
367
    bool bUseTransactions = false;
368
    std::unique_ptr<GDALDataset> poDS{};
369
    OGRLayer *poLayer = nullptr;
370
    GIntBig nFeatureCount = 0;
371
    int nFileCounter = 1;
372
    GIntBig nFileSize = MIN_FILE_SIZE;
373
374
    ~Layer()
375
0
    {
376
0
        if (poDS)
377
0
        {
378
0
            CPL_IGNORE_RET_VAL(poDS->CommitTransaction());
379
0
        }
380
0
    }
381
};
382
}  // namespace
383
384
static bool GetCurrentOutputLayer(
385
    GDALAlgorithm *const alg, const OGRFeatureDefn *const poSrcFeatureDefn,
386
    OGRLayer *const poSrcLayer, const std::string &osKey,
387
    const std::vector<OGRwkbGeometryType> &aeGeomTypes,
388
    const std::string &osLayerDir, const std::string &osScheme,
389
    const std::string &osPatternIn, bool partDigitLeadingZeroes,
390
    size_t partDigitCount, const int featureLimit, const GIntBig maxFileSize,
391
    const bool omitPartitionedFields,
392
    const std::vector<bool> &abPartitionedFields,
393
    const std::vector<bool> &abPartitionedGeomFields, const char *pszExtension,
394
    GDALDriver *const poOutDriver, const CPLStringList &datasetCreationOptions,
395
    const CPLStringList &layerCreationOptions,
396
    const OGRFeatureDefn *const poFeatureDefnWithoutPartitionedFields,
397
    const int nSpatialIndexPerFeatureConstant,
398
    const int nSpatialIndexPerLog2FeatureCountConstant, bool bUseTransactions,
399
    lru11::Cache<std::string, std::shared_ptr<Layer>> &oCacheOutputLayer,
400
    std::shared_ptr<Layer> &outputLayer)
401
0
{
402
0
    const std::string osPattern =
403
0
        !osPatternIn.empty() ? osPatternIn
404
0
        : osScheme == GDALVectorPartitionAlgorithm::SCHEME_HIVE
405
0
            ? DEFAULT_PATTERN_HIVE
406
0
        : osKey.empty() ? DEFAULT_PATTERN_FLAT_NO_FIELD
407
0
                        : DEFAULT_PATTERN_FLAT;
408
409
0
    bool bLimitReached = false;
410
0
    bool bOpenOrCreateNewFile = true;
411
0
    if (oCacheOutputLayer.tryGet(osKey, outputLayer))
412
0
    {
413
0
        if (featureLimit > 0 && outputLayer->nFeatureCount >= featureLimit)
414
0
        {
415
0
            bLimitReached = true;
416
0
        }
417
0
        else if (maxFileSize > 0 &&
418
0
                 outputLayer->nFileSize +
419
0
                         (nSpatialIndexPerFeatureConstant > 0
420
0
                              ? (outputLayer->nFeatureCount *
421
0
                                     nSpatialIndexPerFeatureConstant +
422
0
                                 static_cast<int>(std::ceil(
423
0
                                     log2(outputLayer->nFeatureCount)))) *
424
0
                                    nSpatialIndexPerLog2FeatureCountConstant
425
0
                              : 0) >=
426
0
                     maxFileSize)
427
0
        {
428
0
            bLimitReached = true;
429
0
        }
430
0
        else
431
0
        {
432
0
            bOpenOrCreateNewFile = false;
433
0
        }
434
0
    }
435
0
    else
436
0
    {
437
0
        outputLayer = std::make_unique<Layer>();
438
0
        outputLayer->bUseTransactions = bUseTransactions;
439
0
    }
440
441
0
    const auto SubstituteVariables = [&osKey, poSrcLayer](const std::string &s)
442
0
    {
443
0
        CPLString ret(s);
444
0
        ret.replaceAll("{LAYER_NAME}",
445
0
                       PercentEncode(poSrcLayer->GetDescription()));
446
447
0
        if (ret.find("{FIELD_VALUE}") != std::string::npos)
448
0
        {
449
0
            std::string fieldValue;
450
0
            const CPLStringList aosTokens(
451
0
                CSLTokenizeString2(osKey.c_str(), "/", 0));
452
0
            for (int i = 0; i < aosTokens.size(); ++i)
453
0
            {
454
0
                const CPLStringList aosFieldNameValue(
455
0
                    CSLTokenizeString2(aosTokens[i], "=", 0));
456
0
                if (!fieldValue.empty())
457
0
                    fieldValue += '_';
458
0
                fieldValue +=
459
0
                    aosFieldNameValue.size() == 2
460
0
                        ? (strcmp(aosFieldNameValue[1], NULL_MARKER) == 0
461
0
                               ? std::string("__NULL__")
462
0
                               : aosFieldNameValue[1])
463
0
                        : std::string("__EMPTY__");
464
0
            }
465
0
            ret.replaceAll("{FIELD_VALUE}", fieldValue);
466
0
        }
467
0
        return ret;
468
0
    };
469
470
0
    const auto nPercentPos = osPattern.find('%');
471
0
    CPLAssert(nPercentPos !=
472
0
              std::string::npos);  // checked by validation action
473
0
    const std::string osPatternPrefix =
474
0
        SubstituteVariables(osPattern.substr(0, nPercentPos));
475
0
    const auto nAfterDPos = osPattern.find('d', nPercentPos + 1) + 1;
476
0
    const std::string osPatternSuffix =
477
0
        nAfterDPos < osPattern.size()
478
0
            ? SubstituteVariables(osPattern.substr(nAfterDPos))
479
0
            : std::string();
480
481
0
    const auto GetBasenameFromCounter = [partDigitCount, partDigitLeadingZeroes,
482
0
                                         &osPatternPrefix,
483
0
                                         &osPatternSuffix](int nCounter)
484
0
    {
485
0
        const std::string sCounter(CPLSPrintf("%d", nCounter));
486
0
        std::string s(osPatternPrefix);
487
0
        if (sCounter.size() < partDigitCount)
488
0
        {
489
0
            s += std::string(partDigitCount - sCounter.size(),
490
0
                             partDigitLeadingZeroes ? DIGIT_ZERO : ' ');
491
0
        }
492
0
        s += sCounter;
493
0
        s += osPatternSuffix;
494
0
        return s;
495
0
    };
496
497
0
    if (bOpenOrCreateNewFile)
498
0
    {
499
0
        std::string osDatasetDir =
500
0
            osScheme == GDALVectorPartitionAlgorithm::SCHEME_HIVE
501
0
                ? CPLFormFilenameSafe(osLayerDir.c_str(), osKey.c_str(),
502
0
                                      nullptr)
503
0
                : osLayerDir;
504
0
        outputLayer->nFeatureCount = 0;
505
506
0
        bool bCreateNewFile = true;
507
0
        if (bLimitReached)
508
0
        {
509
0
            ++outputLayer->nFileCounter;
510
0
        }
511
0
        else
512
0
        {
513
0
            outputLayer->nFileCounter = 1;
514
515
0
            VSIStatBufL sStat;
516
0
            if (VSIStatL(osDatasetDir.c_str(), &sStat) != 0)
517
0
            {
518
0
                if (VSIMkdirRecursive(osDatasetDir.c_str(),
519
0
                                      DIRECTORY_CREATION_MODE) != 0)
520
0
                {
521
0
                    alg->ReportError(CE_Failure, CPLE_AppDefined,
522
0
                                     "Cannot create directory '%s'",
523
0
                                     osDatasetDir.c_str());
524
0
                    return false;
525
0
                }
526
0
            }
527
528
0
            int nMaxCounter = 0;
529
0
            std::unique_ptr<VSIDIR, decltype(&VSICloseDir)> psDir(
530
0
                VSIOpenDir(osDatasetDir.c_str(), 0, nullptr), VSICloseDir);
531
0
            if (psDir)
532
0
            {
533
0
                while (const auto *psEntry = VSIGetNextDirEntry(psDir.get()))
534
0
                {
535
0
                    const std::string osName(
536
0
                        CPLGetBasenameSafe(psEntry->pszName));
537
0
                    if (cpl::starts_with(osName, osPatternPrefix) &&
538
0
                        cpl::ends_with(osName, osPatternSuffix))
539
0
                    {
540
0
                        nMaxCounter = std::max(
541
0
                            nMaxCounter,
542
0
                            atoi(osName
543
0
                                     .substr(osPatternPrefix.size(),
544
0
                                             osName.size() -
545
0
                                                 osPatternPrefix.size() -
546
0
                                                 osPatternSuffix.size())
547
0
                                     .c_str()));
548
0
                    }
549
0
                }
550
0
            }
551
552
0
            if (nMaxCounter > 0)
553
0
            {
554
0
                outputLayer->nFileCounter = nMaxCounter;
555
556
0
                const std::string osFilename = CPLFormFilenameSafe(
557
0
                    osDatasetDir.c_str(),
558
0
                    GetBasenameFromCounter(nMaxCounter).c_str(), pszExtension);
559
0
                auto poDS = std::unique_ptr<GDALDataset>(GDALDataset::Open(
560
0
                    osFilename.c_str(),
561
0
                    GDAL_OF_VECTOR | GDAL_OF_UPDATE | GDAL_OF_VERBOSE_ERROR));
562
0
                if (!poDS)
563
0
                    return false;
564
0
                auto poDstLayer = poDS->GetLayer(0);
565
0
                if (!poDstLayer)
566
0
                {
567
0
                    alg->ReportError(CE_Failure, CPLE_AppDefined,
568
0
                                     "No layer in %s", osFilename.c_str());
569
0
                    return false;
570
0
                }
571
572
                // Check if the existing output layer has the expected layer
573
                // definition
574
0
                const auto poRefFeatureDefn =
575
0
                    poFeatureDefnWithoutPartitionedFields
576
0
                        ? poFeatureDefnWithoutPartitionedFields
577
0
                        : poSrcFeatureDefn;
578
0
                const auto poDstFeatureDefn = poDstLayer->GetLayerDefn();
579
0
                bool bSameDefinition = (poDstFeatureDefn->GetFieldCount() ==
580
0
                                        poRefFeatureDefn->GetFieldCount());
581
0
                for (int i = 0;
582
0
                     bSameDefinition && i < poRefFeatureDefn->GetFieldCount();
583
0
                     ++i)
584
0
                {
585
0
                    const auto poRefFieldDefn =
586
0
                        poRefFeatureDefn->GetFieldDefn(i);
587
0
                    const auto poDstFieldDefn =
588
0
                        poDstFeatureDefn->GetFieldDefn(i);
589
0
                    bSameDefinition =
590
0
                        EQUAL(poRefFieldDefn->GetNameRef(),
591
0
                              poDstFieldDefn->GetNameRef()) &&
592
0
                        poRefFieldDefn->GetType() == poDstFieldDefn->GetType();
593
0
                }
594
0
                bSameDefinition =
595
0
                    bSameDefinition && (poDstFeatureDefn->GetGeomFieldCount() ==
596
0
                                        poRefFeatureDefn->GetGeomFieldCount());
597
0
                for (int i = 0; bSameDefinition &&
598
0
                                i < poRefFeatureDefn->GetGeomFieldCount();
599
0
                     ++i)
600
0
                {
601
0
                    const auto poRefFieldDefn =
602
0
                        poRefFeatureDefn->GetGeomFieldDefn(i);
603
0
                    const auto poDstFieldDefn =
604
0
                        poDstFeatureDefn->GetGeomFieldDefn(i);
605
0
                    bSameDefinition =
606
0
                        (poRefFeatureDefn->GetGeomFieldCount() == 1 ||
607
0
                         EQUAL(poRefFieldDefn->GetNameRef(),
608
0
                               poDstFieldDefn->GetNameRef()));
609
0
                }
610
611
0
                if (!bSameDefinition)
612
0
                {
613
0
                    alg->ReportError(CE_Failure, CPLE_AppDefined,
614
0
                                     "%s does not have the same feature "
615
0
                                     "definition as the source layer",
616
0
                                     osFilename.c_str());
617
0
                    return false;
618
0
                }
619
620
0
                if (VSIStatL(osFilename.c_str(), &sStat) == 0)
621
0
                {
622
0
                    outputLayer->nFileSize = sStat.st_size;
623
0
                }
624
625
0
                GIntBig nFeatureCount = 0;
626
0
                if (((featureLimit == 0 ||
627
0
                      (nFeatureCount = poDstLayer->GetFeatureCount(true)) <
628
0
                          featureLimit)) &&
629
0
                    (maxFileSize == 0 || outputLayer->nFileSize < maxFileSize))
630
0
                {
631
0
                    bCreateNewFile = false;
632
0
                    outputLayer->poDS = std::move(poDS);
633
0
                    outputLayer->poLayer = poDstLayer;
634
0
                    outputLayer->nFeatureCount = nFeatureCount;
635
636
0
                    if (bUseTransactions)
637
0
                    {
638
0
                        if (outputLayer->poDS->StartTransaction() !=
639
0
                            OGRERR_NONE)
640
0
                        {
641
0
                            return false;
642
0
                        }
643
0
                    }
644
0
                }
645
0
                else
646
0
                {
647
0
                    ++outputLayer->nFileCounter;
648
0
                }
649
0
            }
650
0
        }
651
652
0
        if (bCreateNewFile)
653
0
        {
654
0
            outputLayer->nFileSize = MIN_FILE_SIZE;
655
656
0
            if (bUseTransactions && outputLayer->poDS &&
657
0
                outputLayer->poDS->CommitTransaction() != OGRERR_NONE)
658
0
            {
659
0
                return false;
660
0
            }
661
662
0
            const std::string osFilename = CPLFormFilenameSafe(
663
0
                osDatasetDir.c_str(),
664
0
                GetBasenameFromCounter(outputLayer->nFileCounter).c_str(),
665
0
                pszExtension);
666
0
            outputLayer->poDS.reset(
667
0
                poOutDriver->Create(osFilename.c_str(), 0, 0, 0, GDT_Unknown,
668
0
                                    datasetCreationOptions.List()));
669
0
            if (!outputLayer->poDS)
670
0
            {
671
0
                alg->ReportError(CE_Failure, CPLE_AppDefined,
672
0
                                 "Cannot create dataset '%s'",
673
0
                                 osFilename.c_str());
674
0
                return false;
675
0
            }
676
677
0
            CPLStringList modLayerCreationOptions(layerCreationOptions);
678
0
            const char *pszSrcFIDColumn = poSrcLayer->GetFIDColumn();
679
0
            if (pszSrcFIDColumn[0])
680
0
            {
681
0
                const char *pszLCO = poOutDriver->GetMetadataItem(
682
0
                    GDAL_DS_LAYER_CREATIONOPTIONLIST);
683
0
                if (pszLCO && strstr(pszLCO, "'FID'") &&
684
0
                    layerCreationOptions.FetchNameValue("FID") == nullptr)
685
0
                    modLayerCreationOptions.SetNameValue("FID",
686
0
                                                         pszSrcFIDColumn);
687
0
            }
688
689
0
            std::unique_ptr<OGRGeomFieldDefn> poFirstGeomFieldDefn;
690
0
            if (poSrcFeatureDefn->GetGeomFieldCount())
691
0
            {
692
0
                poFirstGeomFieldDefn = std::make_unique<OGRGeomFieldDefn>(
693
0
                    *poSrcFeatureDefn->GetGeomFieldDefn(0));
694
0
                if (abPartitionedGeomFields[0])
695
0
                {
696
0
                    if (aeGeomTypes[0] == wkbNone)
697
0
                        poFirstGeomFieldDefn.reset();
698
0
                    else
699
0
                        whileUnsealing(poFirstGeomFieldDefn.get())
700
0
                            ->SetType(aeGeomTypes[0]);
701
0
                }
702
0
            }
703
0
            auto poLayer = outputLayer->poDS->CreateLayer(
704
0
                poSrcLayer->GetDescription(), poFirstGeomFieldDefn.get(),
705
0
                modLayerCreationOptions.List());
706
0
            if (!poLayer)
707
0
            {
708
0
                return false;
709
0
            }
710
0
            outputLayer->poLayer = poLayer;
711
0
            int iField = -1;
712
0
            for (const auto *poFieldDefn : poSrcFeatureDefn->GetFields())
713
0
            {
714
0
                ++iField;
715
0
                if (omitPartitionedFields && abPartitionedFields[iField])
716
0
                    continue;
717
0
                if (poLayer->CreateField(poFieldDefn) != OGRERR_NONE)
718
0
                {
719
0
                    alg->ReportError(CE_Failure, CPLE_AppDefined,
720
0
                                     "Cannot create field '%s'",
721
0
                                     poFieldDefn->GetNameRef());
722
0
                    return false;
723
0
                }
724
0
            }
725
0
            int iGeomField = -1;
726
0
            for (const auto *poGeomFieldDefn :
727
0
                 poSrcFeatureDefn->GetGeomFields())
728
0
            {
729
0
                ++iGeomField;
730
0
                if (iGeomField > 0)
731
0
                {
732
0
                    OGRGeomFieldDefn oClone(poGeomFieldDefn);
733
0
                    if (abPartitionedGeomFields[iGeomField])
734
0
                    {
735
0
                        if (aeGeomTypes[iGeomField] == wkbNone)
736
0
                            continue;
737
0
                        whileUnsealing(&oClone)->SetType(
738
0
                            aeGeomTypes[iGeomField]);
739
0
                    }
740
0
                    if (poLayer->CreateGeomField(&oClone) != OGRERR_NONE)
741
0
                    {
742
0
                        alg->ReportError(CE_Failure, CPLE_AppDefined,
743
0
                                         "Cannot create geometry field '%s'",
744
0
                                         poGeomFieldDefn->GetNameRef());
745
0
                        return false;
746
0
                    }
747
0
                }
748
0
            }
749
750
0
            if (bUseTransactions)
751
0
            {
752
0
                if (outputLayer->poDS->StartTransaction() != OGRERR_NONE)
753
0
                    return false;
754
0
            }
755
0
        }
756
757
0
        const auto nCounter = CPLGetErrorCounter();
758
0
        oCacheOutputLayer.insert(osKey, outputLayer);
759
        // In case insertion caused an eviction and old dataset
760
        // flushing caused an error
761
0
        if (CPLGetErrorCounter() != nCounter)
762
0
            return false;
763
0
    }
764
765
0
    return true;
766
0
}
767
768
/************************************************************************/
769
/*               GDALVectorPartitionAlgorithm::RunStep()                */
770
/************************************************************************/
771
772
bool GDALVectorPartitionAlgorithm::RunStep(GDALPipelineStepRunContext &ctxt)
773
0
{
774
0
    auto poSrcDS = m_inputDataset[0].GetDatasetRef();
775
0
    CPLAssert(poSrcDS);
776
777
0
    auto poOutDriver = poSrcDS->GetDriver();
778
0
    const char *pszExtensions =
779
0
        poOutDriver ? poOutDriver->GetMetadataItem(GDAL_DMD_EXTENSIONS)
780
0
                    : nullptr;
781
0
    if (m_format.empty())
782
0
    {
783
0
        if (!pszExtensions)
784
0
        {
785
0
            ReportError(CE_Failure, CPLE_AppDefined,
786
0
                        "Cannot infer output format. Please specify "
787
0
                        "'output-format' argument");
788
0
            return false;
789
0
        }
790
0
    }
791
0
    else
792
0
    {
793
0
        poOutDriver = GetGDALDriverManager()->GetDriverByName(m_format.c_str());
794
0
        if (!(poOutDriver && (pszExtensions = poOutDriver->GetMetadataItem(
795
0
                                  GDAL_DMD_EXTENSIONS)) != nullptr))
796
0
        {
797
0
            ReportError(CE_Failure, CPLE_AppDefined,
798
0
                        "Output driver has no known file extension");
799
0
            return false;
800
0
        }
801
0
    }
802
0
    CPLAssert(poOutDriver);
803
804
0
    const bool bFormatSupportsAppend =
805
0
        poOutDriver->GetMetadataItem(GDAL_DCAP_UPDATE) ||
806
0
        poOutDriver->GetMetadataItem(GDAL_DCAP_APPEND);
807
0
    if (m_appendLayer && !bFormatSupportsAppend)
808
0
    {
809
0
        ReportError(CE_Failure, CPLE_AppDefined,
810
0
                    "Driver '%s' does not support update",
811
0
                    poOutDriver->GetDescription());
812
0
        return false;
813
0
    }
814
815
0
    const bool bParquetOutput = EQUAL(poOutDriver->GetDescription(), "PARQUET");
816
0
    if (bParquetOutput && m_scheme == SCHEME_HIVE)
817
0
    {
818
        // Required for Parquet Hive partitioning
819
0
        m_omitPartitionedFields = true;
820
0
    }
821
822
0
    const CPLStringList aosExtensions(CSLTokenizeString(pszExtensions));
823
0
    const char *pszExtension = aosExtensions[0];
824
825
0
    const CPLStringList datasetCreationOptions(m_creationOptions);
826
0
    const CPLStringList layerCreationOptions(m_layerCreationOptions);
827
828
    // We don't have driver metadata for that (and that would be a bit
829
    // tricky because some formats are half-text/half-binary), so...
830
0
    const bool bOutputFormatIsBinary =
831
0
        bParquetOutput || EQUAL(poOutDriver->GetDescription(), "GPKG") ||
832
0
        EQUAL(poOutDriver->GetDescription(), "SQLite") ||
833
0
        EQUAL(poOutDriver->GetDescription(), "FlatGeoBuf");
834
835
    // Below values have been experimentally determined and are not based
836
    // on rocket science...
837
0
    int nSpatialIndexPerFeatureConstant = 0;
838
0
    int nSpatialIndexPerLog2FeatureCountConstant = 0;
839
0
    if (CPLTestBool(
840
0
            layerCreationOptions.FetchNameValueDef("SPATIAL_INDEX", "YES")))
841
0
    {
842
0
        if (EQUAL(poOutDriver->GetDescription(), "GPKG"))
843
0
        {
844
0
            nSpatialIndexPerFeatureConstant =
845
0
                static_cast<int>(sizeof(double) * 4 + sizeof(uint32_t));
846
0
            nSpatialIndexPerLog2FeatureCountConstant = 1;
847
0
        }
848
0
        else if (EQUAL(poOutDriver->GetDescription(), "FlatGeoBuf"))
849
0
        {
850
0
            nSpatialIndexPerFeatureConstant = 1;
851
0
            nSpatialIndexPerLog2FeatureCountConstant =
852
0
                static_cast<int>(sizeof(double) * 4 + sizeof(uint64_t));
853
0
        }
854
0
    }
855
856
0
    const bool bUseTransactions =
857
0
        (EQUAL(poOutDriver->GetDescription(), "GPKG") ||
858
0
         EQUAL(poOutDriver->GetDescription(), "SQLite")) &&
859
0
        !m_skipErrors;
860
861
0
    VSIStatBufL sStat;
862
0
    if (VSIStatL(m_output.c_str(), &sStat) == 0)
863
0
    {
864
0
        if (m_overwrite)
865
0
        {
866
0
            bool emptyDir = true;
867
0
            bool hasDirLevel1WithEqual = false;
868
869
            // Do a sanity check to verify that this looks like a directory
870
            // generated by partition
871
872
0
            if (m_scheme == SCHEME_HIVE)
873
0
            {
874
0
                std::unique_ptr<VSIDIR, decltype(&VSICloseDir)> psDir(
875
0
                    VSIOpenDir(m_output.c_str(), -1, nullptr), VSICloseDir);
876
0
                if (psDir)
877
0
                {
878
0
                    while (const auto *psEntry =
879
0
                               VSIGetNextDirEntry(psDir.get()))
880
0
                    {
881
0
                        emptyDir = false;
882
0
                        if (VSI_ISDIR(psEntry->nMode))
883
0
                        {
884
0
                            std::string_view v(psEntry->pszName);
885
0
                            if (std::count_if(
886
0
                                    v.begin(), v.end(), [](char c)
887
0
                                    { return c == '/' || c == '\\'; }) == 1)
888
0
                            {
889
0
                                const auto nPosDirSep = v.find_first_of("/\\");
890
0
                                const auto nPosEqual = v.find('=', nPosDirSep);
891
0
                                if (nPosEqual != std::string::npos)
892
0
                                {
893
0
                                    hasDirLevel1WithEqual = true;
894
0
                                    break;
895
0
                                }
896
0
                            }
897
0
                        }
898
0
                    }
899
0
                }
900
901
0
                if (!hasDirLevel1WithEqual && !emptyDir)
902
0
                {
903
0
                    ReportError(
904
0
                        CE_Failure, CPLE_AppDefined,
905
0
                        "Rejecting removing '%s' as it does not look like "
906
0
                        "a directory generated by this utility. If you are "
907
0
                        "sure, remove it manually and re-run",
908
0
                        m_output.c_str());
909
0
                    return false;
910
0
                }
911
0
            }
912
0
            else
913
0
            {
914
0
                bool hasSubDir = false;
915
0
                std::unique_ptr<VSIDIR, decltype(&VSICloseDir)> psDir(
916
0
                    VSIOpenDir(m_output.c_str(), 0, nullptr), VSICloseDir);
917
0
                if (psDir)
918
0
                {
919
0
                    while (const auto *psEntry =
920
0
                               VSIGetNextDirEntry(psDir.get()))
921
0
                    {
922
0
                        if (VSI_ISDIR(psEntry->nMode))
923
0
                        {
924
0
                            hasSubDir = true;
925
0
                            break;
926
0
                        }
927
0
                    }
928
0
                }
929
930
0
                if (hasSubDir)
931
0
                {
932
0
                    ReportError(
933
0
                        CE_Failure, CPLE_AppDefined,
934
0
                        "Rejecting removing '%s' as it does not look like "
935
0
                        "a directory generated by this utility. If you are "
936
0
                        "sure, remove it manually and re-run",
937
0
                        m_output.c_str());
938
0
                    return false;
939
0
                }
940
0
            }
941
942
0
            if (VSIRmdirRecursive(m_output.c_str()) != 0)
943
0
            {
944
0
                ReportError(CE_Failure, CPLE_AppDefined, "Cannot remove '%s'",
945
0
                            m_output.c_str());
946
0
                return false;
947
0
            }
948
0
        }
949
0
        else if (!m_appendLayer)
950
0
        {
951
0
            ReportError(CE_Failure, CPLE_AppDefined,
952
0
                        "'%s' already exists. Specify --overwrite or --append",
953
0
                        m_output.c_str());
954
0
            return false;
955
0
        }
956
0
    }
957
0
    if (VSIStatL(m_output.c_str(), &sStat) != 0)
958
0
    {
959
0
        if (VSIMkdir(m_output.c_str(), DIRECTORY_CREATION_MODE) != 0)
960
0
        {
961
0
            ReportError(CE_Failure, CPLE_AppDefined,
962
0
                        "Cannot create directory '%s'", m_output.c_str());
963
0
            return false;
964
0
        }
965
0
    }
966
967
0
    for (OGRLayer *poSrcLayer : poSrcDS->GetLayers())
968
0
    {
969
0
        const std::string osLayerDir =
970
0
            m_scheme == SCHEME_HIVE
971
0
                ? CPLFormFilenameSafe(
972
0
                      m_output.c_str(),
973
0
                      PercentEncode(poSrcLayer->GetDescription()).c_str(),
974
0
                      nullptr)
975
0
                : m_output;
976
0
        if (m_scheme == SCHEME_HIVE &&
977
0
            VSIStatL(osLayerDir.c_str(), &sStat) != 0)
978
0
        {
979
0
            if (VSIMkdir(osLayerDir.c_str(), DIRECTORY_CREATION_MODE) != 0)
980
0
            {
981
0
                ReportError(CE_Failure, CPLE_AppDefined,
982
0
                            "Cannot create directory '%s'", osLayerDir.c_str());
983
0
                return false;
984
0
            }
985
0
        }
986
987
0
        const auto poSrcFeatureDefn = poSrcLayer->GetLayerDefn();
988
989
0
        struct Field
990
0
        {
991
0
            int nIdx{};
992
0
            bool bIsGeom = false;
993
0
            std::string encodedFieldName{};
994
0
            OGRFieldType eType{};
995
0
        };
996
997
0
        std::vector<Field> asFields;
998
0
        std::vector<bool> abPartitionedFields(poSrcFeatureDefn->GetFieldCount(),
999
0
                                              false);
1000
0
        std::vector<bool> abPartitionedGeomFields(
1001
0
            poSrcFeatureDefn->GetGeomFieldCount(), false);
1002
0
        for (const std::string &fieldName : m_fields)
1003
0
        {
1004
0
            int nIdx = poSrcFeatureDefn->GetFieldIndex(fieldName.c_str());
1005
0
            if (nIdx < 0)
1006
0
            {
1007
0
                if (fieldName == "OGR_GEOMETRY" &&
1008
0
                    poSrcFeatureDefn->GetGeomFieldCount() > 0)
1009
0
                    nIdx = 0;
1010
0
                else
1011
0
                    nIdx =
1012
0
                        poSrcFeatureDefn->GetGeomFieldIndex(fieldName.c_str());
1013
0
                if (nIdx < 0)
1014
0
                {
1015
0
                    ReportError(CE_Failure, CPLE_AppDefined,
1016
0
                                "Cannot find field '%s' in layer '%s'",
1017
0
                                fieldName.c_str(),
1018
0
                                poSrcLayer->GetDescription());
1019
0
                    return false;
1020
0
                }
1021
0
                else
1022
0
                {
1023
0
                    abPartitionedGeomFields[nIdx] = true;
1024
0
                    Field f;
1025
0
                    f.nIdx = nIdx;
1026
0
                    f.bIsGeom = true;
1027
0
                    if (fieldName.empty())
1028
0
                        f.encodedFieldName = "OGR_GEOMETRY";
1029
0
                    else
1030
0
                        f.encodedFieldName = PercentEncode(fieldName);
1031
0
                    asFields.push_back(std::move(f));
1032
0
                }
1033
0
            }
1034
0
            else
1035
0
            {
1036
0
                const auto eType =
1037
0
                    poSrcFeatureDefn->GetFieldDefn(nIdx)->GetType();
1038
0
                if (eType != OFTString && eType != OFTInteger &&
1039
0
                    eType != OFTInteger64)
1040
0
                {
1041
0
                    ReportError(
1042
0
                        CE_Failure, CPLE_NotSupported,
1043
0
                        "Field '%s' not valid for partitioning. Only fields of "
1044
0
                        "type String, Integer or Integer64, or geometry fields,"
1045
0
                        " are accepted",
1046
0
                        fieldName.c_str());
1047
0
                    return false;
1048
0
                }
1049
0
                abPartitionedFields[nIdx] = true;
1050
0
                Field f;
1051
0
                f.nIdx = nIdx;
1052
0
                f.bIsGeom = false;
1053
0
                f.encodedFieldName = PercentEncode(fieldName);
1054
0
                f.eType = eType;
1055
0
                asFields.push_back(std::move(f));
1056
0
            }
1057
0
        }
1058
1059
0
        std::vector<OGRFieldType> aeSrcFieldTypes;
1060
0
        for (const auto *poFieldDefn : poSrcFeatureDefn->GetFields())
1061
0
        {
1062
0
            aeSrcFieldTypes.push_back(poFieldDefn->GetType());
1063
0
        }
1064
1065
0
        std::unique_ptr<OGRFeatureDefn> poFeatureDefnWithoutPartitionedFields(
1066
0
            poSrcFeatureDefn->Clone());
1067
0
        std::vector<int> anMapForSetFrom;
1068
0
        if (m_omitPartitionedFields)
1069
0
        {
1070
            // Sort fields by descending index (so we can delete them easily)
1071
0
            std::vector<Field> sortedFields(asFields);
1072
0
            std::sort(sortedFields.begin(), sortedFields.end(),
1073
0
                      [](const Field &a, const Field &b)
1074
0
                      { return a.nIdx > b.nIdx; });
1075
0
            for (const auto &field : sortedFields)
1076
0
            {
1077
0
                if (!field.bIsGeom)
1078
0
                    poFeatureDefnWithoutPartitionedFields->DeleteFieldDefn(
1079
0
                        field.nIdx);
1080
0
            }
1081
0
            anMapForSetFrom =
1082
0
                poFeatureDefnWithoutPartitionedFields->ComputeMapForSetFrom(
1083
0
                    poSrcFeatureDefn);
1084
0
        }
1085
1086
0
        lru11::Cache<std::string, std::shared_ptr<Layer>> oCacheOutputLayer(
1087
0
            m_maxCacheSize, 0);
1088
0
        std::shared_ptr<Layer> outputLayer = std::make_unique<Layer>();
1089
0
        outputLayer->bUseTransactions = bUseTransactions;
1090
1091
0
        GIntBig nTotalFeatures = 1;
1092
0
        GIntBig nFeatureIter = 0;
1093
0
        if (ctxt.m_pfnProgress)
1094
0
            nTotalFeatures = poSrcLayer->GetFeatureCount(true);
1095
0
        const double dfInvTotalFeatures =
1096
0
            1.0 / static_cast<double>(std::max<GIntBig>(1, nTotalFeatures));
1097
1098
0
        std::string osAttrQueryString;
1099
0
        if (const char *pszAttrQueryString = poSrcLayer->GetAttrQueryString())
1100
0
            osAttrQueryString = pszAttrQueryString;
1101
1102
0
        std::string osKeyTmp;
1103
0
        std::vector<OGRwkbGeometryType> aeGeomTypesTmp;
1104
0
        const auto BuildKey =
1105
0
            [&osKeyTmp, &aeGeomTypesTmp](const std::vector<Field> &fields,
1106
0
                                         const OGRFeature *poFeature)
1107
0
            -> std::pair<const std::string &,
1108
0
                         const std::vector<OGRwkbGeometryType> &>
1109
0
        {
1110
0
            osKeyTmp.clear();
1111
0
            aeGeomTypesTmp.resize(poFeature->GetDefnRef()->GetGeomFieldCount());
1112
0
            for (const auto &field : fields)
1113
0
            {
1114
0
                if (!osKeyTmp.empty())
1115
0
                    osKeyTmp += '/';
1116
0
                osKeyTmp += field.encodedFieldName;
1117
0
                osKeyTmp += '=';
1118
0
                if (field.bIsGeom)
1119
0
                {
1120
0
                    const auto poGeom = poFeature->GetGeomFieldRef(field.nIdx);
1121
0
                    if (poGeom)
1122
0
                    {
1123
0
                        aeGeomTypesTmp[field.nIdx] = poGeom->getGeometryType();
1124
0
                        osKeyTmp += poGeom->getGeometryName();
1125
0
                        if (poGeom->Is3D())
1126
0
                            osKeyTmp += 'Z';
1127
0
                        if (poGeom->IsMeasured())
1128
0
                            osKeyTmp += 'M';
1129
0
                    }
1130
0
                    else
1131
0
                    {
1132
0
                        aeGeomTypesTmp[field.nIdx] = wkbNone;
1133
0
                        osKeyTmp += NULL_MARKER;
1134
0
                    }
1135
0
                }
1136
0
                else if (poFeature->IsFieldSetAndNotNull(field.nIdx))
1137
0
                {
1138
0
                    if (field.eType == OFTString)
1139
0
                    {
1140
0
                        PercentEncode(
1141
0
                            osKeyTmp,
1142
0
                            poFeature->GetFieldAsStringUnsafe(field.nIdx));
1143
0
                    }
1144
0
                    else if (field.eType == OFTInteger)
1145
0
                    {
1146
0
                        osKeyTmp += CPLSPrintf(
1147
0
                            "%d",
1148
0
                            poFeature->GetFieldAsIntegerUnsafe(field.nIdx));
1149
0
                    }
1150
0
                    else
1151
0
                    {
1152
0
                        osKeyTmp += CPLSPrintf(
1153
0
                            CPL_FRMT_GIB,
1154
0
                            poFeature->GetFieldAsInteger64Unsafe(field.nIdx));
1155
0
                    }
1156
0
                }
1157
0
                else
1158
0
                {
1159
0
                    osKeyTmp += NULL_MARKER;
1160
0
                }
1161
0
            }
1162
0
            return {osKeyTmp, aeGeomTypesTmp};
1163
0
        };
1164
1165
0
        std::set<std::string> oSetKeys;
1166
0
        if (!bFormatSupportsAppend)
1167
0
        {
1168
0
            CPLDebug(
1169
0
                "GDAL",
1170
0
                "First pass to determine all distinct partitioned values...");
1171
1172
0
            if (asFields.size() == 1 && !asFields[0].bIsGeom)
1173
0
            {
1174
0
                std::string osSQL = "SELECT DISTINCT \"";
1175
0
                osSQL += CPLString(m_fields[0]).replaceAll('"', "\"\"");
1176
0
                osSQL += "\" FROM \"";
1177
0
                osSQL += CPLString(poSrcLayer->GetDescription())
1178
0
                             .replaceAll('"', "\"\"");
1179
0
                osSQL += '"';
1180
0
                if (!osAttrQueryString.empty())
1181
0
                {
1182
0
                    osSQL += " WHERE ";
1183
0
                    osSQL += osAttrQueryString;
1184
0
                }
1185
0
                auto poSQLLayer =
1186
0
                    poSrcDS->ExecuteSQL(osSQL.c_str(), nullptr, nullptr);
1187
0
                if (!poSQLLayer)
1188
0
                    return false;
1189
0
                std::vector<Field> asSingleField{asFields[0]};
1190
0
                asSingleField[0].nIdx = 0;
1191
0
                for (auto &poFeature : *poSQLLayer)
1192
0
                {
1193
0
                    const auto sPair = BuildKey(asFields, poFeature.get());
1194
0
                    const std::string &osKey = sPair.first;
1195
0
                    oSetKeys.insert(osKey);
1196
#ifdef DEBUG_VERBOSE
1197
                    CPLDebug("GDAL", "Found %s", osKey.c_str());
1198
#endif
1199
0
                }
1200
0
                poSrcDS->ReleaseResultSet(poSQLLayer);
1201
1202
0
                if (!osAttrQueryString.empty())
1203
0
                {
1204
0
                    poSrcLayer->SetAttributeFilter(osAttrQueryString.c_str());
1205
0
                }
1206
0
            }
1207
0
            else
1208
0
            {
1209
0
                for (auto &poFeature : *poSrcLayer)
1210
0
                {
1211
0
                    const auto sPair = BuildKey(asFields, poFeature.get());
1212
0
                    const std::string &osKey = sPair.first;
1213
0
                    if (oSetKeys.insert(osKey).second)
1214
0
                    {
1215
#ifdef DEBUG_VERBOSE
1216
                        CPLDebug("GDAL", "Found %s", osKey.c_str());
1217
#endif
1218
0
                    }
1219
0
                }
1220
0
            }
1221
0
            CPLDebug("GDAL",
1222
0
                     "End of first pass: %d unique partitioning keys found -> "
1223
0
                     "%d pass(es) needed",
1224
0
                     static_cast<int>(oSetKeys.size()),
1225
0
                     static_cast<int>((oSetKeys.size() + m_maxCacheSize - 1) /
1226
0
                                      m_maxCacheSize));
1227
1228
            // If we have less distinct values as the maximum cache size, we
1229
            // can do a single iteration.
1230
0
            if (oSetKeys.size() <= static_cast<size_t>(m_maxCacheSize))
1231
0
                oSetKeys.clear();
1232
0
        }
1233
1234
0
        std::set<std::string> oSetOutputDatasets;
1235
0
        auto oSetKeysIter = oSetKeys.begin();
1236
0
        while (true)
1237
0
        {
1238
            // Determine which keys are allowed for the current pass
1239
0
            std::set<std::string> oSetKeysAllowedInThisPass;
1240
0
            if (!oSetKeys.empty())
1241
0
            {
1242
0
                while (oSetKeysAllowedInThisPass.size() <
1243
0
                           static_cast<size_t>(m_maxCacheSize) &&
1244
0
                       oSetKeysIter != oSetKeys.end())
1245
0
                {
1246
0
                    oSetKeysAllowedInThisPass.insert(*oSetKeysIter);
1247
0
                    ++oSetKeysIter;
1248
0
                }
1249
0
                if (oSetKeysAllowedInThisPass.empty())
1250
0
                    break;
1251
0
            }
1252
1253
0
            for (auto &poFeature : *poSrcLayer)
1254
0
            {
1255
0
                const auto sPair = BuildKey(asFields, poFeature.get());
1256
0
                const std::string &osKey = sPair.first;
1257
0
                const auto &aeGeomTypes = sPair.second;
1258
1259
0
                if (!oSetKeysAllowedInThisPass.empty() &&
1260
0
                    !cpl::contains(oSetKeysAllowedInThisPass, osKey))
1261
0
                {
1262
0
                    continue;
1263
0
                }
1264
1265
0
                if (!GetCurrentOutputLayer(
1266
0
                        this, poSrcFeatureDefn, poSrcLayer, osKey, aeGeomTypes,
1267
0
                        osLayerDir, m_scheme, m_pattern,
1268
0
                        m_partDigitLeadingZeroes, m_partDigitCount,
1269
0
                        m_featureLimit, m_maxFileSize, m_omitPartitionedFields,
1270
0
                        abPartitionedFields, abPartitionedGeomFields,
1271
0
                        pszExtension, poOutDriver, datasetCreationOptions,
1272
0
                        layerCreationOptions,
1273
0
                        poFeatureDefnWithoutPartitionedFields.get(),
1274
0
                        poFeature->GetGeometryRef()
1275
0
                            ? nSpatialIndexPerFeatureConstant
1276
0
                            : 0,
1277
0
                        nSpatialIndexPerLog2FeatureCountConstant,
1278
0
                        bUseTransactions, oCacheOutputLayer, outputLayer))
1279
0
                {
1280
0
                    return false;
1281
0
                }
1282
1283
0
                if (bParquetOutput)
1284
0
                {
1285
0
                    oSetOutputDatasets.insert(
1286
0
                        outputLayer->poDS->GetDescription());
1287
0
                }
1288
1289
0
                if (m_appendLayer)
1290
0
                    poFeature->SetFID(OGRNullFID);
1291
1292
0
                OGRErr eErr;
1293
0
                if (m_omitPartitionedFields ||
1294
0
                    std::find(aeGeomTypes.begin(), aeGeomTypes.end(),
1295
0
                              wkbNone) != aeGeomTypes.end())
1296
0
                {
1297
0
                    OGRFeature oFeat(outputLayer->poLayer->GetLayerDefn());
1298
0
                    oFeat.SetFrom(poFeature.get(), anMapForSetFrom.data());
1299
0
                    oFeat.SetFID(poFeature->GetFID());
1300
0
                    eErr = outputLayer->poLayer->CreateFeature(&oFeat);
1301
0
                }
1302
0
                else
1303
0
                {
1304
0
                    poFeature->SetFDefnUnsafe(
1305
0
                        outputLayer->poLayer->GetLayerDefn());
1306
0
                    eErr = outputLayer->poLayer->CreateFeature(poFeature.get());
1307
0
                }
1308
0
                if (eErr != OGRERR_NONE)
1309
0
                {
1310
0
                    ReportError(m_skipErrors ? CE_Warning : CE_Failure,
1311
0
                                CPLE_AppDefined,
1312
0
                                "Cannot insert feature " CPL_FRMT_GIB,
1313
0
                                poFeature->GetFID());
1314
0
                    if (m_skipErrors)
1315
0
                        continue;
1316
0
                    return false;
1317
0
                }
1318
0
                ++outputLayer->nFeatureCount;
1319
1320
0
                if (bUseTransactions &&
1321
0
                    (outputLayer->nFeatureCount % m_transactionSize) == 0)
1322
0
                {
1323
0
                    if (outputLayer->poDS->CommitTransaction() != OGRERR_NONE ||
1324
0
                        outputLayer->poDS->StartTransaction() != OGRERR_NONE)
1325
0
                    {
1326
0
                        return false;
1327
0
                    }
1328
0
                }
1329
1330
                // Compute a rough estimate of the space taken by the feature
1331
0
                if (m_maxFileSize > 0)
1332
0
                {
1333
0
                    outputLayer->nFileSize += GetEstimatedFeatureSize(
1334
0
                        poFeature.get(), abPartitionedFields,
1335
0
                        m_omitPartitionedFields, aeSrcFieldTypes,
1336
0
                        bOutputFormatIsBinary);
1337
0
                }
1338
1339
0
                ++nFeatureIter;
1340
0
                if (ctxt.m_pfnProgress &&
1341
0
                    !ctxt.m_pfnProgress(
1342
0
                        std::min(1.0, static_cast<double>(nFeatureIter) *
1343
0
                                          dfInvTotalFeatures),
1344
0
                        "", ctxt.m_pProgressData))
1345
0
                {
1346
0
                    ReportError(CE_Failure, CPLE_UserInterrupt,
1347
0
                                "Interrupted by user");
1348
0
                    return false;
1349
0
                }
1350
0
            }
1351
1352
0
            if (oSetKeysIter == oSetKeys.end())
1353
0
                break;
1354
0
        }
1355
1356
0
        const auto nCounter = CPLGetErrorCounter();
1357
0
        outputLayer.reset();
1358
0
        oCacheOutputLayer.clear();
1359
0
        if (CPLGetErrorCounter() != nCounter)
1360
0
            return false;
1361
1362
        // For Parquet output, create special "_metadata" file that contains
1363
        // the schema and references the individual files
1364
0
        if (bParquetOutput && !oSetOutputDatasets.empty())
1365
0
        {
1366
0
            auto poAlg =
1367
0
                GDALGlobalAlgorithmRegistry::GetSingleton().Instantiate(
1368
0
                    "driver", "parquet", "create-metadata-file");
1369
0
            if (poAlg)
1370
0
            {
1371
0
                auto inputArg = poAlg->GetArg(GDAL_ARG_NAME_INPUT);
1372
0
                auto outputArg = poAlg->GetArg(GDAL_ARG_NAME_OUTPUT);
1373
0
                if (inputArg && inputArg->GetType() == GAAT_DATASET_LIST &&
1374
0
                    outputArg && outputArg->GetType() == GAAT_DATASET)
1375
0
                {
1376
0
                    std::vector<std::string> asInputFilenames;
1377
0
                    asInputFilenames.insert(asInputFilenames.end(),
1378
0
                                            oSetOutputDatasets.begin(),
1379
0
                                            oSetOutputDatasets.end());
1380
0
                    inputArg->Set(asInputFilenames);
1381
0
                    outputArg->Set(CPLFormFilenameSafe(osLayerDir.c_str(),
1382
0
                                                       "_metadata", nullptr));
1383
0
                    if (!poAlg->Run())
1384
0
                        return false;
1385
0
                }
1386
0
            }
1387
0
        }
1388
0
    }
1389
1390
0
    return true;
1391
0
}
1392
1393
/************************************************************************/
1394
/*               GDALVectorPartitionAlgorithm::RunImpl()                */
1395
/************************************************************************/
1396
1397
bool GDALVectorPartitionAlgorithm::RunImpl(GDALProgressFunc pfnProgress,
1398
                                           void *pProgressData)
1399
0
{
1400
0
    GDALPipelineStepRunContext stepCtxt;
1401
0
    stepCtxt.m_pfnProgress = pfnProgress;
1402
0
    stepCtxt.m_pProgressData = pProgressData;
1403
0
    return RunStep(stepCtxt);
1404
0
}
1405
1406
GDALVectorPartitionAlgorithmStandalone::
1407
0
    ~GDALVectorPartitionAlgorithmStandalone() = default;
1408
//! @endcond