Coverage Report

Created: 2025-06-13 06:29

/src/gdal/ogr/ogrsf_frmts/generic/ograrrowarrayhelper.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Project:  OpenGIS Simple Features Reference Implementation
4
 * Purpose:  Helper to fill ArrowArray
5
 * Author:   Even Rouault <even dot rouault at spatialys.com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2022, Even Rouault <even dot rouault at spatialys.com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#include "ograrrowarrayhelper.h"
14
#include "ogrlayerarrow.h"
15
#include "ogr_p.h"
16
17
#include <limits>
18
19
//! @cond Doxygen_Suppress
20
21
/************************************************************************/
22
/*                           GetMemLimit()                              */
23
/************************************************************************/
24
25
/*static*/ uint32_t OGRArrowArrayHelper::GetMemLimit()
26
0
{
27
0
    uint32_t nMemLimit =
28
0
        static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
29
    // Just for tests
30
0
    const char *pszOGR_ARROW_MEM_LIMIT =
31
0
        CPLGetConfigOption("OGR_ARROW_MEM_LIMIT", nullptr);
32
0
    if (pszOGR_ARROW_MEM_LIMIT)
33
0
        nMemLimit = atoi(pszOGR_ARROW_MEM_LIMIT);
34
0
    else
35
0
    {
36
0
        const auto nUsableRAM = CPLGetUsablePhysicalRAM();
37
0
        if (nUsableRAM > 0 && static_cast<uint64_t>(nUsableRAM / 4) < nMemLimit)
38
0
            nMemLimit = static_cast<uint32_t>(nUsableRAM / 4);
39
0
    }
40
0
    return nMemLimit;
41
0
}
42
43
/************************************************************************/
44
/*                       GetMaxFeaturesInBatch()                        */
45
/************************************************************************/
46
47
/* static */
48
int OGRArrowArrayHelper::GetMaxFeaturesInBatch(
49
    const CPLStringList &aosArrowArrayStreamOptions)
50
0
{
51
0
    int l_nMaxBatchSize = atoi(aosArrowArrayStreamOptions.FetchNameValueDef(
52
0
        "MAX_FEATURES_IN_BATCH", "65536"));
53
0
    if (l_nMaxBatchSize <= 0)
54
0
        l_nMaxBatchSize = 1;
55
0
    if (l_nMaxBatchSize > INT_MAX - 1)
56
0
        l_nMaxBatchSize = INT_MAX - 1;
57
58
0
    return l_nMaxBatchSize;
59
0
}
60
61
/************************************************************************/
62
/*                       OGRArrowArrayHelper()                          */
63
/************************************************************************/
64
65
OGRArrowArrayHelper::OGRArrowArrayHelper(
66
    GDALDataset *poDS, OGRFeatureDefn *poFeatureDefn,
67
    const CPLStringList &aosArrowArrayStreamOptions,
68
    struct ArrowArray *out_array)
69
0
    : m_bIncludeFID(CPLTestBool(
70
0
          aosArrowArrayStreamOptions.FetchNameValueDef("INCLUDE_FID", "YES"))),
71
0
      m_nMaxBatchSize(GetMaxFeaturesInBatch(aosArrowArrayStreamOptions)),
72
0
      m_nFieldCount(poFeatureDefn->GetFieldCount()),
73
0
      m_nGeomFieldCount(poFeatureDefn->GetGeomFieldCount()),
74
0
      m_out_array(out_array)
75
0
{
76
0
    memset(out_array, 0, sizeof(*out_array));
77
78
0
    m_mapOGRFieldToArrowField.resize(m_nFieldCount, -1);
79
0
    m_mapOGRGeomFieldToArrowField.resize(m_nGeomFieldCount, -1);
80
0
    m_abNullableFields.resize(m_nFieldCount);
81
0
    m_anTZFlags.resize(m_nFieldCount);
82
0
    int nTZFlagOverride = -1;
83
0
    const char *pszTZOverride =
84
0
        aosArrowArrayStreamOptions.FetchNameValue("TIMEZONE");
85
0
    if (pszTZOverride)
86
0
    {
87
0
        if (EQUAL(pszTZOverride, "unknown") || EQUAL(pszTZOverride, ""))
88
0
        {
89
0
            nTZFlagOverride = OGR_TZFLAG_UNKNOWN;
90
0
        }
91
0
        else
92
0
        {
93
            // we don't really care about the actual timezone, since we
94
            // will convert OGRField::Date to UTC in all cases
95
0
            nTZFlagOverride = OGR_TZFLAG_UTC;
96
0
        }
97
0
    }
98
0
    const bool bDateTimeAsString =
99
0
        aosArrowArrayStreamOptions.FetchBool(GAS_OPT_DATETIME_AS_STRING, false);
100
101
0
    if (m_bIncludeFID)
102
0
    {
103
0
        m_nChildren++;
104
0
    }
105
    // cppcheck-suppress knownConditionTrueFalse
106
0
    for (int i = 0; i < m_nFieldCount; i++)
107
0
    {
108
0
        const auto poFieldDefn = poFeatureDefn->GetFieldDefn(i);
109
0
        m_abNullableFields[i] = CPL_TO_BOOL(poFieldDefn->IsNullable());
110
0
        m_anTZFlags[i] =
111
0
            nTZFlagOverride >= 0 ? nTZFlagOverride : poFieldDefn->GetTZFlag();
112
0
        if (!poFieldDefn->IsIgnored())
113
0
        {
114
0
            m_mapOGRFieldToArrowField[i] = m_nChildren;
115
0
            m_nChildren++;
116
0
        }
117
0
    }
118
    // cppcheck-suppress knownConditionTrueFalse
119
0
    for (int i = 0; i < m_nGeomFieldCount; i++)
120
0
    {
121
0
        if (!poFeatureDefn->GetGeomFieldDefn(i)->IsIgnored())
122
0
        {
123
0
            m_mapOGRGeomFieldToArrowField[i] = m_nChildren;
124
0
            m_nChildren++;
125
0
        }
126
0
    }
127
128
0
    m_anArrowFieldMaxAlloc.resize(m_nChildren);
129
130
0
    out_array->release = OGRLayer::ReleaseArray;
131
132
0
    out_array->length = m_nMaxBatchSize;
133
0
    out_array->null_count = 0;
134
135
0
    out_array->n_children = m_nChildren;
136
0
    out_array->children = static_cast<struct ArrowArray **>(
137
0
        CPLCalloc(m_nChildren, sizeof(struct ArrowArray *)));
138
0
    out_array->release = OGRLayer::ReleaseArray;
139
0
    out_array->n_buffers = 1;
140
0
    out_array->buffers =
141
0
        static_cast<const void **>(CPLCalloc(1, sizeof(void *)));
142
143
    // Allocate buffers
144
145
0
    if (m_bIncludeFID)
146
0
    {
147
0
        out_array->children[0] = static_cast<struct ArrowArray *>(
148
0
            CPLCalloc(1, sizeof(struct ArrowArray)));
149
0
        auto psChild = out_array->children[0];
150
0
        psChild->release = OGRLayer::ReleaseArray;
151
0
        psChild->length = m_nMaxBatchSize;
152
0
        psChild->n_buffers = 2;
153
0
        psChild->buffers =
154
0
            static_cast<const void **>(CPLCalloc(2, sizeof(void *)));
155
0
        m_panFIDValues = static_cast<int64_t *>(
156
0
            VSI_MALLOC_ALIGNED_AUTO_VERBOSE(sizeof(int64_t) * m_nMaxBatchSize));
157
0
        if (m_panFIDValues == nullptr)
158
0
            goto error;
159
0
        psChild->buffers[1] = m_panFIDValues;
160
0
    }
161
162
    // cppcheck-suppress knownConditionTrueFalse
163
0
    for (int i = 0; i < m_nFieldCount; i++)
164
0
    {
165
0
        const int iArrowField = m_mapOGRFieldToArrowField[i];
166
0
        if (iArrowField >= 0)
167
0
        {
168
0
            const auto poFieldDefn = poFeatureDefn->GetFieldDefn(i);
169
0
            out_array->children[iArrowField] = static_cast<struct ArrowArray *>(
170
0
                CPLCalloc(1, sizeof(struct ArrowArray)));
171
0
            auto psChild = out_array->children[iArrowField];
172
173
0
            psChild->release = OGRLayer::ReleaseArray;
174
0
            psChild->length = m_nMaxBatchSize;
175
0
            const auto eSubType = poFieldDefn->GetSubType();
176
0
            size_t nEltSize = 0;
177
0
            switch (poFieldDefn->GetType())
178
0
            {
179
0
                case OFTInteger:
180
0
                {
181
0
                    if (eSubType == OFSTBoolean)
182
0
                    {
183
0
                        nEltSize = sizeof(uint8_t);
184
0
                    }
185
0
                    else if (eSubType == OFSTInt16)
186
0
                    {
187
0
                        nEltSize = sizeof(int16_t);
188
0
                    }
189
0
                    else
190
0
                    {
191
0
                        nEltSize = sizeof(int32_t);
192
0
                    }
193
194
0
                    const auto &osDomainName = poFieldDefn->GetDomainName();
195
0
                    if (!osDomainName.empty() && poDS != nullptr)
196
0
                    {
197
0
                        const auto poFieldDomain =
198
0
                            poDS->GetFieldDomain(osDomainName);
199
0
                        if (poFieldDomain &&
200
0
                            poFieldDomain->GetDomainType() == OFDT_CODED)
201
0
                        {
202
0
                            const OGRCodedFieldDomain *poCodedDomain =
203
0
                                static_cast<const OGRCodedFieldDomain *>(
204
0
                                    poFieldDomain);
205
0
                            FillDict(psChild, poCodedDomain);
206
0
                        }
207
0
                    }
208
209
0
                    break;
210
0
                }
211
0
                case OFTInteger64:
212
0
                {
213
0
                    nEltSize = sizeof(int64_t);
214
0
                    break;
215
0
                }
216
0
                case OFTReal:
217
0
                {
218
0
                    if (eSubType == OFSTFloat32)
219
0
                    {
220
0
                        nEltSize = sizeof(float);
221
0
                    }
222
0
                    else
223
0
                    {
224
0
                        nEltSize = sizeof(double);
225
0
                    }
226
0
                    break;
227
0
                }
228
229
0
                case OFTDateTime:
230
0
                {
231
0
                    if (!bDateTimeAsString)
232
0
                    {
233
0
                        nEltSize = sizeof(int64_t);
234
0
                        break;
235
0
                    }
236
0
                    else
237
0
                    {
238
0
                        [[fallthrough]];
239
0
                    }
240
0
                }
241
242
0
                case OFTString:
243
0
                case OFTBinary:
244
0
                {
245
0
                    psChild->n_buffers = 3;
246
0
                    psChild->buffers = static_cast<const void **>(
247
0
                        CPLCalloc(3, sizeof(void *)));
248
0
                    psChild->buffers[1] = VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
249
0
                        sizeof(uint32_t) * (1 + m_nMaxBatchSize));
250
0
                    if (psChild->buffers[1] == nullptr)
251
0
                        goto error;
252
0
                    memset(const_cast<void *>(psChild->buffers[1]), 0,
253
0
                           sizeof(uint32_t) * (1 + m_nMaxBatchSize));
254
0
                    constexpr size_t DEFAULT_STRING_SIZE = 10;
255
0
                    m_anArrowFieldMaxAlloc[iArrowField] =
256
0
                        DEFAULT_STRING_SIZE * m_nMaxBatchSize;
257
0
                    psChild->buffers[2] = VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
258
0
                        m_anArrowFieldMaxAlloc[iArrowField]);
259
0
                    if (psChild->buffers[2] == nullptr)
260
0
                        goto error;
261
0
                    break;
262
0
                }
263
264
0
                case OFTDate:
265
0
                {
266
0
                    nEltSize = sizeof(int32_t);
267
0
                    break;
268
0
                }
269
270
0
                case OFTTime:
271
0
                {
272
0
                    nEltSize = sizeof(int32_t);
273
0
                    break;
274
0
                }
275
276
0
                default:
277
0
                    break;
278
0
            }
279
280
0
            if (nEltSize != 0)
281
0
            {
282
0
                psChild->n_buffers = 2;
283
0
                psChild->buffers =
284
0
                    static_cast<const void **>(CPLCalloc(2, sizeof(void *)));
285
0
                psChild->buffers[1] =
286
0
                    VSI_MALLOC_ALIGNED_AUTO_VERBOSE(nEltSize * m_nMaxBatchSize);
287
0
                if (psChild->buffers[1] == nullptr)
288
0
                    goto error;
289
0
                memset(const_cast<void *>(psChild->buffers[1]), 0,
290
0
                       nEltSize * m_nMaxBatchSize);
291
0
            }
292
0
        }
293
0
    }
294
295
    // cppcheck-suppress knownConditionTrueFalse
296
0
    for (int i = 0; i < m_nGeomFieldCount; i++)
297
0
    {
298
0
        const int iArrowField = m_mapOGRGeomFieldToArrowField[i];
299
0
        if (iArrowField >= 0)
300
0
        {
301
0
            out_array->children[iArrowField] = static_cast<struct ArrowArray *>(
302
0
                CPLCalloc(1, sizeof(struct ArrowArray)));
303
0
            auto psChild = out_array->children[iArrowField];
304
305
0
            psChild->release = OGRLayer::ReleaseArray;
306
0
            psChild->length = m_nMaxBatchSize;
307
308
0
            psChild->n_buffers = 3;
309
0
            psChild->buffers =
310
0
                static_cast<const void **>(CPLCalloc(3, sizeof(void *)));
311
0
            psChild->buffers[1] = VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
312
0
                sizeof(uint32_t) * (1 + m_nMaxBatchSize));
313
0
            if (psChild->buffers[1] == nullptr)
314
0
                goto error;
315
0
            memset(const_cast<void *>(psChild->buffers[1]), 0,
316
0
                   sizeof(uint32_t) * (1 + m_nMaxBatchSize));
317
0
            constexpr size_t DEFAULT_WKB_SIZE = 100;
318
0
            m_anArrowFieldMaxAlloc[iArrowField] =
319
0
                DEFAULT_WKB_SIZE * m_nMaxBatchSize;
320
0
            psChild->buffers[2] = VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
321
0
                m_anArrowFieldMaxAlloc[iArrowField]);
322
0
            if (psChild->buffers[2] == nullptr)
323
0
                goto error;
324
0
        }
325
0
    }
326
327
0
    return;
328
329
0
error:
330
0
    out_array->release(out_array);
331
0
    memset(out_array, 0, sizeof(*out_array));
332
0
}
333
334
/************************************************************************/
335
/*                             FillDict()                               */
336
/************************************************************************/
337
338
/* static */
339
bool OGRArrowArrayHelper::FillDict(struct ArrowArray *psChild,
340
                                   const OGRCodedFieldDomain *poCodedDomain)
341
0
{
342
0
    int nLastCode = -1;
343
0
    uint32_t nCountChars = 0;
344
0
    int nCountNull = 0;
345
0
    for (const OGRCodedValue *psIter = poCodedDomain->GetEnumeration();
346
0
         psIter->pszCode; ++psIter)
347
0
    {
348
0
        if (CPLGetValueType(psIter->pszCode) != CPL_VALUE_INTEGER)
349
0
        {
350
0
            return false;
351
0
        }
352
0
        int nCode = atoi(psIter->pszCode);
353
0
        if (nCode <= nLastCode || nCode - nLastCode > 100)
354
0
        {
355
0
            return false;
356
0
        }
357
0
        for (int i = nLastCode + 1; i < nCode; ++i)
358
0
        {
359
0
            nCountNull++;
360
0
        }
361
0
        if (psIter->pszValue)
362
0
        {
363
0
            const size_t nLen = strlen(psIter->pszValue);
364
0
            if (nLen > std::numeric_limits<uint32_t>::max() - nCountChars)
365
0
                return false;
366
0
            nCountChars += static_cast<uint32_t>(nLen);
367
0
        }
368
0
        else
369
0
        {
370
0
            nCountNull++;
371
0
        }
372
0
        nLastCode = nCode;
373
0
    }
374
0
    const int nLength = 1 + nLastCode;
375
376
0
    auto psDict = static_cast<struct ArrowArray *>(
377
0
        CPLCalloc(1, sizeof(struct ArrowArray)));
378
0
    psChild->dictionary = psDict;
379
380
0
    psDict->release = OGRLayer::ReleaseArray;
381
0
    psDict->length = nLength;
382
0
    psDict->n_buffers = 3;
383
0
    psDict->buffers = static_cast<const void **>(CPLCalloc(3, sizeof(void *)));
384
0
    psDict->null_count = nCountNull;
385
0
    uint8_t *pabyNull = nullptr;
386
0
    if (nCountNull)
387
0
    {
388
0
        pabyNull = static_cast<uint8_t *>(
389
0
            VSI_MALLOC_ALIGNED_AUTO_VERBOSE((nLength + 7) / 8));
390
0
        if (pabyNull == nullptr)
391
0
        {
392
0
            psDict->release(psDict);
393
0
            CPLFree(psDict);
394
0
            psChild->dictionary = nullptr;
395
0
            return false;
396
0
        }
397
0
        memset(pabyNull, 0xFF, (nLength + 7) / 8);
398
0
        psDict->buffers[0] = pabyNull;
399
0
    }
400
401
0
    uint32_t *panOffsets = static_cast<uint32_t *>(
402
0
        VSI_MALLOC_ALIGNED_AUTO_VERBOSE(sizeof(uint32_t) * (1 + nLength)));
403
0
    if (panOffsets == nullptr)
404
0
    {
405
0
        psDict->release(psDict);
406
0
        CPLFree(psDict);
407
0
        psChild->dictionary = nullptr;
408
0
        return false;
409
0
    }
410
0
    psDict->buffers[1] = panOffsets;
411
412
0
    char *pachValues =
413
0
        static_cast<char *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(nCountChars));
414
0
    if (pachValues == nullptr)
415
0
    {
416
0
        psDict->release(psDict);
417
0
        CPLFree(psDict);
418
0
        psChild->dictionary = nullptr;
419
0
        return false;
420
0
    }
421
0
    psDict->buffers[2] = pachValues;
422
423
0
    nLastCode = -1;
424
0
    uint32_t nOffset = 0;
425
0
    for (const OGRCodedValue *psIter = poCodedDomain->GetEnumeration();
426
0
         psIter->pszCode; ++psIter)
427
0
    {
428
0
        if (CPLGetValueType(psIter->pszCode) != CPL_VALUE_INTEGER)
429
0
        {
430
0
            psDict->release(psDict);
431
0
            CPLFree(psDict);
432
0
            psChild->dictionary = nullptr;
433
0
            return false;
434
0
        }
435
0
        int nCode = atoi(psIter->pszCode);
436
0
        if (nCode <= nLastCode || nCode - nLastCode > 100)
437
0
        {
438
0
            psDict->release(psDict);
439
0
            CPLFree(psDict);
440
0
            psChild->dictionary = nullptr;
441
0
            return false;
442
0
        }
443
0
        for (int i = nLastCode + 1; i < nCode; ++i)
444
0
        {
445
0
            panOffsets[i] = nOffset;
446
0
            if (pabyNull)
447
0
                pabyNull[i / 8] &= static_cast<uint8_t>(~(1 << (i % 8)));
448
0
        }
449
0
        panOffsets[nCode] = nOffset;
450
0
        if (psIter->pszValue)
451
0
        {
452
0
            const size_t nLen = strlen(psIter->pszValue);
453
0
            memcpy(pachValues + nOffset, psIter->pszValue, nLen);
454
0
            nOffset += static_cast<uint32_t>(nLen);
455
0
        }
456
0
        else if (pabyNull)
457
0
        {
458
0
            pabyNull[nCode / 8] &= static_cast<uint8_t>(~(1 << (nCode % 8)));
459
0
        }
460
0
        nLastCode = nCode;
461
0
    }
462
0
    panOffsets[nLength] = nOffset;
463
464
0
    return true;
465
0
}
466
467
//! @endcond