Coverage Report

Created: 2025-06-09 08:44

/src/gdal/frmts/zarr/vsikerchunk_parquet_ref.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Project:  GDAL
4
 * Purpose:  Zarr driver. Virtual file system for
5
 *           https://fsspec.github.io/kerchunk/spec.html#parquet-references
6
 * Author:   Even Rouault <even dot rouault at spatialys.com>
7
 *
8
 ******************************************************************************
9
 * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com>
10
 *
11
 * SPDX-License-Identifier: MIT
12
 ****************************************************************************/
13
14
#include "vsikerchunk.h"
15
16
#include "cpl_json.h"
17
#include "cpl_mem_cache.h"
18
#include "cpl_vsi_virtual.h"
19
20
#include "gdal_priv.h"
21
#include "ogrsf_frmts.h"
22
23
#include <algorithm>
24
#include <cinttypes>
25
#include <functional>
26
#include <limits>
27
#include <mutex>
28
#include <set>
29
#include <utility>
30
31
extern "C" int CPL_DLL GDALIsInGlobalDestructor();
32
33
/************************************************************************/
34
/*                         VSIZarrArrayInfo                             */
35
/************************************************************************/
36
37
struct VSIZarrArrayInfo
38
{
39
    std::vector<uint64_t> anChunkCount{};
40
};
41
42
/************************************************************************/
43
/*                    VSIKerchunkParquetRefFile                         */
44
/************************************************************************/
45
46
struct VSIKerchunkParquetRefFile
47
{
48
    int m_nRecordSize = 0;
49
    std::map<std::string, std::vector<GByte>> m_oMapKeys{};
50
    std::map<std::string, VSIZarrArrayInfo> m_oMapArrayInfo{};
51
};
52
53
/************************************************************************/
54
/*                    VSIKerchunkParquetRefFileSystem                   */
55
/************************************************************************/
56
57
class VSIKerchunkParquetRefFileSystem final : public VSIFilesystemHandler
58
{
59
  public:
60
    VSIKerchunkParquetRefFileSystem()
61
26
    {
62
26
        IsFileSystemInstantiated() = true;
63
26
    }
64
65
    ~VSIKerchunkParquetRefFileSystem();
66
67
    static bool &IsFileSystemInstantiated()
68
52
    {
69
52
        static bool bIsFileSystemInstantiated = false;
70
52
        return bIsFileSystemInstantiated;
71
52
    }
72
73
    VSIVirtualHandle *Open(const char *pszFilename, const char *pszAccess,
74
                           bool bSetError, CSLConstList papszOptions) override;
75
76
    int Stat(const char *pszFilename, VSIStatBufL *pStatBuf,
77
             int nFlags) override;
78
79
    char **ReadDirEx(const char *pszDirname, int nMaxFiles) override;
80
81
    void CleanCache();
82
83
  private:
84
    lru11::Cache<std::string, std::shared_ptr<VSIKerchunkParquetRefFile>,
85
                 std::mutex>
86
        m_oCache{};
87
88
    std::mutex m_oParquetCacheMutex{};
89
    lru11::Cache<std::string, std::shared_ptr<GDALDataset>> *m_poParquetCache{};
90
91
    static std::pair<std::string, std::string>
92
    SplitFilename(const char *pszFilename);
93
94
    std::shared_ptr<VSIKerchunkParquetRefFile>
95
    Load(const std::string &osRootFilename);
96
97
    struct ChunkInfo
98
    {
99
        std::string osParquetFileDirectory{};
100
        std::unique_ptr<OGRFeature> poFeature{};
101
        int iPathField = -1;
102
        int iOffsetField = -1;
103
        int iSizeField = -1;
104
        int iRawField = -1;
105
    };
106
107
    ChunkInfo
108
    GetChunkInfo(const std::string &osRootFilename,
109
                 const std::shared_ptr<VSIKerchunkParquetRefFile> &refFile,
110
                 const std::string &osKey);
111
112
    CPL_DISALLOW_COPY_ASSIGN(VSIKerchunkParquetRefFileSystem)
113
};
114
115
/************************************************************************/
116
/*               ~VSIKerchunkParquetRefFileSystem()                     */
117
/************************************************************************/
118
119
VSIKerchunkParquetRefFileSystem::~VSIKerchunkParquetRefFileSystem()
120
0
{
121
0
    CleanCache();
122
0
    IsFileSystemInstantiated() = false;
123
0
}
124
125
/************************************************************************/
126
/*            VSIKerchunkParquetRefFileSystem::CleanCache()             */
127
/************************************************************************/
128
129
void VSIKerchunkParquetRefFileSystem::CleanCache()
130
12.5k
{
131
    // If we are in the unloading of the library do not try to close
132
    // datasets to avoid crashes and prefer leaking memory...
133
12.5k
    if (!GDALIsInGlobalDestructor())
134
12.5k
    {
135
12.5k
        std::lock_guard<std::mutex> oLock(m_oParquetCacheMutex);
136
12.5k
        if (m_poParquetCache)
137
0
        {
138
0
            m_poParquetCache->clear();
139
0
            delete m_poParquetCache;
140
0
            m_poParquetCache = nullptr;
141
0
        }
142
12.5k
    }
143
12.5k
}
144
145
/************************************************************************/
146
/*            VSIKerchunkParquetRefFileSystem::SplitFilename()          */
147
/************************************************************************/
148
149
/*static*/
150
std::pair<std::string, std::string>
151
VSIKerchunkParquetRefFileSystem::SplitFilename(const char *pszFilename)
152
1.75k
{
153
1.75k
    if (!STARTS_WITH(pszFilename, PARQUET_REF_FS_PREFIX))
154
0
        return {std::string(), std::string()};
155
156
1.75k
    std::string osRootFilename;
157
158
1.75k
    pszFilename += strlen(PARQUET_REF_FS_PREFIX);
159
160
1.75k
    if (*pszFilename == '{')
161
0
    {
162
        // Parse /vsikerchunk_parquet_ref/{/path/to/some/parquet_root}[key]
163
0
        int nLevel = 1;
164
0
        ++pszFilename;
165
0
        for (; *pszFilename; ++pszFilename)
166
0
        {
167
0
            if (*pszFilename == '{')
168
0
            {
169
0
                ++nLevel;
170
0
            }
171
0
            else if (*pszFilename == '}')
172
0
            {
173
0
                --nLevel;
174
0
                if (nLevel == 0)
175
0
                {
176
0
                    ++pszFilename;
177
0
                    break;
178
0
                }
179
0
            }
180
0
            osRootFilename += *pszFilename;
181
0
        }
182
0
        if (nLevel != 0)
183
0
        {
184
0
            CPLError(CE_Failure, CPLE_AppDefined,
185
0
                     "Invalid %s syntax: should be "
186
0
                     "%s{/path/to/some/file}[/optional_key]",
187
0
                     PARQUET_REF_FS_PREFIX, PARQUET_REF_FS_PREFIX);
188
0
            return {std::string(), std::string()};
189
0
        }
190
191
0
        return {osRootFilename,
192
0
                *pszFilename == '/' ? pszFilename + 1 : pszFilename};
193
0
    }
194
1.75k
    else
195
1.75k
    {
196
1.75k
        CPLError(CE_Failure, CPLE_AppDefined,
197
1.75k
                 "Invalid %s syntax: should be "
198
1.75k
                 "%s{/path/to/root/dir}[/optional_key]",
199
1.75k
                 PARQUET_REF_FS_PREFIX, PARQUET_REF_FS_PREFIX);
200
1.75k
        return {std::string(), std::string()};
201
1.75k
    }
202
1.75k
}
203
204
/************************************************************************/
205
/*              VSIKerchunkParquetRefFileSystem::Load()                 */
206
/************************************************************************/
207
208
std::shared_ptr<VSIKerchunkParquetRefFile>
209
VSIKerchunkParquetRefFileSystem::Load(const std::string &osRootFilename)
210
0
{
211
0
    std::shared_ptr<VSIKerchunkParquetRefFile> refFile;
212
0
    if (m_oCache.tryGet(osRootFilename, refFile))
213
0
        return refFile;
214
215
0
    CPLJSONDocument oDoc;
216
217
0
    const std::string osZMetataFilename =
218
0
        CPLFormFilenameSafe(osRootFilename.c_str(), ".zmetadata", nullptr);
219
0
    if (!oDoc.Load(osZMetataFilename))
220
0
    {
221
0
        CPLError(CE_Failure, CPLE_AppDefined,
222
0
                 "VSIKerchunkParquetRefFileSystem: cannot open %s",
223
0
                 osZMetataFilename.c_str());
224
0
        return nullptr;
225
0
    }
226
227
0
    const auto oRoot = oDoc.GetRoot();
228
0
    const auto oRecordSize = oRoot.GetObj("record_size");
229
0
    if (!oRecordSize.IsValid() ||
230
0
        oRecordSize.GetType() != CPLJSONObject::Type::Integer)
231
0
    {
232
0
        CPLError(CE_Failure, CPLE_AppDefined,
233
0
                 "VSIKerchunkParquetRefFileSystem: key 'record_size' missing "
234
0
                 "or not of type integer");
235
0
        return nullptr;
236
0
    }
237
238
0
    const auto oMetadata = oRoot.GetObj("metadata");
239
0
    if (!oMetadata.IsValid() ||
240
0
        oMetadata.GetType() != CPLJSONObject::Type::Object)
241
0
    {
242
0
        CPLError(CE_Failure, CPLE_AppDefined,
243
0
                 "VSIKerchunkParquetRefFileSystem: key 'metadata' missing "
244
0
                 "or not of type dict");
245
0
        return nullptr;
246
0
    }
247
248
0
    refFile = std::make_shared<VSIKerchunkParquetRefFile>();
249
0
    refFile->m_nRecordSize = oRecordSize.ToInteger();
250
0
    if (refFile->m_nRecordSize < 1)
251
0
    {
252
0
        CPLError(CE_Failure, CPLE_AppDefined,
253
0
                 "VSIKerchunkParquetRefFileSystem: Invalid 'record_size'");
254
0
        return nullptr;
255
0
    }
256
257
0
    for (const auto &oEntry : oMetadata.GetChildren())
258
0
    {
259
0
        const std::string osKeyName = oEntry.GetName();
260
0
        if (oEntry.GetType() == CPLJSONObject::Type::Object)
261
0
        {
262
0
            const std::string osSerialized =
263
0
                oEntry.Format(CPLJSONObject::PrettyFormat::Plain);
264
0
            std::vector<GByte> abyValue;
265
0
            abyValue.insert(
266
0
                abyValue.end(),
267
0
                reinterpret_cast<const GByte *>(osSerialized.data()),
268
0
                reinterpret_cast<const GByte *>(osSerialized.data()) +
269
0
                    osSerialized.size());
270
271
0
            refFile->m_oMapKeys[osKeyName] = std::move(abyValue);
272
273
0
            if (cpl::ends_with(osKeyName, "/.zarray"))
274
0
            {
275
0
                const auto oShape = oEntry.GetArray("shape");
276
0
                const auto oChunks = oEntry.GetArray("chunks");
277
0
                if (!oShape.IsValid())
278
0
                {
279
0
                    CPLError(CE_Failure, CPLE_AppDefined,
280
0
                             "VSIKerchunkParquetRefFileSystem: "
281
0
                             "missing 'shape' entry for key '%s'",
282
0
                             osKeyName.c_str());
283
0
                    return nullptr;
284
0
                }
285
0
                else if (!oChunks.IsValid())
286
0
                {
287
0
                    CPLError(CE_Failure, CPLE_AppDefined,
288
0
                             "VSIKerchunkParquetRefFileSystem: "
289
0
                             "missing 'chunks' entry for key '%s'",
290
0
                             osKeyName.c_str());
291
0
                    return nullptr;
292
0
                }
293
0
                else if (oShape.Size() != oChunks.Size())
294
0
                {
295
0
                    CPLError(CE_Failure, CPLE_AppDefined,
296
0
                             "VSIKerchunkParquetRefFileSystem: "
297
0
                             "'shape' and 'chunks' entries have not the same "
298
0
                             "number of values for key '%s'",
299
0
                             osKeyName.c_str());
300
0
                    return nullptr;
301
0
                }
302
0
                else if (oShape.Size() > 32)
303
0
                {
304
0
                    CPLError(CE_Failure, CPLE_AppDefined,
305
0
                             "VSIKerchunkParquetRefFileSystem: "
306
0
                             "'shape' has too many dimensions for key '%s'",
307
0
                             osKeyName.c_str());
308
0
                    return nullptr;
309
0
                }
310
0
                else
311
0
                {
312
0
                    VSIZarrArrayInfo arrayInfo;
313
0
                    uint64_t nTotalChunks = 1;
314
0
                    for (int i = 0; i < oShape.Size(); ++i)
315
0
                    {
316
0
                        const uint64_t nSize = oShape[i].ToLong();
317
0
                        const uint64_t nChunkSize = oChunks[i].ToLong();
318
0
                        if (nSize == 0)
319
0
                        {
320
0
                            CPLError(CE_Failure, CPLE_AppDefined,
321
0
                                     "VSIKerchunkParquetRefFileSystem: "
322
0
                                     "shape[%d]=0 in "
323
0
                                     "array definition for key '%s'",
324
0
                                     i, osKeyName.c_str());
325
0
                            return nullptr;
326
0
                        }
327
0
                        else if (nChunkSize == 0)
328
0
                        {
329
0
                            CPLError(CE_Failure, CPLE_AppDefined,
330
0
                                     "VSIKerchunkParquetRefFileSystem: "
331
0
                                     "chunks[%d]=0 in "
332
0
                                     "array definition for key '%s'",
333
0
                                     i, osKeyName.c_str());
334
0
                            return nullptr;
335
0
                        }
336
0
                        const auto nChunkCount =
337
0
                            DIV_ROUND_UP(nSize, nChunkSize);
338
0
                        if (nChunkCount >
339
0
                            std::numeric_limits<uint64_t>::max() / nTotalChunks)
340
0
                        {
341
0
                            CPLError(
342
0
                                CE_Failure, CPLE_AppDefined,
343
0
                                "VSIKerchunkParquetRefFileSystem: "
344
0
                                "product(shape[]) > UINT64_MAX for key '%s'",
345
0
                                osKeyName.c_str());
346
0
                            return nullptr;
347
0
                        }
348
0
                        nTotalChunks *= nChunkCount;
349
0
                        arrayInfo.anChunkCount.push_back(nChunkCount);
350
0
                    }
351
0
                    const std::string osArrayDir = osKeyName.substr(
352
0
                        0, osKeyName.size() - strlen("/.zarray"));
353
0
                    refFile->m_oMapArrayInfo[osArrayDir] = std::move(arrayInfo);
354
0
                }
355
0
            }
356
0
        }
357
0
        else
358
0
        {
359
0
            CPLError(CE_Failure, CPLE_AppDefined,
360
0
                     "VSIKerchunkParquetRefFileSystem: invalid value type for "
361
0
                     "key '%s'",
362
0
                     osKeyName.c_str());
363
0
            return nullptr;
364
0
        }
365
0
    }
366
367
0
    m_oCache.insert(osRootFilename, refFile);
368
0
    return refFile;
369
0
}
370
371
/************************************************************************/
372
/*           VSIKerchunkParquetRefFileSystem::GetChunkInfo()            */
373
/************************************************************************/
374
375
VSIKerchunkParquetRefFileSystem::ChunkInfo
376
VSIKerchunkParquetRefFileSystem::GetChunkInfo(
377
    const std::string &osRootFilename,
378
    const std::shared_ptr<VSIKerchunkParquetRefFile> &refFile,
379
    const std::string &osKey)
380
0
{
381
0
    ChunkInfo info;
382
383
0
    const std::string osArrayPath = CPLGetPathSafe(osKey.c_str());
384
0
    const auto oIterArray = refFile->m_oMapArrayInfo.find(osArrayPath);
385
0
    const std::string osIndices = CPLGetFilename(osKey.c_str());
386
0
    if (oIterArray != refFile->m_oMapArrayInfo.end() && !osIndices.empty() &&
387
0
        osIndices[0] >= '0' && osIndices[0] <= '9')
388
0
    {
389
0
        const auto &oArrayInfo = oIterArray->second;
390
0
        const CPLStringList aosIndices(
391
0
            CSLTokenizeString2(osIndices.c_str(), ".", 0));
392
0
        if ((static_cast<size_t>(aosIndices.size()) ==
393
0
             oArrayInfo.anChunkCount.size()) ||
394
0
            (aosIndices.size() == 1 && strcmp(aosIndices[0], "0") == 0 &&
395
0
             oArrayInfo.anChunkCount.empty()))
396
0
        {
397
0
            std::vector<uint64_t> anIndices;
398
0
            for (size_t i = 0; i < oArrayInfo.anChunkCount.size(); ++i)
399
0
            {
400
0
                char *endptr = nullptr;
401
0
                anIndices.push_back(std::strtoull(aosIndices[i], &endptr, 10));
402
0
                if (aosIndices[i][0] == '-' ||
403
0
                    endptr != aosIndices[i] + strlen(aosIndices[i]) ||
404
0
                    anIndices[i] >= oArrayInfo.anChunkCount[i])
405
0
                {
406
0
                    return info;
407
0
                }
408
0
            }
409
410
0
            uint64_t nLinearIndex = 0;
411
0
            uint64_t nMulFactor = 1;
412
0
            for (size_t i = anIndices.size(); i > 0;)
413
0
            {
414
0
                --i;
415
0
                nLinearIndex += anIndices[i] * nMulFactor;
416
0
                nMulFactor *= oArrayInfo.anChunkCount[i];
417
0
            }
418
419
0
            CPLDebugOnly("VSIKerchunkParquetRefFileSystem",
420
0
                         "Linear chunk index %" PRIu64, nLinearIndex);
421
422
0
            const uint64_t nParquetIdx = nLinearIndex / refFile->m_nRecordSize;
423
0
            const int nIdxInParquet =
424
0
                static_cast<int>(nLinearIndex % refFile->m_nRecordSize);
425
426
0
            const std::string osParquetFilename = CPLFormFilenameSafe(
427
0
                CPLFormFilenameSafe(osRootFilename.c_str(), osArrayPath.c_str(),
428
0
                                    nullptr)
429
0
                    .c_str(),
430
0
                CPLSPrintf("refs.%" PRIu64 ".parq", nParquetIdx), nullptr);
431
0
            CPLDebugOnly("VSIKerchunkParquetRefFileSystem",
432
0
                         "Looking for entry %d in Parquet file %s",
433
0
                         nIdxInParquet, osParquetFilename.c_str());
434
435
0
            std::lock_guard<std::mutex> oLock(m_oParquetCacheMutex);
436
0
            std::shared_ptr<GDALDataset> poDS;
437
0
            if (!m_poParquetCache)
438
0
            {
439
0
                m_poParquetCache = std::make_unique<lru11::Cache<
440
0
                    std::string, std::shared_ptr<GDALDataset>>>()
441
0
                                       .release();
442
0
            }
443
0
            if (!m_poParquetCache->tryGet(osParquetFilename, poDS))
444
0
            {
445
0
                const char *const apszAllowedDrivers[] = {"PARQUET", "ADBC",
446
0
                                                          nullptr};
447
0
                CPLConfigOptionSetter oSetter(
448
0
                    "OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", "NO", false);
449
0
                poDS.reset(
450
0
                    GDALDataset::Open(osParquetFilename.c_str(),
451
0
                                      GDAL_OF_VECTOR | GDAL_OF_VERBOSE_ERROR,
452
0
                                      apszAllowedDrivers, nullptr, nullptr));
453
0
                if (poDS)
454
0
                    m_poParquetCache->insert(osParquetFilename, poDS);
455
0
            }
456
457
0
            if (poDS && poDS->GetLayerCount() == 1)
458
0
            {
459
0
                const auto IsIntOrInt64 = [](OGRFieldType eType)
460
0
                { return eType == OFTInteger || eType == OFTInteger64; };
461
0
                auto poLayer = poDS->GetLayer(0);
462
0
                const auto poDefn = poLayer->GetLayerDefn();
463
0
                info.iPathField = poDefn->GetFieldIndex("path");
464
0
                info.iOffsetField = poDefn->GetFieldIndex("offset");
465
0
                info.iSizeField = poDefn->GetFieldIndex("size");
466
0
                info.iRawField = poDefn->GetFieldIndex("raw");
467
0
                if (info.iPathField >= 0 && info.iOffsetField >= 0 &&
468
0
                    info.iSizeField >= 0 && info.iRawField >= 0 &&
469
0
                    poDefn->GetFieldDefn(info.iPathField)->GetType() ==
470
0
                        OFTString &&
471
0
                    IsIntOrInt64(
472
0
                        poDefn->GetFieldDefn(info.iOffsetField)->GetType()) &&
473
0
                    IsIntOrInt64(
474
0
                        poDefn->GetFieldDefn(info.iSizeField)->GetType()) &&
475
0
                    poDefn->GetFieldDefn(info.iRawField)->GetType() ==
476
0
                        OFTBinary)
477
0
                {
478
0
                    info.osParquetFileDirectory =
479
0
                        CPLGetPathSafe(osParquetFilename.c_str());
480
0
                    info.poFeature.reset(poLayer->GetFeature(nIdxInParquet));
481
0
                }
482
0
                else
483
0
                {
484
0
                    CPLError(CE_Failure, CPLE_AppDefined,
485
0
                             "%s has an unexpected field structure",
486
0
                             osParquetFilename.c_str());
487
0
                }
488
0
            }
489
0
        }
490
0
    }
491
0
    return info;
492
0
}
493
494
/************************************************************************/
495
/*               VSIKerchunkParquetRefFileSystem::Open()                */
496
/************************************************************************/
497
498
VSIVirtualHandle *VSIKerchunkParquetRefFileSystem::Open(
499
    const char *pszFilename, const char *pszAccess, bool /* bSetError */,
500
    CSLConstList /* papszOptions */)
501
1.24k
{
502
1.24k
    CPLDebugOnly("VSIKerchunkParquetRefFileSystem", "Open(%s)", pszFilename);
503
1.24k
    if (strcmp(pszAccess, "r") != 0 && strcmp(pszAccess, "rb") != 0)
504
0
        return nullptr;
505
506
1.24k
    const auto [osRootFilename, osKey] = SplitFilename(pszFilename);
507
1.24k
    if (osRootFilename.empty())
508
1.24k
        return nullptr;
509
510
0
    const auto refFile = Load(osRootFilename);
511
0
    if (!refFile)
512
0
        return nullptr;
513
514
0
    const auto oIter = refFile->m_oMapKeys.find(osKey);
515
0
    if (oIter == refFile->m_oMapKeys.end())
516
0
    {
517
0
        const auto info = GetChunkInfo(osRootFilename, refFile, osKey);
518
0
        if (info.poFeature)
519
0
        {
520
0
            if (info.poFeature->IsFieldSetAndNotNull(info.iRawField))
521
0
            {
522
0
                auto psField = info.poFeature->GetRawFieldRef(info.iRawField);
523
                // Borrow binary data to feature
524
0
                GByte *abyData = psField->Binary.paData;
525
0
                int nSize = psField->Binary.nCount;
526
0
                psField->Binary.paData = nullptr;
527
0
                psField->Binary.nCount = 0;
528
                // and transmit its ownership to the VSIMem file
529
0
                return VSIFileFromMemBuffer(nullptr, abyData, nSize,
530
0
                                            /* bTakeOwnership = */ true);
531
0
            }
532
0
            else
533
0
            {
534
0
                const uint64_t nOffset =
535
0
                    info.poFeature->GetFieldAsInteger64(info.iOffsetField);
536
0
                const int nSize =
537
0
                    info.poFeature->GetFieldAsInteger(info.iSizeField);
538
539
0
                std::string osVSIPath = VSIKerchunkMorphURIToVSIPath(
540
0
                    info.poFeature->GetFieldAsString(info.iPathField),
541
0
                    info.osParquetFileDirectory);
542
0
                if (osVSIPath.empty())
543
0
                    return nullptr;
544
545
0
                const std::string osPath =
546
0
                    nSize ? CPLSPrintf("/vsisubfile/%" PRIu64 "_%u,%s", nOffset,
547
0
                                       nSize, osVSIPath.c_str())
548
0
                          : std::move(osVSIPath);
549
0
                CPLDebugOnly("VSIKerchunkParquetRefFileSystem", "Opening %s",
550
0
                             osPath.c_str());
551
0
                CPLConfigOptionSetter oSetter("GDAL_DISABLE_READDIR_ON_OPEN",
552
0
                                              "EMPTY_DIR", false);
553
0
                return VSIFOpenL(osPath.c_str(), "rb");
554
0
            }
555
0
        }
556
557
0
        return nullptr;
558
0
    }
559
560
0
    const auto &abyValue = oIter->second;
561
0
    return VSIFileFromMemBuffer(nullptr, const_cast<GByte *>(abyValue.data()),
562
0
                                abyValue.size(), /* bTakeOwnership = */ false);
563
0
}
564
565
/************************************************************************/
566
/*               VSIKerchunkParquetRefFileSystem::Stat()                */
567
/************************************************************************/
568
569
int VSIKerchunkParquetRefFileSystem::Stat(const char *pszFilename,
570
                                          VSIStatBufL *pStatBuf, int nFlags)
571
513
{
572
513
    CPLDebugOnly("VSIKerchunkParquetRefFileSystem", "Stat(%s)", pszFilename);
573
513
    memset(pStatBuf, 0, sizeof(VSIStatBufL));
574
575
513
    const auto [osRootFilename, osKey] = SplitFilename(pszFilename);
576
513
    if (osRootFilename.empty())
577
513
        return -1;
578
579
0
    const auto refFile = Load(osRootFilename);
580
0
    if (!refFile)
581
0
        return -1;
582
583
0
    if (osKey.empty())
584
0
    {
585
0
        pStatBuf->st_mode = S_IFDIR;
586
0
        return 0;
587
0
    }
588
589
0
    const auto oIter = refFile->m_oMapKeys.find(osKey);
590
0
    if (oIter == refFile->m_oMapKeys.end())
591
0
    {
592
0
        const auto info = GetChunkInfo(osRootFilename, refFile, osKey);
593
0
        if (info.poFeature)
594
0
        {
595
0
            if (info.poFeature->IsFieldSetAndNotNull(info.iRawField))
596
0
            {
597
0
                int nSize = 0;
598
0
                info.poFeature->GetFieldAsBinary(info.iRawField, &nSize);
599
0
                pStatBuf->st_size = nSize;
600
0
            }
601
0
            else
602
0
            {
603
0
                pStatBuf->st_size =
604
0
                    info.poFeature->GetFieldAsInteger64(info.iSizeField);
605
0
                if (pStatBuf->st_size == 0)
606
0
                {
607
0
                    const std::string osVSIPath = VSIKerchunkMorphURIToVSIPath(
608
0
                        info.poFeature->GetFieldAsString(info.iPathField),
609
0
                        info.osParquetFileDirectory);
610
0
                    if (osVSIPath.empty())
611
0
                        return -1;
612
0
                    return VSIStatExL(osVSIPath.c_str(), pStatBuf, nFlags);
613
0
                }
614
0
            }
615
0
            pStatBuf->st_mode = S_IFREG;
616
0
            return 0;
617
0
        }
618
619
0
        if (cpl::contains(refFile->m_oMapKeys, osKey + "/.zgroup") ||
620
0
            cpl::contains(refFile->m_oMapKeys, osKey + "/.zarray"))
621
0
        {
622
0
            pStatBuf->st_mode = S_IFDIR;
623
0
            return 0;
624
0
        }
625
626
0
        return -1;
627
0
    }
628
629
0
    const auto &abyValue = oIter->second;
630
0
    pStatBuf->st_size = abyValue.size();
631
0
    pStatBuf->st_mode = S_IFREG;
632
633
0
    return 0;
634
0
}
635
636
/************************************************************************/
637
/*             VSIKerchunkParquetRefFileSystem::ReadDirEx()             */
638
/************************************************************************/
639
640
char **VSIKerchunkParquetRefFileSystem::ReadDirEx(const char *pszDirname,
641
                                                  int nMaxFiles)
642
0
{
643
0
    CPLDebugOnly("VSIKerchunkParquetRefFileSystem", "ReadDir(%s)", pszDirname);
644
645
0
    const auto [osRootFilename, osAskedKey] = SplitFilename(pszDirname);
646
0
    if (osRootFilename.empty())
647
0
        return nullptr;
648
649
0
    const auto refFile = Load(osRootFilename);
650
0
    if (!refFile)
651
0
        return nullptr;
652
653
0
    std::set<std::string> set;
654
0
    for (const auto &[key, value] : refFile->m_oMapKeys)
655
0
    {
656
0
        if (osAskedKey.empty())
657
0
        {
658
0
            const auto nPos = key.find('/');
659
0
            if (nPos == std::string::npos)
660
0
                set.insert(key);
661
0
            else
662
0
                set.insert(key.substr(0, nPos));
663
0
        }
664
0
        else if (key.size() > osAskedKey.size() &&
665
0
                 cpl::starts_with(key, osAskedKey) &&
666
0
                 key[osAskedKey.size()] == '/')
667
0
        {
668
0
            std::string subKey = key.substr(osAskedKey.size() + 1);
669
0
            const auto nPos = subKey.find('/');
670
0
            if (nPos == std::string::npos)
671
0
                set.insert(std::move(subKey));
672
0
            else
673
0
                set.insert(subKey.substr(0, nPos));
674
0
        }
675
0
    }
676
677
0
    CPLStringList aosRet;
678
0
    for (const std::string &v : set)
679
0
    {
680
        // CPLDebugOnly("VSIKerchunkParquetRefFileSystem", ".. %s", v.c_str());
681
0
        aosRet.AddString(v.c_str());
682
0
    }
683
684
    // Synthetize file names for x.y.z chunks
685
0
    const auto oIterArray = refFile->m_oMapArrayInfo.find(osAskedKey);
686
0
    if (oIterArray != refFile->m_oMapArrayInfo.end())
687
0
    {
688
0
        const auto &oArrayInfo = oIterArray->second;
689
0
        if (oArrayInfo.anChunkCount.empty())
690
0
        {
691
0
            aosRet.AddString("0");
692
0
        }
693
0
        else
694
0
        {
695
0
            std::string osCurElt;
696
0
            std::function<bool(size_t)> Enumerate;
697
0
            if (nMaxFiles <= 0)
698
0
                nMaxFiles = 100 * 1024 * 1024;
699
700
0
            Enumerate = [nMaxFiles, &aosRet, &oArrayInfo, &osCurElt,
701
0
                         &Enumerate](size_t iDim)
702
0
            {
703
0
                const size_t sizeBefore = osCurElt.size();
704
0
                for (uint64_t i = 0; i < oArrayInfo.anChunkCount[iDim]; ++i)
705
0
                {
706
0
                    osCurElt += CPLSPrintf("%" PRIu64, i);
707
0
                    if (iDim + 1 < oArrayInfo.anChunkCount.size())
708
0
                    {
709
0
                        osCurElt += '.';
710
0
                        if (!Enumerate(iDim + 1))
711
0
                            return false;
712
0
                    }
713
0
                    else
714
0
                    {
715
0
                        if (aosRet.size() >= nMaxFiles)
716
0
                            return false;
717
0
                        aosRet.AddString(osCurElt);
718
0
                    }
719
0
                    osCurElt.resize(sizeBefore);
720
0
                }
721
0
                return true;
722
0
            };
723
724
0
            Enumerate(0);
725
0
        }
726
0
    }
727
728
0
    return aosRet.StealList();
729
0
}
730
731
/************************************************************************/
732
/*               VSIInstallKerchunkParquetRefFileSystem()               */
733
/************************************************************************/
734
735
void VSIInstallKerchunkParquetRefFileSystem()
736
26
{
737
26
    static std::mutex oMutex;
738
26
    std::lock_guard<std::mutex> oLock(oMutex);
739
    // cppcheck-suppress knownConditionTrueFalse
740
26
    if (!VSIKerchunkParquetRefFileSystem::IsFileSystemInstantiated())
741
26
    {
742
26
        VSIFileManager::InstallHandler(
743
26
            PARQUET_REF_FS_PREFIX,
744
26
            std::make_unique<VSIKerchunkParquetRefFileSystem>().release());
745
26
    }
746
26
}
747
748
/************************************************************************/
749
/*                VSIKerchunkParquetRefFileSystemCleanCache()           */
750
/************************************************************************/
751
752
void VSIKerchunkParquetRefFileSystemCleanCache()
753
12.5k
{
754
12.5k
    auto poFS = dynamic_cast<VSIKerchunkParquetRefFileSystem *>(
755
12.5k
        VSIFileManager::GetHandler(PARQUET_REF_FS_PREFIX));
756
12.5k
    if (poFS)
757
12.5k
        poFS->CleanCache();
758
12.5k
}