/src/gdal/frmts/zarr/vsikerchunk_parquet_ref.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Project: GDAL |
4 | | * Purpose: Zarr driver. Virtual file system for |
5 | | * https://fsspec.github.io/kerchunk/spec.html#parquet-references |
6 | | * Author: Even Rouault <even dot rouault at spatialys.com> |
7 | | * |
8 | | ****************************************************************************** |
9 | | * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com> |
10 | | * |
11 | | * SPDX-License-Identifier: MIT |
12 | | ****************************************************************************/ |
13 | | |
14 | | #include "vsikerchunk.h" |
15 | | |
16 | | #include "cpl_json.h" |
17 | | #include "cpl_mem_cache.h" |
18 | | #include "cpl_vsi_virtual.h" |
19 | | |
20 | | #include "gdal_priv.h" |
21 | | #include "ogrsf_frmts.h" |
22 | | |
23 | | #include <algorithm> |
24 | | #include <cinttypes> |
25 | | #include <functional> |
26 | | #include <limits> |
27 | | #include <mutex> |
28 | | #include <set> |
29 | | #include <utility> |
30 | | |
31 | | extern "C" int CPL_DLL GDALIsInGlobalDestructor(); |
32 | | |
33 | | /************************************************************************/ |
34 | | /* VSIZarrArrayInfo */ |
35 | | /************************************************************************/ |
36 | | |
37 | | struct VSIZarrArrayInfo |
38 | | { |
39 | | std::vector<uint64_t> anChunkCount{}; |
40 | | }; |
41 | | |
42 | | /************************************************************************/ |
43 | | /* VSIKerchunkParquetRefFile */ |
44 | | /************************************************************************/ |
45 | | |
46 | | struct VSIKerchunkParquetRefFile |
47 | | { |
48 | | int m_nRecordSize = 0; |
49 | | std::map<std::string, std::vector<GByte>> m_oMapKeys{}; |
50 | | std::map<std::string, VSIZarrArrayInfo> m_oMapArrayInfo{}; |
51 | | }; |
52 | | |
53 | | /************************************************************************/ |
54 | | /* VSIKerchunkParquetRefFileSystem */ |
55 | | /************************************************************************/ |
56 | | |
57 | | class VSIKerchunkParquetRefFileSystem final : public VSIFilesystemHandler |
58 | | { |
59 | | public: |
60 | | VSIKerchunkParquetRefFileSystem() |
61 | 26 | { |
62 | 26 | IsFileSystemInstantiated() = true; |
63 | 26 | } |
64 | | |
65 | | ~VSIKerchunkParquetRefFileSystem(); |
66 | | |
67 | | static bool &IsFileSystemInstantiated() |
68 | 52 | { |
69 | 52 | static bool bIsFileSystemInstantiated = false; |
70 | 52 | return bIsFileSystemInstantiated; |
71 | 52 | } |
72 | | |
73 | | VSIVirtualHandle *Open(const char *pszFilename, const char *pszAccess, |
74 | | bool bSetError, CSLConstList papszOptions) override; |
75 | | |
76 | | int Stat(const char *pszFilename, VSIStatBufL *pStatBuf, |
77 | | int nFlags) override; |
78 | | |
79 | | char **ReadDirEx(const char *pszDirname, int nMaxFiles) override; |
80 | | |
81 | | void CleanCache(); |
82 | | |
83 | | private: |
84 | | lru11::Cache<std::string, std::shared_ptr<VSIKerchunkParquetRefFile>, |
85 | | std::mutex> |
86 | | m_oCache{}; |
87 | | |
88 | | std::mutex m_oParquetCacheMutex{}; |
89 | | lru11::Cache<std::string, std::shared_ptr<GDALDataset>> *m_poParquetCache{}; |
90 | | |
91 | | static std::pair<std::string, std::string> |
92 | | SplitFilename(const char *pszFilename); |
93 | | |
94 | | std::shared_ptr<VSIKerchunkParquetRefFile> |
95 | | Load(const std::string &osRootFilename); |
96 | | |
97 | | struct ChunkInfo |
98 | | { |
99 | | std::string osParquetFileDirectory{}; |
100 | | std::unique_ptr<OGRFeature> poFeature{}; |
101 | | int iPathField = -1; |
102 | | int iOffsetField = -1; |
103 | | int iSizeField = -1; |
104 | | int iRawField = -1; |
105 | | }; |
106 | | |
107 | | ChunkInfo |
108 | | GetChunkInfo(const std::string &osRootFilename, |
109 | | const std::shared_ptr<VSIKerchunkParquetRefFile> &refFile, |
110 | | const std::string &osKey); |
111 | | |
112 | | CPL_DISALLOW_COPY_ASSIGN(VSIKerchunkParquetRefFileSystem) |
113 | | }; |
114 | | |
115 | | /************************************************************************/ |
116 | | /* ~VSIKerchunkParquetRefFileSystem() */ |
117 | | /************************************************************************/ |
118 | | |
119 | | VSIKerchunkParquetRefFileSystem::~VSIKerchunkParquetRefFileSystem() |
120 | 0 | { |
121 | 0 | CleanCache(); |
122 | 0 | IsFileSystemInstantiated() = false; |
123 | 0 | } |
124 | | |
125 | | /************************************************************************/ |
126 | | /* VSIKerchunkParquetRefFileSystem::CleanCache() */ |
127 | | /************************************************************************/ |
128 | | |
129 | | void VSIKerchunkParquetRefFileSystem::CleanCache() |
130 | 12.5k | { |
131 | | // If we are in the unloading of the library do not try to close |
132 | | // datasets to avoid crashes and prefer leaking memory... |
133 | 12.5k | if (!GDALIsInGlobalDestructor()) |
134 | 12.5k | { |
135 | 12.5k | std::lock_guard<std::mutex> oLock(m_oParquetCacheMutex); |
136 | 12.5k | if (m_poParquetCache) |
137 | 0 | { |
138 | 0 | m_poParquetCache->clear(); |
139 | 0 | delete m_poParquetCache; |
140 | 0 | m_poParquetCache = nullptr; |
141 | 0 | } |
142 | 12.5k | } |
143 | 12.5k | } |
144 | | |
145 | | /************************************************************************/ |
146 | | /* VSIKerchunkParquetRefFileSystem::SplitFilename() */ |
147 | | /************************************************************************/ |
148 | | |
149 | | /*static*/ |
150 | | std::pair<std::string, std::string> |
151 | | VSIKerchunkParquetRefFileSystem::SplitFilename(const char *pszFilename) |
152 | 1.75k | { |
153 | 1.75k | if (!STARTS_WITH(pszFilename, PARQUET_REF_FS_PREFIX)) |
154 | 0 | return {std::string(), std::string()}; |
155 | | |
156 | 1.75k | std::string osRootFilename; |
157 | | |
158 | 1.75k | pszFilename += strlen(PARQUET_REF_FS_PREFIX); |
159 | | |
160 | 1.75k | if (*pszFilename == '{') |
161 | 0 | { |
162 | | // Parse /vsikerchunk_parquet_ref/{/path/to/some/parquet_root}[key] |
163 | 0 | int nLevel = 1; |
164 | 0 | ++pszFilename; |
165 | 0 | for (; *pszFilename; ++pszFilename) |
166 | 0 | { |
167 | 0 | if (*pszFilename == '{') |
168 | 0 | { |
169 | 0 | ++nLevel; |
170 | 0 | } |
171 | 0 | else if (*pszFilename == '}') |
172 | 0 | { |
173 | 0 | --nLevel; |
174 | 0 | if (nLevel == 0) |
175 | 0 | { |
176 | 0 | ++pszFilename; |
177 | 0 | break; |
178 | 0 | } |
179 | 0 | } |
180 | 0 | osRootFilename += *pszFilename; |
181 | 0 | } |
182 | 0 | if (nLevel != 0) |
183 | 0 | { |
184 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
185 | 0 | "Invalid %s syntax: should be " |
186 | 0 | "%s{/path/to/some/file}[/optional_key]", |
187 | 0 | PARQUET_REF_FS_PREFIX, PARQUET_REF_FS_PREFIX); |
188 | 0 | return {std::string(), std::string()}; |
189 | 0 | } |
190 | | |
191 | 0 | return {osRootFilename, |
192 | 0 | *pszFilename == '/' ? pszFilename + 1 : pszFilename}; |
193 | 0 | } |
194 | 1.75k | else |
195 | 1.75k | { |
196 | 1.75k | CPLError(CE_Failure, CPLE_AppDefined, |
197 | 1.75k | "Invalid %s syntax: should be " |
198 | 1.75k | "%s{/path/to/root/dir}[/optional_key]", |
199 | 1.75k | PARQUET_REF_FS_PREFIX, PARQUET_REF_FS_PREFIX); |
200 | 1.75k | return {std::string(), std::string()}; |
201 | 1.75k | } |
202 | 1.75k | } |
203 | | |
204 | | /************************************************************************/ |
205 | | /* VSIKerchunkParquetRefFileSystem::Load() */ |
206 | | /************************************************************************/ |
207 | | |
208 | | std::shared_ptr<VSIKerchunkParquetRefFile> |
209 | | VSIKerchunkParquetRefFileSystem::Load(const std::string &osRootFilename) |
210 | 0 | { |
211 | 0 | std::shared_ptr<VSIKerchunkParquetRefFile> refFile; |
212 | 0 | if (m_oCache.tryGet(osRootFilename, refFile)) |
213 | 0 | return refFile; |
214 | | |
215 | 0 | CPLJSONDocument oDoc; |
216 | |
|
217 | 0 | const std::string osZMetataFilename = |
218 | 0 | CPLFormFilenameSafe(osRootFilename.c_str(), ".zmetadata", nullptr); |
219 | 0 | if (!oDoc.Load(osZMetataFilename)) |
220 | 0 | { |
221 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
222 | 0 | "VSIKerchunkParquetRefFileSystem: cannot open %s", |
223 | 0 | osZMetataFilename.c_str()); |
224 | 0 | return nullptr; |
225 | 0 | } |
226 | | |
227 | 0 | const auto oRoot = oDoc.GetRoot(); |
228 | 0 | const auto oRecordSize = oRoot.GetObj("record_size"); |
229 | 0 | if (!oRecordSize.IsValid() || |
230 | 0 | oRecordSize.GetType() != CPLJSONObject::Type::Integer) |
231 | 0 | { |
232 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
233 | 0 | "VSIKerchunkParquetRefFileSystem: key 'record_size' missing " |
234 | 0 | "or not of type integer"); |
235 | 0 | return nullptr; |
236 | 0 | } |
237 | | |
238 | 0 | const auto oMetadata = oRoot.GetObj("metadata"); |
239 | 0 | if (!oMetadata.IsValid() || |
240 | 0 | oMetadata.GetType() != CPLJSONObject::Type::Object) |
241 | 0 | { |
242 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
243 | 0 | "VSIKerchunkParquetRefFileSystem: key 'metadata' missing " |
244 | 0 | "or not of type dict"); |
245 | 0 | return nullptr; |
246 | 0 | } |
247 | | |
248 | 0 | refFile = std::make_shared<VSIKerchunkParquetRefFile>(); |
249 | 0 | refFile->m_nRecordSize = oRecordSize.ToInteger(); |
250 | 0 | if (refFile->m_nRecordSize < 1) |
251 | 0 | { |
252 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
253 | 0 | "VSIKerchunkParquetRefFileSystem: Invalid 'record_size'"); |
254 | 0 | return nullptr; |
255 | 0 | } |
256 | | |
257 | 0 | for (const auto &oEntry : oMetadata.GetChildren()) |
258 | 0 | { |
259 | 0 | const std::string osKeyName = oEntry.GetName(); |
260 | 0 | if (oEntry.GetType() == CPLJSONObject::Type::Object) |
261 | 0 | { |
262 | 0 | const std::string osSerialized = |
263 | 0 | oEntry.Format(CPLJSONObject::PrettyFormat::Plain); |
264 | 0 | std::vector<GByte> abyValue; |
265 | 0 | abyValue.insert( |
266 | 0 | abyValue.end(), |
267 | 0 | reinterpret_cast<const GByte *>(osSerialized.data()), |
268 | 0 | reinterpret_cast<const GByte *>(osSerialized.data()) + |
269 | 0 | osSerialized.size()); |
270 | |
|
271 | 0 | refFile->m_oMapKeys[osKeyName] = std::move(abyValue); |
272 | |
|
273 | 0 | if (cpl::ends_with(osKeyName, "/.zarray")) |
274 | 0 | { |
275 | 0 | const auto oShape = oEntry.GetArray("shape"); |
276 | 0 | const auto oChunks = oEntry.GetArray("chunks"); |
277 | 0 | if (!oShape.IsValid()) |
278 | 0 | { |
279 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
280 | 0 | "VSIKerchunkParquetRefFileSystem: " |
281 | 0 | "missing 'shape' entry for key '%s'", |
282 | 0 | osKeyName.c_str()); |
283 | 0 | return nullptr; |
284 | 0 | } |
285 | 0 | else if (!oChunks.IsValid()) |
286 | 0 | { |
287 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
288 | 0 | "VSIKerchunkParquetRefFileSystem: " |
289 | 0 | "missing 'chunks' entry for key '%s'", |
290 | 0 | osKeyName.c_str()); |
291 | 0 | return nullptr; |
292 | 0 | } |
293 | 0 | else if (oShape.Size() != oChunks.Size()) |
294 | 0 | { |
295 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
296 | 0 | "VSIKerchunkParquetRefFileSystem: " |
297 | 0 | "'shape' and 'chunks' entries have not the same " |
298 | 0 | "number of values for key '%s'", |
299 | 0 | osKeyName.c_str()); |
300 | 0 | return nullptr; |
301 | 0 | } |
302 | 0 | else if (oShape.Size() > 32) |
303 | 0 | { |
304 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
305 | 0 | "VSIKerchunkParquetRefFileSystem: " |
306 | 0 | "'shape' has too many dimensions for key '%s'", |
307 | 0 | osKeyName.c_str()); |
308 | 0 | return nullptr; |
309 | 0 | } |
310 | 0 | else |
311 | 0 | { |
312 | 0 | VSIZarrArrayInfo arrayInfo; |
313 | 0 | uint64_t nTotalChunks = 1; |
314 | 0 | for (int i = 0; i < oShape.Size(); ++i) |
315 | 0 | { |
316 | 0 | const uint64_t nSize = oShape[i].ToLong(); |
317 | 0 | const uint64_t nChunkSize = oChunks[i].ToLong(); |
318 | 0 | if (nSize == 0) |
319 | 0 | { |
320 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
321 | 0 | "VSIKerchunkParquetRefFileSystem: " |
322 | 0 | "shape[%d]=0 in " |
323 | 0 | "array definition for key '%s'", |
324 | 0 | i, osKeyName.c_str()); |
325 | 0 | return nullptr; |
326 | 0 | } |
327 | 0 | else if (nChunkSize == 0) |
328 | 0 | { |
329 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
330 | 0 | "VSIKerchunkParquetRefFileSystem: " |
331 | 0 | "chunks[%d]=0 in " |
332 | 0 | "array definition for key '%s'", |
333 | 0 | i, osKeyName.c_str()); |
334 | 0 | return nullptr; |
335 | 0 | } |
336 | 0 | const auto nChunkCount = |
337 | 0 | DIV_ROUND_UP(nSize, nChunkSize); |
338 | 0 | if (nChunkCount > |
339 | 0 | std::numeric_limits<uint64_t>::max() / nTotalChunks) |
340 | 0 | { |
341 | 0 | CPLError( |
342 | 0 | CE_Failure, CPLE_AppDefined, |
343 | 0 | "VSIKerchunkParquetRefFileSystem: " |
344 | 0 | "product(shape[]) > UINT64_MAX for key '%s'", |
345 | 0 | osKeyName.c_str()); |
346 | 0 | return nullptr; |
347 | 0 | } |
348 | 0 | nTotalChunks *= nChunkCount; |
349 | 0 | arrayInfo.anChunkCount.push_back(nChunkCount); |
350 | 0 | } |
351 | 0 | const std::string osArrayDir = osKeyName.substr( |
352 | 0 | 0, osKeyName.size() - strlen("/.zarray")); |
353 | 0 | refFile->m_oMapArrayInfo[osArrayDir] = std::move(arrayInfo); |
354 | 0 | } |
355 | 0 | } |
356 | 0 | } |
357 | 0 | else |
358 | 0 | { |
359 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
360 | 0 | "VSIKerchunkParquetRefFileSystem: invalid value type for " |
361 | 0 | "key '%s'", |
362 | 0 | osKeyName.c_str()); |
363 | 0 | return nullptr; |
364 | 0 | } |
365 | 0 | } |
366 | | |
367 | 0 | m_oCache.insert(osRootFilename, refFile); |
368 | 0 | return refFile; |
369 | 0 | } |
370 | | |
371 | | /************************************************************************/ |
372 | | /* VSIKerchunkParquetRefFileSystem::GetChunkInfo() */ |
373 | | /************************************************************************/ |
374 | | |
375 | | VSIKerchunkParquetRefFileSystem::ChunkInfo |
376 | | VSIKerchunkParquetRefFileSystem::GetChunkInfo( |
377 | | const std::string &osRootFilename, |
378 | | const std::shared_ptr<VSIKerchunkParquetRefFile> &refFile, |
379 | | const std::string &osKey) |
380 | 0 | { |
381 | 0 | ChunkInfo info; |
382 | |
|
383 | 0 | const std::string osArrayPath = CPLGetPathSafe(osKey.c_str()); |
384 | 0 | const auto oIterArray = refFile->m_oMapArrayInfo.find(osArrayPath); |
385 | 0 | const std::string osIndices = CPLGetFilename(osKey.c_str()); |
386 | 0 | if (oIterArray != refFile->m_oMapArrayInfo.end() && !osIndices.empty() && |
387 | 0 | osIndices[0] >= '0' && osIndices[0] <= '9') |
388 | 0 | { |
389 | 0 | const auto &oArrayInfo = oIterArray->second; |
390 | 0 | const CPLStringList aosIndices( |
391 | 0 | CSLTokenizeString2(osIndices.c_str(), ".", 0)); |
392 | 0 | if ((static_cast<size_t>(aosIndices.size()) == |
393 | 0 | oArrayInfo.anChunkCount.size()) || |
394 | 0 | (aosIndices.size() == 1 && strcmp(aosIndices[0], "0") == 0 && |
395 | 0 | oArrayInfo.anChunkCount.empty())) |
396 | 0 | { |
397 | 0 | std::vector<uint64_t> anIndices; |
398 | 0 | for (size_t i = 0; i < oArrayInfo.anChunkCount.size(); ++i) |
399 | 0 | { |
400 | 0 | char *endptr = nullptr; |
401 | 0 | anIndices.push_back(std::strtoull(aosIndices[i], &endptr, 10)); |
402 | 0 | if (aosIndices[i][0] == '-' || |
403 | 0 | endptr != aosIndices[i] + strlen(aosIndices[i]) || |
404 | 0 | anIndices[i] >= oArrayInfo.anChunkCount[i]) |
405 | 0 | { |
406 | 0 | return info; |
407 | 0 | } |
408 | 0 | } |
409 | | |
410 | 0 | uint64_t nLinearIndex = 0; |
411 | 0 | uint64_t nMulFactor = 1; |
412 | 0 | for (size_t i = anIndices.size(); i > 0;) |
413 | 0 | { |
414 | 0 | --i; |
415 | 0 | nLinearIndex += anIndices[i] * nMulFactor; |
416 | 0 | nMulFactor *= oArrayInfo.anChunkCount[i]; |
417 | 0 | } |
418 | |
|
419 | 0 | CPLDebugOnly("VSIKerchunkParquetRefFileSystem", |
420 | 0 | "Linear chunk index %" PRIu64, nLinearIndex); |
421 | |
|
422 | 0 | const uint64_t nParquetIdx = nLinearIndex / refFile->m_nRecordSize; |
423 | 0 | const int nIdxInParquet = |
424 | 0 | static_cast<int>(nLinearIndex % refFile->m_nRecordSize); |
425 | |
|
426 | 0 | const std::string osParquetFilename = CPLFormFilenameSafe( |
427 | 0 | CPLFormFilenameSafe(osRootFilename.c_str(), osArrayPath.c_str(), |
428 | 0 | nullptr) |
429 | 0 | .c_str(), |
430 | 0 | CPLSPrintf("refs.%" PRIu64 ".parq", nParquetIdx), nullptr); |
431 | 0 | CPLDebugOnly("VSIKerchunkParquetRefFileSystem", |
432 | 0 | "Looking for entry %d in Parquet file %s", |
433 | 0 | nIdxInParquet, osParquetFilename.c_str()); |
434 | |
|
435 | 0 | std::lock_guard<std::mutex> oLock(m_oParquetCacheMutex); |
436 | 0 | std::shared_ptr<GDALDataset> poDS; |
437 | 0 | if (!m_poParquetCache) |
438 | 0 | { |
439 | 0 | m_poParquetCache = std::make_unique<lru11::Cache< |
440 | 0 | std::string, std::shared_ptr<GDALDataset>>>() |
441 | 0 | .release(); |
442 | 0 | } |
443 | 0 | if (!m_poParquetCache->tryGet(osParquetFilename, poDS)) |
444 | 0 | { |
445 | 0 | const char *const apszAllowedDrivers[] = {"PARQUET", "ADBC", |
446 | 0 | nullptr}; |
447 | 0 | CPLConfigOptionSetter oSetter( |
448 | 0 | "OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", "NO", false); |
449 | 0 | poDS.reset( |
450 | 0 | GDALDataset::Open(osParquetFilename.c_str(), |
451 | 0 | GDAL_OF_VECTOR | GDAL_OF_VERBOSE_ERROR, |
452 | 0 | apszAllowedDrivers, nullptr, nullptr)); |
453 | 0 | if (poDS) |
454 | 0 | m_poParquetCache->insert(osParquetFilename, poDS); |
455 | 0 | } |
456 | |
|
457 | 0 | if (poDS && poDS->GetLayerCount() == 1) |
458 | 0 | { |
459 | 0 | const auto IsIntOrInt64 = [](OGRFieldType eType) |
460 | 0 | { return eType == OFTInteger || eType == OFTInteger64; }; |
461 | 0 | auto poLayer = poDS->GetLayer(0); |
462 | 0 | const auto poDefn = poLayer->GetLayerDefn(); |
463 | 0 | info.iPathField = poDefn->GetFieldIndex("path"); |
464 | 0 | info.iOffsetField = poDefn->GetFieldIndex("offset"); |
465 | 0 | info.iSizeField = poDefn->GetFieldIndex("size"); |
466 | 0 | info.iRawField = poDefn->GetFieldIndex("raw"); |
467 | 0 | if (info.iPathField >= 0 && info.iOffsetField >= 0 && |
468 | 0 | info.iSizeField >= 0 && info.iRawField >= 0 && |
469 | 0 | poDefn->GetFieldDefn(info.iPathField)->GetType() == |
470 | 0 | OFTString && |
471 | 0 | IsIntOrInt64( |
472 | 0 | poDefn->GetFieldDefn(info.iOffsetField)->GetType()) && |
473 | 0 | IsIntOrInt64( |
474 | 0 | poDefn->GetFieldDefn(info.iSizeField)->GetType()) && |
475 | 0 | poDefn->GetFieldDefn(info.iRawField)->GetType() == |
476 | 0 | OFTBinary) |
477 | 0 | { |
478 | 0 | info.osParquetFileDirectory = |
479 | 0 | CPLGetPathSafe(osParquetFilename.c_str()); |
480 | 0 | info.poFeature.reset(poLayer->GetFeature(nIdxInParquet)); |
481 | 0 | } |
482 | 0 | else |
483 | 0 | { |
484 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
485 | 0 | "%s has an unexpected field structure", |
486 | 0 | osParquetFilename.c_str()); |
487 | 0 | } |
488 | 0 | } |
489 | 0 | } |
490 | 0 | } |
491 | 0 | return info; |
492 | 0 | } |
493 | | |
494 | | /************************************************************************/ |
495 | | /* VSIKerchunkParquetRefFileSystem::Open() */ |
496 | | /************************************************************************/ |
497 | | |
498 | | VSIVirtualHandle *VSIKerchunkParquetRefFileSystem::Open( |
499 | | const char *pszFilename, const char *pszAccess, bool /* bSetError */, |
500 | | CSLConstList /* papszOptions */) |
501 | 1.24k | { |
502 | 1.24k | CPLDebugOnly("VSIKerchunkParquetRefFileSystem", "Open(%s)", pszFilename); |
503 | 1.24k | if (strcmp(pszAccess, "r") != 0 && strcmp(pszAccess, "rb") != 0) |
504 | 0 | return nullptr; |
505 | | |
506 | 1.24k | const auto [osRootFilename, osKey] = SplitFilename(pszFilename); |
507 | 1.24k | if (osRootFilename.empty()) |
508 | 1.24k | return nullptr; |
509 | | |
510 | 0 | const auto refFile = Load(osRootFilename); |
511 | 0 | if (!refFile) |
512 | 0 | return nullptr; |
513 | | |
514 | 0 | const auto oIter = refFile->m_oMapKeys.find(osKey); |
515 | 0 | if (oIter == refFile->m_oMapKeys.end()) |
516 | 0 | { |
517 | 0 | const auto info = GetChunkInfo(osRootFilename, refFile, osKey); |
518 | 0 | if (info.poFeature) |
519 | 0 | { |
520 | 0 | if (info.poFeature->IsFieldSetAndNotNull(info.iRawField)) |
521 | 0 | { |
522 | 0 | auto psField = info.poFeature->GetRawFieldRef(info.iRawField); |
523 | | // Borrow binary data to feature |
524 | 0 | GByte *abyData = psField->Binary.paData; |
525 | 0 | int nSize = psField->Binary.nCount; |
526 | 0 | psField->Binary.paData = nullptr; |
527 | 0 | psField->Binary.nCount = 0; |
528 | | // and transmit its ownership to the VSIMem file |
529 | 0 | return VSIFileFromMemBuffer(nullptr, abyData, nSize, |
530 | 0 | /* bTakeOwnership = */ true); |
531 | 0 | } |
532 | 0 | else |
533 | 0 | { |
534 | 0 | const uint64_t nOffset = |
535 | 0 | info.poFeature->GetFieldAsInteger64(info.iOffsetField); |
536 | 0 | const int nSize = |
537 | 0 | info.poFeature->GetFieldAsInteger(info.iSizeField); |
538 | |
|
539 | 0 | std::string osVSIPath = VSIKerchunkMorphURIToVSIPath( |
540 | 0 | info.poFeature->GetFieldAsString(info.iPathField), |
541 | 0 | info.osParquetFileDirectory); |
542 | 0 | if (osVSIPath.empty()) |
543 | 0 | return nullptr; |
544 | | |
545 | 0 | const std::string osPath = |
546 | 0 | nSize ? CPLSPrintf("/vsisubfile/%" PRIu64 "_%u,%s", nOffset, |
547 | 0 | nSize, osVSIPath.c_str()) |
548 | 0 | : std::move(osVSIPath); |
549 | 0 | CPLDebugOnly("VSIKerchunkParquetRefFileSystem", "Opening %s", |
550 | 0 | osPath.c_str()); |
551 | 0 | CPLConfigOptionSetter oSetter("GDAL_DISABLE_READDIR_ON_OPEN", |
552 | 0 | "EMPTY_DIR", false); |
553 | 0 | return VSIFOpenL(osPath.c_str(), "rb"); |
554 | 0 | } |
555 | 0 | } |
556 | | |
557 | 0 | return nullptr; |
558 | 0 | } |
559 | | |
560 | 0 | const auto &abyValue = oIter->second; |
561 | 0 | return VSIFileFromMemBuffer(nullptr, const_cast<GByte *>(abyValue.data()), |
562 | 0 | abyValue.size(), /* bTakeOwnership = */ false); |
563 | 0 | } |
564 | | |
565 | | /************************************************************************/ |
566 | | /* VSIKerchunkParquetRefFileSystem::Stat() */ |
567 | | /************************************************************************/ |
568 | | |
569 | | int VSIKerchunkParquetRefFileSystem::Stat(const char *pszFilename, |
570 | | VSIStatBufL *pStatBuf, int nFlags) |
571 | 513 | { |
572 | 513 | CPLDebugOnly("VSIKerchunkParquetRefFileSystem", "Stat(%s)", pszFilename); |
573 | 513 | memset(pStatBuf, 0, sizeof(VSIStatBufL)); |
574 | | |
575 | 513 | const auto [osRootFilename, osKey] = SplitFilename(pszFilename); |
576 | 513 | if (osRootFilename.empty()) |
577 | 513 | return -1; |
578 | | |
579 | 0 | const auto refFile = Load(osRootFilename); |
580 | 0 | if (!refFile) |
581 | 0 | return -1; |
582 | | |
583 | 0 | if (osKey.empty()) |
584 | 0 | { |
585 | 0 | pStatBuf->st_mode = S_IFDIR; |
586 | 0 | return 0; |
587 | 0 | } |
588 | | |
589 | 0 | const auto oIter = refFile->m_oMapKeys.find(osKey); |
590 | 0 | if (oIter == refFile->m_oMapKeys.end()) |
591 | 0 | { |
592 | 0 | const auto info = GetChunkInfo(osRootFilename, refFile, osKey); |
593 | 0 | if (info.poFeature) |
594 | 0 | { |
595 | 0 | if (info.poFeature->IsFieldSetAndNotNull(info.iRawField)) |
596 | 0 | { |
597 | 0 | int nSize = 0; |
598 | 0 | info.poFeature->GetFieldAsBinary(info.iRawField, &nSize); |
599 | 0 | pStatBuf->st_size = nSize; |
600 | 0 | } |
601 | 0 | else |
602 | 0 | { |
603 | 0 | pStatBuf->st_size = |
604 | 0 | info.poFeature->GetFieldAsInteger64(info.iSizeField); |
605 | 0 | if (pStatBuf->st_size == 0) |
606 | 0 | { |
607 | 0 | const std::string osVSIPath = VSIKerchunkMorphURIToVSIPath( |
608 | 0 | info.poFeature->GetFieldAsString(info.iPathField), |
609 | 0 | info.osParquetFileDirectory); |
610 | 0 | if (osVSIPath.empty()) |
611 | 0 | return -1; |
612 | 0 | return VSIStatExL(osVSIPath.c_str(), pStatBuf, nFlags); |
613 | 0 | } |
614 | 0 | } |
615 | 0 | pStatBuf->st_mode = S_IFREG; |
616 | 0 | return 0; |
617 | 0 | } |
618 | | |
619 | 0 | if (cpl::contains(refFile->m_oMapKeys, osKey + "/.zgroup") || |
620 | 0 | cpl::contains(refFile->m_oMapKeys, osKey + "/.zarray")) |
621 | 0 | { |
622 | 0 | pStatBuf->st_mode = S_IFDIR; |
623 | 0 | return 0; |
624 | 0 | } |
625 | | |
626 | 0 | return -1; |
627 | 0 | } |
628 | | |
629 | 0 | const auto &abyValue = oIter->second; |
630 | 0 | pStatBuf->st_size = abyValue.size(); |
631 | 0 | pStatBuf->st_mode = S_IFREG; |
632 | |
|
633 | 0 | return 0; |
634 | 0 | } |
635 | | |
636 | | /************************************************************************/ |
637 | | /* VSIKerchunkParquetRefFileSystem::ReadDirEx() */ |
638 | | /************************************************************************/ |
639 | | |
640 | | char **VSIKerchunkParquetRefFileSystem::ReadDirEx(const char *pszDirname, |
641 | | int nMaxFiles) |
642 | 0 | { |
643 | 0 | CPLDebugOnly("VSIKerchunkParquetRefFileSystem", "ReadDir(%s)", pszDirname); |
644 | |
|
645 | 0 | const auto [osRootFilename, osAskedKey] = SplitFilename(pszDirname); |
646 | 0 | if (osRootFilename.empty()) |
647 | 0 | return nullptr; |
648 | | |
649 | 0 | const auto refFile = Load(osRootFilename); |
650 | 0 | if (!refFile) |
651 | 0 | return nullptr; |
652 | | |
653 | 0 | std::set<std::string> set; |
654 | 0 | for (const auto &[key, value] : refFile->m_oMapKeys) |
655 | 0 | { |
656 | 0 | if (osAskedKey.empty()) |
657 | 0 | { |
658 | 0 | const auto nPos = key.find('/'); |
659 | 0 | if (nPos == std::string::npos) |
660 | 0 | set.insert(key); |
661 | 0 | else |
662 | 0 | set.insert(key.substr(0, nPos)); |
663 | 0 | } |
664 | 0 | else if (key.size() > osAskedKey.size() && |
665 | 0 | cpl::starts_with(key, osAskedKey) && |
666 | 0 | key[osAskedKey.size()] == '/') |
667 | 0 | { |
668 | 0 | std::string subKey = key.substr(osAskedKey.size() + 1); |
669 | 0 | const auto nPos = subKey.find('/'); |
670 | 0 | if (nPos == std::string::npos) |
671 | 0 | set.insert(std::move(subKey)); |
672 | 0 | else |
673 | 0 | set.insert(subKey.substr(0, nPos)); |
674 | 0 | } |
675 | 0 | } |
676 | |
|
677 | 0 | CPLStringList aosRet; |
678 | 0 | for (const std::string &v : set) |
679 | 0 | { |
680 | | // CPLDebugOnly("VSIKerchunkParquetRefFileSystem", ".. %s", v.c_str()); |
681 | 0 | aosRet.AddString(v.c_str()); |
682 | 0 | } |
683 | | |
684 | | // Synthetize file names for x.y.z chunks |
685 | 0 | const auto oIterArray = refFile->m_oMapArrayInfo.find(osAskedKey); |
686 | 0 | if (oIterArray != refFile->m_oMapArrayInfo.end()) |
687 | 0 | { |
688 | 0 | const auto &oArrayInfo = oIterArray->second; |
689 | 0 | if (oArrayInfo.anChunkCount.empty()) |
690 | 0 | { |
691 | 0 | aosRet.AddString("0"); |
692 | 0 | } |
693 | 0 | else |
694 | 0 | { |
695 | 0 | std::string osCurElt; |
696 | 0 | std::function<bool(size_t)> Enumerate; |
697 | 0 | if (nMaxFiles <= 0) |
698 | 0 | nMaxFiles = 100 * 1024 * 1024; |
699 | |
|
700 | 0 | Enumerate = [nMaxFiles, &aosRet, &oArrayInfo, &osCurElt, |
701 | 0 | &Enumerate](size_t iDim) |
702 | 0 | { |
703 | 0 | const size_t sizeBefore = osCurElt.size(); |
704 | 0 | for (uint64_t i = 0; i < oArrayInfo.anChunkCount[iDim]; ++i) |
705 | 0 | { |
706 | 0 | osCurElt += CPLSPrintf("%" PRIu64, i); |
707 | 0 | if (iDim + 1 < oArrayInfo.anChunkCount.size()) |
708 | 0 | { |
709 | 0 | osCurElt += '.'; |
710 | 0 | if (!Enumerate(iDim + 1)) |
711 | 0 | return false; |
712 | 0 | } |
713 | 0 | else |
714 | 0 | { |
715 | 0 | if (aosRet.size() >= nMaxFiles) |
716 | 0 | return false; |
717 | 0 | aosRet.AddString(osCurElt); |
718 | 0 | } |
719 | 0 | osCurElt.resize(sizeBefore); |
720 | 0 | } |
721 | 0 | return true; |
722 | 0 | }; |
723 | |
|
724 | 0 | Enumerate(0); |
725 | 0 | } |
726 | 0 | } |
727 | |
|
728 | 0 | return aosRet.StealList(); |
729 | 0 | } |
730 | | |
731 | | /************************************************************************/ |
732 | | /* VSIInstallKerchunkParquetRefFileSystem() */ |
733 | | /************************************************************************/ |
734 | | |
735 | | void VSIInstallKerchunkParquetRefFileSystem() |
736 | 26 | { |
737 | 26 | static std::mutex oMutex; |
738 | 26 | std::lock_guard<std::mutex> oLock(oMutex); |
739 | | // cppcheck-suppress knownConditionTrueFalse |
740 | 26 | if (!VSIKerchunkParquetRefFileSystem::IsFileSystemInstantiated()) |
741 | 26 | { |
742 | 26 | VSIFileManager::InstallHandler( |
743 | 26 | PARQUET_REF_FS_PREFIX, |
744 | 26 | std::make_unique<VSIKerchunkParquetRefFileSystem>().release()); |
745 | 26 | } |
746 | 26 | } |
747 | | |
748 | | /************************************************************************/ |
749 | | /* VSIKerchunkParquetRefFileSystemCleanCache() */ |
750 | | /************************************************************************/ |
751 | | |
752 | | void VSIKerchunkParquetRefFileSystemCleanCache() |
753 | 12.5k | { |
754 | 12.5k | auto poFS = dynamic_cast<VSIKerchunkParquetRefFileSystem *>( |
755 | 12.5k | VSIFileManager::GetHandler(PARQUET_REF_FS_PREFIX)); |
756 | 12.5k | if (poFS) |
757 | 12.5k | poFS->CleanCache(); |
758 | 12.5k | } |