/src/gdal/port/cpl_csv.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Project: CPL - Common Portability Library |
4 | | * Purpose: CSV (comma separated value) file access. |
5 | | * Author: Frank Warmerdam, warmerdam@pobox.com |
6 | | * |
7 | | ****************************************************************************** |
8 | | * Copyright (c) 1999, Frank Warmerdam |
9 | | * Copyright (c) 2009-2012, Even Rouault <even dot rouault at spatialys.com> |
10 | | * |
11 | | * SPDX-License-Identifier: MIT |
12 | | ****************************************************************************/ |
13 | | |
14 | | #include "cpl_port.h" |
15 | | #include "cpl_csv.h" |
16 | | |
17 | | #include <cstddef> |
18 | | #include <cstdlib> |
19 | | #include <cstring> |
20 | | #include <fcntl.h> |
21 | | |
22 | | #include "cpl_conv.h" |
23 | | #include "cpl_error.h" |
24 | | #include "cpl_multiproc.h" |
25 | | #include "gdal_csv.h" |
26 | | |
27 | | #include <algorithm> |
28 | | |
29 | | /* ==================================================================== */ |
30 | | /* The CSVTable is a persistent set of info about an open CSV */ |
31 | | /* table. While it doesn't currently maintain a record index, */ |
32 | | /* or in-memory copy of the table, it could be changed to do so */ |
33 | | /* in the future. */ |
34 | | /* ==================================================================== */ |
35 | | typedef struct ctb |
36 | | { |
37 | | VSILFILE *fp; |
38 | | struct ctb *psNext; |
39 | | char *pszFilename; |
40 | | char **papszFieldNames; |
41 | | int *panFieldNamesLength; |
42 | | char **papszRecFields; |
43 | | int nFields; |
44 | | int iLastLine; |
45 | | bool bNonUniqueKey; |
46 | | |
47 | | /* Cache for whole file */ |
48 | | int nLineCount; |
49 | | char **papszLines; |
50 | | int *panLineIndex; |
51 | | char *pszRawData; |
52 | | } CSVTable; |
53 | | |
54 | | static void CSVDeaccessInternal(CSVTable **ppsCSVTableList, bool bCanUseTLS, |
55 | | const char *pszFilename); |
56 | | |
57 | | /************************************************************************/ |
58 | | /* CSVFreeTLS() */ |
59 | | /************************************************************************/ |
60 | | static void CSVFreeTLS(void *pData) |
61 | 0 | { |
62 | 0 | CSVDeaccessInternal(static_cast<CSVTable **>(pData), false, nullptr); |
63 | 0 | CPLFree(pData); |
64 | 0 | } |
65 | | |
66 | | /* It would likely be better to share this list between threads, but |
67 | | that will require some rework. */ |
68 | | |
69 | | /************************************************************************/ |
70 | | /* CSVAccess() */ |
71 | | /* */ |
72 | | /* This function will fetch a handle to the requested table. */ |
73 | | /* If not found in the ``open table list'' the table will be */ |
74 | | /* opened and added to the list. Eventually this function may */ |
75 | | /* become public with an abstracted return type so that */ |
76 | | /* applications can set options about the table. For now this */ |
77 | | /* isn't done. */ |
78 | | /************************************************************************/ |
79 | | |
80 | | static CSVTable *CSVAccess(const char *pszFilename) |
81 | | |
82 | 0 | { |
83 | | /* -------------------------------------------------------------------- */ |
84 | | /* Fetch the table, and allocate the thread-local pointer to it */ |
85 | | /* if there isn't already one. */ |
86 | | /* -------------------------------------------------------------------- */ |
87 | 0 | int bMemoryError = FALSE; |
88 | 0 | CSVTable **ppsCSVTableList = |
89 | 0 | static_cast<CSVTable **>(CPLGetTLSEx(CTLS_CSVTABLEPTR, &bMemoryError)); |
90 | 0 | if (bMemoryError) |
91 | 0 | return nullptr; |
92 | 0 | if (ppsCSVTableList == nullptr) |
93 | 0 | { |
94 | 0 | ppsCSVTableList = |
95 | 0 | static_cast<CSVTable **>(VSI_CALLOC_VERBOSE(1, sizeof(CSVTable *))); |
96 | 0 | if (ppsCSVTableList == nullptr) |
97 | 0 | return nullptr; |
98 | 0 | CPLSetTLSWithFreeFunc(CTLS_CSVTABLEPTR, ppsCSVTableList, CSVFreeTLS); |
99 | 0 | } |
100 | | |
101 | | /* -------------------------------------------------------------------- */ |
102 | | /* Is the table already in the list. */ |
103 | | /* -------------------------------------------------------------------- */ |
104 | 0 | for (CSVTable *psTable = *ppsCSVTableList; psTable != nullptr; |
105 | 0 | psTable = psTable->psNext) |
106 | 0 | { |
107 | 0 | if (EQUAL(psTable->pszFilename, pszFilename)) |
108 | 0 | { |
109 | | /* |
110 | | * Eventually we should consider promoting to the front of |
111 | | * the list to accelerate frequently accessed tables. |
112 | | */ |
113 | 0 | return psTable; |
114 | 0 | } |
115 | 0 | } |
116 | | |
117 | | /* -------------------------------------------------------------------- */ |
118 | | /* If not, try to open it. */ |
119 | | /* -------------------------------------------------------------------- */ |
120 | 0 | VSILFILE *fp = VSIFOpenL(pszFilename, "rb"); |
121 | 0 | if (fp == nullptr) |
122 | 0 | return nullptr; |
123 | | |
124 | | /* -------------------------------------------------------------------- */ |
125 | | /* Create an information structure about this table, and add to */ |
126 | | /* the front of the list. */ |
127 | | /* -------------------------------------------------------------------- */ |
128 | 0 | CSVTable *const psTable = |
129 | 0 | static_cast<CSVTable *>(VSI_CALLOC_VERBOSE(sizeof(CSVTable), 1)); |
130 | 0 | if (psTable == nullptr) |
131 | 0 | { |
132 | 0 | VSIFCloseL(fp); |
133 | 0 | return nullptr; |
134 | 0 | } |
135 | | |
136 | 0 | psTable->fp = fp; |
137 | 0 | psTable->pszFilename = VSI_STRDUP_VERBOSE(pszFilename); |
138 | 0 | if (psTable->pszFilename == nullptr) |
139 | 0 | { |
140 | 0 | VSIFree(psTable); |
141 | 0 | VSIFCloseL(fp); |
142 | 0 | return nullptr; |
143 | 0 | } |
144 | 0 | psTable->bNonUniqueKey = false; // As far as we know now. |
145 | 0 | psTable->psNext = *ppsCSVTableList; |
146 | |
|
147 | 0 | *ppsCSVTableList = psTable; |
148 | | |
149 | | /* -------------------------------------------------------------------- */ |
150 | | /* Read the table header record containing the field names. */ |
151 | | /* -------------------------------------------------------------------- */ |
152 | 0 | psTable->papszFieldNames = CSVReadParseLineL(fp); |
153 | 0 | psTable->nFields = CSLCount(psTable->papszFieldNames); |
154 | 0 | psTable->panFieldNamesLength = |
155 | 0 | static_cast<int *>(CPLMalloc(sizeof(int) * psTable->nFields)); |
156 | 0 | for (int i = 0; |
157 | 0 | i < psTable->nFields && |
158 | | /* null-pointer check to avoid a false positive from CLang S.A. */ |
159 | 0 | psTable->papszFieldNames != nullptr; |
160 | 0 | i++) |
161 | 0 | { |
162 | 0 | psTable->panFieldNamesLength[i] = |
163 | 0 | static_cast<int>(strlen(psTable->papszFieldNames[i])); |
164 | 0 | } |
165 | |
|
166 | 0 | return psTable; |
167 | 0 | } |
168 | | |
169 | | /************************************************************************/ |
170 | | /* CSVDeaccess() */ |
171 | | /************************************************************************/ |
172 | | |
173 | | static void CSVDeaccessInternal(CSVTable **ppsCSVTableList, bool bCanUseTLS, |
174 | | const char *pszFilename) |
175 | | |
176 | 0 | { |
177 | 0 | if (ppsCSVTableList == nullptr) |
178 | 0 | return; |
179 | | |
180 | | /* -------------------------------------------------------------------- */ |
181 | | /* A NULL means deaccess all tables. */ |
182 | | /* -------------------------------------------------------------------- */ |
183 | 0 | if (pszFilename == nullptr) |
184 | 0 | { |
185 | 0 | while (*ppsCSVTableList != nullptr) |
186 | 0 | CSVDeaccessInternal(ppsCSVTableList, bCanUseTLS, |
187 | 0 | (*ppsCSVTableList)->pszFilename); |
188 | |
|
189 | 0 | return; |
190 | 0 | } |
191 | | |
192 | | /* -------------------------------------------------------------------- */ |
193 | | /* Find this table. */ |
194 | | /* -------------------------------------------------------------------- */ |
195 | 0 | CSVTable *psLast = nullptr; |
196 | 0 | CSVTable *psTable = *ppsCSVTableList; |
197 | 0 | for (; psTable != nullptr && !EQUAL(psTable->pszFilename, pszFilename); |
198 | 0 | psTable = psTable->psNext) |
199 | 0 | { |
200 | 0 | psLast = psTable; |
201 | 0 | } |
202 | |
|
203 | 0 | if (psTable == nullptr) |
204 | 0 | { |
205 | 0 | if (bCanUseTLS) |
206 | 0 | CPLDebug("CPL_CSV", "CPLDeaccess( %s ) - no match.", pszFilename); |
207 | 0 | return; |
208 | 0 | } |
209 | | |
210 | | /* -------------------------------------------------------------------- */ |
211 | | /* Remove the link from the list. */ |
212 | | /* -------------------------------------------------------------------- */ |
213 | 0 | if (psLast != nullptr) |
214 | 0 | psLast->psNext = psTable->psNext; |
215 | 0 | else |
216 | 0 | *ppsCSVTableList = psTable->psNext; |
217 | | |
218 | | /* -------------------------------------------------------------------- */ |
219 | | /* Free the table. */ |
220 | | /* -------------------------------------------------------------------- */ |
221 | 0 | if (psTable->fp != nullptr) |
222 | 0 | VSIFCloseL(psTable->fp); |
223 | |
|
224 | 0 | CSLDestroy(psTable->papszFieldNames); |
225 | 0 | CPLFree(psTable->panFieldNamesLength); |
226 | 0 | CSLDestroy(psTable->papszRecFields); |
227 | 0 | CPLFree(psTable->pszFilename); |
228 | 0 | CPLFree(psTable->panLineIndex); |
229 | 0 | CPLFree(psTable->pszRawData); |
230 | 0 | CPLFree(psTable->papszLines); |
231 | |
|
232 | 0 | CPLFree(psTable); |
233 | |
|
234 | 0 | if (bCanUseTLS) |
235 | 0 | CPLReadLine(nullptr); |
236 | 0 | } |
237 | | |
238 | | void CSVDeaccess(const char *pszFilename) |
239 | 0 | { |
240 | | /* -------------------------------------------------------------------- */ |
241 | | /* Fetch the table, and allocate the thread-local pointer to it */ |
242 | | /* if there isn't already one. */ |
243 | | /* -------------------------------------------------------------------- */ |
244 | 0 | int bMemoryError = FALSE; |
245 | 0 | CSVTable **ppsCSVTableList = |
246 | 0 | static_cast<CSVTable **>(CPLGetTLSEx(CTLS_CSVTABLEPTR, &bMemoryError)); |
247 | |
|
248 | 0 | CSVDeaccessInternal(ppsCSVTableList, true, pszFilename); |
249 | 0 | } |
250 | | |
251 | | /************************************************************************/ |
252 | | /* CSVSplitLine() */ |
253 | | /* */ |
254 | | /* Tokenize a CSV line into fields in the form of a string */ |
255 | | /* list. This is used instead of the CPLTokenizeString() */ |
256 | | /* because it provides correct CSV escaping and quoting */ |
257 | | /* semantics. */ |
258 | | /************************************************************************/ |
259 | | |
260 | | static char **CSVSplitLine(const char *pszString, const char *pszDelimiter, |
261 | | bool bKeepLeadingAndClosingQuotes, |
262 | | bool bMergeDelimiter) |
263 | | |
264 | 0 | { |
265 | 0 | CPLStringList aosRetList; |
266 | 0 | if (pszString == nullptr) |
267 | 0 | return static_cast<char **>(CPLCalloc(sizeof(char *), 1)); |
268 | | |
269 | 0 | char *pszToken = static_cast<char *>(CPLCalloc(10, 1)); |
270 | 0 | int nTokenMax = 10; |
271 | 0 | const size_t nDelimiterLength = strlen(pszDelimiter); |
272 | |
|
273 | 0 | const char *pszIter = pszString; |
274 | 0 | while (*pszIter != '\0') |
275 | 0 | { |
276 | 0 | bool bInString = false; |
277 | |
|
278 | 0 | int nTokenLen = 0; |
279 | | |
280 | | // Try to find the next delimiter, marking end of token. |
281 | 0 | do |
282 | 0 | { |
283 | | // End if this is a delimiter skip it and break. |
284 | 0 | if (!bInString && |
285 | 0 | strncmp(pszIter, pszDelimiter, nDelimiterLength) == 0) |
286 | 0 | { |
287 | 0 | pszIter += nDelimiterLength; |
288 | 0 | if (bMergeDelimiter) |
289 | 0 | { |
290 | 0 | while (strncmp(pszIter, pszDelimiter, nDelimiterLength) == |
291 | 0 | 0) |
292 | 0 | pszIter += nDelimiterLength; |
293 | 0 | } |
294 | 0 | break; |
295 | 0 | } |
296 | | |
297 | 0 | if (*pszIter == '"') |
298 | 0 | { |
299 | 0 | if (!bInString && nTokenLen > 0) |
300 | 0 | { |
301 | | // do not treat in a special way double quotes that appear |
302 | | // in the middle of a field (similarly to OpenOffice) |
303 | | // Like in records: 1,50°46'06.6"N 116°42'04.4,foo |
304 | 0 | } |
305 | 0 | else if (!bInString || pszIter[1] != '"') |
306 | 0 | { |
307 | 0 | bInString = !bInString; |
308 | 0 | if (!bKeepLeadingAndClosingQuotes) |
309 | 0 | continue; |
310 | 0 | } |
311 | 0 | else // Doubled quotes in string resolve to one quote. |
312 | 0 | { |
313 | 0 | pszIter++; |
314 | 0 | } |
315 | 0 | } |
316 | | |
317 | 0 | if (nTokenLen >= nTokenMax - 2) |
318 | 0 | { |
319 | 0 | nTokenMax = nTokenMax * 2 + 10; |
320 | 0 | pszToken = static_cast<char *>(CPLRealloc(pszToken, nTokenMax)); |
321 | 0 | } |
322 | |
|
323 | 0 | pszToken[nTokenLen] = *pszIter; |
324 | 0 | nTokenLen++; |
325 | 0 | } while (*(++pszIter) != '\0'); |
326 | | |
327 | 0 | pszToken[nTokenLen] = '\0'; |
328 | 0 | aosRetList.AddString(pszToken); |
329 | | |
330 | | // If the last token is an empty token, then we have to catch |
331 | | // it now, otherwise we won't reenter the loop and it will be lost. |
332 | 0 | if (*pszIter == '\0' && |
333 | 0 | pszIter - pszString >= static_cast<int>(nDelimiterLength) && |
334 | 0 | strncmp(pszIter - nDelimiterLength, pszDelimiter, |
335 | 0 | nDelimiterLength) == 0) |
336 | 0 | { |
337 | 0 | aosRetList.AddString(""); |
338 | 0 | } |
339 | 0 | } |
340 | |
|
341 | 0 | CPLFree(pszToken); |
342 | |
|
343 | 0 | if (aosRetList.Count() == 0) |
344 | 0 | return static_cast<char **>(CPLCalloc(sizeof(char *), 1)); |
345 | 0 | else |
346 | 0 | return aosRetList.StealList(); |
347 | 0 | } |
348 | | |
349 | | /************************************************************************/ |
350 | | /* CSVFindNextLine() */ |
351 | | /* */ |
352 | | /* Find the start of the next line, while at the same time zero */ |
353 | | /* terminating this line. Take into account that there may be */ |
354 | | /* newline indicators within quoted strings, and that quotes */ |
355 | | /* can be escaped with a backslash. */ |
356 | | /************************************************************************/ |
357 | | |
358 | | static char *CSVFindNextLine(char *pszThisLine) |
359 | | |
360 | 0 | { |
361 | 0 | int i = 0; // i is used after the for loop. |
362 | |
|
363 | 0 | for (int nQuoteCount = 0; pszThisLine[i] != '\0'; i++) |
364 | 0 | { |
365 | 0 | if (pszThisLine[i] == '\"' && (i == 0 || pszThisLine[i - 1] != '\\')) |
366 | 0 | nQuoteCount++; |
367 | |
|
368 | 0 | if ((pszThisLine[i] == 10 || pszThisLine[i] == 13) && |
369 | 0 | (nQuoteCount % 2) == 0) |
370 | 0 | break; |
371 | 0 | } |
372 | |
|
373 | 0 | while (pszThisLine[i] == 10 || pszThisLine[i] == 13) |
374 | 0 | pszThisLine[i++] = '\0'; |
375 | |
|
376 | 0 | if (pszThisLine[i] == '\0') |
377 | 0 | return nullptr; |
378 | | |
379 | 0 | return pszThisLine + i; |
380 | 0 | } |
381 | | |
382 | | /************************************************************************/ |
383 | | /* CSVIngest() */ |
384 | | /* */ |
385 | | /* Load entire file into memory and setup index if possible. */ |
386 | | /************************************************************************/ |
387 | | |
388 | | // TODO(schwehr): Clean up all the casting in CSVIngest. |
389 | | static void CSVIngest(CSVTable *psTable) |
390 | | |
391 | 0 | { |
392 | 0 | if (psTable->pszRawData != nullptr) |
393 | 0 | return; |
394 | | |
395 | | /* -------------------------------------------------------------------- */ |
396 | | /* Ingest whole file. */ |
397 | | /* -------------------------------------------------------------------- */ |
398 | 0 | if (VSIFSeekL(psTable->fp, 0, SEEK_END) != 0) |
399 | 0 | { |
400 | 0 | CPLError(CE_Failure, CPLE_FileIO, |
401 | 0 | "Failed using seek end and tell to get file length: %s", |
402 | 0 | psTable->pszFilename); |
403 | 0 | return; |
404 | 0 | } |
405 | 0 | const vsi_l_offset nFileLen = VSIFTellL(psTable->fp); |
406 | 0 | if (static_cast<long>(nFileLen) == -1) |
407 | 0 | { |
408 | 0 | CPLError(CE_Failure, CPLE_FileIO, |
409 | 0 | "Failed using seek end and tell to get file length: %s", |
410 | 0 | psTable->pszFilename); |
411 | 0 | return; |
412 | 0 | } |
413 | 0 | VSIRewindL(psTable->fp); |
414 | |
|
415 | 0 | psTable->pszRawData = static_cast<char *>( |
416 | 0 | VSI_MALLOC_VERBOSE(static_cast<size_t>(nFileLen) + 1)); |
417 | 0 | if (psTable->pszRawData == nullptr) |
418 | 0 | return; |
419 | 0 | if (VSIFReadL(psTable->pszRawData, 1, static_cast<size_t>(nFileLen), |
420 | 0 | psTable->fp) != static_cast<size_t>(nFileLen)) |
421 | 0 | { |
422 | 0 | CPLFree(psTable->pszRawData); |
423 | 0 | psTable->pszRawData = nullptr; |
424 | |
|
425 | 0 | CPLError(CE_Failure, CPLE_FileIO, "Read of file %s failed.", |
426 | 0 | psTable->pszFilename); |
427 | 0 | return; |
428 | 0 | } |
429 | | |
430 | 0 | psTable->pszRawData[nFileLen] = '\0'; |
431 | | |
432 | | /* -------------------------------------------------------------------- */ |
433 | | /* Get count of newlines so we can allocate line array. */ |
434 | | /* -------------------------------------------------------------------- */ |
435 | 0 | int nMaxLineCount = 0; |
436 | 0 | for (int i = 0; i < static_cast<int>(nFileLen); i++) |
437 | 0 | { |
438 | 0 | if (psTable->pszRawData[i] == 10) |
439 | 0 | nMaxLineCount++; |
440 | 0 | } |
441 | |
|
442 | 0 | psTable->papszLines = |
443 | 0 | static_cast<char **>(VSI_CALLOC_VERBOSE(sizeof(char *), nMaxLineCount)); |
444 | 0 | if (psTable->papszLines == nullptr) |
445 | 0 | return; |
446 | | |
447 | | /* -------------------------------------------------------------------- */ |
448 | | /* Build a list of record pointers into the raw data buffer */ |
449 | | /* based on line terminators. Zero terminate the line */ |
450 | | /* strings. */ |
451 | | /* -------------------------------------------------------------------- */ |
452 | | /* skip header line */ |
453 | 0 | char *pszThisLine = CSVFindNextLine(psTable->pszRawData); |
454 | |
|
455 | 0 | int iLine = 0; |
456 | 0 | while (pszThisLine != nullptr && iLine < nMaxLineCount) |
457 | 0 | { |
458 | 0 | if (pszThisLine[0] != '#') |
459 | 0 | psTable->papszLines[iLine++] = pszThisLine; |
460 | 0 | pszThisLine = CSVFindNextLine(pszThisLine); |
461 | 0 | } |
462 | |
|
463 | 0 | psTable->nLineCount = iLine; |
464 | | |
465 | | /* -------------------------------------------------------------------- */ |
466 | | /* Allocate and populate index array. Ensure they are in */ |
467 | | /* ascending order so that binary searches can be done on the */ |
468 | | /* array. */ |
469 | | /* -------------------------------------------------------------------- */ |
470 | 0 | psTable->panLineIndex = static_cast<int *>( |
471 | 0 | VSI_MALLOC_VERBOSE(sizeof(int) * psTable->nLineCount)); |
472 | 0 | if (psTable->panLineIndex == nullptr) |
473 | 0 | return; |
474 | | |
475 | 0 | for (int i = 0; i < psTable->nLineCount; i++) |
476 | 0 | { |
477 | 0 | psTable->panLineIndex[i] = atoi(psTable->papszLines[i]); |
478 | |
|
479 | 0 | if (i > 0 && psTable->panLineIndex[i] < psTable->panLineIndex[i - 1]) |
480 | 0 | { |
481 | 0 | CPLFree(psTable->panLineIndex); |
482 | 0 | psTable->panLineIndex = nullptr; |
483 | 0 | break; |
484 | 0 | } |
485 | 0 | } |
486 | |
|
487 | 0 | psTable->iLastLine = -1; |
488 | | |
489 | | /* -------------------------------------------------------------------- */ |
490 | | /* We should never need the file handle against, so close it. */ |
491 | | /* -------------------------------------------------------------------- */ |
492 | 0 | VSIFCloseL(psTable->fp); |
493 | 0 | psTable->fp = nullptr; |
494 | 0 | } |
495 | | |
496 | | static void CSVIngest(const char *pszFilename) |
497 | | |
498 | 0 | { |
499 | 0 | CSVTable *psTable = CSVAccess(pszFilename); |
500 | 0 | if (psTable == nullptr) |
501 | 0 | { |
502 | 0 | CPLError(CE_Failure, CPLE_FileIO, "Failed to open file: %s", |
503 | 0 | pszFilename); |
504 | 0 | return; |
505 | 0 | } |
506 | 0 | CSVIngest(psTable); |
507 | 0 | } |
508 | | |
509 | | /************************************************************************/ |
510 | | /* CSVDetectSeperator() */ |
511 | | /************************************************************************/ |
512 | | |
513 | | /** Detect which field separator is used. |
514 | | * |
515 | | * Currently, it can detect comma, semicolon, space, tabulation or pipe. |
516 | | * In case of ambiguity, starting with GDAL 3.7.1, the separator with the |
517 | | * most occurrences will be selected (and a warning emitted). |
518 | | * If no separator found, comma will be considered as the separator. |
519 | | * |
520 | | * @return ',', ';', ' ', tabulation character or '|'. |
521 | | */ |
522 | | char CSVDetectSeperator(const char *pszLine) |
523 | 0 | { |
524 | 0 | bool bInString = false; |
525 | 0 | int nCountComma = 0; |
526 | 0 | int nCountSemicolon = 0; |
527 | 0 | int nCountTab = 0; |
528 | 0 | int nCountPipe = 0; |
529 | 0 | int nCountSpace = 0; |
530 | |
|
531 | 0 | for (; *pszLine != '\0'; pszLine++) |
532 | 0 | { |
533 | 0 | if (!bInString && *pszLine == ',') |
534 | 0 | { |
535 | 0 | nCountComma++; |
536 | 0 | } |
537 | 0 | else if (!bInString && *pszLine == ';') |
538 | 0 | { |
539 | 0 | nCountSemicolon++; |
540 | 0 | } |
541 | 0 | else if (!bInString && *pszLine == '\t') |
542 | 0 | { |
543 | 0 | nCountTab++; |
544 | 0 | } |
545 | 0 | else if (!bInString && *pszLine == '|') |
546 | 0 | { |
547 | 0 | nCountPipe++; |
548 | 0 | } |
549 | 0 | else if (!bInString && *pszLine == ' ') |
550 | 0 | { |
551 | 0 | nCountSpace++; |
552 | 0 | } |
553 | 0 | else if (*pszLine == '"') |
554 | 0 | { |
555 | 0 | if (!bInString || pszLine[1] != '"') |
556 | 0 | { |
557 | 0 | bInString = !bInString; |
558 | 0 | continue; |
559 | 0 | } |
560 | 0 | else /* doubled quotes in string resolve to one quote */ |
561 | 0 | { |
562 | 0 | pszLine++; |
563 | 0 | } |
564 | 0 | } |
565 | 0 | } |
566 | |
|
567 | 0 | const int nMaxCountExceptSpace = |
568 | 0 | std::max(std::max(nCountComma, nCountSemicolon), |
569 | 0 | std::max(nCountTab, nCountPipe)); |
570 | 0 | char chDelimiter = ','; |
571 | 0 | if (nMaxCountExceptSpace == 0) |
572 | 0 | { |
573 | 0 | if (nCountSpace > 0) |
574 | 0 | chDelimiter = ' '; |
575 | 0 | } |
576 | 0 | else |
577 | 0 | { |
578 | 0 | bool bWarn = false; |
579 | 0 | if (nCountComma == nMaxCountExceptSpace) |
580 | 0 | { |
581 | 0 | chDelimiter = ','; |
582 | 0 | bWarn = (nCountSemicolon > 0 || nCountTab > 0 || nCountPipe > 0); |
583 | 0 | } |
584 | 0 | else if (nCountSemicolon == nMaxCountExceptSpace) |
585 | 0 | { |
586 | 0 | chDelimiter = ';'; |
587 | 0 | bWarn = (nCountComma > 0 || nCountTab > 0 || nCountPipe > 0); |
588 | 0 | } |
589 | 0 | else if (nCountTab == nMaxCountExceptSpace) |
590 | 0 | { |
591 | 0 | chDelimiter = '\t'; |
592 | 0 | bWarn = (nCountComma > 0 || nCountSemicolon > 0 || nCountPipe > 0); |
593 | 0 | } |
594 | 0 | else /* if( nCountPipe == nMaxCountExceptSpace ) */ |
595 | 0 | { |
596 | 0 | chDelimiter = '|'; |
597 | 0 | bWarn = (nCountComma > 0 || nCountSemicolon > 0 || nCountTab > 0); |
598 | 0 | } |
599 | 0 | if (bWarn) |
600 | 0 | { |
601 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
602 | 0 | "Selecting '%c' as CSV field separator, but " |
603 | 0 | "other candidate separator(s) have been found.", |
604 | 0 | chDelimiter); |
605 | 0 | } |
606 | 0 | } |
607 | |
|
608 | 0 | return chDelimiter; |
609 | 0 | } |
610 | | |
611 | | /************************************************************************/ |
612 | | /* CSVReadParseLine3L() */ |
613 | | /* */ |
614 | | /* Read one line, and return split into fields. The return */ |
615 | | /* result is a stringlist, in the sense of the CSL functions. */ |
616 | | /************************************************************************/ |
617 | | |
618 | | static char ** |
619 | | CSVReadParseLineGeneric(void *fp, const char *(*pfnReadLine)(void *, size_t), |
620 | | size_t nMaxLineSize, const char *pszDelimiter, |
621 | | bool bHonourStrings, bool bKeepLeadingAndClosingQuotes, |
622 | | bool bMergeDelimiter, bool bSkipBOM) |
623 | 0 | { |
624 | 0 | const char *pszLine = pfnReadLine(fp, nMaxLineSize); |
625 | 0 | if (pszLine == nullptr) |
626 | 0 | return nullptr; |
627 | | |
628 | 0 | if (bSkipBOM) |
629 | 0 | { |
630 | | // Skip BOM. |
631 | 0 | const GByte *pabyData = reinterpret_cast<const GByte *>(pszLine); |
632 | 0 | if (pabyData[0] == 0xEF && pabyData[1] == 0xBB && pabyData[2] == 0xBF) |
633 | 0 | pszLine += 3; |
634 | 0 | } |
635 | | |
636 | | // Special fix to read NdfcFacilities.xls with un-balanced double quotes. |
637 | 0 | if (!bHonourStrings) |
638 | 0 | { |
639 | 0 | return CSLTokenizeStringComplex(pszLine, pszDelimiter, FALSE, TRUE); |
640 | 0 | } |
641 | | |
642 | | // If there are no quotes, then this is the simple case. |
643 | | // Parse, and return tokens. |
644 | 0 | if (strchr(pszLine, '\"') == nullptr) |
645 | 0 | return CSVSplitLine(pszLine, pszDelimiter, bKeepLeadingAndClosingQuotes, |
646 | 0 | bMergeDelimiter); |
647 | | |
648 | 0 | const size_t nDelimiterLength = strlen(pszDelimiter); |
649 | 0 | bool bInString = false; // keep in that scope ! |
650 | 0 | std::string osWorkLine(pszLine); // keep in that scope ! |
651 | 0 | size_t i = 0; // keep in that scope ! |
652 | |
|
653 | 0 | try |
654 | 0 | { |
655 | 0 | while (true) |
656 | 0 | { |
657 | 0 | for (; i < osWorkLine.size(); ++i) |
658 | 0 | { |
659 | 0 | if (osWorkLine[i] == '\"') |
660 | 0 | { |
661 | 0 | if (!bInString) |
662 | 0 | { |
663 | | // Only consider " as the start of a quoted string |
664 | | // if it is the first character of the line, or |
665 | | // if it is immediately after the field delimiter. |
666 | 0 | if (i == 0 || |
667 | 0 | (i >= nDelimiterLength && |
668 | 0 | osWorkLine.compare(i - nDelimiterLength, |
669 | 0 | nDelimiterLength, pszDelimiter, |
670 | 0 | nDelimiterLength) == 0)) |
671 | 0 | { |
672 | 0 | bInString = true; |
673 | 0 | } |
674 | 0 | } |
675 | 0 | else if (i + 1 < osWorkLine.size() && |
676 | 0 | osWorkLine[i + 1] == '"') |
677 | 0 | { |
678 | | // Escaped double quote in a quoted string |
679 | 0 | ++i; |
680 | 0 | } |
681 | 0 | else |
682 | 0 | { |
683 | 0 | bInString = false; |
684 | 0 | } |
685 | 0 | } |
686 | 0 | } |
687 | |
|
688 | 0 | if (!bInString) |
689 | 0 | { |
690 | 0 | return CSVSplitLine(osWorkLine.c_str(), pszDelimiter, |
691 | 0 | bKeepLeadingAndClosingQuotes, |
692 | 0 | bMergeDelimiter); |
693 | 0 | } |
694 | | |
695 | 0 | const char *pszNewLine = pfnReadLine(fp, nMaxLineSize); |
696 | 0 | if (pszNewLine == nullptr) |
697 | 0 | break; |
698 | | |
699 | 0 | osWorkLine.append("\n"); |
700 | 0 | osWorkLine.append(pszNewLine); |
701 | 0 | } |
702 | 0 | } |
703 | 0 | catch (const std::exception &e) |
704 | 0 | { |
705 | 0 | CPLError(CE_Failure, CPLE_OutOfMemory, "%s", e.what()); |
706 | 0 | } |
707 | | |
708 | 0 | if (bInString) |
709 | 0 | { |
710 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
711 | 0 | "CSV file has unbalanced number of double-quotes. Corrupted " |
712 | 0 | "data will likely be returned"); |
713 | 0 | } |
714 | |
|
715 | 0 | return nullptr; |
716 | 0 | } |
717 | | |
718 | | /************************************************************************/ |
719 | | /* CSVReadParseLine() */ |
720 | | /* */ |
721 | | /* Read one line, and return split into fields. The return */ |
722 | | /* result is a stringlist, in the sense of the CSL functions. */ |
723 | | /* */ |
724 | | /* Deprecated. Replaced by CSVReadParseLineL(). */ |
725 | | /************************************************************************/ |
726 | | |
727 | | char **CSVReadParseLine(FILE *fp) |
728 | 0 | { |
729 | 0 | return CSVReadParseLine2(fp, ','); |
730 | 0 | } |
731 | | |
732 | | static const char *ReadLineClassicalFile(void *fp, size_t /* nMaxLineSize */) |
733 | 0 | { |
734 | 0 | return CPLReadLine(static_cast<FILE *>(fp)); |
735 | 0 | } |
736 | | |
737 | | char **CSVReadParseLine2(FILE *fp, char chDelimiter) |
738 | 0 | { |
739 | 0 | CPLAssert(fp != nullptr); |
740 | 0 | if (fp == nullptr) |
741 | 0 | return nullptr; |
742 | | |
743 | 0 | char szDelimiter[2] = {chDelimiter, 0}; |
744 | 0 | return CSVReadParseLineGeneric(fp, ReadLineClassicalFile, |
745 | 0 | 0, // nMaxLineSize, |
746 | 0 | szDelimiter, |
747 | 0 | true, // bHonourStrings |
748 | 0 | false, // bKeepLeadingAndClosingQuotes |
749 | 0 | false, // bMergeDelimiter |
750 | 0 | true /* bSkipBOM */); |
751 | 0 | } |
752 | | |
753 | | /************************************************************************/ |
754 | | /* CSVReadParseLineL() */ |
755 | | /* */ |
756 | | /* Read one line, and return split into fields. The return */ |
757 | | /* result is a stringlist, in the sense of the CSL functions. */ |
758 | | /* */ |
759 | | /* Replaces CSVReadParseLine(). These functions use the VSI */ |
760 | | /* layer to allow reading from other file containers. */ |
761 | | /************************************************************************/ |
762 | | |
763 | | char **CSVReadParseLineL(VSILFILE *fp) |
764 | 0 | { |
765 | 0 | return CSVReadParseLine2L(fp, ','); |
766 | 0 | } |
767 | | |
768 | | char **CSVReadParseLine2L(VSILFILE *fp, char chDelimiter) |
769 | | |
770 | 0 | { |
771 | 0 | CPLAssert(fp != nullptr); |
772 | 0 | if (fp == nullptr) |
773 | 0 | return nullptr; |
774 | | |
775 | 0 | char szDelimiter[2] = {chDelimiter, 0}; |
776 | 0 | return CSVReadParseLine3L(fp, |
777 | 0 | 0, // nMaxLineSize |
778 | 0 | szDelimiter, |
779 | 0 | true, // bHonourStrings |
780 | 0 | false, // bKeepLeadingAndClosingQuotes |
781 | 0 | false, // bMergeDelimiter |
782 | 0 | true /* bSkipBOM */); |
783 | 0 | } |
784 | | |
785 | | /************************************************************************/ |
786 | | /* ReadLineLargeFile() */ |
787 | | /************************************************************************/ |
788 | | |
789 | | static const char *ReadLineLargeFile(void *fp, size_t nMaxLineSize) |
790 | 0 | { |
791 | 0 | int nBufLength = 0; |
792 | 0 | return CPLReadLine3L(static_cast<VSILFILE *>(fp), |
793 | 0 | nMaxLineSize == 0 ? -1 |
794 | 0 | : static_cast<int>(nMaxLineSize), |
795 | 0 | &nBufLength, nullptr); |
796 | 0 | } |
797 | | |
798 | | /************************************************************************/ |
799 | | /* CSVReadParseLine3L() */ |
800 | | /* */ |
801 | | /* Read one line, and return split into fields. The return */ |
802 | | /* result is a stringlist, in the sense of the CSL functions. */ |
803 | | /************************************************************************/ |
804 | | |
805 | | /** Read one line, and return split into fields. |
806 | | * The return result is a stringlist, in the sense of the CSL functions. |
807 | | * |
808 | | * @param fp File handle. Must not be NULL |
809 | | * @param nMaxLineSize Maximum line size, or 0 for unlimited. |
810 | | * @param pszDelimiter Delimiter sequence for readers (can be multiple bytes) |
811 | | * @param bHonourStrings Should be true, unless double quotes should not be |
812 | | * considered when separating fields. |
813 | | * @param bKeepLeadingAndClosingQuotes Whether the leading and closing double |
814 | | * quote characters should be kept. |
815 | | * @param bMergeDelimiter Whether consecutive delimiters should be considered |
816 | | * as a single one. Should generally be set to false. |
817 | | * @param bSkipBOM Whether leading UTF-8 BOM should be skipped. |
818 | | */ |
819 | | char **CSVReadParseLine3L(VSILFILE *fp, size_t nMaxLineSize, |
820 | | const char *pszDelimiter, bool bHonourStrings, |
821 | | bool bKeepLeadingAndClosingQuotes, |
822 | | bool bMergeDelimiter, bool bSkipBOM) |
823 | | |
824 | 0 | { |
825 | 0 | return CSVReadParseLineGeneric( |
826 | 0 | fp, ReadLineLargeFile, nMaxLineSize, pszDelimiter, bHonourStrings, |
827 | 0 | bKeepLeadingAndClosingQuotes, bMergeDelimiter, bSkipBOM); |
828 | 0 | } |
829 | | |
830 | | /************************************************************************/ |
831 | | /* CSVCompare() */ |
832 | | /* */ |
833 | | /* Compare a field to a search value using a particular */ |
834 | | /* criteria. */ |
835 | | /************************************************************************/ |
836 | | |
837 | | static bool CSVCompare(const char *pszFieldValue, const char *pszTarget, |
838 | | CSVCompareCriteria eCriteria) |
839 | | |
840 | 0 | { |
841 | 0 | if (eCriteria == CC_ExactString) |
842 | 0 | { |
843 | 0 | return (strcmp(pszFieldValue, pszTarget) == 0); |
844 | 0 | } |
845 | 0 | else if (eCriteria == CC_ApproxString) |
846 | 0 | { |
847 | 0 | return EQUAL(pszFieldValue, pszTarget); |
848 | 0 | } |
849 | 0 | else if (eCriteria == CC_Integer) |
850 | 0 | { |
851 | 0 | return (CPLGetValueType(pszFieldValue) == CPL_VALUE_INTEGER && |
852 | 0 | atoi(pszFieldValue) == atoi(pszTarget)); |
853 | 0 | } |
854 | | |
855 | 0 | return false; |
856 | 0 | } |
857 | | |
858 | | /************************************************************************/ |
859 | | /* CSVScanLines() */ |
860 | | /* */ |
861 | | /* Read the file scanline for lines where the key field equals */ |
862 | | /* the indicated value with the suggested comparison criteria. */ |
863 | | /* Return the first matching line split into fields. */ |
864 | | /* */ |
865 | | /* Deprecated. Replaced by CSVScanLinesL(). */ |
866 | | /************************************************************************/ |
867 | | |
868 | | char **CSVScanLines(FILE *fp, int iKeyField, const char *pszValue, |
869 | | CSVCompareCriteria eCriteria) |
870 | | |
871 | 0 | { |
872 | 0 | CPLAssert(pszValue != nullptr); |
873 | 0 | CPLAssert(iKeyField >= 0); |
874 | 0 | CPLAssert(fp != nullptr); |
875 | | |
876 | 0 | bool bSelected = false; |
877 | 0 | const int nTestValue = atoi(pszValue); |
878 | 0 | char **papszFields = nullptr; |
879 | |
|
880 | 0 | while (!bSelected) |
881 | 0 | { |
882 | 0 | papszFields = CSVReadParseLine(fp); |
883 | 0 | if (papszFields == nullptr) |
884 | 0 | return nullptr; |
885 | | |
886 | 0 | if (CSLCount(papszFields) < iKeyField + 1) |
887 | 0 | { |
888 | | /* not selected */ |
889 | 0 | } |
890 | 0 | else if (eCriteria == CC_Integer && |
891 | 0 | atoi(papszFields[iKeyField]) == nTestValue) |
892 | 0 | { |
893 | 0 | bSelected = true; |
894 | 0 | } |
895 | 0 | else |
896 | 0 | { |
897 | 0 | bSelected = CSVCompare(papszFields[iKeyField], pszValue, eCriteria); |
898 | 0 | } |
899 | |
|
900 | 0 | if (!bSelected) |
901 | 0 | { |
902 | 0 | CSLDestroy(papszFields); |
903 | 0 | papszFields = nullptr; |
904 | 0 | } |
905 | 0 | } |
906 | | |
907 | 0 | return papszFields; |
908 | 0 | } |
909 | | |
910 | | /************************************************************************/ |
911 | | /* CSVScanLinesL() */ |
912 | | /* */ |
913 | | /* Read the file scanline for lines where the key field equals */ |
914 | | /* the indicated value with the suggested comparison criteria. */ |
915 | | /* Return the first matching line split into fields. */ |
916 | | /************************************************************************/ |
917 | | |
918 | | char **CSVScanLinesL(VSILFILE *fp, int iKeyField, const char *pszValue, |
919 | | CSVCompareCriteria eCriteria) |
920 | | |
921 | 0 | { |
922 | 0 | CPLAssert(pszValue != nullptr); |
923 | 0 | CPLAssert(iKeyField >= 0); |
924 | 0 | CPLAssert(fp != nullptr); |
925 | | |
926 | 0 | bool bSelected = false; |
927 | 0 | const int nTestValue = atoi(pszValue); |
928 | 0 | char **papszFields = nullptr; |
929 | |
|
930 | 0 | while (!bSelected) |
931 | 0 | { |
932 | 0 | papszFields = CSVReadParseLineL(fp); |
933 | 0 | if (papszFields == nullptr) |
934 | 0 | return nullptr; |
935 | | |
936 | 0 | if (CSLCount(papszFields) < iKeyField + 1) |
937 | 0 | { |
938 | | /* not selected */ |
939 | 0 | } |
940 | 0 | else if (eCriteria == CC_Integer && |
941 | 0 | atoi(papszFields[iKeyField]) == nTestValue) |
942 | 0 | { |
943 | 0 | bSelected = true; |
944 | 0 | } |
945 | 0 | else |
946 | 0 | { |
947 | 0 | bSelected = CSVCompare(papszFields[iKeyField], pszValue, eCriteria); |
948 | 0 | } |
949 | |
|
950 | 0 | if (!bSelected) |
951 | 0 | { |
952 | 0 | CSLDestroy(papszFields); |
953 | 0 | papszFields = nullptr; |
954 | 0 | } |
955 | 0 | } |
956 | | |
957 | 0 | return papszFields; |
958 | 0 | } |
959 | | |
960 | | /************************************************************************/ |
961 | | /* CSVScanLinesIndexed() */ |
962 | | /* */ |
963 | | /* Read the file scanline for lines where the key field equals */ |
964 | | /* the indicated value with the suggested comparison criteria. */ |
965 | | /* Return the first matching line split into fields. */ |
966 | | /************************************************************************/ |
967 | | |
968 | | static char **CSVScanLinesIndexed(CSVTable *psTable, int nKeyValue) |
969 | | |
970 | 0 | { |
971 | 0 | CPLAssert(psTable->panLineIndex != nullptr); |
972 | | |
973 | | /* -------------------------------------------------------------------- */ |
974 | | /* Find target record with binary search. */ |
975 | | /* -------------------------------------------------------------------- */ |
976 | 0 | int iTop = psTable->nLineCount - 1; |
977 | 0 | int iBottom = 0; |
978 | 0 | int iResult = -1; |
979 | |
|
980 | 0 | while (iTop >= iBottom) |
981 | 0 | { |
982 | 0 | const int iMiddle = (iTop + iBottom) / 2; |
983 | 0 | if (psTable->panLineIndex[iMiddle] > nKeyValue) |
984 | 0 | iTop = iMiddle - 1; |
985 | 0 | else if (psTable->panLineIndex[iMiddle] < nKeyValue) |
986 | 0 | iBottom = iMiddle + 1; |
987 | 0 | else |
988 | 0 | { |
989 | 0 | iResult = iMiddle; |
990 | | // if a key is not unique, select the first instance of it. |
991 | 0 | while (iResult > 0 && |
992 | 0 | psTable->panLineIndex[iResult - 1] == nKeyValue) |
993 | 0 | { |
994 | 0 | psTable->bNonUniqueKey = true; |
995 | 0 | iResult--; |
996 | 0 | } |
997 | 0 | break; |
998 | 0 | } |
999 | 0 | } |
1000 | |
|
1001 | 0 | if (iResult == -1) |
1002 | 0 | return nullptr; |
1003 | | |
1004 | | /* -------------------------------------------------------------------- */ |
1005 | | /* Parse target line, and update iLastLine indicator. */ |
1006 | | /* -------------------------------------------------------------------- */ |
1007 | 0 | psTable->iLastLine = iResult; |
1008 | |
|
1009 | 0 | return CSVSplitLine(psTable->papszLines[iResult], ",", false, false); |
1010 | 0 | } |
1011 | | |
1012 | | /************************************************************************/ |
1013 | | /* CSVScanLinesIngested() */ |
1014 | | /* */ |
1015 | | /* Read the file scanline for lines where the key field equals */ |
1016 | | /* the indicated value with the suggested comparison criteria. */ |
1017 | | /* Return the first matching line split into fields. */ |
1018 | | /************************************************************************/ |
1019 | | |
1020 | | static char **CSVScanLinesIngested(CSVTable *psTable, int iKeyField, |
1021 | | const char *pszValue, |
1022 | | CSVCompareCriteria eCriteria) |
1023 | | |
1024 | 0 | { |
1025 | 0 | CPLAssert(pszValue != nullptr); |
1026 | 0 | CPLAssert(iKeyField >= 0); |
1027 | | |
1028 | 0 | const int nTestValue = atoi(pszValue); |
1029 | | |
1030 | | /* -------------------------------------------------------------------- */ |
1031 | | /* Short cut for indexed files. */ |
1032 | | /* -------------------------------------------------------------------- */ |
1033 | 0 | if (iKeyField == 0 && eCriteria == CC_Integer && |
1034 | 0 | psTable->panLineIndex != nullptr) |
1035 | 0 | return CSVScanLinesIndexed(psTable, nTestValue); |
1036 | | |
1037 | | /* -------------------------------------------------------------------- */ |
1038 | | /* Scan from in-core lines. */ |
1039 | | /* -------------------------------------------------------------------- */ |
1040 | 0 | char **papszFields = nullptr; |
1041 | 0 | bool bSelected = false; |
1042 | |
|
1043 | 0 | while (!bSelected && psTable->iLastLine + 1 < psTable->nLineCount) |
1044 | 0 | { |
1045 | 0 | psTable->iLastLine++; |
1046 | 0 | papszFields = CSVSplitLine(psTable->papszLines[psTable->iLastLine], ",", |
1047 | 0 | false, false); |
1048 | |
|
1049 | 0 | if (CSLCount(papszFields) < iKeyField + 1) |
1050 | 0 | { |
1051 | | /* not selected */ |
1052 | 0 | } |
1053 | 0 | else if (eCriteria == CC_Integer && |
1054 | 0 | atoi(papszFields[iKeyField]) == nTestValue) |
1055 | 0 | { |
1056 | 0 | bSelected = true; |
1057 | 0 | } |
1058 | 0 | else |
1059 | 0 | { |
1060 | 0 | bSelected = CSVCompare(papszFields[iKeyField], pszValue, eCriteria); |
1061 | 0 | } |
1062 | |
|
1063 | 0 | if (!bSelected) |
1064 | 0 | { |
1065 | 0 | CSLDestroy(papszFields); |
1066 | 0 | papszFields = nullptr; |
1067 | 0 | } |
1068 | 0 | } |
1069 | |
|
1070 | 0 | return papszFields; |
1071 | 0 | } |
1072 | | |
1073 | | /************************************************************************/ |
1074 | | /* CSVRewind() */ |
1075 | | /* */ |
1076 | | /* Rewind a CSV file based on a passed in filename. */ |
1077 | | /* This is aimed at being used with CSVGetNextLine(). */ |
1078 | | /************************************************************************/ |
1079 | | |
1080 | | void CSVRewind(const char *pszFilename) |
1081 | | |
1082 | 0 | { |
1083 | | /* -------------------------------------------------------------------- */ |
1084 | | /* Get access to the table. */ |
1085 | | /* -------------------------------------------------------------------- */ |
1086 | 0 | CPLAssert(pszFilename != nullptr); |
1087 | | |
1088 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1089 | 0 | if (psTable != nullptr) |
1090 | 0 | psTable->iLastLine = -1; |
1091 | 0 | } |
1092 | | |
1093 | | /************************************************************************/ |
1094 | | /* CSVGetNextLine() */ |
1095 | | /* */ |
1096 | | /* Fetch the next line of a CSV file based on a passed in */ |
1097 | | /* filename. Returns NULL at end of file, or if file is not */ |
1098 | | /* really established. */ |
1099 | | /* This ingests the whole file into memory if not already done. */ |
1100 | | /* When reaching end of file, CSVRewind() may be used to read */ |
1101 | | /* again from the beginning. */ |
1102 | | /************************************************************************/ |
1103 | | |
1104 | | char **CSVGetNextLine(const char *pszFilename) |
1105 | | |
1106 | 0 | { |
1107 | | |
1108 | | /* -------------------------------------------------------------------- */ |
1109 | | /* Get access to the table. */ |
1110 | | /* -------------------------------------------------------------------- */ |
1111 | 0 | CPLAssert(pszFilename != nullptr); |
1112 | | |
1113 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1114 | 0 | if (psTable == nullptr) |
1115 | 0 | return nullptr; |
1116 | | |
1117 | 0 | CSVIngest(psTable->pszFilename); |
1118 | | |
1119 | | /* -------------------------------------------------------------------- */ |
1120 | | /* If we use CSVGetNextLine() we can pretty much assume we have */ |
1121 | | /* a non-unique key. */ |
1122 | | /* -------------------------------------------------------------------- */ |
1123 | 0 | psTable->bNonUniqueKey = true; |
1124 | | |
1125 | | /* -------------------------------------------------------------------- */ |
1126 | | /* Do we have a next line available? This only works for */ |
1127 | | /* ingested tables I believe. */ |
1128 | | /* -------------------------------------------------------------------- */ |
1129 | 0 | if (psTable->iLastLine + 1 >= psTable->nLineCount) |
1130 | 0 | return nullptr; |
1131 | | |
1132 | 0 | psTable->iLastLine++; |
1133 | 0 | CSLDestroy(psTable->papszRecFields); |
1134 | 0 | psTable->papszRecFields = CSVSplitLine( |
1135 | 0 | psTable->papszLines[psTable->iLastLine], ",", false, false); |
1136 | |
|
1137 | 0 | return psTable->papszRecFields; |
1138 | 0 | } |
1139 | | |
1140 | | /************************************************************************/ |
1141 | | /* CSVScanFile() */ |
1142 | | /* */ |
1143 | | /* Scan a whole file using criteria similar to above, but also */ |
1144 | | /* taking care of file opening and closing. */ |
1145 | | /************************************************************************/ |
1146 | | |
1147 | | static char **CSVScanFile(CSVTable *const psTable, int iKeyField, |
1148 | | const char *pszValue, CSVCompareCriteria eCriteria) |
1149 | 0 | { |
1150 | 0 | CSVIngest(psTable->pszFilename); |
1151 | | |
1152 | | /* -------------------------------------------------------------------- */ |
1153 | | /* Does the current record match the criteria? If so, just */ |
1154 | | /* return it again. */ |
1155 | | /* -------------------------------------------------------------------- */ |
1156 | 0 | if (iKeyField >= 0 && iKeyField < CSLCount(psTable->papszRecFields) && |
1157 | 0 | CSVCompare(psTable->papszRecFields[iKeyField], pszValue, eCriteria) && |
1158 | 0 | !psTable->bNonUniqueKey) |
1159 | 0 | { |
1160 | 0 | return psTable->papszRecFields; |
1161 | 0 | } |
1162 | | |
1163 | | /* -------------------------------------------------------------------- */ |
1164 | | /* Scan the file from the beginning, replacing the ``current */ |
1165 | | /* record'' in our structure with the one that is found. */ |
1166 | | /* -------------------------------------------------------------------- */ |
1167 | 0 | psTable->iLastLine = -1; |
1168 | 0 | CSLDestroy(psTable->papszRecFields); |
1169 | |
|
1170 | 0 | if (psTable->pszRawData != nullptr) |
1171 | 0 | psTable->papszRecFields = |
1172 | 0 | CSVScanLinesIngested(psTable, iKeyField, pszValue, eCriteria); |
1173 | 0 | else |
1174 | 0 | { |
1175 | 0 | VSIRewindL(psTable->fp); |
1176 | 0 | CPLReadLineL(psTable->fp); /* throw away the header line */ |
1177 | |
|
1178 | 0 | psTable->papszRecFields = |
1179 | 0 | CSVScanLinesL(psTable->fp, iKeyField, pszValue, eCriteria); |
1180 | 0 | } |
1181 | |
|
1182 | 0 | return psTable->papszRecFields; |
1183 | 0 | } |
1184 | | |
1185 | | char **CSVScanFile(const char *pszFilename, int iKeyField, const char *pszValue, |
1186 | | CSVCompareCriteria eCriteria) |
1187 | | |
1188 | 0 | { |
1189 | | /* -------------------------------------------------------------------- */ |
1190 | | /* Get access to the table. */ |
1191 | | /* -------------------------------------------------------------------- */ |
1192 | 0 | CPLAssert(pszFilename != nullptr); |
1193 | | |
1194 | 0 | if (iKeyField < 0) |
1195 | 0 | return nullptr; |
1196 | | |
1197 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1198 | 0 | if (psTable == nullptr) |
1199 | 0 | return nullptr; |
1200 | | |
1201 | 0 | return CSVScanFile(psTable, iKeyField, pszValue, eCriteria); |
1202 | 0 | } |
1203 | | |
1204 | | /************************************************************************/ |
1205 | | /* CPLGetFieldId() */ |
1206 | | /* */ |
1207 | | /* Read the first record of a CSV file (rewinding to be sure), */ |
1208 | | /* and find the field with the indicated name. Returns -1 if */ |
1209 | | /* it fails to find the field name. Comparison is case */ |
1210 | | /* insensitive, but otherwise exact. After this function has */ |
1211 | | /* been called the file pointer will be positioned just after */ |
1212 | | /* the first record. */ |
1213 | | /* */ |
1214 | | /* Deprecated. Replaced by CPLGetFieldIdL(). */ |
1215 | | /************************************************************************/ |
1216 | | |
1217 | | int CSVGetFieldId(FILE *fp, const char *pszFieldName) |
1218 | | |
1219 | 0 | { |
1220 | 0 | CPLAssert(fp != nullptr && pszFieldName != nullptr); |
1221 | | |
1222 | 0 | VSIRewind(fp); |
1223 | |
|
1224 | 0 | char **papszFields = CSVReadParseLine(fp); |
1225 | 0 | for (int i = 0; papszFields != nullptr && papszFields[i] != nullptr; i++) |
1226 | 0 | { |
1227 | 0 | if (EQUAL(papszFields[i], pszFieldName)) |
1228 | 0 | { |
1229 | 0 | CSLDestroy(papszFields); |
1230 | 0 | return i; |
1231 | 0 | } |
1232 | 0 | } |
1233 | | |
1234 | 0 | CSLDestroy(papszFields); |
1235 | |
|
1236 | 0 | return -1; |
1237 | 0 | } |
1238 | | |
1239 | | /************************************************************************/ |
1240 | | /* CPLGetFieldIdL() */ |
1241 | | /* */ |
1242 | | /* Read the first record of a CSV file (rewinding to be sure), */ |
1243 | | /* and find the field with the indicated name. Returns -1 if */ |
1244 | | /* it fails to find the field name. Comparison is case */ |
1245 | | /* insensitive, but otherwise exact. After this function has */ |
1246 | | /* been called the file pointer will be positioned just after */ |
1247 | | /* the first record. */ |
1248 | | /************************************************************************/ |
1249 | | |
1250 | | int CSVGetFieldIdL(VSILFILE *fp, const char *pszFieldName) |
1251 | | |
1252 | 0 | { |
1253 | 0 | CPLAssert(fp != nullptr && pszFieldName != nullptr); |
1254 | | |
1255 | 0 | VSIRewindL(fp); |
1256 | |
|
1257 | 0 | char **papszFields = CSVReadParseLineL(fp); |
1258 | 0 | for (int i = 0; papszFields != nullptr && papszFields[i] != nullptr; i++) |
1259 | 0 | { |
1260 | 0 | if (EQUAL(papszFields[i], pszFieldName)) |
1261 | 0 | { |
1262 | 0 | CSLDestroy(papszFields); |
1263 | 0 | return i; |
1264 | 0 | } |
1265 | 0 | } |
1266 | | |
1267 | 0 | CSLDestroy(papszFields); |
1268 | |
|
1269 | 0 | return -1; |
1270 | 0 | } |
1271 | | |
1272 | | /************************************************************************/ |
1273 | | /* CSVGetFileFieldId() */ |
1274 | | /* */ |
1275 | | /* Same as CPLGetFieldId(), except that we get the file based */ |
1276 | | /* on filename, rather than having an existing handle. */ |
1277 | | /************************************************************************/ |
1278 | | |
1279 | | static int CSVGetFileFieldId(CSVTable *const psTable, const char *pszFieldName) |
1280 | | |
1281 | 0 | { |
1282 | | /* -------------------------------------------------------------------- */ |
1283 | | /* Find the requested field. */ |
1284 | | /* -------------------------------------------------------------------- */ |
1285 | 0 | const int nFieldNameLength = static_cast<int>(strlen(pszFieldName)); |
1286 | 0 | for (int i = 0; psTable->papszFieldNames != nullptr && |
1287 | 0 | psTable->papszFieldNames[i] != nullptr; |
1288 | 0 | i++) |
1289 | 0 | { |
1290 | 0 | if (psTable->panFieldNamesLength[i] == nFieldNameLength && |
1291 | 0 | EQUALN(psTable->papszFieldNames[i], pszFieldName, nFieldNameLength)) |
1292 | 0 | { |
1293 | 0 | return i; |
1294 | 0 | } |
1295 | 0 | } |
1296 | | |
1297 | 0 | return -1; |
1298 | 0 | } |
1299 | | |
1300 | | int CSVGetFileFieldId(const char *pszFilename, const char *pszFieldName) |
1301 | | |
1302 | 0 | { |
1303 | | /* -------------------------------------------------------------------- */ |
1304 | | /* Get access to the table. */ |
1305 | | /* -------------------------------------------------------------------- */ |
1306 | 0 | CPLAssert(pszFilename != nullptr); |
1307 | | |
1308 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1309 | 0 | if (psTable == nullptr) |
1310 | 0 | return -1; |
1311 | 0 | return CSVGetFileFieldId(psTable, pszFieldName); |
1312 | 0 | } |
1313 | | |
1314 | | /************************************************************************/ |
1315 | | /* CSVScanFileByName() */ |
1316 | | /* */ |
1317 | | /* Same as CSVScanFile(), but using a field name instead of a */ |
1318 | | /* field number. */ |
1319 | | /************************************************************************/ |
1320 | | |
1321 | | char **CSVScanFileByName(const char *pszFilename, const char *pszKeyFieldName, |
1322 | | const char *pszValue, CSVCompareCriteria eCriteria) |
1323 | | |
1324 | 0 | { |
1325 | 0 | const int iKeyField = CSVGetFileFieldId(pszFilename, pszKeyFieldName); |
1326 | 0 | if (iKeyField == -1) |
1327 | 0 | return nullptr; |
1328 | | |
1329 | 0 | return CSVScanFile(pszFilename, iKeyField, pszValue, eCriteria); |
1330 | 0 | } |
1331 | | |
1332 | | /************************************************************************/ |
1333 | | /* CSVGetField() */ |
1334 | | /* */ |
1335 | | /* The all-in-one function to fetch a particular field value */ |
1336 | | /* from a CSV file. Note this function will return an empty */ |
1337 | | /* string, rather than NULL if it fails to find the desired */ |
1338 | | /* value for some reason. The caller can't establish that the */ |
1339 | | /* fetch failed. */ |
1340 | | /************************************************************************/ |
1341 | | |
1342 | | const char *CSVGetField(const char *pszFilename, const char *pszKeyFieldName, |
1343 | | const char *pszKeyFieldValue, |
1344 | | CSVCompareCriteria eCriteria, |
1345 | | const char *pszTargetField) |
1346 | | |
1347 | 0 | { |
1348 | | /* -------------------------------------------------------------------- */ |
1349 | | /* Find the table. */ |
1350 | | /* -------------------------------------------------------------------- */ |
1351 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1352 | 0 | if (psTable == nullptr) |
1353 | 0 | return ""; |
1354 | | |
1355 | 0 | const int iKeyField = CSVGetFileFieldId(psTable, pszKeyFieldName); |
1356 | 0 | if (iKeyField == -1) |
1357 | 0 | return ""; |
1358 | | |
1359 | | /* -------------------------------------------------------------------- */ |
1360 | | /* Find the correct record. */ |
1361 | | /* -------------------------------------------------------------------- */ |
1362 | 0 | char **papszRecord = |
1363 | 0 | CSVScanFile(psTable, iKeyField, pszKeyFieldValue, eCriteria); |
1364 | 0 | if (papszRecord == nullptr) |
1365 | 0 | return ""; |
1366 | | |
1367 | | /* -------------------------------------------------------------------- */ |
1368 | | /* Figure out which field we want out of this. */ |
1369 | | /* -------------------------------------------------------------------- */ |
1370 | 0 | const int iTargetField = CSVGetFileFieldId(psTable, pszTargetField); |
1371 | 0 | if (iTargetField < 0) |
1372 | 0 | return ""; |
1373 | | |
1374 | 0 | for (int i = 0; papszRecord[i] != nullptr; ++i) |
1375 | 0 | { |
1376 | 0 | if (i == iTargetField) |
1377 | 0 | return papszRecord[iTargetField]; |
1378 | 0 | } |
1379 | 0 | return ""; |
1380 | 0 | } |
1381 | | |
1382 | | /************************************************************************/ |
1383 | | /* GDALDefaultCSVFilename() */ |
1384 | | /************************************************************************/ |
1385 | | |
1386 | | typedef struct |
1387 | | { |
1388 | | char szPath[512]; |
1389 | | bool bCSVFinderInitialized; |
1390 | | } DefaultCSVFileNameTLS; |
1391 | | |
1392 | | const char *GDALDefaultCSVFilename(const char *pszBasename) |
1393 | | |
1394 | 0 | { |
1395 | | /* -------------------------------------------------------------------- */ |
1396 | | /* Do we already have this file accessed? If so, just return */ |
1397 | | /* the existing path without any further probing. */ |
1398 | | /* -------------------------------------------------------------------- */ |
1399 | 0 | int bMemoryError = FALSE; |
1400 | 0 | CSVTable **ppsCSVTableList = |
1401 | 0 | static_cast<CSVTable **>(CPLGetTLSEx(CTLS_CSVTABLEPTR, &bMemoryError)); |
1402 | 0 | if (ppsCSVTableList != nullptr) |
1403 | 0 | { |
1404 | 0 | const size_t nBasenameLen = strlen(pszBasename); |
1405 | |
|
1406 | 0 | for (const CSVTable *psTable = *ppsCSVTableList; psTable != nullptr; |
1407 | 0 | psTable = psTable->psNext) |
1408 | 0 | { |
1409 | 0 | const size_t nFullLen = strlen(psTable->pszFilename); |
1410 | |
|
1411 | 0 | if (nFullLen > nBasenameLen && |
1412 | 0 | strcmp(psTable->pszFilename + nFullLen - nBasenameLen, |
1413 | 0 | pszBasename) == 0 && |
1414 | 0 | strchr("/\\", |
1415 | 0 | psTable->pszFilename[+nFullLen - nBasenameLen - 1]) != |
1416 | 0 | nullptr) |
1417 | 0 | { |
1418 | 0 | return psTable->pszFilename; |
1419 | 0 | } |
1420 | 0 | } |
1421 | 0 | } |
1422 | | |
1423 | | /* -------------------------------------------------------------------- */ |
1424 | | /* Otherwise we need to look harder for it. */ |
1425 | | /* -------------------------------------------------------------------- */ |
1426 | 0 | DefaultCSVFileNameTLS *pTLSData = static_cast<DefaultCSVFileNameTLS *>( |
1427 | 0 | CPLGetTLSEx(CTLS_CSVDEFAULTFILENAME, &bMemoryError)); |
1428 | 0 | if (pTLSData == nullptr && !bMemoryError) |
1429 | 0 | { |
1430 | 0 | pTLSData = static_cast<DefaultCSVFileNameTLS *>( |
1431 | 0 | VSI_CALLOC_VERBOSE(1, sizeof(DefaultCSVFileNameTLS))); |
1432 | 0 | if (pTLSData) |
1433 | 0 | CPLSetTLS(CTLS_CSVDEFAULTFILENAME, pTLSData, TRUE); |
1434 | 0 | } |
1435 | 0 | if (pTLSData == nullptr) |
1436 | 0 | return "/not_existing_dir/not_existing_path"; |
1437 | | |
1438 | 0 | const char *pszResult = CPLFindFile("gdal", pszBasename); |
1439 | |
|
1440 | 0 | if (pszResult != nullptr) |
1441 | 0 | return pszResult; |
1442 | | |
1443 | 0 | if (!pTLSData->bCSVFinderInitialized) |
1444 | 0 | { |
1445 | 0 | pTLSData->bCSVFinderInitialized = true; |
1446 | |
|
1447 | 0 | if (CPLGetConfigOption("GDAL_DATA", nullptr) != nullptr) |
1448 | 0 | CPLPushFinderLocation(CPLGetConfigOption("GDAL_DATA", nullptr)); |
1449 | |
|
1450 | 0 | pszResult = CPLFindFile("gdal", pszBasename); |
1451 | |
|
1452 | 0 | if (pszResult != nullptr) |
1453 | 0 | return pszResult; |
1454 | 0 | } |
1455 | | |
1456 | | // For systems like sandboxes that do not allow other checks. |
1457 | 0 | CPLDebug("CPL_CSV", |
1458 | 0 | "Failed to find file in GDALDefaultCSVFilename. " |
1459 | 0 | "Returning original basename: %s", |
1460 | 0 | pszBasename); |
1461 | 0 | CPLStrlcpy(pTLSData->szPath, pszBasename, sizeof(pTLSData->szPath)); |
1462 | 0 | return pTLSData->szPath; |
1463 | 0 | } |
1464 | | |
1465 | | /************************************************************************/ |
1466 | | /* CSVFilename() */ |
1467 | | /* */ |
1468 | | /* Return the full path to a particular CSV file. This will */ |
1469 | | /* eventually be something the application can override. */ |
1470 | | /************************************************************************/ |
1471 | | |
1472 | | CPL_C_START |
1473 | | static const char *(*pfnCSVFilenameHook)(const char *) = nullptr; |
1474 | | CPL_C_END |
1475 | | |
1476 | | const char *CSVFilename(const char *pszBasename) |
1477 | | |
1478 | 0 | { |
1479 | 0 | if (pfnCSVFilenameHook == nullptr) |
1480 | 0 | return GDALDefaultCSVFilename(pszBasename); |
1481 | | |
1482 | 0 | return pfnCSVFilenameHook(pszBasename); |
1483 | 0 | } |
1484 | | |
1485 | | /************************************************************************/ |
1486 | | /* SetCSVFilenameHook() */ |
1487 | | /* */ |
1488 | | /* Applications can use this to set a function that will */ |
1489 | | /* massage CSV filenames. */ |
1490 | | /************************************************************************/ |
1491 | | |
1492 | | /** |
1493 | | * Override CSV file search method. |
1494 | | * |
1495 | | * @param pfnNewHook The pointer to a function which will return the |
1496 | | * full path for a given filename. |
1497 | | * |
1498 | | |
1499 | | This function allows an application to override how the GTIFGetDefn() |
1500 | | and related function find the CSV (Comma Separated Value) values |
1501 | | required. The pfnHook argument should be a pointer to a function that |
1502 | | will take in a CSV filename and return a full path to the file. The |
1503 | | returned string should be to an internal static buffer so that the |
1504 | | caller doesn't have to free the result. |
1505 | | |
1506 | | Example: |
1507 | | |
1508 | | The listgeo utility uses the following override function if the user |
1509 | | specified a CSV file directory with the -t commandline switch (argument |
1510 | | put into CSVDirName). |
1511 | | |
1512 | | \code{.cpp} |
1513 | | |
1514 | | ... |
1515 | | SetCSVFilenameHook( CSVFileOverride ); |
1516 | | ... |
1517 | | |
1518 | | static const char *CSVFileOverride( const char * pszInput ) |
1519 | | |
1520 | | { |
1521 | | static char szPath[1024] = {}; |
1522 | | |
1523 | | sprintf( szPath, "%s/%s", CSVDirName, pszInput ); |
1524 | | |
1525 | | return szPath; |
1526 | | } |
1527 | | \endcode |
1528 | | |
1529 | | */ |
1530 | | |
1531 | | CPL_C_START |
1532 | | void SetCSVFilenameHook(const char *(*pfnNewHook)(const char *)) |
1533 | | |
1534 | 0 | { |
1535 | 0 | pfnCSVFilenameHook = pfnNewHook; |
1536 | 0 | } |
1537 | | |
1538 | | CPL_C_END |