/src/gdal/port/cpl_csv.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Project: CPL - Common Portability Library |
4 | | * Purpose: CSV (comma separated value) file access. |
5 | | * Author: Frank Warmerdam, warmerdam@pobox.com |
6 | | * |
7 | | ****************************************************************************** |
8 | | * Copyright (c) 1999, Frank Warmerdam |
9 | | * Copyright (c) 2009-2012, Even Rouault <even dot rouault at spatialys.com> |
10 | | * |
11 | | * SPDX-License-Identifier: MIT |
12 | | ****************************************************************************/ |
13 | | |
14 | | #include "cpl_port.h" |
15 | | #include "cpl_csv.h" |
16 | | |
17 | | #include <cstddef> |
18 | | #include <cstdlib> |
19 | | #include <cstring> |
20 | | #if HAVE_FCNTL_H |
21 | | #include <fcntl.h> |
22 | | #endif |
23 | | |
24 | | #include "cpl_conv.h" |
25 | | #include "cpl_error.h" |
26 | | #include "cpl_multiproc.h" |
27 | | #include "gdal_csv.h" |
28 | | |
29 | | #include <algorithm> |
30 | | |
31 | | /* ==================================================================== */ |
32 | | /* The CSVTable is a persistent set of info about an open CSV */ |
33 | | /* table. While it doesn't currently maintain a record index, */ |
34 | | /* or in-memory copy of the table, it could be changed to do so */ |
35 | | /* in the future. */ |
36 | | /* ==================================================================== */ |
37 | | typedef struct ctb |
38 | | { |
39 | | VSILFILE *fp; |
40 | | struct ctb *psNext; |
41 | | char *pszFilename; |
42 | | char **papszFieldNames; |
43 | | int *panFieldNamesLength; |
44 | | char **papszRecFields; |
45 | | int nFields; |
46 | | int iLastLine; |
47 | | bool bNonUniqueKey; |
48 | | |
49 | | /* Cache for whole file */ |
50 | | int nLineCount; |
51 | | char **papszLines; |
52 | | int *panLineIndex; |
53 | | char *pszRawData; |
54 | | } CSVTable; |
55 | | |
56 | | static void CSVDeaccessInternal(CSVTable **ppsCSVTableList, bool bCanUseTLS, |
57 | | const char *pszFilename); |
58 | | |
59 | | /************************************************************************/ |
60 | | /* CSVFreeTLS() */ |
61 | | /************************************************************************/ |
62 | | static void CSVFreeTLS(void *pData) |
63 | 0 | { |
64 | 0 | CSVDeaccessInternal(static_cast<CSVTable **>(pData), false, nullptr); |
65 | 0 | CPLFree(pData); |
66 | 0 | } |
67 | | |
68 | | /* It would likely be better to share this list between threads, but |
69 | | that will require some rework. */ |
70 | | |
71 | | /************************************************************************/ |
72 | | /* CSVAccess() */ |
73 | | /* */ |
74 | | /* This function will fetch a handle to the requested table. */ |
75 | | /* If not found in the ``open table list'' the table will be */ |
76 | | /* opened and added to the list. Eventually this function may */ |
77 | | /* become public with an abstracted return type so that */ |
78 | | /* applications can set options about the table. For now this */ |
79 | | /* isn't done. */ |
80 | | /************************************************************************/ |
81 | | |
82 | | static CSVTable *CSVAccess(const char *pszFilename) |
83 | | |
84 | 0 | { |
85 | | /* -------------------------------------------------------------------- */ |
86 | | /* Fetch the table, and allocate the thread-local pointer to it */ |
87 | | /* if there isn't already one. */ |
88 | | /* -------------------------------------------------------------------- */ |
89 | 0 | int bMemoryError = FALSE; |
90 | 0 | CSVTable **ppsCSVTableList = |
91 | 0 | static_cast<CSVTable **>(CPLGetTLSEx(CTLS_CSVTABLEPTR, &bMemoryError)); |
92 | 0 | if (bMemoryError) |
93 | 0 | return nullptr; |
94 | 0 | if (ppsCSVTableList == nullptr) |
95 | 0 | { |
96 | 0 | ppsCSVTableList = |
97 | 0 | static_cast<CSVTable **>(VSI_CALLOC_VERBOSE(1, sizeof(CSVTable *))); |
98 | 0 | if (ppsCSVTableList == nullptr) |
99 | 0 | return nullptr; |
100 | 0 | CPLSetTLSWithFreeFunc(CTLS_CSVTABLEPTR, ppsCSVTableList, CSVFreeTLS); |
101 | 0 | } |
102 | | |
103 | | /* -------------------------------------------------------------------- */ |
104 | | /* Is the table already in the list. */ |
105 | | /* -------------------------------------------------------------------- */ |
106 | 0 | for (CSVTable *psTable = *ppsCSVTableList; psTable != nullptr; |
107 | 0 | psTable = psTable->psNext) |
108 | 0 | { |
109 | 0 | if (EQUAL(psTable->pszFilename, pszFilename)) |
110 | 0 | { |
111 | | /* |
112 | | * Eventually we should consider promoting to the front of |
113 | | * the list to accelerate frequently accessed tables. |
114 | | */ |
115 | 0 | return psTable; |
116 | 0 | } |
117 | 0 | } |
118 | | |
119 | | /* -------------------------------------------------------------------- */ |
120 | | /* If not, try to open it. */ |
121 | | /* -------------------------------------------------------------------- */ |
122 | 0 | VSILFILE *fp = VSIFOpenL(pszFilename, "rb"); |
123 | 0 | if (fp == nullptr) |
124 | 0 | return nullptr; |
125 | | |
126 | | /* -------------------------------------------------------------------- */ |
127 | | /* Create an information structure about this table, and add to */ |
128 | | /* the front of the list. */ |
129 | | /* -------------------------------------------------------------------- */ |
130 | 0 | CSVTable *const psTable = |
131 | 0 | static_cast<CSVTable *>(VSI_CALLOC_VERBOSE(sizeof(CSVTable), 1)); |
132 | 0 | if (psTable == nullptr) |
133 | 0 | { |
134 | 0 | VSIFCloseL(fp); |
135 | 0 | return nullptr; |
136 | 0 | } |
137 | | |
138 | 0 | psTable->fp = fp; |
139 | 0 | psTable->pszFilename = VSI_STRDUP_VERBOSE(pszFilename); |
140 | 0 | if (psTable->pszFilename == nullptr) |
141 | 0 | { |
142 | 0 | VSIFree(psTable); |
143 | 0 | VSIFCloseL(fp); |
144 | 0 | return nullptr; |
145 | 0 | } |
146 | 0 | psTable->bNonUniqueKey = false; // As far as we know now. |
147 | 0 | psTable->psNext = *ppsCSVTableList; |
148 | |
|
149 | 0 | *ppsCSVTableList = psTable; |
150 | | |
151 | | /* -------------------------------------------------------------------- */ |
152 | | /* Read the table header record containing the field names. */ |
153 | | /* -------------------------------------------------------------------- */ |
154 | 0 | psTable->papszFieldNames = CSVReadParseLineL(fp); |
155 | 0 | psTable->nFields = CSLCount(psTable->papszFieldNames); |
156 | 0 | psTable->panFieldNamesLength = |
157 | 0 | static_cast<int *>(CPLMalloc(sizeof(int) * psTable->nFields)); |
158 | 0 | for (int i = 0; |
159 | 0 | i < psTable->nFields && |
160 | | /* null-pointer check to avoid a false positive from CLang S.A. */ |
161 | 0 | psTable->papszFieldNames != nullptr; |
162 | 0 | i++) |
163 | 0 | { |
164 | 0 | psTable->panFieldNamesLength[i] = |
165 | 0 | static_cast<int>(strlen(psTable->papszFieldNames[i])); |
166 | 0 | } |
167 | |
|
168 | 0 | return psTable; |
169 | 0 | } |
170 | | |
171 | | /************************************************************************/ |
172 | | /* CSVDeaccess() */ |
173 | | /************************************************************************/ |
174 | | |
175 | | static void CSVDeaccessInternal(CSVTable **ppsCSVTableList, bool bCanUseTLS, |
176 | | const char *pszFilename) |
177 | | |
178 | 0 | { |
179 | 0 | if (ppsCSVTableList == nullptr) |
180 | 0 | return; |
181 | | |
182 | | /* -------------------------------------------------------------------- */ |
183 | | /* A NULL means deaccess all tables. */ |
184 | | /* -------------------------------------------------------------------- */ |
185 | 0 | if (pszFilename == nullptr) |
186 | 0 | { |
187 | 0 | while (*ppsCSVTableList != nullptr) |
188 | 0 | CSVDeaccessInternal(ppsCSVTableList, bCanUseTLS, |
189 | 0 | (*ppsCSVTableList)->pszFilename); |
190 | |
|
191 | 0 | return; |
192 | 0 | } |
193 | | |
194 | | /* -------------------------------------------------------------------- */ |
195 | | /* Find this table. */ |
196 | | /* -------------------------------------------------------------------- */ |
197 | 0 | CSVTable *psLast = nullptr; |
198 | 0 | CSVTable *psTable = *ppsCSVTableList; |
199 | 0 | for (; psTable != nullptr && !EQUAL(psTable->pszFilename, pszFilename); |
200 | 0 | psTable = psTable->psNext) |
201 | 0 | { |
202 | 0 | psLast = psTable; |
203 | 0 | } |
204 | |
|
205 | 0 | if (psTable == nullptr) |
206 | 0 | { |
207 | 0 | if (bCanUseTLS) |
208 | 0 | CPLDebug("CPL_CSV", "CPLDeaccess( %s ) - no match.", pszFilename); |
209 | 0 | return; |
210 | 0 | } |
211 | | |
212 | | /* -------------------------------------------------------------------- */ |
213 | | /* Remove the link from the list. */ |
214 | | /* -------------------------------------------------------------------- */ |
215 | 0 | if (psLast != nullptr) |
216 | 0 | psLast->psNext = psTable->psNext; |
217 | 0 | else |
218 | 0 | *ppsCSVTableList = psTable->psNext; |
219 | | |
220 | | /* -------------------------------------------------------------------- */ |
221 | | /* Free the table. */ |
222 | | /* -------------------------------------------------------------------- */ |
223 | 0 | if (psTable->fp != nullptr) |
224 | 0 | VSIFCloseL(psTable->fp); |
225 | |
|
226 | 0 | CSLDestroy(psTable->papszFieldNames); |
227 | 0 | CPLFree(psTable->panFieldNamesLength); |
228 | 0 | CSLDestroy(psTable->papszRecFields); |
229 | 0 | CPLFree(psTable->pszFilename); |
230 | 0 | CPLFree(psTable->panLineIndex); |
231 | 0 | CPLFree(psTable->pszRawData); |
232 | 0 | CPLFree(psTable->papszLines); |
233 | |
|
234 | 0 | CPLFree(psTable); |
235 | |
|
236 | 0 | if (bCanUseTLS) |
237 | 0 | CPLReadLine(nullptr); |
238 | 0 | } |
239 | | |
240 | | void CSVDeaccess(const char *pszFilename) |
241 | 0 | { |
242 | | /* -------------------------------------------------------------------- */ |
243 | | /* Fetch the table, and allocate the thread-local pointer to it */ |
244 | | /* if there isn't already one. */ |
245 | | /* -------------------------------------------------------------------- */ |
246 | 0 | int bMemoryError = FALSE; |
247 | 0 | CSVTable **ppsCSVTableList = |
248 | 0 | static_cast<CSVTable **>(CPLGetTLSEx(CTLS_CSVTABLEPTR, &bMemoryError)); |
249 | |
|
250 | 0 | CSVDeaccessInternal(ppsCSVTableList, true, pszFilename); |
251 | 0 | } |
252 | | |
253 | | /************************************************************************/ |
254 | | /* CSVSplitLine() */ |
255 | | /* */ |
256 | | /* Tokenize a CSV line into fields in the form of a string */ |
257 | | /* list. This is used instead of the CPLTokenizeString() */ |
258 | | /* because it provides correct CSV escaping and quoting */ |
259 | | /* semantics. */ |
260 | | /************************************************************************/ |
261 | | |
262 | | static char **CSVSplitLine(const char *pszString, const char *pszDelimiter, |
263 | | bool bKeepLeadingAndClosingQuotes, |
264 | | bool bMergeDelimiter) |
265 | | |
266 | 0 | { |
267 | 0 | CPLStringList aosRetList; |
268 | 0 | if (pszString == nullptr) |
269 | 0 | return static_cast<char **>(CPLCalloc(sizeof(char *), 1)); |
270 | | |
271 | 0 | char *pszToken = static_cast<char *>(CPLCalloc(10, 1)); |
272 | 0 | int nTokenMax = 10; |
273 | 0 | const size_t nDelimiterLength = strlen(pszDelimiter); |
274 | |
|
275 | 0 | const char *pszIter = pszString; |
276 | 0 | while (*pszIter != '\0') |
277 | 0 | { |
278 | 0 | bool bInString = false; |
279 | |
|
280 | 0 | int nTokenLen = 0; |
281 | | |
282 | | // Try to find the next delimiter, marking end of token. |
283 | 0 | do |
284 | 0 | { |
285 | | // End if this is a delimiter skip it and break. |
286 | 0 | if (!bInString && |
287 | 0 | strncmp(pszIter, pszDelimiter, nDelimiterLength) == 0) |
288 | 0 | { |
289 | 0 | pszIter += nDelimiterLength; |
290 | 0 | if (bMergeDelimiter) |
291 | 0 | { |
292 | 0 | while (strncmp(pszIter, pszDelimiter, nDelimiterLength) == |
293 | 0 | 0) |
294 | 0 | pszIter += nDelimiterLength; |
295 | 0 | } |
296 | 0 | break; |
297 | 0 | } |
298 | | |
299 | 0 | if (*pszIter == '"') |
300 | 0 | { |
301 | 0 | if (!bInString && nTokenLen > 0) |
302 | 0 | { |
303 | | // do not treat in a special way double quotes that appear |
304 | | // in the middle of a field (similarly to OpenOffice) |
305 | | // Like in records: 1,50°46'06.6"N 116°42'04.4,foo |
306 | 0 | } |
307 | 0 | else if (!bInString || pszIter[1] != '"') |
308 | 0 | { |
309 | 0 | bInString = !bInString; |
310 | 0 | if (!bKeepLeadingAndClosingQuotes) |
311 | 0 | continue; |
312 | 0 | } |
313 | 0 | else // Doubled quotes in string resolve to one quote. |
314 | 0 | { |
315 | 0 | pszIter++; |
316 | 0 | } |
317 | 0 | } |
318 | | |
319 | 0 | if (nTokenLen >= nTokenMax - 2) |
320 | 0 | { |
321 | 0 | nTokenMax = nTokenMax * 2 + 10; |
322 | 0 | pszToken = static_cast<char *>(CPLRealloc(pszToken, nTokenMax)); |
323 | 0 | } |
324 | |
|
325 | 0 | pszToken[nTokenLen] = *pszIter; |
326 | 0 | nTokenLen++; |
327 | 0 | } while (*(++pszIter) != '\0'); |
328 | | |
329 | 0 | pszToken[nTokenLen] = '\0'; |
330 | 0 | aosRetList.AddString(pszToken); |
331 | | |
332 | | // If the last token is an empty token, then we have to catch |
333 | | // it now, otherwise we won't reenter the loop and it will be lost. |
334 | 0 | if (*pszIter == '\0' && |
335 | 0 | pszIter - pszString >= static_cast<int>(nDelimiterLength) && |
336 | 0 | strncmp(pszIter - nDelimiterLength, pszDelimiter, |
337 | 0 | nDelimiterLength) == 0) |
338 | 0 | { |
339 | 0 | aosRetList.AddString(""); |
340 | 0 | } |
341 | 0 | } |
342 | |
|
343 | 0 | CPLFree(pszToken); |
344 | |
|
345 | 0 | if (aosRetList.Count() == 0) |
346 | 0 | return static_cast<char **>(CPLCalloc(sizeof(char *), 1)); |
347 | 0 | else |
348 | 0 | return aosRetList.StealList(); |
349 | 0 | } |
350 | | |
351 | | /************************************************************************/ |
352 | | /* CSVFindNextLine() */ |
353 | | /* */ |
354 | | /* Find the start of the next line, while at the same time zero */ |
355 | | /* terminating this line. Take into account that there may be */ |
356 | | /* newline indicators within quoted strings, and that quotes */ |
357 | | /* can be escaped with a backslash. */ |
358 | | /************************************************************************/ |
359 | | |
360 | | static char *CSVFindNextLine(char *pszThisLine) |
361 | | |
362 | 0 | { |
363 | 0 | int i = 0; // i is used after the for loop. |
364 | |
|
365 | 0 | for (int nQuoteCount = 0; pszThisLine[i] != '\0'; i++) |
366 | 0 | { |
367 | 0 | if (pszThisLine[i] == '\"' && (i == 0 || pszThisLine[i - 1] != '\\')) |
368 | 0 | nQuoteCount++; |
369 | |
|
370 | 0 | if ((pszThisLine[i] == 10 || pszThisLine[i] == 13) && |
371 | 0 | (nQuoteCount % 2) == 0) |
372 | 0 | break; |
373 | 0 | } |
374 | |
|
375 | 0 | while (pszThisLine[i] == 10 || pszThisLine[i] == 13) |
376 | 0 | pszThisLine[i++] = '\0'; |
377 | |
|
378 | 0 | if (pszThisLine[i] == '\0') |
379 | 0 | return nullptr; |
380 | | |
381 | 0 | return pszThisLine + i; |
382 | 0 | } |
383 | | |
384 | | /************************************************************************/ |
385 | | /* CSVIngest() */ |
386 | | /* */ |
387 | | /* Load entire file into memory and setup index if possible. */ |
388 | | /************************************************************************/ |
389 | | |
390 | | // TODO(schwehr): Clean up all the casting in CSVIngest. |
391 | | static void CSVIngest(CSVTable *psTable) |
392 | | |
393 | 0 | { |
394 | 0 | if (psTable->pszRawData != nullptr) |
395 | 0 | return; |
396 | | |
397 | | /* -------------------------------------------------------------------- */ |
398 | | /* Ingest whole file. */ |
399 | | /* -------------------------------------------------------------------- */ |
400 | 0 | if (VSIFSeekL(psTable->fp, 0, SEEK_END) != 0) |
401 | 0 | { |
402 | 0 | CPLError(CE_Failure, CPLE_FileIO, |
403 | 0 | "Failed using seek end and tell to get file length: %s", |
404 | 0 | psTable->pszFilename); |
405 | 0 | return; |
406 | 0 | } |
407 | 0 | const vsi_l_offset nFileLen = VSIFTellL(psTable->fp); |
408 | 0 | if (static_cast<long>(nFileLen) == -1) |
409 | 0 | { |
410 | 0 | CPLError(CE_Failure, CPLE_FileIO, |
411 | 0 | "Failed using seek end and tell to get file length: %s", |
412 | 0 | psTable->pszFilename); |
413 | 0 | return; |
414 | 0 | } |
415 | 0 | VSIRewindL(psTable->fp); |
416 | |
|
417 | 0 | psTable->pszRawData = static_cast<char *>( |
418 | 0 | VSI_MALLOC_VERBOSE(static_cast<size_t>(nFileLen) + 1)); |
419 | 0 | if (psTable->pszRawData == nullptr) |
420 | 0 | return; |
421 | 0 | if (VSIFReadL(psTable->pszRawData, 1, static_cast<size_t>(nFileLen), |
422 | 0 | psTable->fp) != static_cast<size_t>(nFileLen)) |
423 | 0 | { |
424 | 0 | CPLFree(psTable->pszRawData); |
425 | 0 | psTable->pszRawData = nullptr; |
426 | |
|
427 | 0 | CPLError(CE_Failure, CPLE_FileIO, "Read of file %s failed.", |
428 | 0 | psTable->pszFilename); |
429 | 0 | return; |
430 | 0 | } |
431 | | |
432 | 0 | psTable->pszRawData[nFileLen] = '\0'; |
433 | | |
434 | | /* -------------------------------------------------------------------- */ |
435 | | /* Get count of newlines so we can allocate line array. */ |
436 | | /* -------------------------------------------------------------------- */ |
437 | 0 | int nMaxLineCount = 0; |
438 | 0 | for (int i = 0; i < static_cast<int>(nFileLen); i++) |
439 | 0 | { |
440 | 0 | if (psTable->pszRawData[i] == 10) |
441 | 0 | nMaxLineCount++; |
442 | 0 | } |
443 | |
|
444 | 0 | psTable->papszLines = |
445 | 0 | static_cast<char **>(VSI_CALLOC_VERBOSE(sizeof(char *), nMaxLineCount)); |
446 | 0 | if (psTable->papszLines == nullptr) |
447 | 0 | return; |
448 | | |
449 | | /* -------------------------------------------------------------------- */ |
450 | | /* Build a list of record pointers into the raw data buffer */ |
451 | | /* based on line terminators. Zero terminate the line */ |
452 | | /* strings. */ |
453 | | /* -------------------------------------------------------------------- */ |
454 | | /* skip header line */ |
455 | 0 | char *pszThisLine = CSVFindNextLine(psTable->pszRawData); |
456 | |
|
457 | 0 | int iLine = 0; |
458 | 0 | while (pszThisLine != nullptr && iLine < nMaxLineCount) |
459 | 0 | { |
460 | 0 | if (pszThisLine[0] != '#') |
461 | 0 | psTable->papszLines[iLine++] = pszThisLine; |
462 | 0 | pszThisLine = CSVFindNextLine(pszThisLine); |
463 | 0 | } |
464 | |
|
465 | 0 | psTable->nLineCount = iLine; |
466 | | |
467 | | /* -------------------------------------------------------------------- */ |
468 | | /* Allocate and populate index array. Ensure they are in */ |
469 | | /* ascending order so that binary searches can be done on the */ |
470 | | /* array. */ |
471 | | /* -------------------------------------------------------------------- */ |
472 | 0 | psTable->panLineIndex = static_cast<int *>( |
473 | 0 | VSI_MALLOC_VERBOSE(sizeof(int) * psTable->nLineCount)); |
474 | 0 | if (psTable->panLineIndex == nullptr) |
475 | 0 | return; |
476 | | |
477 | 0 | for (int i = 0; i < psTable->nLineCount; i++) |
478 | 0 | { |
479 | 0 | psTable->panLineIndex[i] = atoi(psTable->papszLines[i]); |
480 | |
|
481 | 0 | if (i > 0 && psTable->panLineIndex[i] < psTable->panLineIndex[i - 1]) |
482 | 0 | { |
483 | 0 | CPLFree(psTable->panLineIndex); |
484 | 0 | psTable->panLineIndex = nullptr; |
485 | 0 | break; |
486 | 0 | } |
487 | 0 | } |
488 | |
|
489 | 0 | psTable->iLastLine = -1; |
490 | | |
491 | | /* -------------------------------------------------------------------- */ |
492 | | /* We should never need the file handle against, so close it. */ |
493 | | /* -------------------------------------------------------------------- */ |
494 | 0 | VSIFCloseL(psTable->fp); |
495 | 0 | psTable->fp = nullptr; |
496 | 0 | } |
497 | | |
498 | | static void CSVIngest(const char *pszFilename) |
499 | | |
500 | 0 | { |
501 | 0 | CSVTable *psTable = CSVAccess(pszFilename); |
502 | 0 | if (psTable == nullptr) |
503 | 0 | { |
504 | 0 | CPLError(CE_Failure, CPLE_FileIO, "Failed to open file: %s", |
505 | 0 | pszFilename); |
506 | 0 | return; |
507 | 0 | } |
508 | 0 | CSVIngest(psTable); |
509 | 0 | } |
510 | | |
511 | | /************************************************************************/ |
512 | | /* CSVDetectSeperator() */ |
513 | | /************************************************************************/ |
514 | | |
515 | | /** Detect which field separator is used. |
516 | | * |
517 | | * Currently, it can detect comma, semicolon, space, tabulation or pipe. |
518 | | * In case of ambiguity, starting with GDAL 3.7.1, the separator with the |
519 | | * most occurrences will be selected (and a warning emitted). |
520 | | * If no separator found, comma will be considered as the separator. |
521 | | * |
522 | | * @return ',', ';', ' ', tabulation character or '|'. |
523 | | */ |
524 | | char CSVDetectSeperator(const char *pszLine) |
525 | 0 | { |
526 | 0 | bool bInString = false; |
527 | 0 | int nCountComma = 0; |
528 | 0 | int nCountSemicolon = 0; |
529 | 0 | int nCountTab = 0; |
530 | 0 | int nCountPipe = 0; |
531 | 0 | int nCountSpace = 0; |
532 | |
|
533 | 0 | for (; *pszLine != '\0'; pszLine++) |
534 | 0 | { |
535 | 0 | if (!bInString && *pszLine == ',') |
536 | 0 | { |
537 | 0 | nCountComma++; |
538 | 0 | } |
539 | 0 | else if (!bInString && *pszLine == ';') |
540 | 0 | { |
541 | 0 | nCountSemicolon++; |
542 | 0 | } |
543 | 0 | else if (!bInString && *pszLine == '\t') |
544 | 0 | { |
545 | 0 | nCountTab++; |
546 | 0 | } |
547 | 0 | else if (!bInString && *pszLine == '|') |
548 | 0 | { |
549 | 0 | nCountPipe++; |
550 | 0 | } |
551 | 0 | else if (!bInString && *pszLine == ' ') |
552 | 0 | { |
553 | 0 | nCountSpace++; |
554 | 0 | } |
555 | 0 | else if (*pszLine == '"') |
556 | 0 | { |
557 | 0 | if (!bInString || pszLine[1] != '"') |
558 | 0 | { |
559 | 0 | bInString = !bInString; |
560 | 0 | continue; |
561 | 0 | } |
562 | 0 | else /* doubled quotes in string resolve to one quote */ |
563 | 0 | { |
564 | 0 | pszLine++; |
565 | 0 | } |
566 | 0 | } |
567 | 0 | } |
568 | |
|
569 | 0 | const int nMaxCountExceptSpace = |
570 | 0 | std::max(std::max(nCountComma, nCountSemicolon), |
571 | 0 | std::max(nCountTab, nCountPipe)); |
572 | 0 | char chDelimiter = ','; |
573 | 0 | if (nMaxCountExceptSpace == 0) |
574 | 0 | { |
575 | 0 | if (nCountSpace > 0) |
576 | 0 | chDelimiter = ' '; |
577 | 0 | } |
578 | 0 | else |
579 | 0 | { |
580 | 0 | bool bWarn = false; |
581 | 0 | if (nCountComma == nMaxCountExceptSpace) |
582 | 0 | { |
583 | 0 | chDelimiter = ','; |
584 | 0 | bWarn = (nCountSemicolon > 0 || nCountTab > 0 || nCountPipe > 0); |
585 | 0 | } |
586 | 0 | else if (nCountSemicolon == nMaxCountExceptSpace) |
587 | 0 | { |
588 | 0 | chDelimiter = ';'; |
589 | 0 | bWarn = (nCountComma > 0 || nCountTab > 0 || nCountPipe > 0); |
590 | 0 | } |
591 | 0 | else if (nCountTab == nMaxCountExceptSpace) |
592 | 0 | { |
593 | 0 | chDelimiter = '\t'; |
594 | 0 | bWarn = (nCountComma > 0 || nCountSemicolon > 0 || nCountPipe > 0); |
595 | 0 | } |
596 | 0 | else /* if( nCountPipe == nMaxCountExceptSpace ) */ |
597 | 0 | { |
598 | 0 | chDelimiter = '|'; |
599 | 0 | bWarn = (nCountComma > 0 || nCountSemicolon > 0 || nCountTab > 0); |
600 | 0 | } |
601 | 0 | if (bWarn) |
602 | 0 | { |
603 | 0 | CPLError(CE_Warning, CPLE_AppDefined, |
604 | 0 | "Selecting '%c' as CSV field separator, but " |
605 | 0 | "other candidate separator(s) have been found.", |
606 | 0 | chDelimiter); |
607 | 0 | } |
608 | 0 | } |
609 | |
|
610 | 0 | return chDelimiter; |
611 | 0 | } |
612 | | |
613 | | /************************************************************************/ |
614 | | /* CSVReadParseLine3L() */ |
615 | | /* */ |
616 | | /* Read one line, and return split into fields. The return */ |
617 | | /* result is a stringlist, in the sense of the CSL functions. */ |
618 | | /************************************************************************/ |
619 | | |
620 | | static char ** |
621 | | CSVReadParseLineGeneric(void *fp, const char *(*pfnReadLine)(void *, size_t), |
622 | | size_t nMaxLineSize, const char *pszDelimiter, |
623 | | bool bHonourStrings, bool bKeepLeadingAndClosingQuotes, |
624 | | bool bMergeDelimiter, bool bSkipBOM) |
625 | 0 | { |
626 | 0 | const char *pszLine = pfnReadLine(fp, nMaxLineSize); |
627 | 0 | if (pszLine == nullptr) |
628 | 0 | return nullptr; |
629 | | |
630 | 0 | if (bSkipBOM) |
631 | 0 | { |
632 | | // Skip BOM. |
633 | 0 | const GByte *pabyData = reinterpret_cast<const GByte *>(pszLine); |
634 | 0 | if (pabyData[0] == 0xEF && pabyData[1] == 0xBB && pabyData[2] == 0xBF) |
635 | 0 | pszLine += 3; |
636 | 0 | } |
637 | | |
638 | | // Special fix to read NdfcFacilities.xls with un-balanced double quotes. |
639 | 0 | if (!bHonourStrings) |
640 | 0 | { |
641 | 0 | return CSLTokenizeStringComplex(pszLine, pszDelimiter, FALSE, TRUE); |
642 | 0 | } |
643 | | |
644 | | // If there are no quotes, then this is the simple case. |
645 | | // Parse, and return tokens. |
646 | 0 | if (strchr(pszLine, '\"') == nullptr) |
647 | 0 | return CSVSplitLine(pszLine, pszDelimiter, bKeepLeadingAndClosingQuotes, |
648 | 0 | bMergeDelimiter); |
649 | | |
650 | 0 | const size_t nDelimiterLength = strlen(pszDelimiter); |
651 | 0 | bool bInString = false; // keep in that scope ! |
652 | 0 | std::string osWorkLine(pszLine); // keep in that scope ! |
653 | 0 | size_t i = 0; // keep in that scope ! |
654 | |
|
655 | 0 | try |
656 | 0 | { |
657 | 0 | while (true) |
658 | 0 | { |
659 | 0 | for (; i < osWorkLine.size(); ++i) |
660 | 0 | { |
661 | 0 | if (osWorkLine[i] == '\"') |
662 | 0 | { |
663 | 0 | if (!bInString) |
664 | 0 | { |
665 | | // Only consider " as the start of a quoted string |
666 | | // if it is the first character of the line, or |
667 | | // if it is immediately after the field delimiter. |
668 | 0 | if (i == 0 || |
669 | 0 | (i >= nDelimiterLength && |
670 | 0 | osWorkLine.compare(i - nDelimiterLength, |
671 | 0 | nDelimiterLength, pszDelimiter, |
672 | 0 | nDelimiterLength) == 0)) |
673 | 0 | { |
674 | 0 | bInString = true; |
675 | 0 | } |
676 | 0 | } |
677 | 0 | else if (i + 1 < osWorkLine.size() && |
678 | 0 | osWorkLine[i + 1] == '"') |
679 | 0 | { |
680 | | // Escaped double quote in a quoted string |
681 | 0 | ++i; |
682 | 0 | } |
683 | 0 | else |
684 | 0 | { |
685 | 0 | bInString = false; |
686 | 0 | } |
687 | 0 | } |
688 | 0 | } |
689 | |
|
690 | 0 | if (!bInString) |
691 | 0 | { |
692 | 0 | return CSVSplitLine(osWorkLine.c_str(), pszDelimiter, |
693 | 0 | bKeepLeadingAndClosingQuotes, |
694 | 0 | bMergeDelimiter); |
695 | 0 | } |
696 | | |
697 | 0 | const char *pszNewLine = pfnReadLine(fp, nMaxLineSize); |
698 | 0 | if (pszNewLine == nullptr) |
699 | 0 | break; |
700 | | |
701 | 0 | osWorkLine.append("\n"); |
702 | 0 | osWorkLine.append(pszNewLine); |
703 | 0 | } |
704 | 0 | } |
705 | 0 | catch (const std::exception &e) |
706 | 0 | { |
707 | 0 | CPLError(CE_Failure, CPLE_OutOfMemory, "%s", e.what()); |
708 | 0 | } |
709 | | |
710 | 0 | if (bInString) |
711 | 0 | { |
712 | 0 | CPLError(CE_Failure, CPLE_AppDefined, |
713 | 0 | "CSV file has unbalanced number of double-quotes. Corrupted " |
714 | 0 | "data will likely be returned"); |
715 | 0 | } |
716 | |
|
717 | 0 | return nullptr; |
718 | 0 | } |
719 | | |
720 | | /************************************************************************/ |
721 | | /* CSVReadParseLine() */ |
722 | | /* */ |
723 | | /* Read one line, and return split into fields. The return */ |
724 | | /* result is a stringlist, in the sense of the CSL functions. */ |
725 | | /* */ |
726 | | /* Deprecated. Replaced by CSVReadParseLineL(). */ |
727 | | /************************************************************************/ |
728 | | |
729 | | char **CSVReadParseLine(FILE *fp) |
730 | 0 | { |
731 | 0 | return CSVReadParseLine2(fp, ','); |
732 | 0 | } |
733 | | |
734 | | static const char *ReadLineClassicalFile(void *fp, size_t /* nMaxLineSize */) |
735 | 0 | { |
736 | 0 | return CPLReadLine(static_cast<FILE *>(fp)); |
737 | 0 | } |
738 | | |
739 | | char **CSVReadParseLine2(FILE *fp, char chDelimiter) |
740 | 0 | { |
741 | 0 | CPLAssert(fp != nullptr); |
742 | 0 | if (fp == nullptr) |
743 | 0 | return nullptr; |
744 | | |
745 | 0 | char szDelimiter[2] = {chDelimiter, 0}; |
746 | 0 | return CSVReadParseLineGeneric(fp, ReadLineClassicalFile, |
747 | 0 | 0, // nMaxLineSize, |
748 | 0 | szDelimiter, |
749 | 0 | true, // bHonourStrings |
750 | 0 | false, // bKeepLeadingAndClosingQuotes |
751 | 0 | false, // bMergeDelimiter |
752 | 0 | true /* bSkipBOM */); |
753 | 0 | } |
754 | | |
755 | | /************************************************************************/ |
756 | | /* CSVReadParseLineL() */ |
757 | | /* */ |
758 | | /* Read one line, and return split into fields. The return */ |
759 | | /* result is a stringlist, in the sense of the CSL functions. */ |
760 | | /* */ |
761 | | /* Replaces CSVReadParseLine(). These functions use the VSI */ |
762 | | /* layer to allow reading from other file containers. */ |
763 | | /************************************************************************/ |
764 | | |
765 | | char **CSVReadParseLineL(VSILFILE *fp) |
766 | 0 | { |
767 | 0 | return CSVReadParseLine2L(fp, ','); |
768 | 0 | } |
769 | | |
770 | | char **CSVReadParseLine2L(VSILFILE *fp, char chDelimiter) |
771 | | |
772 | 0 | { |
773 | 0 | CPLAssert(fp != nullptr); |
774 | 0 | if (fp == nullptr) |
775 | 0 | return nullptr; |
776 | | |
777 | 0 | char szDelimiter[2] = {chDelimiter, 0}; |
778 | 0 | return CSVReadParseLine3L(fp, |
779 | 0 | 0, // nMaxLineSize |
780 | 0 | szDelimiter, |
781 | 0 | true, // bHonourStrings |
782 | 0 | false, // bKeepLeadingAndClosingQuotes |
783 | 0 | false, // bMergeDelimiter |
784 | 0 | true /* bSkipBOM */); |
785 | 0 | } |
786 | | |
787 | | /************************************************************************/ |
788 | | /* ReadLineLargeFile() */ |
789 | | /************************************************************************/ |
790 | | |
791 | | static const char *ReadLineLargeFile(void *fp, size_t nMaxLineSize) |
792 | 0 | { |
793 | 0 | int nBufLength = 0; |
794 | 0 | return CPLReadLine3L(static_cast<VSILFILE *>(fp), |
795 | 0 | nMaxLineSize == 0 ? -1 |
796 | 0 | : static_cast<int>(nMaxLineSize), |
797 | 0 | &nBufLength, nullptr); |
798 | 0 | } |
799 | | |
800 | | /************************************************************************/ |
801 | | /* CSVReadParseLine3L() */ |
802 | | /* */ |
803 | | /* Read one line, and return split into fields. The return */ |
804 | | /* result is a stringlist, in the sense of the CSL functions. */ |
805 | | /************************************************************************/ |
806 | | |
807 | | /** Read one line, and return split into fields. |
808 | | * The return result is a stringlist, in the sense of the CSL functions. |
809 | | * |
810 | | * @param fp File handle. Must not be NULL |
811 | | * @param nMaxLineSize Maximum line size, or 0 for unlimited. |
812 | | * @param pszDelimiter Delimiter sequence for readers (can be multiple bytes) |
813 | | * @param bHonourStrings Should be true, unless double quotes should not be |
814 | | * considered when separating fields. |
815 | | * @param bKeepLeadingAndClosingQuotes Whether the leading and closing double |
816 | | * quote characters should be kept. |
817 | | * @param bMergeDelimiter Whether consecutive delimiters should be considered |
818 | | * as a single one. Should generally be set to false. |
819 | | * @param bSkipBOM Whether leading UTF-8 BOM should be skipped. |
820 | | */ |
821 | | char **CSVReadParseLine3L(VSILFILE *fp, size_t nMaxLineSize, |
822 | | const char *pszDelimiter, bool bHonourStrings, |
823 | | bool bKeepLeadingAndClosingQuotes, |
824 | | bool bMergeDelimiter, bool bSkipBOM) |
825 | | |
826 | 0 | { |
827 | 0 | return CSVReadParseLineGeneric( |
828 | 0 | fp, ReadLineLargeFile, nMaxLineSize, pszDelimiter, bHonourStrings, |
829 | 0 | bKeepLeadingAndClosingQuotes, bMergeDelimiter, bSkipBOM); |
830 | 0 | } |
831 | | |
832 | | /************************************************************************/ |
833 | | /* CSVCompare() */ |
834 | | /* */ |
835 | | /* Compare a field to a search value using a particular */ |
836 | | /* criteria. */ |
837 | | /************************************************************************/ |
838 | | |
839 | | static bool CSVCompare(const char *pszFieldValue, const char *pszTarget, |
840 | | CSVCompareCriteria eCriteria) |
841 | | |
842 | 0 | { |
843 | 0 | if (eCriteria == CC_ExactString) |
844 | 0 | { |
845 | 0 | return (strcmp(pszFieldValue, pszTarget) == 0); |
846 | 0 | } |
847 | 0 | else if (eCriteria == CC_ApproxString) |
848 | 0 | { |
849 | 0 | return EQUAL(pszFieldValue, pszTarget); |
850 | 0 | } |
851 | 0 | else if (eCriteria == CC_Integer) |
852 | 0 | { |
853 | 0 | return (CPLGetValueType(pszFieldValue) == CPL_VALUE_INTEGER && |
854 | 0 | atoi(pszFieldValue) == atoi(pszTarget)); |
855 | 0 | } |
856 | | |
857 | 0 | return false; |
858 | 0 | } |
859 | | |
860 | | /************************************************************************/ |
861 | | /* CSVScanLines() */ |
862 | | /* */ |
863 | | /* Read the file scanline for lines where the key field equals */ |
864 | | /* the indicated value with the suggested comparison criteria. */ |
865 | | /* Return the first matching line split into fields. */ |
866 | | /* */ |
867 | | /* Deprecated. Replaced by CSVScanLinesL(). */ |
868 | | /************************************************************************/ |
869 | | |
870 | | char **CSVScanLines(FILE *fp, int iKeyField, const char *pszValue, |
871 | | CSVCompareCriteria eCriteria) |
872 | | |
873 | 0 | { |
874 | 0 | CPLAssert(pszValue != nullptr); |
875 | 0 | CPLAssert(iKeyField >= 0); |
876 | 0 | CPLAssert(fp != nullptr); |
877 | | |
878 | 0 | bool bSelected = false; |
879 | 0 | const int nTestValue = atoi(pszValue); |
880 | 0 | char **papszFields = nullptr; |
881 | |
|
882 | 0 | while (!bSelected) |
883 | 0 | { |
884 | 0 | papszFields = CSVReadParseLine(fp); |
885 | 0 | if (papszFields == nullptr) |
886 | 0 | return nullptr; |
887 | | |
888 | 0 | if (CSLCount(papszFields) < iKeyField + 1) |
889 | 0 | { |
890 | | /* not selected */ |
891 | 0 | } |
892 | 0 | else if (eCriteria == CC_Integer && |
893 | 0 | atoi(papszFields[iKeyField]) == nTestValue) |
894 | 0 | { |
895 | 0 | bSelected = true; |
896 | 0 | } |
897 | 0 | else |
898 | 0 | { |
899 | 0 | bSelected = CSVCompare(papszFields[iKeyField], pszValue, eCriteria); |
900 | 0 | } |
901 | |
|
902 | 0 | if (!bSelected) |
903 | 0 | { |
904 | 0 | CSLDestroy(papszFields); |
905 | 0 | papszFields = nullptr; |
906 | 0 | } |
907 | 0 | } |
908 | | |
909 | 0 | return papszFields; |
910 | 0 | } |
911 | | |
912 | | /************************************************************************/ |
913 | | /* CSVScanLinesL() */ |
914 | | /* */ |
915 | | /* Read the file scanline for lines where the key field equals */ |
916 | | /* the indicated value with the suggested comparison criteria. */ |
917 | | /* Return the first matching line split into fields. */ |
918 | | /************************************************************************/ |
919 | | |
920 | | char **CSVScanLinesL(VSILFILE *fp, int iKeyField, const char *pszValue, |
921 | | CSVCompareCriteria eCriteria) |
922 | | |
923 | 0 | { |
924 | 0 | CPLAssert(pszValue != nullptr); |
925 | 0 | CPLAssert(iKeyField >= 0); |
926 | 0 | CPLAssert(fp != nullptr); |
927 | | |
928 | 0 | bool bSelected = false; |
929 | 0 | const int nTestValue = atoi(pszValue); |
930 | 0 | char **papszFields = nullptr; |
931 | |
|
932 | 0 | while (!bSelected) |
933 | 0 | { |
934 | 0 | papszFields = CSVReadParseLineL(fp); |
935 | 0 | if (papszFields == nullptr) |
936 | 0 | return nullptr; |
937 | | |
938 | 0 | if (CSLCount(papszFields) < iKeyField + 1) |
939 | 0 | { |
940 | | /* not selected */ |
941 | 0 | } |
942 | 0 | else if (eCriteria == CC_Integer && |
943 | 0 | atoi(papszFields[iKeyField]) == nTestValue) |
944 | 0 | { |
945 | 0 | bSelected = true; |
946 | 0 | } |
947 | 0 | else |
948 | 0 | { |
949 | 0 | bSelected = CSVCompare(papszFields[iKeyField], pszValue, eCriteria); |
950 | 0 | } |
951 | |
|
952 | 0 | if (!bSelected) |
953 | 0 | { |
954 | 0 | CSLDestroy(papszFields); |
955 | 0 | papszFields = nullptr; |
956 | 0 | } |
957 | 0 | } |
958 | | |
959 | 0 | return papszFields; |
960 | 0 | } |
961 | | |
962 | | /************************************************************************/ |
963 | | /* CSVScanLinesIndexed() */ |
964 | | /* */ |
965 | | /* Read the file scanline for lines where the key field equals */ |
966 | | /* the indicated value with the suggested comparison criteria. */ |
967 | | /* Return the first matching line split into fields. */ |
968 | | /************************************************************************/ |
969 | | |
970 | | static char **CSVScanLinesIndexed(CSVTable *psTable, int nKeyValue) |
971 | | |
972 | 0 | { |
973 | 0 | CPLAssert(psTable->panLineIndex != nullptr); |
974 | | |
975 | | /* -------------------------------------------------------------------- */ |
976 | | /* Find target record with binary search. */ |
977 | | /* -------------------------------------------------------------------- */ |
978 | 0 | int iTop = psTable->nLineCount - 1; |
979 | 0 | int iBottom = 0; |
980 | 0 | int iResult = -1; |
981 | |
|
982 | 0 | while (iTop >= iBottom) |
983 | 0 | { |
984 | 0 | const int iMiddle = (iTop + iBottom) / 2; |
985 | 0 | if (psTable->panLineIndex[iMiddle] > nKeyValue) |
986 | 0 | iTop = iMiddle - 1; |
987 | 0 | else if (psTable->panLineIndex[iMiddle] < nKeyValue) |
988 | 0 | iBottom = iMiddle + 1; |
989 | 0 | else |
990 | 0 | { |
991 | 0 | iResult = iMiddle; |
992 | | // if a key is not unique, select the first instance of it. |
993 | 0 | while (iResult > 0 && |
994 | 0 | psTable->panLineIndex[iResult - 1] == nKeyValue) |
995 | 0 | { |
996 | 0 | psTable->bNonUniqueKey = true; |
997 | 0 | iResult--; |
998 | 0 | } |
999 | 0 | break; |
1000 | 0 | } |
1001 | 0 | } |
1002 | |
|
1003 | 0 | if (iResult == -1) |
1004 | 0 | return nullptr; |
1005 | | |
1006 | | /* -------------------------------------------------------------------- */ |
1007 | | /* Parse target line, and update iLastLine indicator. */ |
1008 | | /* -------------------------------------------------------------------- */ |
1009 | 0 | psTable->iLastLine = iResult; |
1010 | |
|
1011 | 0 | return CSVSplitLine(psTable->papszLines[iResult], ",", false, false); |
1012 | 0 | } |
1013 | | |
1014 | | /************************************************************************/ |
1015 | | /* CSVScanLinesIngested() */ |
1016 | | /* */ |
1017 | | /* Read the file scanline for lines where the key field equals */ |
1018 | | /* the indicated value with the suggested comparison criteria. */ |
1019 | | /* Return the first matching line split into fields. */ |
1020 | | /************************************************************************/ |
1021 | | |
1022 | | static char **CSVScanLinesIngested(CSVTable *psTable, int iKeyField, |
1023 | | const char *pszValue, |
1024 | | CSVCompareCriteria eCriteria) |
1025 | | |
1026 | 0 | { |
1027 | 0 | CPLAssert(pszValue != nullptr); |
1028 | 0 | CPLAssert(iKeyField >= 0); |
1029 | | |
1030 | 0 | const int nTestValue = atoi(pszValue); |
1031 | | |
1032 | | /* -------------------------------------------------------------------- */ |
1033 | | /* Short cut for indexed files. */ |
1034 | | /* -------------------------------------------------------------------- */ |
1035 | 0 | if (iKeyField == 0 && eCriteria == CC_Integer && |
1036 | 0 | psTable->panLineIndex != nullptr) |
1037 | 0 | return CSVScanLinesIndexed(psTable, nTestValue); |
1038 | | |
1039 | | /* -------------------------------------------------------------------- */ |
1040 | | /* Scan from in-core lines. */ |
1041 | | /* -------------------------------------------------------------------- */ |
1042 | 0 | char **papszFields = nullptr; |
1043 | 0 | bool bSelected = false; |
1044 | |
|
1045 | 0 | while (!bSelected && psTable->iLastLine + 1 < psTable->nLineCount) |
1046 | 0 | { |
1047 | 0 | psTable->iLastLine++; |
1048 | 0 | papszFields = CSVSplitLine(psTable->papszLines[psTable->iLastLine], ",", |
1049 | 0 | false, false); |
1050 | |
|
1051 | 0 | if (CSLCount(papszFields) < iKeyField + 1) |
1052 | 0 | { |
1053 | | /* not selected */ |
1054 | 0 | } |
1055 | 0 | else if (eCriteria == CC_Integer && |
1056 | 0 | atoi(papszFields[iKeyField]) == nTestValue) |
1057 | 0 | { |
1058 | 0 | bSelected = true; |
1059 | 0 | } |
1060 | 0 | else |
1061 | 0 | { |
1062 | 0 | bSelected = CSVCompare(papszFields[iKeyField], pszValue, eCriteria); |
1063 | 0 | } |
1064 | |
|
1065 | 0 | if (!bSelected) |
1066 | 0 | { |
1067 | 0 | CSLDestroy(papszFields); |
1068 | 0 | papszFields = nullptr; |
1069 | 0 | } |
1070 | 0 | } |
1071 | |
|
1072 | 0 | return papszFields; |
1073 | 0 | } |
1074 | | |
1075 | | /************************************************************************/ |
1076 | | /* CSVRewind() */ |
1077 | | /* */ |
1078 | | /* Rewind a CSV file based on a passed in filename. */ |
1079 | | /* This is aimed at being used with CSVGetNextLine(). */ |
1080 | | /************************************************************************/ |
1081 | | |
1082 | | void CSVRewind(const char *pszFilename) |
1083 | | |
1084 | 0 | { |
1085 | | /* -------------------------------------------------------------------- */ |
1086 | | /* Get access to the table. */ |
1087 | | /* -------------------------------------------------------------------- */ |
1088 | 0 | CPLAssert(pszFilename != nullptr); |
1089 | | |
1090 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1091 | 0 | if (psTable != nullptr) |
1092 | 0 | psTable->iLastLine = -1; |
1093 | 0 | } |
1094 | | |
1095 | | /************************************************************************/ |
1096 | | /* CSVGetNextLine() */ |
1097 | | /* */ |
1098 | | /* Fetch the next line of a CSV file based on a passed in */ |
1099 | | /* filename. Returns NULL at end of file, or if file is not */ |
1100 | | /* really established. */ |
1101 | | /* This ingests the whole file into memory if not already done. */ |
1102 | | /* When reaching end of file, CSVRewind() may be used to read */ |
1103 | | /* again from the beginning. */ |
1104 | | /************************************************************************/ |
1105 | | |
1106 | | char **CSVGetNextLine(const char *pszFilename) |
1107 | | |
1108 | 0 | { |
1109 | | |
1110 | | /* -------------------------------------------------------------------- */ |
1111 | | /* Get access to the table. */ |
1112 | | /* -------------------------------------------------------------------- */ |
1113 | 0 | CPLAssert(pszFilename != nullptr); |
1114 | | |
1115 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1116 | 0 | if (psTable == nullptr) |
1117 | 0 | return nullptr; |
1118 | | |
1119 | 0 | CSVIngest(psTable->pszFilename); |
1120 | | |
1121 | | /* -------------------------------------------------------------------- */ |
1122 | | /* If we use CSVGetNextLine() we can pretty much assume we have */ |
1123 | | /* a non-unique key. */ |
1124 | | /* -------------------------------------------------------------------- */ |
1125 | 0 | psTable->bNonUniqueKey = true; |
1126 | | |
1127 | | /* -------------------------------------------------------------------- */ |
1128 | | /* Do we have a next line available? This only works for */ |
1129 | | /* ingested tables I believe. */ |
1130 | | /* -------------------------------------------------------------------- */ |
1131 | 0 | if (psTable->iLastLine + 1 >= psTable->nLineCount) |
1132 | 0 | return nullptr; |
1133 | | |
1134 | 0 | psTable->iLastLine++; |
1135 | 0 | CSLDestroy(psTable->papszRecFields); |
1136 | 0 | psTable->papszRecFields = CSVSplitLine( |
1137 | 0 | psTable->papszLines[psTable->iLastLine], ",", false, false); |
1138 | |
|
1139 | 0 | return psTable->papszRecFields; |
1140 | 0 | } |
1141 | | |
1142 | | /************************************************************************/ |
1143 | | /* CSVScanFile() */ |
1144 | | /* */ |
1145 | | /* Scan a whole file using criteria similar to above, but also */ |
1146 | | /* taking care of file opening and closing. */ |
1147 | | /************************************************************************/ |
1148 | | |
1149 | | static char **CSVScanFile(CSVTable *const psTable, int iKeyField, |
1150 | | const char *pszValue, CSVCompareCriteria eCriteria) |
1151 | 0 | { |
1152 | 0 | CSVIngest(psTable->pszFilename); |
1153 | | |
1154 | | /* -------------------------------------------------------------------- */ |
1155 | | /* Does the current record match the criteria? If so, just */ |
1156 | | /* return it again. */ |
1157 | | /* -------------------------------------------------------------------- */ |
1158 | 0 | if (iKeyField >= 0 && iKeyField < CSLCount(psTable->papszRecFields) && |
1159 | 0 | CSVCompare(psTable->papszRecFields[iKeyField], pszValue, eCriteria) && |
1160 | 0 | !psTable->bNonUniqueKey) |
1161 | 0 | { |
1162 | 0 | return psTable->papszRecFields; |
1163 | 0 | } |
1164 | | |
1165 | | /* -------------------------------------------------------------------- */ |
1166 | | /* Scan the file from the beginning, replacing the ``current */ |
1167 | | /* record'' in our structure with the one that is found. */ |
1168 | | /* -------------------------------------------------------------------- */ |
1169 | 0 | psTable->iLastLine = -1; |
1170 | 0 | CSLDestroy(psTable->papszRecFields); |
1171 | |
|
1172 | 0 | if (psTable->pszRawData != nullptr) |
1173 | 0 | psTable->papszRecFields = |
1174 | 0 | CSVScanLinesIngested(psTable, iKeyField, pszValue, eCriteria); |
1175 | 0 | else |
1176 | 0 | { |
1177 | 0 | VSIRewindL(psTable->fp); |
1178 | 0 | CPLReadLineL(psTable->fp); /* throw away the header line */ |
1179 | |
|
1180 | 0 | psTable->papszRecFields = |
1181 | 0 | CSVScanLinesL(psTable->fp, iKeyField, pszValue, eCriteria); |
1182 | 0 | } |
1183 | |
|
1184 | 0 | return psTable->papszRecFields; |
1185 | 0 | } |
1186 | | |
1187 | | char **CSVScanFile(const char *pszFilename, int iKeyField, const char *pszValue, |
1188 | | CSVCompareCriteria eCriteria) |
1189 | | |
1190 | 0 | { |
1191 | | /* -------------------------------------------------------------------- */ |
1192 | | /* Get access to the table. */ |
1193 | | /* -------------------------------------------------------------------- */ |
1194 | 0 | CPLAssert(pszFilename != nullptr); |
1195 | | |
1196 | 0 | if (iKeyField < 0) |
1197 | 0 | return nullptr; |
1198 | | |
1199 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1200 | 0 | if (psTable == nullptr) |
1201 | 0 | return nullptr; |
1202 | | |
1203 | 0 | return CSVScanFile(psTable, iKeyField, pszValue, eCriteria); |
1204 | 0 | } |
1205 | | |
1206 | | /************************************************************************/ |
1207 | | /* CPLGetFieldId() */ |
1208 | | /* */ |
1209 | | /* Read the first record of a CSV file (rewinding to be sure), */ |
1210 | | /* and find the field with the indicated name. Returns -1 if */ |
1211 | | /* it fails to find the field name. Comparison is case */ |
1212 | | /* insensitive, but otherwise exact. After this function has */ |
1213 | | /* been called the file pointer will be positioned just after */ |
1214 | | /* the first record. */ |
1215 | | /* */ |
1216 | | /* Deprecated. Replaced by CPLGetFieldIdL(). */ |
1217 | | /************************************************************************/ |
1218 | | |
1219 | | int CSVGetFieldId(FILE *fp, const char *pszFieldName) |
1220 | | |
1221 | 0 | { |
1222 | 0 | CPLAssert(fp != nullptr && pszFieldName != nullptr); |
1223 | | |
1224 | 0 | VSIRewind(fp); |
1225 | |
|
1226 | 0 | char **papszFields = CSVReadParseLine(fp); |
1227 | 0 | for (int i = 0; papszFields != nullptr && papszFields[i] != nullptr; i++) |
1228 | 0 | { |
1229 | 0 | if (EQUAL(papszFields[i], pszFieldName)) |
1230 | 0 | { |
1231 | 0 | CSLDestroy(papszFields); |
1232 | 0 | return i; |
1233 | 0 | } |
1234 | 0 | } |
1235 | | |
1236 | 0 | CSLDestroy(papszFields); |
1237 | |
|
1238 | 0 | return -1; |
1239 | 0 | } |
1240 | | |
1241 | | /************************************************************************/ |
1242 | | /* CPLGetFieldIdL() */ |
1243 | | /* */ |
1244 | | /* Read the first record of a CSV file (rewinding to be sure), */ |
1245 | | /* and find the field with the indicated name. Returns -1 if */ |
1246 | | /* it fails to find the field name. Comparison is case */ |
1247 | | /* insensitive, but otherwise exact. After this function has */ |
1248 | | /* been called the file pointer will be positioned just after */ |
1249 | | /* the first record. */ |
1250 | | /************************************************************************/ |
1251 | | |
1252 | | int CSVGetFieldIdL(VSILFILE *fp, const char *pszFieldName) |
1253 | | |
1254 | 0 | { |
1255 | 0 | CPLAssert(fp != nullptr && pszFieldName != nullptr); |
1256 | | |
1257 | 0 | VSIRewindL(fp); |
1258 | |
|
1259 | 0 | char **papszFields = CSVReadParseLineL(fp); |
1260 | 0 | for (int i = 0; papszFields != nullptr && papszFields[i] != nullptr; i++) |
1261 | 0 | { |
1262 | 0 | if (EQUAL(papszFields[i], pszFieldName)) |
1263 | 0 | { |
1264 | 0 | CSLDestroy(papszFields); |
1265 | 0 | return i; |
1266 | 0 | } |
1267 | 0 | } |
1268 | | |
1269 | 0 | CSLDestroy(papszFields); |
1270 | |
|
1271 | 0 | return -1; |
1272 | 0 | } |
1273 | | |
1274 | | /************************************************************************/ |
1275 | | /* CSVGetFileFieldId() */ |
1276 | | /* */ |
1277 | | /* Same as CPLGetFieldId(), except that we get the file based */ |
1278 | | /* on filename, rather than having an existing handle. */ |
1279 | | /************************************************************************/ |
1280 | | |
1281 | | static int CSVGetFileFieldId(CSVTable *const psTable, const char *pszFieldName) |
1282 | | |
1283 | 0 | { |
1284 | | /* -------------------------------------------------------------------- */ |
1285 | | /* Find the requested field. */ |
1286 | | /* -------------------------------------------------------------------- */ |
1287 | 0 | const int nFieldNameLength = static_cast<int>(strlen(pszFieldName)); |
1288 | 0 | for (int i = 0; psTable->papszFieldNames != nullptr && |
1289 | 0 | psTable->papszFieldNames[i] != nullptr; |
1290 | 0 | i++) |
1291 | 0 | { |
1292 | 0 | if (psTable->panFieldNamesLength[i] == nFieldNameLength && |
1293 | 0 | EQUALN(psTable->papszFieldNames[i], pszFieldName, nFieldNameLength)) |
1294 | 0 | { |
1295 | 0 | return i; |
1296 | 0 | } |
1297 | 0 | } |
1298 | | |
1299 | 0 | return -1; |
1300 | 0 | } |
1301 | | |
1302 | | int CSVGetFileFieldId(const char *pszFilename, const char *pszFieldName) |
1303 | | |
1304 | 0 | { |
1305 | | /* -------------------------------------------------------------------- */ |
1306 | | /* Get access to the table. */ |
1307 | | /* -------------------------------------------------------------------- */ |
1308 | 0 | CPLAssert(pszFilename != nullptr); |
1309 | | |
1310 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1311 | 0 | if (psTable == nullptr) |
1312 | 0 | return -1; |
1313 | 0 | return CSVGetFileFieldId(psTable, pszFieldName); |
1314 | 0 | } |
1315 | | |
1316 | | /************************************************************************/ |
1317 | | /* CSVScanFileByName() */ |
1318 | | /* */ |
1319 | | /* Same as CSVScanFile(), but using a field name instead of a */ |
1320 | | /* field number. */ |
1321 | | /************************************************************************/ |
1322 | | |
1323 | | char **CSVScanFileByName(const char *pszFilename, const char *pszKeyFieldName, |
1324 | | const char *pszValue, CSVCompareCriteria eCriteria) |
1325 | | |
1326 | 0 | { |
1327 | 0 | const int iKeyField = CSVGetFileFieldId(pszFilename, pszKeyFieldName); |
1328 | 0 | if (iKeyField == -1) |
1329 | 0 | return nullptr; |
1330 | | |
1331 | 0 | return CSVScanFile(pszFilename, iKeyField, pszValue, eCriteria); |
1332 | 0 | } |
1333 | | |
1334 | | /************************************************************************/ |
1335 | | /* CSVGetField() */ |
1336 | | /* */ |
1337 | | /* The all-in-one function to fetch a particular field value */ |
1338 | | /* from a CSV file. Note this function will return an empty */ |
1339 | | /* string, rather than NULL if it fails to find the desired */ |
1340 | | /* value for some reason. The caller can't establish that the */ |
1341 | | /* fetch failed. */ |
1342 | | /************************************************************************/ |
1343 | | |
1344 | | const char *CSVGetField(const char *pszFilename, const char *pszKeyFieldName, |
1345 | | const char *pszKeyFieldValue, |
1346 | | CSVCompareCriteria eCriteria, |
1347 | | const char *pszTargetField) |
1348 | | |
1349 | 0 | { |
1350 | | /* -------------------------------------------------------------------- */ |
1351 | | /* Find the table. */ |
1352 | | /* -------------------------------------------------------------------- */ |
1353 | 0 | CSVTable *const psTable = CSVAccess(pszFilename); |
1354 | 0 | if (psTable == nullptr) |
1355 | 0 | return ""; |
1356 | | |
1357 | 0 | const int iKeyField = CSVGetFileFieldId(psTable, pszKeyFieldName); |
1358 | 0 | if (iKeyField == -1) |
1359 | 0 | return ""; |
1360 | | |
1361 | | /* -------------------------------------------------------------------- */ |
1362 | | /* Find the correct record. */ |
1363 | | /* -------------------------------------------------------------------- */ |
1364 | 0 | char **papszRecord = |
1365 | 0 | CSVScanFile(psTable, iKeyField, pszKeyFieldValue, eCriteria); |
1366 | 0 | if (papszRecord == nullptr) |
1367 | 0 | return ""; |
1368 | | |
1369 | | /* -------------------------------------------------------------------- */ |
1370 | | /* Figure out which field we want out of this. */ |
1371 | | /* -------------------------------------------------------------------- */ |
1372 | 0 | const int iTargetField = CSVGetFileFieldId(psTable, pszTargetField); |
1373 | 0 | if (iTargetField < 0) |
1374 | 0 | return ""; |
1375 | | |
1376 | 0 | for (int i = 0; papszRecord[i] != nullptr; ++i) |
1377 | 0 | { |
1378 | 0 | if (i == iTargetField) |
1379 | 0 | return papszRecord[iTargetField]; |
1380 | 0 | } |
1381 | 0 | return ""; |
1382 | 0 | } |
1383 | | |
1384 | | /************************************************************************/ |
1385 | | /* GDALDefaultCSVFilename() */ |
1386 | | /************************************************************************/ |
1387 | | |
1388 | | typedef struct |
1389 | | { |
1390 | | char szPath[512]; |
1391 | | bool bCSVFinderInitialized; |
1392 | | } DefaultCSVFileNameTLS; |
1393 | | |
1394 | | const char *GDALDefaultCSVFilename(const char *pszBasename) |
1395 | | |
1396 | 0 | { |
1397 | | /* -------------------------------------------------------------------- */ |
1398 | | /* Do we already have this file accessed? If so, just return */ |
1399 | | /* the existing path without any further probing. */ |
1400 | | /* -------------------------------------------------------------------- */ |
1401 | 0 | int bMemoryError = FALSE; |
1402 | 0 | CSVTable **ppsCSVTableList = |
1403 | 0 | static_cast<CSVTable **>(CPLGetTLSEx(CTLS_CSVTABLEPTR, &bMemoryError)); |
1404 | 0 | if (ppsCSVTableList != nullptr) |
1405 | 0 | { |
1406 | 0 | const size_t nBasenameLen = strlen(pszBasename); |
1407 | |
|
1408 | 0 | for (const CSVTable *psTable = *ppsCSVTableList; psTable != nullptr; |
1409 | 0 | psTable = psTable->psNext) |
1410 | 0 | { |
1411 | 0 | const size_t nFullLen = strlen(psTable->pszFilename); |
1412 | |
|
1413 | 0 | if (nFullLen > nBasenameLen && |
1414 | 0 | strcmp(psTable->pszFilename + nFullLen - nBasenameLen, |
1415 | 0 | pszBasename) == 0 && |
1416 | 0 | strchr("/\\", |
1417 | 0 | psTable->pszFilename[+nFullLen - nBasenameLen - 1]) != |
1418 | 0 | nullptr) |
1419 | 0 | { |
1420 | 0 | return psTable->pszFilename; |
1421 | 0 | } |
1422 | 0 | } |
1423 | 0 | } |
1424 | | |
1425 | | /* -------------------------------------------------------------------- */ |
1426 | | /* Otherwise we need to look harder for it. */ |
1427 | | /* -------------------------------------------------------------------- */ |
1428 | 0 | DefaultCSVFileNameTLS *pTLSData = static_cast<DefaultCSVFileNameTLS *>( |
1429 | 0 | CPLGetTLSEx(CTLS_CSVDEFAULTFILENAME, &bMemoryError)); |
1430 | 0 | if (pTLSData == nullptr && !bMemoryError) |
1431 | 0 | { |
1432 | 0 | pTLSData = static_cast<DefaultCSVFileNameTLS *>( |
1433 | 0 | VSI_CALLOC_VERBOSE(1, sizeof(DefaultCSVFileNameTLS))); |
1434 | 0 | if (pTLSData) |
1435 | 0 | CPLSetTLS(CTLS_CSVDEFAULTFILENAME, pTLSData, TRUE); |
1436 | 0 | } |
1437 | 0 | if (pTLSData == nullptr) |
1438 | 0 | return "/not_existing_dir/not_existing_path"; |
1439 | | |
1440 | 0 | const char *pszResult = CPLFindFile("gdal", pszBasename); |
1441 | |
|
1442 | 0 | if (pszResult != nullptr) |
1443 | 0 | return pszResult; |
1444 | | |
1445 | 0 | if (!pTLSData->bCSVFinderInitialized) |
1446 | 0 | { |
1447 | 0 | pTLSData->bCSVFinderInitialized = true; |
1448 | |
|
1449 | 0 | if (CPLGetConfigOption("GDAL_DATA", nullptr) != nullptr) |
1450 | 0 | CPLPushFinderLocation(CPLGetConfigOption("GDAL_DATA", nullptr)); |
1451 | |
|
1452 | 0 | pszResult = CPLFindFile("gdal", pszBasename); |
1453 | |
|
1454 | 0 | if (pszResult != nullptr) |
1455 | 0 | return pszResult; |
1456 | 0 | } |
1457 | | |
1458 | | // For systems like sandboxes that do not allow other checks. |
1459 | 0 | CPLDebug("CPL_CSV", |
1460 | 0 | "Failed to find file in GDALDefaultCSVFilename. " |
1461 | 0 | "Returning original basename: %s", |
1462 | 0 | pszBasename); |
1463 | 0 | CPLStrlcpy(pTLSData->szPath, pszBasename, sizeof(pTLSData->szPath)); |
1464 | 0 | return pTLSData->szPath; |
1465 | 0 | } |
1466 | | |
1467 | | /************************************************************************/ |
1468 | | /* CSVFilename() */ |
1469 | | /* */ |
1470 | | /* Return the full path to a particular CSV file. This will */ |
1471 | | /* eventually be something the application can override. */ |
1472 | | /************************************************************************/ |
1473 | | |
1474 | | CPL_C_START |
1475 | | static const char *(*pfnCSVFilenameHook)(const char *) = nullptr; |
1476 | | CPL_C_END |
1477 | | |
1478 | | const char *CSVFilename(const char *pszBasename) |
1479 | | |
1480 | 0 | { |
1481 | 0 | if (pfnCSVFilenameHook == nullptr) |
1482 | 0 | return GDALDefaultCSVFilename(pszBasename); |
1483 | | |
1484 | 0 | return pfnCSVFilenameHook(pszBasename); |
1485 | 0 | } |
1486 | | |
1487 | | /************************************************************************/ |
1488 | | /* SetCSVFilenameHook() */ |
1489 | | /* */ |
1490 | | /* Applications can use this to set a function that will */ |
1491 | | /* massage CSV filenames. */ |
1492 | | /************************************************************************/ |
1493 | | |
1494 | | /** |
1495 | | * Override CSV file search method. |
1496 | | * |
1497 | | * @param pfnNewHook The pointer to a function which will return the |
1498 | | * full path for a given filename. |
1499 | | * |
1500 | | |
1501 | | This function allows an application to override how the GTIFGetDefn() |
1502 | | and related function find the CSV (Comma Separated Value) values |
1503 | | required. The pfnHook argument should be a pointer to a function that |
1504 | | will take in a CSV filename and return a full path to the file. The |
1505 | | returned string should be to an internal static buffer so that the |
1506 | | caller doesn't have to free the result. |
1507 | | |
1508 | | Example: |
1509 | | |
1510 | | The listgeo utility uses the following override function if the user |
1511 | | specified a CSV file directory with the -t commandline switch (argument |
1512 | | put into CSVDirName). |
1513 | | |
1514 | | \code{.cpp} |
1515 | | |
1516 | | ... |
1517 | | SetCSVFilenameHook( CSVFileOverride ); |
1518 | | ... |
1519 | | |
1520 | | static const char *CSVFileOverride( const char * pszInput ) |
1521 | | |
1522 | | { |
1523 | | static char szPath[1024] = {}; |
1524 | | |
1525 | | sprintf( szPath, "%s/%s", CSVDirName, pszInput ); |
1526 | | |
1527 | | return szPath; |
1528 | | } |
1529 | | \endcode |
1530 | | |
1531 | | */ |
1532 | | |
1533 | | CPL_C_START |
1534 | | void SetCSVFilenameHook(const char *(*pfnNewHook)(const char *)) |
1535 | | |
1536 | 0 | { |
1537 | 0 | pfnCSVFilenameHook = pfnNewHook; |
1538 | 0 | } |
1539 | | |
1540 | | CPL_C_END |