/src/postgres/src/backend/commands/copyfromparse.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * copyfromparse.c |
4 | | * Parse CSV/text/binary format for COPY FROM. |
5 | | * |
6 | | * This file contains routines to parse the text, CSV and binary input |
7 | | * formats. The main entry point is NextCopyFrom(), which parses the |
8 | | * next input line and returns it as Datums. |
9 | | * |
10 | | * In text/CSV mode, the parsing happens in multiple stages: |
11 | | * |
12 | | * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf |
13 | | * 1. 2. 3. 4. |
14 | | * |
15 | | * 1. CopyLoadRawBuf() reads raw data from the input file or client, and |
16 | | * places it into 'raw_buf'. |
17 | | * |
18 | | * 2. CopyConvertBuf() calls the encoding conversion function to convert |
19 | | * the data in 'raw_buf' from client to server encoding, placing the |
20 | | * converted result in 'input_buf'. |
21 | | * |
22 | | * 3. CopyReadLine() parses the data in 'input_buf', one line at a time. |
23 | | * It is responsible for finding the next newline marker, taking quote and |
24 | | * escape characters into account according to the COPY options. The line |
25 | | * is copied into 'line_buf', with quotes and escape characters still |
26 | | * intact. |
27 | | * |
28 | | * 4. CopyReadAttributesText/CSV() function takes the input line from |
29 | | * 'line_buf', and splits it into fields, unescaping the data as required. |
30 | | * The fields are stored in 'attribute_buf', and 'raw_fields' array holds |
31 | | * pointers to each field. |
32 | | * |
33 | | * If encoding conversion is not required, a shortcut is taken in step 2 to |
34 | | * avoid copying the data unnecessarily. The 'input_buf' pointer is set to |
35 | | * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data |
36 | | * directly into 'input_buf'. CopyConvertBuf() then merely validates that |
37 | | * the data is valid in the current encoding. |
38 | | * |
39 | | * In binary mode, the pipeline is much simpler. Input is loaded into |
40 | | * 'raw_buf', and encoding conversion is done in the datatype-specific |
41 | | * receive functions, if required. 'input_buf' and 'line_buf' are not used, |
42 | | * but 'attribute_buf' is used as a temporary buffer to hold one attribute's |
43 | | * data when it's passed the receive function. |
44 | | * |
45 | | * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also |
46 | | * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf' |
47 | | * and 'attribute_buf' are expanded on demand, to hold the longest line |
48 | | * encountered so far. |
49 | | * |
50 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
51 | | * Portions Copyright (c) 1994, Regents of the University of California |
52 | | * |
53 | | * |
54 | | * IDENTIFICATION |
55 | | * src/backend/commands/copyfromparse.c |
56 | | * |
57 | | *------------------------------------------------------------------------- |
58 | | */ |
59 | | #include "postgres.h" |
60 | | |
61 | | #include <ctype.h> |
62 | | #include <unistd.h> |
63 | | #include <sys/stat.h> |
64 | | |
65 | | #include "commands/copyapi.h" |
66 | | #include "commands/copyfrom_internal.h" |
67 | | #include "commands/progress.h" |
68 | | #include "executor/executor.h" |
69 | | #include "libpq/libpq.h" |
70 | | #include "libpq/pqformat.h" |
71 | | #include "mb/pg_wchar.h" |
72 | | #include "miscadmin.h" |
73 | | #include "pgstat.h" |
74 | | #include "port/pg_bswap.h" |
75 | | #include "utils/builtins.h" |
76 | | #include "utils/rel.h" |
77 | | |
78 | 0 | #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7')) |
79 | 0 | #define OCTVALUE(c) ((c) - '0') |
80 | | |
81 | | /* |
82 | | * These macros centralize code used to process line_buf and input_buf buffers. |
83 | | * They are macros because they often do continue/break control and to avoid |
84 | | * function call overhead in tight COPY loops. |
85 | | * |
86 | | * We must use "if (1)" because the usual "do {...} while(0)" wrapper would |
87 | | * prevent the continue/break processing from working. We end the "if (1)" |
88 | | * with "else ((void) 0)" to ensure the "if" does not unintentionally match |
89 | | * any "else" in the calling code, and to avoid any compiler warnings about |
90 | | * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros. |
91 | | */ |
92 | | |
93 | | /* |
94 | | * This keeps the character read at the top of the loop in the buffer |
95 | | * even if there is more than one read-ahead. |
96 | | */ |
97 | 0 | #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \ |
98 | 0 | if (1) \ |
99 | 0 | { \ |
100 | 0 | if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \ |
101 | 0 | { \ |
102 | 0 | input_buf_ptr = prev_raw_ptr; /* undo fetch */ \ |
103 | 0 | need_data = true; \ |
104 | 0 | continue; \ |
105 | 0 | } \ |
106 | 0 | } else ((void) 0) |
107 | | |
108 | | /* This consumes the remainder of the buffer and breaks */ |
109 | 0 | #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \ |
110 | 0 | if (1) \ |
111 | 0 | { \ |
112 | 0 | if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \ |
113 | 0 | { \ |
114 | 0 | if (extralen) \ |
115 | 0 | input_buf_ptr = copy_buf_len; /* consume the partial character */ \ |
116 | 0 | /* backslash just before EOF, treat as data char */ \ |
117 | 0 | result = true; \ |
118 | 0 | break; \ |
119 | 0 | } \ |
120 | 0 | } else ((void) 0) |
121 | | |
122 | | /* |
123 | | * Transfer any approved data to line_buf; must do this to be sure |
124 | | * there is some room in input_buf. |
125 | | */ |
126 | 0 | #define REFILL_LINEBUF \ |
127 | 0 | if (1) \ |
128 | 0 | { \ |
129 | 0 | if (input_buf_ptr > cstate->input_buf_index) \ |
130 | 0 | { \ |
131 | 0 | appendBinaryStringInfo(&cstate->line_buf, \ |
132 | 0 | cstate->input_buf + cstate->input_buf_index, \ |
133 | 0 | input_buf_ptr - cstate->input_buf_index); \ |
134 | 0 | cstate->input_buf_index = input_buf_ptr; \ |
135 | 0 | } \ |
136 | 0 | } else ((void) 0) |
137 | | |
138 | | /* NOTE: there's a copy of this in copyto.c */ |
139 | | static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; |
140 | | |
141 | | |
142 | | /* non-export function prototypes */ |
143 | | static bool CopyReadLine(CopyFromState cstate, bool is_csv); |
144 | | static bool CopyReadLineText(CopyFromState cstate, bool is_csv); |
145 | | static int CopyReadAttributesText(CopyFromState cstate); |
146 | | static int CopyReadAttributesCSV(CopyFromState cstate); |
147 | | static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, |
148 | | Oid typioparam, int32 typmod, |
149 | | bool *isnull); |
150 | | static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate, |
151 | | ExprContext *econtext, |
152 | | Datum *values, |
153 | | bool *nulls, |
154 | | bool is_csv); |
155 | | static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate, |
156 | | char ***fields, |
157 | | int *nfields, |
158 | | bool is_csv); |
159 | | |
160 | | |
161 | | /* Low-level communications functions */ |
162 | | static int CopyGetData(CopyFromState cstate, void *databuf, |
163 | | int minread, int maxread); |
164 | | static inline bool CopyGetInt32(CopyFromState cstate, int32 *val); |
165 | | static inline bool CopyGetInt16(CopyFromState cstate, int16 *val); |
166 | | static void CopyLoadInputBuf(CopyFromState cstate); |
167 | | static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes); |
168 | | |
169 | | void |
170 | | ReceiveCopyBegin(CopyFromState cstate) |
171 | 0 | { |
172 | 0 | StringInfoData buf; |
173 | 0 | int natts = list_length(cstate->attnumlist); |
174 | 0 | int16 format = (cstate->opts.binary ? 1 : 0); |
175 | 0 | int i; |
176 | |
|
177 | 0 | pq_beginmessage(&buf, PqMsg_CopyInResponse); |
178 | 0 | pq_sendbyte(&buf, format); /* overall format */ |
179 | 0 | pq_sendint16(&buf, natts); |
180 | 0 | for (i = 0; i < natts; i++) |
181 | 0 | pq_sendint16(&buf, format); /* per-column formats */ |
182 | 0 | pq_endmessage(&buf); |
183 | 0 | cstate->copy_src = COPY_FRONTEND; |
184 | 0 | cstate->fe_msgbuf = makeStringInfo(); |
185 | | /* We *must* flush here to ensure FE knows it can send. */ |
186 | 0 | pq_flush(); |
187 | 0 | } |
188 | | |
189 | | void |
190 | | ReceiveCopyBinaryHeader(CopyFromState cstate) |
191 | 0 | { |
192 | 0 | char readSig[11]; |
193 | 0 | int32 tmp; |
194 | | |
195 | | /* Signature */ |
196 | 0 | if (CopyReadBinaryData(cstate, readSig, 11) != 11 || |
197 | 0 | memcmp(readSig, BinarySignature, 11) != 0) |
198 | 0 | ereport(ERROR, |
199 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
200 | 0 | errmsg("COPY file signature not recognized"))); |
201 | | /* Flags field */ |
202 | 0 | if (!CopyGetInt32(cstate, &tmp)) |
203 | 0 | ereport(ERROR, |
204 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
205 | 0 | errmsg("invalid COPY file header (missing flags)"))); |
206 | 0 | if ((tmp & (1 << 16)) != 0) |
207 | 0 | ereport(ERROR, |
208 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
209 | 0 | errmsg("invalid COPY file header (WITH OIDS)"))); |
210 | 0 | tmp &= ~(1 << 16); |
211 | 0 | if ((tmp >> 16) != 0) |
212 | 0 | ereport(ERROR, |
213 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
214 | 0 | errmsg("unrecognized critical flags in COPY file header"))); |
215 | | /* Header extension length */ |
216 | 0 | if (!CopyGetInt32(cstate, &tmp) || |
217 | 0 | tmp < 0) |
218 | 0 | ereport(ERROR, |
219 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
220 | 0 | errmsg("invalid COPY file header (missing length)"))); |
221 | | /* Skip extension header, if present */ |
222 | 0 | while (tmp-- > 0) |
223 | 0 | { |
224 | 0 | if (CopyReadBinaryData(cstate, readSig, 1) != 1) |
225 | 0 | ereport(ERROR, |
226 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
227 | 0 | errmsg("invalid COPY file header (wrong length)"))); |
228 | 0 | } |
229 | 0 | } |
230 | | |
231 | | /* |
232 | | * CopyGetData reads data from the source (file or frontend) |
233 | | * |
234 | | * We attempt to read at least minread, and at most maxread, bytes from |
235 | | * the source. The actual number of bytes read is returned; if this is |
236 | | * less than minread, EOF was detected. |
237 | | * |
238 | | * Note: when copying from the frontend, we expect a proper EOF mark per |
239 | | * protocol; if the frontend simply drops the connection, we raise error. |
240 | | * It seems unwise to allow the COPY IN to complete normally in that case. |
241 | | * |
242 | | * NB: no data conversion is applied here. |
243 | | */ |
244 | | static int |
245 | | CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) |
246 | 0 | { |
247 | 0 | int bytesread = 0; |
248 | |
|
249 | 0 | switch (cstate->copy_src) |
250 | 0 | { |
251 | 0 | case COPY_FILE: |
252 | 0 | bytesread = fread(databuf, 1, maxread, cstate->copy_file); |
253 | 0 | if (ferror(cstate->copy_file)) |
254 | 0 | ereport(ERROR, |
255 | 0 | (errcode_for_file_access(), |
256 | 0 | errmsg("could not read from COPY file: %m"))); |
257 | 0 | if (bytesread == 0) |
258 | 0 | cstate->raw_reached_eof = true; |
259 | 0 | break; |
260 | 0 | case COPY_FRONTEND: |
261 | 0 | while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof) |
262 | 0 | { |
263 | 0 | int avail; |
264 | |
|
265 | 0 | while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len) |
266 | 0 | { |
267 | | /* Try to receive another message */ |
268 | 0 | int mtype; |
269 | 0 | int maxmsglen; |
270 | |
|
271 | 0 | readmessage: |
272 | 0 | HOLD_CANCEL_INTERRUPTS(); |
273 | 0 | pq_startmsgread(); |
274 | 0 | mtype = pq_getbyte(); |
275 | 0 | if (mtype == EOF) |
276 | 0 | ereport(ERROR, |
277 | 0 | (errcode(ERRCODE_CONNECTION_FAILURE), |
278 | 0 | errmsg("unexpected EOF on client connection with an open transaction"))); |
279 | | /* Validate message type and set packet size limit */ |
280 | 0 | switch (mtype) |
281 | 0 | { |
282 | 0 | case PqMsg_CopyData: |
283 | 0 | maxmsglen = PQ_LARGE_MESSAGE_LIMIT; |
284 | 0 | break; |
285 | 0 | case PqMsg_CopyDone: |
286 | 0 | case PqMsg_CopyFail: |
287 | 0 | case PqMsg_Flush: |
288 | 0 | case PqMsg_Sync: |
289 | 0 | maxmsglen = PQ_SMALL_MESSAGE_LIMIT; |
290 | 0 | break; |
291 | 0 | default: |
292 | 0 | ereport(ERROR, |
293 | 0 | (errcode(ERRCODE_PROTOCOL_VIOLATION), |
294 | 0 | errmsg("unexpected message type 0x%02X during COPY from stdin", |
295 | 0 | mtype))); |
296 | 0 | maxmsglen = 0; /* keep compiler quiet */ |
297 | 0 | break; |
298 | 0 | } |
299 | | /* Now collect the message body */ |
300 | 0 | if (pq_getmessage(cstate->fe_msgbuf, maxmsglen)) |
301 | 0 | ereport(ERROR, |
302 | 0 | (errcode(ERRCODE_CONNECTION_FAILURE), |
303 | 0 | errmsg("unexpected EOF on client connection with an open transaction"))); |
304 | 0 | RESUME_CANCEL_INTERRUPTS(); |
305 | | /* ... and process it */ |
306 | 0 | switch (mtype) |
307 | 0 | { |
308 | 0 | case PqMsg_CopyData: |
309 | 0 | break; |
310 | 0 | case PqMsg_CopyDone: |
311 | | /* COPY IN correctly terminated by frontend */ |
312 | 0 | cstate->raw_reached_eof = true; |
313 | 0 | return bytesread; |
314 | 0 | case PqMsg_CopyFail: |
315 | 0 | ereport(ERROR, |
316 | 0 | (errcode(ERRCODE_QUERY_CANCELED), |
317 | 0 | errmsg("COPY from stdin failed: %s", |
318 | 0 | pq_getmsgstring(cstate->fe_msgbuf)))); |
319 | 0 | break; |
320 | 0 | case PqMsg_Flush: |
321 | 0 | case PqMsg_Sync: |
322 | | |
323 | | /* |
324 | | * Ignore Flush/Sync for the convenience of client |
325 | | * libraries (such as libpq) that may send those |
326 | | * without noticing that the command they just |
327 | | * sent was COPY. |
328 | | */ |
329 | 0 | goto readmessage; |
330 | 0 | default: |
331 | 0 | Assert(false); /* NOT REACHED */ |
332 | 0 | } |
333 | 0 | } |
334 | 0 | avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor; |
335 | 0 | if (avail > maxread) |
336 | 0 | avail = maxread; |
337 | 0 | pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail); |
338 | 0 | databuf = (void *) ((char *) databuf + avail); |
339 | 0 | maxread -= avail; |
340 | 0 | bytesread += avail; |
341 | 0 | } |
342 | 0 | break; |
343 | 0 | case COPY_CALLBACK: |
344 | 0 | bytesread = cstate->data_source_cb(databuf, minread, maxread); |
345 | 0 | break; |
346 | 0 | } |
347 | | |
348 | 0 | return bytesread; |
349 | 0 | } |
350 | | |
351 | | |
352 | | /* |
353 | | * These functions do apply some data conversion |
354 | | */ |
355 | | |
356 | | /* |
357 | | * CopyGetInt32 reads an int32 that appears in network byte order |
358 | | * |
359 | | * Returns true if OK, false if EOF |
360 | | */ |
361 | | static inline bool |
362 | | CopyGetInt32(CopyFromState cstate, int32 *val) |
363 | 0 | { |
364 | 0 | uint32 buf; |
365 | |
|
366 | 0 | if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf)) |
367 | 0 | { |
368 | 0 | *val = 0; /* suppress compiler warning */ |
369 | 0 | return false; |
370 | 0 | } |
371 | 0 | *val = (int32) pg_ntoh32(buf); |
372 | 0 | return true; |
373 | 0 | } |
374 | | |
375 | | /* |
376 | | * CopyGetInt16 reads an int16 that appears in network byte order |
377 | | */ |
378 | | static inline bool |
379 | | CopyGetInt16(CopyFromState cstate, int16 *val) |
380 | 0 | { |
381 | 0 | uint16 buf; |
382 | |
|
383 | 0 | if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf)) |
384 | 0 | { |
385 | 0 | *val = 0; /* suppress compiler warning */ |
386 | 0 | return false; |
387 | 0 | } |
388 | 0 | *val = (int16) pg_ntoh16(buf); |
389 | 0 | return true; |
390 | 0 | } |
391 | | |
392 | | |
393 | | /* |
394 | | * Perform encoding conversion on data in 'raw_buf', writing the converted |
395 | | * data into 'input_buf'. |
396 | | * |
397 | | * On entry, there must be some data to convert in 'raw_buf'. |
398 | | */ |
399 | | static void |
400 | | CopyConvertBuf(CopyFromState cstate) |
401 | 0 | { |
402 | | /* |
403 | | * If the file and server encoding are the same, no encoding conversion is |
404 | | * required. However, we still need to verify that the input is valid for |
405 | | * the encoding. |
406 | | */ |
407 | 0 | if (!cstate->need_transcoding) |
408 | 0 | { |
409 | | /* |
410 | | * When conversion is not required, input_buf and raw_buf are the |
411 | | * same. raw_buf_len is the total number of bytes in the buffer, and |
412 | | * input_buf_len tracks how many of those bytes have already been |
413 | | * verified. |
414 | | */ |
415 | 0 | int preverifiedlen = cstate->input_buf_len; |
416 | 0 | int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len; |
417 | 0 | int nverified; |
418 | |
|
419 | 0 | if (unverifiedlen == 0) |
420 | 0 | { |
421 | | /* |
422 | | * If no more raw data is coming, report the EOF to the caller. |
423 | | */ |
424 | 0 | if (cstate->raw_reached_eof) |
425 | 0 | cstate->input_reached_eof = true; |
426 | 0 | return; |
427 | 0 | } |
428 | | |
429 | | /* |
430 | | * Verify the new data, including any residual unverified bytes from |
431 | | * previous round. |
432 | | */ |
433 | 0 | nverified = pg_encoding_verifymbstr(cstate->file_encoding, |
434 | 0 | cstate->raw_buf + preverifiedlen, |
435 | 0 | unverifiedlen); |
436 | 0 | if (nverified == 0) |
437 | 0 | { |
438 | | /* |
439 | | * Could not verify anything. |
440 | | * |
441 | | * If there is no more raw input data coming, it means that there |
442 | | * was an incomplete multi-byte sequence at the end. Also, if |
443 | | * there's "enough" input left, we should be able to verify at |
444 | | * least one character, and a failure to do so means that we've |
445 | | * hit an invalid byte sequence. |
446 | | */ |
447 | 0 | if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding)) |
448 | 0 | cstate->input_reached_error = true; |
449 | 0 | return; |
450 | 0 | } |
451 | 0 | cstate->input_buf_len += nverified; |
452 | 0 | } |
453 | 0 | else |
454 | 0 | { |
455 | | /* |
456 | | * Encoding conversion is needed. |
457 | | */ |
458 | 0 | int nbytes; |
459 | 0 | unsigned char *src; |
460 | 0 | int srclen; |
461 | 0 | unsigned char *dst; |
462 | 0 | int dstlen; |
463 | 0 | int convertedlen; |
464 | |
|
465 | 0 | if (RAW_BUF_BYTES(cstate) == 0) |
466 | 0 | { |
467 | | /* |
468 | | * If no more raw data is coming, report the EOF to the caller. |
469 | | */ |
470 | 0 | if (cstate->raw_reached_eof) |
471 | 0 | cstate->input_reached_eof = true; |
472 | 0 | return; |
473 | 0 | } |
474 | | |
475 | | /* |
476 | | * First, copy down any unprocessed data. |
477 | | */ |
478 | 0 | nbytes = INPUT_BUF_BYTES(cstate); |
479 | 0 | if (nbytes > 0 && cstate->input_buf_index > 0) |
480 | 0 | memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index, |
481 | 0 | nbytes); |
482 | 0 | cstate->input_buf_index = 0; |
483 | 0 | cstate->input_buf_len = nbytes; |
484 | 0 | cstate->input_buf[nbytes] = '\0'; |
485 | |
|
486 | 0 | src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index; |
487 | 0 | srclen = cstate->raw_buf_len - cstate->raw_buf_index; |
488 | 0 | dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len; |
489 | 0 | dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1; |
490 | | |
491 | | /* |
492 | | * Do the conversion. This might stop short, if there is an invalid |
493 | | * byte sequence in the input. We'll convert as much as we can in |
494 | | * that case. |
495 | | * |
496 | | * Note: Even if we hit an invalid byte sequence, we don't report the |
497 | | * error until all the valid bytes have been consumed. The input |
498 | | * might contain an end-of-input marker (\.), and we don't want to |
499 | | * report an error if the invalid byte sequence is after the |
500 | | * end-of-input marker. We might unnecessarily convert some data |
501 | | * after the end-of-input marker as long as it's valid for the |
502 | | * encoding, but that's harmless. |
503 | | */ |
504 | 0 | convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc, |
505 | 0 | cstate->file_encoding, |
506 | 0 | GetDatabaseEncoding(), |
507 | 0 | src, srclen, |
508 | 0 | dst, dstlen, |
509 | 0 | true); |
510 | 0 | if (convertedlen == 0) |
511 | 0 | { |
512 | | /* |
513 | | * Could not convert anything. If there is no more raw input data |
514 | | * coming, it means that there was an incomplete multi-byte |
515 | | * sequence at the end. Also, if there is plenty of input left, |
516 | | * we should be able to convert at least one character, so a |
517 | | * failure to do so must mean that we've hit a byte sequence |
518 | | * that's invalid. |
519 | | */ |
520 | 0 | if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH) |
521 | 0 | cstate->input_reached_error = true; |
522 | 0 | return; |
523 | 0 | } |
524 | 0 | cstate->raw_buf_index += convertedlen; |
525 | 0 | cstate->input_buf_len += strlen((char *) dst); |
526 | 0 | } |
527 | 0 | } |
528 | | |
529 | | /* |
530 | | * Report an encoding or conversion error. |
531 | | */ |
532 | | static void |
533 | | CopyConversionError(CopyFromState cstate) |
534 | 0 | { |
535 | 0 | Assert(cstate->raw_buf_len > 0); |
536 | 0 | Assert(cstate->input_reached_error); |
537 | |
|
538 | 0 | if (!cstate->need_transcoding) |
539 | 0 | { |
540 | | /* |
541 | | * Everything up to input_buf_len was successfully verified, and |
542 | | * input_buf_len points to the invalid or incomplete character. |
543 | | */ |
544 | 0 | report_invalid_encoding(cstate->file_encoding, |
545 | 0 | cstate->raw_buf + cstate->input_buf_len, |
546 | 0 | cstate->raw_buf_len - cstate->input_buf_len); |
547 | 0 | } |
548 | 0 | else |
549 | 0 | { |
550 | | /* |
551 | | * raw_buf_index points to the invalid or untranslatable character. We |
552 | | * let the conversion routine report the error, because it can provide |
553 | | * a more specific error message than we could here. An earlier call |
554 | | * to the conversion routine in CopyConvertBuf() detected that there |
555 | | * is an error, now we call the conversion routine again with |
556 | | * noError=false, to have it throw the error. |
557 | | */ |
558 | 0 | unsigned char *src; |
559 | 0 | int srclen; |
560 | 0 | unsigned char *dst; |
561 | 0 | int dstlen; |
562 | |
|
563 | 0 | src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index; |
564 | 0 | srclen = cstate->raw_buf_len - cstate->raw_buf_index; |
565 | 0 | dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len; |
566 | 0 | dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1; |
567 | |
|
568 | 0 | (void) pg_do_encoding_conversion_buf(cstate->conversion_proc, |
569 | 0 | cstate->file_encoding, |
570 | 0 | GetDatabaseEncoding(), |
571 | 0 | src, srclen, |
572 | 0 | dst, dstlen, |
573 | 0 | false); |
574 | | |
575 | | /* |
576 | | * The conversion routine should have reported an error, so this |
577 | | * should not be reached. |
578 | | */ |
579 | 0 | elog(ERROR, "encoding conversion failed without error"); |
580 | 0 | } |
581 | 0 | } |
582 | | |
583 | | /* |
584 | | * Load more data from data source to raw_buf. |
585 | | * |
586 | | * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the |
587 | | * beginning of the buffer, and we load new data after that. |
588 | | */ |
589 | | static void |
590 | | CopyLoadRawBuf(CopyFromState cstate) |
591 | 0 | { |
592 | 0 | int nbytes; |
593 | 0 | int inbytes; |
594 | | |
595 | | /* |
596 | | * In text mode, if encoding conversion is not required, raw_buf and |
597 | | * input_buf point to the same buffer. Their len/index better agree, too. |
598 | | */ |
599 | 0 | if (cstate->raw_buf == cstate->input_buf) |
600 | 0 | { |
601 | 0 | Assert(!cstate->need_transcoding); |
602 | 0 | Assert(cstate->raw_buf_index == cstate->input_buf_index); |
603 | 0 | Assert(cstate->input_buf_len <= cstate->raw_buf_len); |
604 | 0 | } |
605 | | |
606 | | /* |
607 | | * Copy down the unprocessed data if any. |
608 | | */ |
609 | 0 | nbytes = RAW_BUF_BYTES(cstate); |
610 | 0 | if (nbytes > 0 && cstate->raw_buf_index > 0) |
611 | 0 | memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index, |
612 | 0 | nbytes); |
613 | 0 | cstate->raw_buf_len -= cstate->raw_buf_index; |
614 | 0 | cstate->raw_buf_index = 0; |
615 | | |
616 | | /* |
617 | | * If raw_buf and input_buf are in fact the same buffer, adjust the |
618 | | * input_buf variables, too. |
619 | | */ |
620 | 0 | if (cstate->raw_buf == cstate->input_buf) |
621 | 0 | { |
622 | 0 | cstate->input_buf_len -= cstate->input_buf_index; |
623 | 0 | cstate->input_buf_index = 0; |
624 | 0 | } |
625 | | |
626 | | /* Load more data */ |
627 | 0 | inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len, |
628 | 0 | 1, RAW_BUF_SIZE - cstate->raw_buf_len); |
629 | 0 | nbytes += inbytes; |
630 | 0 | cstate->raw_buf[nbytes] = '\0'; |
631 | 0 | cstate->raw_buf_len = nbytes; |
632 | |
|
633 | 0 | cstate->bytes_processed += inbytes; |
634 | 0 | pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed); |
635 | |
|
636 | 0 | if (inbytes == 0) |
637 | 0 | cstate->raw_reached_eof = true; |
638 | 0 | } |
639 | | |
640 | | /* |
641 | | * CopyLoadInputBuf loads some more data into input_buf |
642 | | * |
643 | | * On return, at least one more input character is loaded into |
644 | | * input_buf, or input_reached_eof is set. |
645 | | * |
646 | | * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start |
647 | | * of the buffer and then we load more data after that. |
648 | | */ |
649 | | static void |
650 | | CopyLoadInputBuf(CopyFromState cstate) |
651 | 0 | { |
652 | 0 | int nbytes = INPUT_BUF_BYTES(cstate); |
653 | | |
654 | | /* |
655 | | * The caller has updated input_buf_index to indicate how much of the |
656 | | * input has been consumed and isn't needed anymore. If input_buf is the |
657 | | * same physical area as raw_buf, update raw_buf_index accordingly. |
658 | | */ |
659 | 0 | if (cstate->raw_buf == cstate->input_buf) |
660 | 0 | { |
661 | 0 | Assert(!cstate->need_transcoding); |
662 | 0 | Assert(cstate->input_buf_index >= cstate->raw_buf_index); |
663 | 0 | cstate->raw_buf_index = cstate->input_buf_index; |
664 | 0 | } |
665 | |
|
666 | 0 | for (;;) |
667 | 0 | { |
668 | | /* If we now have some unconverted data, try to convert it */ |
669 | 0 | CopyConvertBuf(cstate); |
670 | | |
671 | | /* If we now have some more input bytes ready, return them */ |
672 | 0 | if (INPUT_BUF_BYTES(cstate) > nbytes) |
673 | 0 | return; |
674 | | |
675 | | /* |
676 | | * If we reached an invalid byte sequence, or we're at an incomplete |
677 | | * multi-byte character but there is no more raw input data, report |
678 | | * conversion error. |
679 | | */ |
680 | 0 | if (cstate->input_reached_error) |
681 | 0 | CopyConversionError(cstate); |
682 | | |
683 | | /* no more input, and everything has been converted */ |
684 | 0 | if (cstate->input_reached_eof) |
685 | 0 | break; |
686 | | |
687 | | /* Try to load more raw data */ |
688 | 0 | Assert(!cstate->raw_reached_eof); |
689 | 0 | CopyLoadRawBuf(cstate); |
690 | 0 | } |
691 | 0 | } |
692 | | |
693 | | /* |
694 | | * CopyReadBinaryData |
695 | | * |
696 | | * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf |
697 | | * and writes them to 'dest'. Returns the number of bytes read (which |
698 | | * would be less than 'nbytes' only if we reach EOF). |
699 | | */ |
700 | | static int |
701 | | CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes) |
702 | 0 | { |
703 | 0 | int copied_bytes = 0; |
704 | |
|
705 | 0 | if (RAW_BUF_BYTES(cstate) >= nbytes) |
706 | 0 | { |
707 | | /* Enough bytes are present in the buffer. */ |
708 | 0 | memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes); |
709 | 0 | cstate->raw_buf_index += nbytes; |
710 | 0 | copied_bytes = nbytes; |
711 | 0 | } |
712 | 0 | else |
713 | 0 | { |
714 | | /* |
715 | | * Not enough bytes in the buffer, so must read from the file. Need |
716 | | * to loop since 'nbytes' could be larger than the buffer size. |
717 | | */ |
718 | 0 | do |
719 | 0 | { |
720 | 0 | int copy_bytes; |
721 | | |
722 | | /* Load more data if buffer is empty. */ |
723 | 0 | if (RAW_BUF_BYTES(cstate) == 0) |
724 | 0 | { |
725 | 0 | CopyLoadRawBuf(cstate); |
726 | 0 | if (cstate->raw_reached_eof) |
727 | 0 | break; /* EOF */ |
728 | 0 | } |
729 | | |
730 | | /* Transfer some bytes. */ |
731 | 0 | copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate)); |
732 | 0 | memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes); |
733 | 0 | cstate->raw_buf_index += copy_bytes; |
734 | 0 | dest += copy_bytes; |
735 | 0 | copied_bytes += copy_bytes; |
736 | 0 | } while (copied_bytes < nbytes); |
737 | 0 | } |
738 | |
|
739 | 0 | return copied_bytes; |
740 | 0 | } |
741 | | |
742 | | /* |
743 | | * This function is exposed for use by extensions that read raw fields in the |
744 | | * next line. See NextCopyFromRawFieldsInternal() for details. |
745 | | */ |
746 | | bool |
747 | | NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields) |
748 | 0 | { |
749 | 0 | return NextCopyFromRawFieldsInternal(cstate, fields, nfields, |
750 | 0 | cstate->opts.csv_mode); |
751 | 0 | } |
752 | | |
753 | | /* |
754 | | * Workhorse for NextCopyFromRawFields(). |
755 | | * |
756 | | * Read raw fields in the next line for COPY FROM in text or csv mode. Return |
757 | | * false if no more lines. |
758 | | * |
759 | | * An internal temporary buffer is returned via 'fields'. It is valid until |
760 | | * the next call of the function. Since the function returns all raw fields |
761 | | * in the input file, 'nfields' could be different from the number of columns |
762 | | * in the relation. |
763 | | * |
764 | | * NOTE: force_not_null option are not applied to the returned fields. |
765 | | * |
766 | | * We use pg_attribute_always_inline to reduce function call overhead |
767 | | * and to help compilers to optimize away the 'is_csv' condition when called |
768 | | * by internal functions such as CopyFromTextLikeOneRow(). |
769 | | */ |
770 | | static pg_attribute_always_inline bool |
771 | | NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv) |
772 | 0 | { |
773 | 0 | int fldct; |
774 | 0 | bool done = false; |
775 | | |
776 | | /* only available for text or csv input */ |
777 | 0 | Assert(!cstate->opts.binary); |
778 | | |
779 | | /* on input check that the header line is correct if needed */ |
780 | 0 | if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE) |
781 | 0 | { |
782 | 0 | ListCell *cur; |
783 | 0 | TupleDesc tupDesc; |
784 | 0 | int lines_to_skip = cstate->opts.header_line; |
785 | | |
786 | | /* If set to "match", one header line is skipped */ |
787 | 0 | if (cstate->opts.header_line == COPY_HEADER_MATCH) |
788 | 0 | lines_to_skip = 1; |
789 | |
|
790 | 0 | tupDesc = RelationGetDescr(cstate->rel); |
791 | |
|
792 | 0 | for (int i = 0; i < lines_to_skip; i++) |
793 | 0 | { |
794 | 0 | cstate->cur_lineno++; |
795 | 0 | if ((done = CopyReadLine(cstate, is_csv))) |
796 | 0 | break; |
797 | 0 | } |
798 | |
|
799 | 0 | if (cstate->opts.header_line == COPY_HEADER_MATCH) |
800 | 0 | { |
801 | 0 | int fldnum; |
802 | |
|
803 | 0 | if (is_csv) |
804 | 0 | fldct = CopyReadAttributesCSV(cstate); |
805 | 0 | else |
806 | 0 | fldct = CopyReadAttributesText(cstate); |
807 | |
|
808 | 0 | if (fldct != list_length(cstate->attnumlist)) |
809 | 0 | ereport(ERROR, |
810 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
811 | 0 | errmsg("wrong number of fields in header line: got %d, expected %d", |
812 | 0 | fldct, list_length(cstate->attnumlist)))); |
813 | | |
814 | 0 | fldnum = 0; |
815 | 0 | foreach(cur, cstate->attnumlist) |
816 | 0 | { |
817 | 0 | int attnum = lfirst_int(cur); |
818 | 0 | char *colName; |
819 | 0 | Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1); |
820 | |
|
821 | 0 | Assert(fldnum < cstate->max_fields); |
822 | |
|
823 | 0 | colName = cstate->raw_fields[fldnum++]; |
824 | 0 | if (colName == NULL) |
825 | 0 | ereport(ERROR, |
826 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
827 | 0 | errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"", |
828 | 0 | fldnum, cstate->opts.null_print, NameStr(attr->attname)))); |
829 | | |
830 | 0 | if (namestrcmp(&attr->attname, colName) != 0) |
831 | 0 | { |
832 | 0 | ereport(ERROR, |
833 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
834 | 0 | errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"", |
835 | 0 | fldnum, colName, NameStr(attr->attname)))); |
836 | 0 | } |
837 | 0 | } |
838 | 0 | } |
839 | | |
840 | 0 | if (done) |
841 | 0 | return false; |
842 | 0 | } |
843 | | |
844 | 0 | cstate->cur_lineno++; |
845 | | |
846 | | /* Actually read the line into memory here */ |
847 | 0 | done = CopyReadLine(cstate, is_csv); |
848 | | |
849 | | /* |
850 | | * EOF at start of line means we're done. If we see EOF after some |
851 | | * characters, we act as though it was newline followed by EOF, ie, |
852 | | * process the line and then exit loop on next iteration. |
853 | | */ |
854 | 0 | if (done && cstate->line_buf.len == 0) |
855 | 0 | return false; |
856 | | |
857 | | /* Parse the line into de-escaped field values */ |
858 | 0 | if (is_csv) |
859 | 0 | fldct = CopyReadAttributesCSV(cstate); |
860 | 0 | else |
861 | 0 | fldct = CopyReadAttributesText(cstate); |
862 | |
|
863 | 0 | *fields = cstate->raw_fields; |
864 | 0 | *nfields = fldct; |
865 | 0 | return true; |
866 | 0 | } |
867 | | |
868 | | /* |
869 | | * Read next tuple from file for COPY FROM. Return false if no more tuples. |
870 | | * |
871 | | * 'econtext' is used to evaluate default expression for each column that is |
872 | | * either not read from the file or is using the DEFAULT option of COPY FROM. |
873 | | * It can be NULL when no default values are used, i.e. when all columns are |
874 | | * read from the file, and DEFAULT option is unset. |
875 | | * |
876 | | * 'values' and 'nulls' arrays must be the same length as columns of the |
877 | | * relation passed to BeginCopyFrom. This function fills the arrays. |
878 | | */ |
879 | | bool |
880 | | NextCopyFrom(CopyFromState cstate, ExprContext *econtext, |
881 | | Datum *values, bool *nulls) |
882 | 0 | { |
883 | 0 | TupleDesc tupDesc; |
884 | 0 | AttrNumber num_phys_attrs, |
885 | 0 | num_defaults = cstate->num_defaults; |
886 | 0 | int i; |
887 | 0 | int *defmap = cstate->defmap; |
888 | 0 | ExprState **defexprs = cstate->defexprs; |
889 | |
|
890 | 0 | tupDesc = RelationGetDescr(cstate->rel); |
891 | 0 | num_phys_attrs = tupDesc->natts; |
892 | | |
893 | | /* Initialize all values for row to NULL */ |
894 | 0 | MemSet(values, 0, num_phys_attrs * sizeof(Datum)); |
895 | 0 | MemSet(nulls, true, num_phys_attrs * sizeof(bool)); |
896 | 0 | MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool)); |
897 | | |
898 | | /* Get one row from source */ |
899 | 0 | if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls)) |
900 | 0 | return false; |
901 | | |
902 | | /* |
903 | | * Now compute and insert any defaults available for the columns not |
904 | | * provided by the input data. Anything not processed here or above will |
905 | | * remain NULL. |
906 | | */ |
907 | 0 | for (i = 0; i < num_defaults; i++) |
908 | 0 | { |
909 | | /* |
910 | | * The caller must supply econtext and have switched into the |
911 | | * per-tuple memory context in it. |
912 | | */ |
913 | 0 | Assert(econtext != NULL); |
914 | 0 | Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory); |
915 | |
|
916 | 0 | values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext, |
917 | 0 | &nulls[defmap[i]]); |
918 | 0 | } |
919 | |
|
920 | 0 | return true; |
921 | 0 | } |
922 | | |
923 | | /* Implementation of the per-row callback for text format */ |
924 | | bool |
925 | | CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, |
926 | | bool *nulls) |
927 | 0 | { |
928 | 0 | return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false); |
929 | 0 | } |
930 | | |
931 | | /* Implementation of the per-row callback for CSV format */ |
932 | | bool |
933 | | CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, |
934 | | bool *nulls) |
935 | 0 | { |
936 | 0 | return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true); |
937 | 0 | } |
938 | | |
939 | | /* |
940 | | * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow(). |
941 | | * |
942 | | * We use pg_attribute_always_inline to reduce function call overhead |
943 | | * and to help compilers to optimize away the 'is_csv' condition. |
944 | | */ |
945 | | static pg_attribute_always_inline bool |
946 | | CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext, |
947 | | Datum *values, bool *nulls, bool is_csv) |
948 | | { |
949 | | TupleDesc tupDesc; |
950 | | AttrNumber attr_count; |
951 | | FmgrInfo *in_functions = cstate->in_functions; |
952 | | Oid *typioparams = cstate->typioparams; |
953 | | ExprState **defexprs = cstate->defexprs; |
954 | | char **field_strings; |
955 | | ListCell *cur; |
956 | | int fldct; |
957 | | int fieldno; |
958 | | char *string; |
959 | | |
960 | | tupDesc = RelationGetDescr(cstate->rel); |
961 | | attr_count = list_length(cstate->attnumlist); |
962 | | |
963 | | /* read raw fields in the next line */ |
964 | | if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv)) |
965 | | return false; |
966 | | |
967 | | /* check for overflowing fields */ |
968 | | if (attr_count > 0 && fldct > attr_count) |
969 | | ereport(ERROR, |
970 | | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
971 | | errmsg("extra data after last expected column"))); |
972 | | |
973 | | fieldno = 0; |
974 | | |
975 | | /* Loop to read the user attributes on the line. */ |
976 | | foreach(cur, cstate->attnumlist) |
977 | | { |
978 | | int attnum = lfirst_int(cur); |
979 | | int m = attnum - 1; |
980 | | Form_pg_attribute att = TupleDescAttr(tupDesc, m); |
981 | | |
982 | | if (fieldno >= fldct) |
983 | | ereport(ERROR, |
984 | | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
985 | | errmsg("missing data for column \"%s\"", |
986 | | NameStr(att->attname)))); |
987 | | string = field_strings[fieldno++]; |
988 | | |
989 | | if (cstate->convert_select_flags && |
990 | | !cstate->convert_select_flags[m]) |
991 | | { |
992 | | /* ignore input field, leaving column as NULL */ |
993 | | continue; |
994 | | } |
995 | | |
996 | | if (is_csv) |
997 | | { |
998 | | if (string == NULL && |
999 | | cstate->opts.force_notnull_flags[m]) |
1000 | | { |
1001 | | /* |
1002 | | * FORCE_NOT_NULL option is set and column is NULL - convert |
1003 | | * it to the NULL string. |
1004 | | */ |
1005 | | string = cstate->opts.null_print; |
1006 | | } |
1007 | | else if (string != NULL && cstate->opts.force_null_flags[m] |
1008 | | && strcmp(string, cstate->opts.null_print) == 0) |
1009 | | { |
1010 | | /* |
1011 | | * FORCE_NULL option is set and column matches the NULL |
1012 | | * string. It must have been quoted, or otherwise the string |
1013 | | * would already have been set to NULL. Convert it to NULL as |
1014 | | * specified. |
1015 | | */ |
1016 | | string = NULL; |
1017 | | } |
1018 | | } |
1019 | | |
1020 | | cstate->cur_attname = NameStr(att->attname); |
1021 | | cstate->cur_attval = string; |
1022 | | |
1023 | | if (string != NULL) |
1024 | | nulls[m] = false; |
1025 | | |
1026 | | if (cstate->defaults[m]) |
1027 | | { |
1028 | | /* We must have switched into the per-tuple memory context */ |
1029 | | Assert(econtext != NULL); |
1030 | | Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory); |
1031 | | |
1032 | | values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]); |
1033 | | } |
1034 | | |
1035 | | /* |
1036 | | * If ON_ERROR is specified with IGNORE, skip rows with soft errors |
1037 | | */ |
1038 | | else if (!InputFunctionCallSafe(&in_functions[m], |
1039 | | string, |
1040 | | typioparams[m], |
1041 | | att->atttypmod, |
1042 | | (Node *) cstate->escontext, |
1043 | | &values[m])) |
1044 | | { |
1045 | | Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP); |
1046 | | |
1047 | | cstate->num_errors++; |
1048 | | |
1049 | | if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE) |
1050 | | { |
1051 | | /* |
1052 | | * Since we emit line number and column info in the below |
1053 | | * notice message, we suppress error context information other |
1054 | | * than the relation name. |
1055 | | */ |
1056 | | Assert(!cstate->relname_only); |
1057 | | cstate->relname_only = true; |
1058 | | |
1059 | | if (cstate->cur_attval) |
1060 | | { |
1061 | | char *attval; |
1062 | | |
1063 | | attval = CopyLimitPrintoutLength(cstate->cur_attval); |
1064 | | ereport(NOTICE, |
1065 | | errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"", |
1066 | | cstate->cur_lineno, |
1067 | | cstate->cur_attname, |
1068 | | attval)); |
1069 | | pfree(attval); |
1070 | | } |
1071 | | else |
1072 | | ereport(NOTICE, |
1073 | | errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input", |
1074 | | cstate->cur_lineno, |
1075 | | cstate->cur_attname)); |
1076 | | |
1077 | | /* reset relname_only */ |
1078 | | cstate->relname_only = false; |
1079 | | } |
1080 | | |
1081 | | return true; |
1082 | | } |
1083 | | |
1084 | | cstate->cur_attname = NULL; |
1085 | | cstate->cur_attval = NULL; |
1086 | | } |
1087 | | |
1088 | | Assert(fieldno == attr_count); |
1089 | | |
1090 | | return true; |
1091 | | } |
1092 | | |
1093 | | /* Implementation of the per-row callback for binary format */ |
1094 | | bool |
1095 | | CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, |
1096 | | bool *nulls) |
1097 | 0 | { |
1098 | 0 | TupleDesc tupDesc; |
1099 | 0 | AttrNumber attr_count; |
1100 | 0 | FmgrInfo *in_functions = cstate->in_functions; |
1101 | 0 | Oid *typioparams = cstate->typioparams; |
1102 | 0 | int16 fld_count; |
1103 | 0 | ListCell *cur; |
1104 | |
|
1105 | 0 | tupDesc = RelationGetDescr(cstate->rel); |
1106 | 0 | attr_count = list_length(cstate->attnumlist); |
1107 | |
|
1108 | 0 | cstate->cur_lineno++; |
1109 | |
|
1110 | 0 | if (!CopyGetInt16(cstate, &fld_count)) |
1111 | 0 | { |
1112 | | /* EOF detected (end of file, or protocol-level EOF) */ |
1113 | 0 | return false; |
1114 | 0 | } |
1115 | | |
1116 | 0 | if (fld_count == -1) |
1117 | 0 | { |
1118 | | /* |
1119 | | * Received EOF marker. Wait for the protocol-level EOF, and complain |
1120 | | * if it doesn't come immediately. In COPY FROM STDIN, this ensures |
1121 | | * that we correctly handle CopyFail, if client chooses to send that |
1122 | | * now. When copying from file, we could ignore the rest of the file |
1123 | | * like in text mode, but we choose to be consistent with the COPY |
1124 | | * FROM STDIN case. |
1125 | | */ |
1126 | 0 | char dummy; |
1127 | |
|
1128 | 0 | if (CopyReadBinaryData(cstate, &dummy, 1) > 0) |
1129 | 0 | ereport(ERROR, |
1130 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1131 | 0 | errmsg("received copy data after EOF marker"))); |
1132 | 0 | return false; |
1133 | 0 | } |
1134 | | |
1135 | 0 | if (fld_count != attr_count) |
1136 | 0 | ereport(ERROR, |
1137 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1138 | 0 | errmsg("row field count is %d, expected %d", |
1139 | 0 | (int) fld_count, attr_count))); |
1140 | | |
1141 | 0 | foreach(cur, cstate->attnumlist) |
1142 | 0 | { |
1143 | 0 | int attnum = lfirst_int(cur); |
1144 | 0 | int m = attnum - 1; |
1145 | 0 | Form_pg_attribute att = TupleDescAttr(tupDesc, m); |
1146 | |
|
1147 | 0 | cstate->cur_attname = NameStr(att->attname); |
1148 | 0 | values[m] = CopyReadBinaryAttribute(cstate, |
1149 | 0 | &in_functions[m], |
1150 | 0 | typioparams[m], |
1151 | 0 | att->atttypmod, |
1152 | 0 | &nulls[m]); |
1153 | 0 | cstate->cur_attname = NULL; |
1154 | 0 | } |
1155 | |
|
1156 | 0 | return true; |
1157 | 0 | } |
1158 | | |
1159 | | /* |
1160 | | * Read the next input line and stash it in line_buf. |
1161 | | * |
1162 | | * Result is true if read was terminated by EOF, false if terminated |
1163 | | * by newline. The terminating newline or EOF marker is not included |
1164 | | * in the final value of line_buf. |
1165 | | */ |
1166 | | static bool |
1167 | | CopyReadLine(CopyFromState cstate, bool is_csv) |
1168 | 0 | { |
1169 | 0 | bool result; |
1170 | |
|
1171 | 0 | resetStringInfo(&cstate->line_buf); |
1172 | 0 | cstate->line_buf_valid = false; |
1173 | | |
1174 | | /* Parse data and transfer into line_buf */ |
1175 | 0 | result = CopyReadLineText(cstate, is_csv); |
1176 | |
|
1177 | 0 | if (result) |
1178 | 0 | { |
1179 | | /* |
1180 | | * Reached EOF. In protocol version 3, we should ignore anything |
1181 | | * after \. up to the protocol end of copy data. (XXX maybe better |
1182 | | * not to treat \. as special?) |
1183 | | */ |
1184 | 0 | if (cstate->copy_src == COPY_FRONTEND) |
1185 | 0 | { |
1186 | 0 | int inbytes; |
1187 | |
|
1188 | 0 | do |
1189 | 0 | { |
1190 | 0 | inbytes = CopyGetData(cstate, cstate->input_buf, |
1191 | 0 | 1, INPUT_BUF_SIZE); |
1192 | 0 | } while (inbytes > 0); |
1193 | 0 | cstate->input_buf_index = 0; |
1194 | 0 | cstate->input_buf_len = 0; |
1195 | 0 | cstate->raw_buf_index = 0; |
1196 | 0 | cstate->raw_buf_len = 0; |
1197 | 0 | } |
1198 | 0 | } |
1199 | 0 | else |
1200 | 0 | { |
1201 | | /* |
1202 | | * If we didn't hit EOF, then we must have transferred the EOL marker |
1203 | | * to line_buf along with the data. Get rid of it. |
1204 | | */ |
1205 | 0 | switch (cstate->eol_type) |
1206 | 0 | { |
1207 | 0 | case EOL_NL: |
1208 | 0 | Assert(cstate->line_buf.len >= 1); |
1209 | 0 | Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n'); |
1210 | 0 | cstate->line_buf.len--; |
1211 | 0 | cstate->line_buf.data[cstate->line_buf.len] = '\0'; |
1212 | 0 | break; |
1213 | 0 | case EOL_CR: |
1214 | 0 | Assert(cstate->line_buf.len >= 1); |
1215 | 0 | Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r'); |
1216 | 0 | cstate->line_buf.len--; |
1217 | 0 | cstate->line_buf.data[cstate->line_buf.len] = '\0'; |
1218 | 0 | break; |
1219 | 0 | case EOL_CRNL: |
1220 | 0 | Assert(cstate->line_buf.len >= 2); |
1221 | 0 | Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r'); |
1222 | 0 | Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n'); |
1223 | 0 | cstate->line_buf.len -= 2; |
1224 | 0 | cstate->line_buf.data[cstate->line_buf.len] = '\0'; |
1225 | 0 | break; |
1226 | 0 | case EOL_UNKNOWN: |
1227 | | /* shouldn't get here */ |
1228 | 0 | Assert(false); |
1229 | 0 | break; |
1230 | 0 | } |
1231 | 0 | } |
1232 | | |
1233 | | /* Now it's safe to use the buffer in error messages */ |
1234 | 0 | cstate->line_buf_valid = true; |
1235 | |
|
1236 | 0 | return result; |
1237 | 0 | } |
1238 | | |
1239 | | /* |
1240 | | * CopyReadLineText - inner loop of CopyReadLine for text mode |
1241 | | */ |
1242 | | static bool |
1243 | | CopyReadLineText(CopyFromState cstate, bool is_csv) |
1244 | 0 | { |
1245 | 0 | char *copy_input_buf; |
1246 | 0 | int input_buf_ptr; |
1247 | 0 | int copy_buf_len; |
1248 | 0 | bool need_data = false; |
1249 | 0 | bool hit_eof = false; |
1250 | 0 | bool result = false; |
1251 | | |
1252 | | /* CSV variables */ |
1253 | 0 | bool in_quote = false, |
1254 | 0 | last_was_esc = false; |
1255 | 0 | char quotec = '\0'; |
1256 | 0 | char escapec = '\0'; |
1257 | |
|
1258 | 0 | if (is_csv) |
1259 | 0 | { |
1260 | 0 | quotec = cstate->opts.quote[0]; |
1261 | 0 | escapec = cstate->opts.escape[0]; |
1262 | | /* ignore special escape processing if it's the same as quotec */ |
1263 | 0 | if (quotec == escapec) |
1264 | 0 | escapec = '\0'; |
1265 | 0 | } |
1266 | | |
1267 | | /* |
1268 | | * The objective of this loop is to transfer the entire next input line |
1269 | | * into line_buf. Hence, we only care for detecting newlines (\r and/or |
1270 | | * \n) and the end-of-copy marker (\.). |
1271 | | * |
1272 | | * In CSV mode, \r and \n inside a quoted field are just part of the data |
1273 | | * value and are put in line_buf. We keep just enough state to know if we |
1274 | | * are currently in a quoted field or not. |
1275 | | * |
1276 | | * The input has already been converted to the database encoding. All |
1277 | | * supported server encodings have the property that all bytes in a |
1278 | | * multi-byte sequence have the high bit set, so a multibyte character |
1279 | | * cannot contain any newline or escape characters embedded in the |
1280 | | * multibyte sequence. Therefore, we can process the input byte-by-byte, |
1281 | | * regardless of the encoding. |
1282 | | * |
1283 | | * For speed, we try to move data from input_buf to line_buf in chunks |
1284 | | * rather than one character at a time. input_buf_ptr points to the next |
1285 | | * character to examine; any characters from input_buf_index to |
1286 | | * input_buf_ptr have been determined to be part of the line, but not yet |
1287 | | * transferred to line_buf. |
1288 | | * |
1289 | | * For a little extra speed within the loop, we copy input_buf and |
1290 | | * input_buf_len into local variables. |
1291 | | */ |
1292 | 0 | copy_input_buf = cstate->input_buf; |
1293 | 0 | input_buf_ptr = cstate->input_buf_index; |
1294 | 0 | copy_buf_len = cstate->input_buf_len; |
1295 | |
|
1296 | 0 | for (;;) |
1297 | 0 | { |
1298 | 0 | int prev_raw_ptr; |
1299 | 0 | char c; |
1300 | | |
1301 | | /* |
1302 | | * Load more data if needed. |
1303 | | * |
1304 | | * TODO: We could just force four bytes of read-ahead and avoid the |
1305 | | * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was |
1306 | | * unsafe with the old v2 COPY protocol, but we don't support that |
1307 | | * anymore. |
1308 | | */ |
1309 | 0 | if (input_buf_ptr >= copy_buf_len || need_data) |
1310 | 0 | { |
1311 | 0 | REFILL_LINEBUF; |
1312 | |
|
1313 | 0 | CopyLoadInputBuf(cstate); |
1314 | | /* update our local variables */ |
1315 | 0 | hit_eof = cstate->input_reached_eof; |
1316 | 0 | input_buf_ptr = cstate->input_buf_index; |
1317 | 0 | copy_buf_len = cstate->input_buf_len; |
1318 | | |
1319 | | /* |
1320 | | * If we are completely out of data, break out of the loop, |
1321 | | * reporting EOF. |
1322 | | */ |
1323 | 0 | if (INPUT_BUF_BYTES(cstate) <= 0) |
1324 | 0 | { |
1325 | 0 | result = true; |
1326 | 0 | break; |
1327 | 0 | } |
1328 | 0 | need_data = false; |
1329 | 0 | } |
1330 | | |
1331 | | /* OK to fetch a character */ |
1332 | 0 | prev_raw_ptr = input_buf_ptr; |
1333 | 0 | c = copy_input_buf[input_buf_ptr++]; |
1334 | |
|
1335 | 0 | if (is_csv) |
1336 | 0 | { |
1337 | | /* |
1338 | | * If character is '\r', we may need to look ahead below. Force |
1339 | | * fetch of the next character if we don't already have it. We |
1340 | | * need to do this before changing CSV state, in case '\r' is also |
1341 | | * the quote or escape character. |
1342 | | */ |
1343 | 0 | if (c == '\r') |
1344 | 0 | { |
1345 | 0 | IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
1346 | 0 | } |
1347 | | |
1348 | | /* |
1349 | | * Dealing with quotes and escapes here is mildly tricky. If the |
1350 | | * quote char is also the escape char, there's no problem - we |
1351 | | * just use the char as a toggle. If they are different, we need |
1352 | | * to ensure that we only take account of an escape inside a |
1353 | | * quoted field and immediately preceding a quote char, and not |
1354 | | * the second in an escape-escape sequence. |
1355 | | */ |
1356 | 0 | if (in_quote && c == escapec) |
1357 | 0 | last_was_esc = !last_was_esc; |
1358 | 0 | if (c == quotec && !last_was_esc) |
1359 | 0 | in_quote = !in_quote; |
1360 | 0 | if (c != escapec) |
1361 | 0 | last_was_esc = false; |
1362 | | |
1363 | | /* |
1364 | | * Updating the line count for embedded CR and/or LF chars is |
1365 | | * necessarily a little fragile - this test is probably about the |
1366 | | * best we can do. (XXX it's arguable whether we should do this |
1367 | | * at all --- is cur_lineno a physical or logical count?) |
1368 | | */ |
1369 | 0 | if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r')) |
1370 | 0 | cstate->cur_lineno++; |
1371 | 0 | } |
1372 | | |
1373 | | /* Process \r */ |
1374 | 0 | if (c == '\r' && (!is_csv || !in_quote)) |
1375 | 0 | { |
1376 | | /* Check for \r\n on first line, _and_ handle \r\n. */ |
1377 | 0 | if (cstate->eol_type == EOL_UNKNOWN || |
1378 | 0 | cstate->eol_type == EOL_CRNL) |
1379 | 0 | { |
1380 | | /* |
1381 | | * If need more data, go back to loop top to load it. |
1382 | | * |
1383 | | * Note that if we are at EOF, c will wind up as '\0' because |
1384 | | * of the guaranteed pad of input_buf. |
1385 | | */ |
1386 | 0 | IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
1387 | | |
1388 | | /* get next char */ |
1389 | 0 | c = copy_input_buf[input_buf_ptr]; |
1390 | |
|
1391 | 0 | if (c == '\n') |
1392 | 0 | { |
1393 | 0 | input_buf_ptr++; /* eat newline */ |
1394 | 0 | cstate->eol_type = EOL_CRNL; /* in case not set yet */ |
1395 | 0 | } |
1396 | 0 | else |
1397 | 0 | { |
1398 | | /* found \r, but no \n */ |
1399 | 0 | if (cstate->eol_type == EOL_CRNL) |
1400 | 0 | ereport(ERROR, |
1401 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1402 | 0 | !is_csv ? |
1403 | 0 | errmsg("literal carriage return found in data") : |
1404 | 0 | errmsg("unquoted carriage return found in data"), |
1405 | 0 | !is_csv ? |
1406 | 0 | errhint("Use \"\\r\" to represent carriage return.") : |
1407 | 0 | errhint("Use quoted CSV field to represent carriage return."))); |
1408 | | |
1409 | | /* |
1410 | | * if we got here, it is the first line and we didn't find |
1411 | | * \n, so don't consume the peeked character |
1412 | | */ |
1413 | 0 | cstate->eol_type = EOL_CR; |
1414 | 0 | } |
1415 | 0 | } |
1416 | 0 | else if (cstate->eol_type == EOL_NL) |
1417 | 0 | ereport(ERROR, |
1418 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1419 | 0 | !is_csv ? |
1420 | 0 | errmsg("literal carriage return found in data") : |
1421 | 0 | errmsg("unquoted carriage return found in data"), |
1422 | 0 | !is_csv ? |
1423 | 0 | errhint("Use \"\\r\" to represent carriage return.") : |
1424 | 0 | errhint("Use quoted CSV field to represent carriage return."))); |
1425 | | /* If reach here, we have found the line terminator */ |
1426 | 0 | break; |
1427 | 0 | } |
1428 | | |
1429 | | /* Process \n */ |
1430 | 0 | if (c == '\n' && (!is_csv || !in_quote)) |
1431 | 0 | { |
1432 | 0 | if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) |
1433 | 0 | ereport(ERROR, |
1434 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1435 | 0 | !is_csv ? |
1436 | 0 | errmsg("literal newline found in data") : |
1437 | 0 | errmsg("unquoted newline found in data"), |
1438 | 0 | !is_csv ? |
1439 | 0 | errhint("Use \"\\n\" to represent newline.") : |
1440 | 0 | errhint("Use quoted CSV field to represent newline."))); |
1441 | 0 | cstate->eol_type = EOL_NL; /* in case not set yet */ |
1442 | | /* If reach here, we have found the line terminator */ |
1443 | 0 | break; |
1444 | 0 | } |
1445 | | |
1446 | | /* |
1447 | | * Process backslash, except in CSV mode where backslash is a normal |
1448 | | * character. |
1449 | | */ |
1450 | 0 | if (c == '\\' && !is_csv) |
1451 | 0 | { |
1452 | 0 | char c2; |
1453 | |
|
1454 | 0 | IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
1455 | 0 | IF_NEED_REFILL_AND_EOF_BREAK(0); |
1456 | | |
1457 | | /* ----- |
1458 | | * get next character |
1459 | | * Note: we do not change c so if it isn't \., we can fall |
1460 | | * through and continue processing. |
1461 | | * ----- |
1462 | | */ |
1463 | 0 | c2 = copy_input_buf[input_buf_ptr]; |
1464 | |
|
1465 | 0 | if (c2 == '.') |
1466 | 0 | { |
1467 | 0 | input_buf_ptr++; /* consume the '.' */ |
1468 | 0 | if (cstate->eol_type == EOL_CRNL) |
1469 | 0 | { |
1470 | | /* Get the next character */ |
1471 | 0 | IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
1472 | | /* if hit_eof, c2 will become '\0' */ |
1473 | 0 | c2 = copy_input_buf[input_buf_ptr++]; |
1474 | |
|
1475 | 0 | if (c2 == '\n') |
1476 | 0 | ereport(ERROR, |
1477 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1478 | 0 | errmsg("end-of-copy marker does not match previous newline style"))); |
1479 | 0 | else if (c2 != '\r') |
1480 | 0 | ereport(ERROR, |
1481 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1482 | 0 | errmsg("end-of-copy marker is not alone on its line"))); |
1483 | 0 | } |
1484 | | |
1485 | | /* Get the next character */ |
1486 | 0 | IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); |
1487 | | /* if hit_eof, c2 will become '\0' */ |
1488 | 0 | c2 = copy_input_buf[input_buf_ptr++]; |
1489 | |
|
1490 | 0 | if (c2 != '\r' && c2 != '\n') |
1491 | 0 | ereport(ERROR, |
1492 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1493 | 0 | errmsg("end-of-copy marker is not alone on its line"))); |
1494 | | |
1495 | 0 | if ((cstate->eol_type == EOL_NL && c2 != '\n') || |
1496 | 0 | (cstate->eol_type == EOL_CRNL && c2 != '\n') || |
1497 | 0 | (cstate->eol_type == EOL_CR && c2 != '\r')) |
1498 | 0 | ereport(ERROR, |
1499 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1500 | 0 | errmsg("end-of-copy marker does not match previous newline style"))); |
1501 | | |
1502 | | /* |
1503 | | * If there is any data on this line before the \., complain. |
1504 | | */ |
1505 | 0 | if (cstate->line_buf.len > 0 || |
1506 | 0 | prev_raw_ptr > cstate->input_buf_index) |
1507 | 0 | ereport(ERROR, |
1508 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1509 | 0 | errmsg("end-of-copy marker is not alone on its line"))); |
1510 | | |
1511 | | /* |
1512 | | * Discard the \. and newline, then report EOF. |
1513 | | */ |
1514 | 0 | cstate->input_buf_index = input_buf_ptr; |
1515 | 0 | result = true; /* report EOF */ |
1516 | 0 | break; |
1517 | 0 | } |
1518 | 0 | else |
1519 | 0 | { |
1520 | | /* |
1521 | | * If we are here, it means we found a backslash followed by |
1522 | | * something other than a period. In non-CSV mode, anything |
1523 | | * after a backslash is special, so we skip over that second |
1524 | | * character too. If we didn't do that \\. would be |
1525 | | * considered an eof-of copy, while in non-CSV mode it is a |
1526 | | * literal backslash followed by a period. |
1527 | | */ |
1528 | 0 | input_buf_ptr++; |
1529 | 0 | } |
1530 | 0 | } |
1531 | 0 | } /* end of outer loop */ |
1532 | | |
1533 | | /* |
1534 | | * Transfer any still-uncopied data to line_buf. |
1535 | | */ |
1536 | 0 | REFILL_LINEBUF; |
1537 | |
|
1538 | 0 | return result; |
1539 | 0 | } |
1540 | | |
1541 | | /* |
1542 | | * Return decimal value for a hexadecimal digit |
1543 | | */ |
1544 | | static int |
1545 | | GetDecimalFromHex(char hex) |
1546 | 0 | { |
1547 | 0 | if (isdigit((unsigned char) hex)) |
1548 | 0 | return hex - '0'; |
1549 | 0 | else |
1550 | 0 | return pg_ascii_tolower((unsigned char) hex) - 'a' + 10; |
1551 | 0 | } |
1552 | | |
1553 | | /* |
1554 | | * Parse the current line into separate attributes (fields), |
1555 | | * performing de-escaping as needed. |
1556 | | * |
1557 | | * The input is in line_buf. We use attribute_buf to hold the result |
1558 | | * strings. cstate->raw_fields[k] is set to point to the k'th attribute |
1559 | | * string, or NULL when the input matches the null marker string. |
1560 | | * This array is expanded as necessary. |
1561 | | * |
1562 | | * (Note that the caller cannot check for nulls since the returned |
1563 | | * string would be the post-de-escaping equivalent, which may look |
1564 | | * the same as some valid data string.) |
1565 | | * |
1566 | | * delim is the column delimiter string (must be just one byte for now). |
1567 | | * null_print is the null marker string. Note that this is compared to |
1568 | | * the pre-de-escaped input string. |
1569 | | * |
1570 | | * The return value is the number of fields actually read. |
1571 | | */ |
1572 | | static int |
1573 | | CopyReadAttributesText(CopyFromState cstate) |
1574 | 0 | { |
1575 | 0 | char delimc = cstate->opts.delim[0]; |
1576 | 0 | int fieldno; |
1577 | 0 | char *output_ptr; |
1578 | 0 | char *cur_ptr; |
1579 | 0 | char *line_end_ptr; |
1580 | | |
1581 | | /* |
1582 | | * We need a special case for zero-column tables: check that the input |
1583 | | * line is empty, and return. |
1584 | | */ |
1585 | 0 | if (cstate->max_fields <= 0) |
1586 | 0 | { |
1587 | 0 | if (cstate->line_buf.len != 0) |
1588 | 0 | ereport(ERROR, |
1589 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1590 | 0 | errmsg("extra data after last expected column"))); |
1591 | 0 | return 0; |
1592 | 0 | } |
1593 | | |
1594 | 0 | resetStringInfo(&cstate->attribute_buf); |
1595 | | |
1596 | | /* |
1597 | | * The de-escaped attributes will certainly not be longer than the input |
1598 | | * data line, so we can just force attribute_buf to be large enough and |
1599 | | * then transfer data without any checks for enough space. We need to do |
1600 | | * it this way because enlarging attribute_buf mid-stream would invalidate |
1601 | | * pointers already stored into cstate->raw_fields[]. |
1602 | | */ |
1603 | 0 | if (cstate->attribute_buf.maxlen <= cstate->line_buf.len) |
1604 | 0 | enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len); |
1605 | 0 | output_ptr = cstate->attribute_buf.data; |
1606 | | |
1607 | | /* set pointer variables for loop */ |
1608 | 0 | cur_ptr = cstate->line_buf.data; |
1609 | 0 | line_end_ptr = cstate->line_buf.data + cstate->line_buf.len; |
1610 | | |
1611 | | /* Outer loop iterates over fields */ |
1612 | 0 | fieldno = 0; |
1613 | 0 | for (;;) |
1614 | 0 | { |
1615 | 0 | bool found_delim = false; |
1616 | 0 | char *start_ptr; |
1617 | 0 | char *end_ptr; |
1618 | 0 | int input_len; |
1619 | 0 | bool saw_non_ascii = false; |
1620 | | |
1621 | | /* Make sure there is enough space for the next value */ |
1622 | 0 | if (fieldno >= cstate->max_fields) |
1623 | 0 | { |
1624 | 0 | cstate->max_fields *= 2; |
1625 | 0 | cstate->raw_fields = |
1626 | 0 | repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *)); |
1627 | 0 | } |
1628 | | |
1629 | | /* Remember start of field on both input and output sides */ |
1630 | 0 | start_ptr = cur_ptr; |
1631 | 0 | cstate->raw_fields[fieldno] = output_ptr; |
1632 | | |
1633 | | /* |
1634 | | * Scan data for field. |
1635 | | * |
1636 | | * Note that in this loop, we are scanning to locate the end of field |
1637 | | * and also speculatively performing de-escaping. Once we find the |
1638 | | * end-of-field, we can match the raw field contents against the null |
1639 | | * marker string. Only after that comparison fails do we know that |
1640 | | * de-escaping is actually the right thing to do; therefore we *must |
1641 | | * not* throw any syntax errors before we've done the null-marker |
1642 | | * check. |
1643 | | */ |
1644 | 0 | for (;;) |
1645 | 0 | { |
1646 | 0 | char c; |
1647 | |
|
1648 | 0 | end_ptr = cur_ptr; |
1649 | 0 | if (cur_ptr >= line_end_ptr) |
1650 | 0 | break; |
1651 | 0 | c = *cur_ptr++; |
1652 | 0 | if (c == delimc) |
1653 | 0 | { |
1654 | 0 | found_delim = true; |
1655 | 0 | break; |
1656 | 0 | } |
1657 | 0 | if (c == '\\') |
1658 | 0 | { |
1659 | 0 | if (cur_ptr >= line_end_ptr) |
1660 | 0 | break; |
1661 | 0 | c = *cur_ptr++; |
1662 | 0 | switch (c) |
1663 | 0 | { |
1664 | 0 | case '0': |
1665 | 0 | case '1': |
1666 | 0 | case '2': |
1667 | 0 | case '3': |
1668 | 0 | case '4': |
1669 | 0 | case '5': |
1670 | 0 | case '6': |
1671 | 0 | case '7': |
1672 | 0 | { |
1673 | | /* handle \013 */ |
1674 | 0 | int val; |
1675 | |
|
1676 | 0 | val = OCTVALUE(c); |
1677 | 0 | if (cur_ptr < line_end_ptr) |
1678 | 0 | { |
1679 | 0 | c = *cur_ptr; |
1680 | 0 | if (ISOCTAL(c)) |
1681 | 0 | { |
1682 | 0 | cur_ptr++; |
1683 | 0 | val = (val << 3) + OCTVALUE(c); |
1684 | 0 | if (cur_ptr < line_end_ptr) |
1685 | 0 | { |
1686 | 0 | c = *cur_ptr; |
1687 | 0 | if (ISOCTAL(c)) |
1688 | 0 | { |
1689 | 0 | cur_ptr++; |
1690 | 0 | val = (val << 3) + OCTVALUE(c); |
1691 | 0 | } |
1692 | 0 | } |
1693 | 0 | } |
1694 | 0 | } |
1695 | 0 | c = val & 0377; |
1696 | 0 | if (c == '\0' || IS_HIGHBIT_SET(c)) |
1697 | 0 | saw_non_ascii = true; |
1698 | 0 | } |
1699 | 0 | break; |
1700 | 0 | case 'x': |
1701 | | /* Handle \x3F */ |
1702 | 0 | if (cur_ptr < line_end_ptr) |
1703 | 0 | { |
1704 | 0 | char hexchar = *cur_ptr; |
1705 | |
|
1706 | 0 | if (isxdigit((unsigned char) hexchar)) |
1707 | 0 | { |
1708 | 0 | int val = GetDecimalFromHex(hexchar); |
1709 | |
|
1710 | 0 | cur_ptr++; |
1711 | 0 | if (cur_ptr < line_end_ptr) |
1712 | 0 | { |
1713 | 0 | hexchar = *cur_ptr; |
1714 | 0 | if (isxdigit((unsigned char) hexchar)) |
1715 | 0 | { |
1716 | 0 | cur_ptr++; |
1717 | 0 | val = (val << 4) + GetDecimalFromHex(hexchar); |
1718 | 0 | } |
1719 | 0 | } |
1720 | 0 | c = val & 0xff; |
1721 | 0 | if (c == '\0' || IS_HIGHBIT_SET(c)) |
1722 | 0 | saw_non_ascii = true; |
1723 | 0 | } |
1724 | 0 | } |
1725 | 0 | break; |
1726 | 0 | case 'b': |
1727 | 0 | c = '\b'; |
1728 | 0 | break; |
1729 | 0 | case 'f': |
1730 | 0 | c = '\f'; |
1731 | 0 | break; |
1732 | 0 | case 'n': |
1733 | 0 | c = '\n'; |
1734 | 0 | break; |
1735 | 0 | case 'r': |
1736 | 0 | c = '\r'; |
1737 | 0 | break; |
1738 | 0 | case 't': |
1739 | 0 | c = '\t'; |
1740 | 0 | break; |
1741 | 0 | case 'v': |
1742 | 0 | c = '\v'; |
1743 | 0 | break; |
1744 | | |
1745 | | /* |
1746 | | * in all other cases, take the char after '\' |
1747 | | * literally |
1748 | | */ |
1749 | 0 | } |
1750 | 0 | } |
1751 | | |
1752 | | /* Add c to output string */ |
1753 | 0 | *output_ptr++ = c; |
1754 | 0 | } |
1755 | | |
1756 | | /* Check whether raw input matched null marker */ |
1757 | 0 | input_len = end_ptr - start_ptr; |
1758 | 0 | if (input_len == cstate->opts.null_print_len && |
1759 | 0 | strncmp(start_ptr, cstate->opts.null_print, input_len) == 0) |
1760 | 0 | cstate->raw_fields[fieldno] = NULL; |
1761 | | /* Check whether raw input matched default marker */ |
1762 | 0 | else if (fieldno < list_length(cstate->attnumlist) && |
1763 | 0 | cstate->opts.default_print && |
1764 | 0 | input_len == cstate->opts.default_print_len && |
1765 | 0 | strncmp(start_ptr, cstate->opts.default_print, input_len) == 0) |
1766 | 0 | { |
1767 | | /* fieldno is 0-indexed and attnum is 1-indexed */ |
1768 | 0 | int m = list_nth_int(cstate->attnumlist, fieldno) - 1; |
1769 | |
|
1770 | 0 | if (cstate->defexprs[m] != NULL) |
1771 | 0 | { |
1772 | | /* defaults contain entries for all physical attributes */ |
1773 | 0 | cstate->defaults[m] = true; |
1774 | 0 | } |
1775 | 0 | else |
1776 | 0 | { |
1777 | 0 | TupleDesc tupDesc = RelationGetDescr(cstate->rel); |
1778 | 0 | Form_pg_attribute att = TupleDescAttr(tupDesc, m); |
1779 | |
|
1780 | 0 | ereport(ERROR, |
1781 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1782 | 0 | errmsg("unexpected default marker in COPY data"), |
1783 | 0 | errdetail("Column \"%s\" has no default value.", |
1784 | 0 | NameStr(att->attname)))); |
1785 | 0 | } |
1786 | 0 | } |
1787 | 0 | else |
1788 | 0 | { |
1789 | | /* |
1790 | | * At this point we know the field is supposed to contain data. |
1791 | | * |
1792 | | * If we de-escaped any non-7-bit-ASCII chars, make sure the |
1793 | | * resulting string is valid data for the db encoding. |
1794 | | */ |
1795 | 0 | if (saw_non_ascii) |
1796 | 0 | { |
1797 | 0 | char *fld = cstate->raw_fields[fieldno]; |
1798 | |
|
1799 | 0 | pg_verifymbstr(fld, output_ptr - fld, false); |
1800 | 0 | } |
1801 | 0 | } |
1802 | | |
1803 | | /* Terminate attribute value in output area */ |
1804 | 0 | *output_ptr++ = '\0'; |
1805 | |
|
1806 | 0 | fieldno++; |
1807 | | /* Done if we hit EOL instead of a delim */ |
1808 | 0 | if (!found_delim) |
1809 | 0 | break; |
1810 | 0 | } |
1811 | | |
1812 | | /* Clean up state of attribute_buf */ |
1813 | 0 | output_ptr--; |
1814 | 0 | Assert(*output_ptr == '\0'); |
1815 | 0 | cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data); |
1816 | |
|
1817 | 0 | return fieldno; |
1818 | 0 | } |
1819 | | |
1820 | | /* |
1821 | | * Parse the current line into separate attributes (fields), |
1822 | | * performing de-escaping as needed. This has exactly the same API as |
1823 | | * CopyReadAttributesText, except we parse the fields according to |
1824 | | * "standard" (i.e. common) CSV usage. |
1825 | | */ |
1826 | | static int |
1827 | | CopyReadAttributesCSV(CopyFromState cstate) |
1828 | 0 | { |
1829 | 0 | char delimc = cstate->opts.delim[0]; |
1830 | 0 | char quotec = cstate->opts.quote[0]; |
1831 | 0 | char escapec = cstate->opts.escape[0]; |
1832 | 0 | int fieldno; |
1833 | 0 | char *output_ptr; |
1834 | 0 | char *cur_ptr; |
1835 | 0 | char *line_end_ptr; |
1836 | | |
1837 | | /* |
1838 | | * We need a special case for zero-column tables: check that the input |
1839 | | * line is empty, and return. |
1840 | | */ |
1841 | 0 | if (cstate->max_fields <= 0) |
1842 | 0 | { |
1843 | 0 | if (cstate->line_buf.len != 0) |
1844 | 0 | ereport(ERROR, |
1845 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1846 | 0 | errmsg("extra data after last expected column"))); |
1847 | 0 | return 0; |
1848 | 0 | } |
1849 | | |
1850 | 0 | resetStringInfo(&cstate->attribute_buf); |
1851 | | |
1852 | | /* |
1853 | | * The de-escaped attributes will certainly not be longer than the input |
1854 | | * data line, so we can just force attribute_buf to be large enough and |
1855 | | * then transfer data without any checks for enough space. We need to do |
1856 | | * it this way because enlarging attribute_buf mid-stream would invalidate |
1857 | | * pointers already stored into cstate->raw_fields[]. |
1858 | | */ |
1859 | 0 | if (cstate->attribute_buf.maxlen <= cstate->line_buf.len) |
1860 | 0 | enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len); |
1861 | 0 | output_ptr = cstate->attribute_buf.data; |
1862 | | |
1863 | | /* set pointer variables for loop */ |
1864 | 0 | cur_ptr = cstate->line_buf.data; |
1865 | 0 | line_end_ptr = cstate->line_buf.data + cstate->line_buf.len; |
1866 | | |
1867 | | /* Outer loop iterates over fields */ |
1868 | 0 | fieldno = 0; |
1869 | 0 | for (;;) |
1870 | 0 | { |
1871 | 0 | bool found_delim = false; |
1872 | 0 | bool saw_quote = false; |
1873 | 0 | char *start_ptr; |
1874 | 0 | char *end_ptr; |
1875 | 0 | int input_len; |
1876 | | |
1877 | | /* Make sure there is enough space for the next value */ |
1878 | 0 | if (fieldno >= cstate->max_fields) |
1879 | 0 | { |
1880 | 0 | cstate->max_fields *= 2; |
1881 | 0 | cstate->raw_fields = |
1882 | 0 | repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *)); |
1883 | 0 | } |
1884 | | |
1885 | | /* Remember start of field on both input and output sides */ |
1886 | 0 | start_ptr = cur_ptr; |
1887 | 0 | cstate->raw_fields[fieldno] = output_ptr; |
1888 | | |
1889 | | /* |
1890 | | * Scan data for field, |
1891 | | * |
1892 | | * The loop starts in "not quote" mode and then toggles between that |
1893 | | * and "in quote" mode. The loop exits normally if it is in "not |
1894 | | * quote" mode and a delimiter or line end is seen. |
1895 | | */ |
1896 | 0 | for (;;) |
1897 | 0 | { |
1898 | 0 | char c; |
1899 | | |
1900 | | /* Not in quote */ |
1901 | 0 | for (;;) |
1902 | 0 | { |
1903 | 0 | end_ptr = cur_ptr; |
1904 | 0 | if (cur_ptr >= line_end_ptr) |
1905 | 0 | goto endfield; |
1906 | 0 | c = *cur_ptr++; |
1907 | | /* unquoted field delimiter */ |
1908 | 0 | if (c == delimc) |
1909 | 0 | { |
1910 | 0 | found_delim = true; |
1911 | 0 | goto endfield; |
1912 | 0 | } |
1913 | | /* start of quoted field (or part of field) */ |
1914 | 0 | if (c == quotec) |
1915 | 0 | { |
1916 | 0 | saw_quote = true; |
1917 | 0 | break; |
1918 | 0 | } |
1919 | | /* Add c to output string */ |
1920 | 0 | *output_ptr++ = c; |
1921 | 0 | } |
1922 | | |
1923 | | /* In quote */ |
1924 | 0 | for (;;) |
1925 | 0 | { |
1926 | 0 | end_ptr = cur_ptr; |
1927 | 0 | if (cur_ptr >= line_end_ptr) |
1928 | 0 | ereport(ERROR, |
1929 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1930 | 0 | errmsg("unterminated CSV quoted field"))); |
1931 | | |
1932 | 0 | c = *cur_ptr++; |
1933 | | |
1934 | | /* escape within a quoted field */ |
1935 | 0 | if (c == escapec) |
1936 | 0 | { |
1937 | | /* |
1938 | | * peek at the next char if available, and escape it if it |
1939 | | * is an escape char or a quote char |
1940 | | */ |
1941 | 0 | if (cur_ptr < line_end_ptr) |
1942 | 0 | { |
1943 | 0 | char nextc = *cur_ptr; |
1944 | |
|
1945 | 0 | if (nextc == escapec || nextc == quotec) |
1946 | 0 | { |
1947 | 0 | *output_ptr++ = nextc; |
1948 | 0 | cur_ptr++; |
1949 | 0 | continue; |
1950 | 0 | } |
1951 | 0 | } |
1952 | 0 | } |
1953 | | |
1954 | | /* |
1955 | | * end of quoted field. Must do this test after testing for |
1956 | | * escape in case quote char and escape char are the same |
1957 | | * (which is the common case). |
1958 | | */ |
1959 | 0 | if (c == quotec) |
1960 | 0 | break; |
1961 | | |
1962 | | /* Add c to output string */ |
1963 | 0 | *output_ptr++ = c; |
1964 | 0 | } |
1965 | 0 | } |
1966 | 0 | endfield: |
1967 | | |
1968 | | /* Terminate attribute value in output area */ |
1969 | 0 | *output_ptr++ = '\0'; |
1970 | | |
1971 | | /* Check whether raw input matched null marker */ |
1972 | 0 | input_len = end_ptr - start_ptr; |
1973 | 0 | if (!saw_quote && input_len == cstate->opts.null_print_len && |
1974 | 0 | strncmp(start_ptr, cstate->opts.null_print, input_len) == 0) |
1975 | 0 | cstate->raw_fields[fieldno] = NULL; |
1976 | | /* Check whether raw input matched default marker */ |
1977 | 0 | else if (fieldno < list_length(cstate->attnumlist) && |
1978 | 0 | cstate->opts.default_print && |
1979 | 0 | input_len == cstate->opts.default_print_len && |
1980 | 0 | strncmp(start_ptr, cstate->opts.default_print, input_len) == 0) |
1981 | 0 | { |
1982 | | /* fieldno is 0-index and attnum is 1-index */ |
1983 | 0 | int m = list_nth_int(cstate->attnumlist, fieldno) - 1; |
1984 | |
|
1985 | 0 | if (cstate->defexprs[m] != NULL) |
1986 | 0 | { |
1987 | | /* defaults contain entries for all physical attributes */ |
1988 | 0 | cstate->defaults[m] = true; |
1989 | 0 | } |
1990 | 0 | else |
1991 | 0 | { |
1992 | 0 | TupleDesc tupDesc = RelationGetDescr(cstate->rel); |
1993 | 0 | Form_pg_attribute att = TupleDescAttr(tupDesc, m); |
1994 | |
|
1995 | 0 | ereport(ERROR, |
1996 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
1997 | 0 | errmsg("unexpected default marker in COPY data"), |
1998 | 0 | errdetail("Column \"%s\" has no default value.", |
1999 | 0 | NameStr(att->attname)))); |
2000 | 0 | } |
2001 | 0 | } |
2002 | | |
2003 | 0 | fieldno++; |
2004 | | /* Done if we hit EOL instead of a delim */ |
2005 | 0 | if (!found_delim) |
2006 | 0 | break; |
2007 | 0 | } |
2008 | | |
2009 | | /* Clean up state of attribute_buf */ |
2010 | 0 | output_ptr--; |
2011 | 0 | Assert(*output_ptr == '\0'); |
2012 | 0 | cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data); |
2013 | |
|
2014 | 0 | return fieldno; |
2015 | 0 | } |
2016 | | |
2017 | | |
2018 | | /* |
2019 | | * Read a binary attribute |
2020 | | */ |
2021 | | static Datum |
2022 | | CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, |
2023 | | Oid typioparam, int32 typmod, |
2024 | | bool *isnull) |
2025 | 0 | { |
2026 | 0 | int32 fld_size; |
2027 | 0 | Datum result; |
2028 | |
|
2029 | 0 | if (!CopyGetInt32(cstate, &fld_size)) |
2030 | 0 | ereport(ERROR, |
2031 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
2032 | 0 | errmsg("unexpected EOF in COPY data"))); |
2033 | 0 | if (fld_size == -1) |
2034 | 0 | { |
2035 | 0 | *isnull = true; |
2036 | 0 | return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod); |
2037 | 0 | } |
2038 | 0 | if (fld_size < 0) |
2039 | 0 | ereport(ERROR, |
2040 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
2041 | 0 | errmsg("invalid field size"))); |
2042 | | |
2043 | | /* reset attribute_buf to empty, and load raw data in it */ |
2044 | 0 | resetStringInfo(&cstate->attribute_buf); |
2045 | |
|
2046 | 0 | enlargeStringInfo(&cstate->attribute_buf, fld_size); |
2047 | 0 | if (CopyReadBinaryData(cstate, cstate->attribute_buf.data, |
2048 | 0 | fld_size) != fld_size) |
2049 | 0 | ereport(ERROR, |
2050 | 0 | (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), |
2051 | 0 | errmsg("unexpected EOF in COPY data"))); |
2052 | | |
2053 | 0 | cstate->attribute_buf.len = fld_size; |
2054 | 0 | cstate->attribute_buf.data[fld_size] = '\0'; |
2055 | | |
2056 | | /* Call the column type's binary input converter */ |
2057 | 0 | result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf, |
2058 | 0 | typioparam, typmod); |
2059 | | |
2060 | | /* Trouble if it didn't eat the whole buffer */ |
2061 | 0 | if (cstate->attribute_buf.cursor != cstate->attribute_buf.len) |
2062 | 0 | ereport(ERROR, |
2063 | 0 | (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), |
2064 | 0 | errmsg("incorrect binary data format"))); |
2065 | | |
2066 | 0 | *isnull = false; |
2067 | 0 | return result; |
2068 | 0 | } |