Coverage Report

Created: 2025-10-09 06:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/postgres/src/backend/commands/copyfromparse.c
Line
Count
Source
1
/*-------------------------------------------------------------------------
2
 *
3
 * copyfromparse.c
4
 *    Parse CSV/text/binary format for COPY FROM.
5
 *
6
 * This file contains routines to parse the text, CSV and binary input
7
 * formats.  The main entry point is NextCopyFrom(), which parses the
8
 * next input line and returns it as Datums.
9
 *
10
 * In text/CSV mode, the parsing happens in multiple stages:
11
 *
12
 * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13
 *                1.          2.            3.           4.
14
 *
15
 * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16
 *    places it into 'raw_buf'.
17
 *
18
 * 2. CopyConvertBuf() calls the encoding conversion function to convert
19
 *    the data in 'raw_buf' from client to server encoding, placing the
20
 *    converted result in 'input_buf'.
21
 *
22
 * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23
 *    It is responsible for finding the next newline marker, taking quote and
24
 *    escape characters into account according to the COPY options.  The line
25
 *    is copied into 'line_buf', with quotes and escape characters still
26
 *    intact.
27
 *
28
 * 4. CopyReadAttributesText/CSV() function takes the input line from
29
 *    'line_buf', and splits it into fields, unescaping the data as required.
30
 *    The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31
 *    pointers to each field.
32
 *
33
 * If encoding conversion is not required, a shortcut is taken in step 2 to
34
 * avoid copying the data unnecessarily.  The 'input_buf' pointer is set to
35
 * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36
 * directly into 'input_buf'.  CopyConvertBuf() then merely validates that
37
 * the data is valid in the current encoding.
38
 *
39
 * In binary mode, the pipeline is much simpler.  Input is loaded into
40
 * 'raw_buf', and encoding conversion is done in the datatype-specific
41
 * receive functions, if required.  'input_buf' and 'line_buf' are not used,
42
 * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43
 * data when it's passed the receive function.
44
 *
45
 * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE).  'input_buf' is also
46
 * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required.  'line_buf'
47
 * and 'attribute_buf' are expanded on demand, to hold the longest line
48
 * encountered so far.
49
 *
50
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
51
 * Portions Copyright (c) 1994, Regents of the University of California
52
 *
53
 *
54
 * IDENTIFICATION
55
 *    src/backend/commands/copyfromparse.c
56
 *
57
 *-------------------------------------------------------------------------
58
 */
59
#include "postgres.h"
60
61
#include <ctype.h>
62
#include <unistd.h>
63
#include <sys/stat.h>
64
65
#include "commands/copyapi.h"
66
#include "commands/copyfrom_internal.h"
67
#include "commands/progress.h"
68
#include "executor/executor.h"
69
#include "libpq/libpq.h"
70
#include "libpq/pqformat.h"
71
#include "mb/pg_wchar.h"
72
#include "miscadmin.h"
73
#include "pgstat.h"
74
#include "port/pg_bswap.h"
75
#include "utils/builtins.h"
76
#include "utils/rel.h"
77
78
0
#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79
0
#define OCTVALUE(c) ((c) - '0')
80
81
/*
82
 * These macros centralize code used to process line_buf and input_buf buffers.
83
 * They are macros because they often do continue/break control and to avoid
84
 * function call overhead in tight COPY loops.
85
 *
86
 * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87
 * prevent the continue/break processing from working.  We end the "if (1)"
88
 * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89
 * any "else" in the calling code, and to avoid any compiler warnings about
90
 * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
91
 */
92
93
/*
94
 * This keeps the character read at the top of the loop in the buffer
95
 * even if there is more than one read-ahead.
96
 */
97
0
#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
98
0
if (1) \
99
0
{ \
100
0
  if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
101
0
  { \
102
0
    input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
103
0
    need_data = true; \
104
0
    continue; \
105
0
  } \
106
0
} else ((void) 0)
107
108
/* This consumes the remainder of the buffer and breaks */
109
0
#define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
110
0
if (1) \
111
0
{ \
112
0
  if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
113
0
  { \
114
0
    if (extralen) \
115
0
      input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116
0
    /* backslash just before EOF, treat as data char */ \
117
0
    result = true; \
118
0
    break; \
119
0
  } \
120
0
} else ((void) 0)
121
122
/*
123
 * Transfer any approved data to line_buf; must do this to be sure
124
 * there is some room in input_buf.
125
 */
126
0
#define REFILL_LINEBUF \
127
0
if (1) \
128
0
{ \
129
0
  if (input_buf_ptr > cstate->input_buf_index) \
130
0
  { \
131
0
    appendBinaryStringInfo(&cstate->line_buf, \
132
0
               cstate->input_buf + cstate->input_buf_index, \
133
0
                 input_buf_ptr - cstate->input_buf_index); \
134
0
    cstate->input_buf_index = input_buf_ptr; \
135
0
  } \
136
0
} else ((void) 0)
137
138
/* NOTE: there's a copy of this in copyto.c */
139
static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
140
141
142
/* non-export function prototypes */
143
static bool CopyReadLine(CopyFromState cstate, bool is_csv);
144
static bool CopyReadLineText(CopyFromState cstate, bool is_csv);
145
static int  CopyReadAttributesText(CopyFromState cstate);
146
static int  CopyReadAttributesCSV(CopyFromState cstate);
147
static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
148
                   Oid typioparam, int32 typmod,
149
                   bool *isnull);
150
static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate,
151
                                ExprContext *econtext,
152
                                Datum *values,
153
                                bool *nulls,
154
                                bool is_csv);
155
static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate,
156
                                   char ***fields,
157
                                   int *nfields,
158
                                   bool is_csv);
159
160
161
/* Low-level communications functions */
162
static int  CopyGetData(CopyFromState cstate, void *databuf,
163
            int minread, int maxread);
164
static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
165
static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
166
static void CopyLoadInputBuf(CopyFromState cstate);
167
static int  CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
168
169
void
170
ReceiveCopyBegin(CopyFromState cstate)
171
0
{
172
0
  StringInfoData buf;
173
0
  int     natts = list_length(cstate->attnumlist);
174
0
  int16   format = (cstate->opts.binary ? 1 : 0);
175
0
  int     i;
176
177
0
  pq_beginmessage(&buf, PqMsg_CopyInResponse);
178
0
  pq_sendbyte(&buf, format);  /* overall format */
179
0
  pq_sendint16(&buf, natts);
180
0
  for (i = 0; i < natts; i++)
181
0
    pq_sendint16(&buf, format); /* per-column formats */
182
0
  pq_endmessage(&buf);
183
0
  cstate->copy_src = COPY_FRONTEND;
184
0
  cstate->fe_msgbuf = makeStringInfo();
185
  /* We *must* flush here to ensure FE knows it can send. */
186
0
  pq_flush();
187
0
}
188
189
void
190
ReceiveCopyBinaryHeader(CopyFromState cstate)
191
0
{
192
0
  char    readSig[11];
193
0
  int32   tmp;
194
195
  /* Signature */
196
0
  if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
197
0
    memcmp(readSig, BinarySignature, 11) != 0)
198
0
    ereport(ERROR,
199
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
200
0
         errmsg("COPY file signature not recognized")));
201
  /* Flags field */
202
0
  if (!CopyGetInt32(cstate, &tmp))
203
0
    ereport(ERROR,
204
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
205
0
         errmsg("invalid COPY file header (missing flags)")));
206
0
  if ((tmp & (1 << 16)) != 0)
207
0
    ereport(ERROR,
208
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
209
0
         errmsg("invalid COPY file header (WITH OIDS)")));
210
0
  tmp &= ~(1 << 16);
211
0
  if ((tmp >> 16) != 0)
212
0
    ereport(ERROR,
213
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
214
0
         errmsg("unrecognized critical flags in COPY file header")));
215
  /* Header extension length */
216
0
  if (!CopyGetInt32(cstate, &tmp) ||
217
0
    tmp < 0)
218
0
    ereport(ERROR,
219
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
220
0
         errmsg("invalid COPY file header (missing length)")));
221
  /* Skip extension header, if present */
222
0
  while (tmp-- > 0)
223
0
  {
224
0
    if (CopyReadBinaryData(cstate, readSig, 1) != 1)
225
0
      ereport(ERROR,
226
0
          (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
227
0
           errmsg("invalid COPY file header (wrong length)")));
228
0
  }
229
0
}
230
231
/*
232
 * CopyGetData reads data from the source (file or frontend)
233
 *
234
 * We attempt to read at least minread, and at most maxread, bytes from
235
 * the source.  The actual number of bytes read is returned; if this is
236
 * less than minread, EOF was detected.
237
 *
238
 * Note: when copying from the frontend, we expect a proper EOF mark per
239
 * protocol; if the frontend simply drops the connection, we raise error.
240
 * It seems unwise to allow the COPY IN to complete normally in that case.
241
 *
242
 * NB: no data conversion is applied here.
243
 */
244
static int
245
CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
246
0
{
247
0
  int     bytesread = 0;
248
249
0
  switch (cstate->copy_src)
250
0
  {
251
0
    case COPY_FILE:
252
0
      bytesread = fread(databuf, 1, maxread, cstate->copy_file);
253
0
      if (ferror(cstate->copy_file))
254
0
        ereport(ERROR,
255
0
            (errcode_for_file_access(),
256
0
             errmsg("could not read from COPY file: %m")));
257
0
      if (bytesread == 0)
258
0
        cstate->raw_reached_eof = true;
259
0
      break;
260
0
    case COPY_FRONTEND:
261
0
      while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
262
0
      {
263
0
        int     avail;
264
265
0
        while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
266
0
        {
267
          /* Try to receive another message */
268
0
          int     mtype;
269
0
          int     maxmsglen;
270
271
0
      readmessage:
272
0
          HOLD_CANCEL_INTERRUPTS();
273
0
          pq_startmsgread();
274
0
          mtype = pq_getbyte();
275
0
          if (mtype == EOF)
276
0
            ereport(ERROR,
277
0
                (errcode(ERRCODE_CONNECTION_FAILURE),
278
0
                 errmsg("unexpected EOF on client connection with an open transaction")));
279
          /* Validate message type and set packet size limit */
280
0
          switch (mtype)
281
0
          {
282
0
            case PqMsg_CopyData:
283
0
              maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
284
0
              break;
285
0
            case PqMsg_CopyDone:
286
0
            case PqMsg_CopyFail:
287
0
            case PqMsg_Flush:
288
0
            case PqMsg_Sync:
289
0
              maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
290
0
              break;
291
0
            default:
292
0
              ereport(ERROR,
293
0
                  (errcode(ERRCODE_PROTOCOL_VIOLATION),
294
0
                   errmsg("unexpected message type 0x%02X during COPY from stdin",
295
0
                      mtype)));
296
0
              maxmsglen = 0;  /* keep compiler quiet */
297
0
              break;
298
0
          }
299
          /* Now collect the message body */
300
0
          if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
301
0
            ereport(ERROR,
302
0
                (errcode(ERRCODE_CONNECTION_FAILURE),
303
0
                 errmsg("unexpected EOF on client connection with an open transaction")));
304
0
          RESUME_CANCEL_INTERRUPTS();
305
          /* ... and process it */
306
0
          switch (mtype)
307
0
          {
308
0
            case PqMsg_CopyData:
309
0
              break;
310
0
            case PqMsg_CopyDone:
311
              /* COPY IN correctly terminated by frontend */
312
0
              cstate->raw_reached_eof = true;
313
0
              return bytesread;
314
0
            case PqMsg_CopyFail:
315
0
              ereport(ERROR,
316
0
                  (errcode(ERRCODE_QUERY_CANCELED),
317
0
                   errmsg("COPY from stdin failed: %s",
318
0
                      pq_getmsgstring(cstate->fe_msgbuf))));
319
0
              break;
320
0
            case PqMsg_Flush:
321
0
            case PqMsg_Sync:
322
323
              /*
324
               * Ignore Flush/Sync for the convenience of client
325
               * libraries (such as libpq) that may send those
326
               * without noticing that the command they just
327
               * sent was COPY.
328
               */
329
0
              goto readmessage;
330
0
            default:
331
0
              Assert(false);  /* NOT REACHED */
332
0
          }
333
0
        }
334
0
        avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
335
0
        if (avail > maxread)
336
0
          avail = maxread;
337
0
        pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
338
0
        databuf = (void *) ((char *) databuf + avail);
339
0
        maxread -= avail;
340
0
        bytesread += avail;
341
0
      }
342
0
      break;
343
0
    case COPY_CALLBACK:
344
0
      bytesread = cstate->data_source_cb(databuf, minread, maxread);
345
0
      break;
346
0
  }
347
348
0
  return bytesread;
349
0
}
350
351
352
/*
353
 * These functions do apply some data conversion
354
 */
355
356
/*
357
 * CopyGetInt32 reads an int32 that appears in network byte order
358
 *
359
 * Returns true if OK, false if EOF
360
 */
361
static inline bool
362
CopyGetInt32(CopyFromState cstate, int32 *val)
363
0
{
364
0
  uint32    buf;
365
366
0
  if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
367
0
  {
368
0
    *val = 0;       /* suppress compiler warning */
369
0
    return false;
370
0
  }
371
0
  *val = (int32) pg_ntoh32(buf);
372
0
  return true;
373
0
}
374
375
/*
376
 * CopyGetInt16 reads an int16 that appears in network byte order
377
 */
378
static inline bool
379
CopyGetInt16(CopyFromState cstate, int16 *val)
380
0
{
381
0
  uint16    buf;
382
383
0
  if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
384
0
  {
385
0
    *val = 0;       /* suppress compiler warning */
386
0
    return false;
387
0
  }
388
0
  *val = (int16) pg_ntoh16(buf);
389
0
  return true;
390
0
}
391
392
393
/*
394
 * Perform encoding conversion on data in 'raw_buf', writing the converted
395
 * data into 'input_buf'.
396
 *
397
 * On entry, there must be some data to convert in 'raw_buf'.
398
 */
399
static void
400
CopyConvertBuf(CopyFromState cstate)
401
0
{
402
  /*
403
   * If the file and server encoding are the same, no encoding conversion is
404
   * required.  However, we still need to verify that the input is valid for
405
   * the encoding.
406
   */
407
0
  if (!cstate->need_transcoding)
408
0
  {
409
    /*
410
     * When conversion is not required, input_buf and raw_buf are the
411
     * same.  raw_buf_len is the total number of bytes in the buffer, and
412
     * input_buf_len tracks how many of those bytes have already been
413
     * verified.
414
     */
415
0
    int     preverifiedlen = cstate->input_buf_len;
416
0
    int     unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
417
0
    int     nverified;
418
419
0
    if (unverifiedlen == 0)
420
0
    {
421
      /*
422
       * If no more raw data is coming, report the EOF to the caller.
423
       */
424
0
      if (cstate->raw_reached_eof)
425
0
        cstate->input_reached_eof = true;
426
0
      return;
427
0
    }
428
429
    /*
430
     * Verify the new data, including any residual unverified bytes from
431
     * previous round.
432
     */
433
0
    nverified = pg_encoding_verifymbstr(cstate->file_encoding,
434
0
                      cstate->raw_buf + preverifiedlen,
435
0
                      unverifiedlen);
436
0
    if (nverified == 0)
437
0
    {
438
      /*
439
       * Could not verify anything.
440
       *
441
       * If there is no more raw input data coming, it means that there
442
       * was an incomplete multi-byte sequence at the end.  Also, if
443
       * there's "enough" input left, we should be able to verify at
444
       * least one character, and a failure to do so means that we've
445
       * hit an invalid byte sequence.
446
       */
447
0
      if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
448
0
        cstate->input_reached_error = true;
449
0
      return;
450
0
    }
451
0
    cstate->input_buf_len += nverified;
452
0
  }
453
0
  else
454
0
  {
455
    /*
456
     * Encoding conversion is needed.
457
     */
458
0
    int     nbytes;
459
0
    unsigned char *src;
460
0
    int     srclen;
461
0
    unsigned char *dst;
462
0
    int     dstlen;
463
0
    int     convertedlen;
464
465
0
    if (RAW_BUF_BYTES(cstate) == 0)
466
0
    {
467
      /*
468
       * If no more raw data is coming, report the EOF to the caller.
469
       */
470
0
      if (cstate->raw_reached_eof)
471
0
        cstate->input_reached_eof = true;
472
0
      return;
473
0
    }
474
475
    /*
476
     * First, copy down any unprocessed data.
477
     */
478
0
    nbytes = INPUT_BUF_BYTES(cstate);
479
0
    if (nbytes > 0 && cstate->input_buf_index > 0)
480
0
      memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
481
0
          nbytes);
482
0
    cstate->input_buf_index = 0;
483
0
    cstate->input_buf_len = nbytes;
484
0
    cstate->input_buf[nbytes] = '\0';
485
486
0
    src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
487
0
    srclen = cstate->raw_buf_len - cstate->raw_buf_index;
488
0
    dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
489
0
    dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
490
491
    /*
492
     * Do the conversion.  This might stop short, if there is an invalid
493
     * byte sequence in the input.  We'll convert as much as we can in
494
     * that case.
495
     *
496
     * Note: Even if we hit an invalid byte sequence, we don't report the
497
     * error until all the valid bytes have been consumed.  The input
498
     * might contain an end-of-input marker (\.), and we don't want to
499
     * report an error if the invalid byte sequence is after the
500
     * end-of-input marker.  We might unnecessarily convert some data
501
     * after the end-of-input marker as long as it's valid for the
502
     * encoding, but that's harmless.
503
     */
504
0
    convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
505
0
                           cstate->file_encoding,
506
0
                           GetDatabaseEncoding(),
507
0
                           src, srclen,
508
0
                           dst, dstlen,
509
0
                           true);
510
0
    if (convertedlen == 0)
511
0
    {
512
      /*
513
       * Could not convert anything.  If there is no more raw input data
514
       * coming, it means that there was an incomplete multi-byte
515
       * sequence at the end.  Also, if there is plenty of input left,
516
       * we should be able to convert at least one character, so a
517
       * failure to do so must mean that we've hit a byte sequence
518
       * that's invalid.
519
       */
520
0
      if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
521
0
        cstate->input_reached_error = true;
522
0
      return;
523
0
    }
524
0
    cstate->raw_buf_index += convertedlen;
525
0
    cstate->input_buf_len += strlen((char *) dst);
526
0
  }
527
0
}
528
529
/*
530
 * Report an encoding or conversion error.
531
 */
532
static void
533
CopyConversionError(CopyFromState cstate)
534
0
{
535
0
  Assert(cstate->raw_buf_len > 0);
536
0
  Assert(cstate->input_reached_error);
537
538
0
  if (!cstate->need_transcoding)
539
0
  {
540
    /*
541
     * Everything up to input_buf_len was successfully verified, and
542
     * input_buf_len points to the invalid or incomplete character.
543
     */
544
0
    report_invalid_encoding(cstate->file_encoding,
545
0
                cstate->raw_buf + cstate->input_buf_len,
546
0
                cstate->raw_buf_len - cstate->input_buf_len);
547
0
  }
548
0
  else
549
0
  {
550
    /*
551
     * raw_buf_index points to the invalid or untranslatable character. We
552
     * let the conversion routine report the error, because it can provide
553
     * a more specific error message than we could here.  An earlier call
554
     * to the conversion routine in CopyConvertBuf() detected that there
555
     * is an error, now we call the conversion routine again with
556
     * noError=false, to have it throw the error.
557
     */
558
0
    unsigned char *src;
559
0
    int     srclen;
560
0
    unsigned char *dst;
561
0
    int     dstlen;
562
563
0
    src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
564
0
    srclen = cstate->raw_buf_len - cstate->raw_buf_index;
565
0
    dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
566
0
    dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
567
568
0
    (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
569
0
                       cstate->file_encoding,
570
0
                       GetDatabaseEncoding(),
571
0
                       src, srclen,
572
0
                       dst, dstlen,
573
0
                       false);
574
575
    /*
576
     * The conversion routine should have reported an error, so this
577
     * should not be reached.
578
     */
579
0
    elog(ERROR, "encoding conversion failed without error");
580
0
  }
581
0
}
582
583
/*
584
 * Load more data from data source to raw_buf.
585
 *
586
 * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
587
 * beginning of the buffer, and we load new data after that.
588
 */
589
static void
590
CopyLoadRawBuf(CopyFromState cstate)
591
0
{
592
0
  int     nbytes;
593
0
  int     inbytes;
594
595
  /*
596
   * In text mode, if encoding conversion is not required, raw_buf and
597
   * input_buf point to the same buffer.  Their len/index better agree, too.
598
   */
599
0
  if (cstate->raw_buf == cstate->input_buf)
600
0
  {
601
0
    Assert(!cstate->need_transcoding);
602
0
    Assert(cstate->raw_buf_index == cstate->input_buf_index);
603
0
    Assert(cstate->input_buf_len <= cstate->raw_buf_len);
604
0
  }
605
606
  /*
607
   * Copy down the unprocessed data if any.
608
   */
609
0
  nbytes = RAW_BUF_BYTES(cstate);
610
0
  if (nbytes > 0 && cstate->raw_buf_index > 0)
611
0
    memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
612
0
        nbytes);
613
0
  cstate->raw_buf_len -= cstate->raw_buf_index;
614
0
  cstate->raw_buf_index = 0;
615
616
  /*
617
   * If raw_buf and input_buf are in fact the same buffer, adjust the
618
   * input_buf variables, too.
619
   */
620
0
  if (cstate->raw_buf == cstate->input_buf)
621
0
  {
622
0
    cstate->input_buf_len -= cstate->input_buf_index;
623
0
    cstate->input_buf_index = 0;
624
0
  }
625
626
  /* Load more data */
627
0
  inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
628
0
              1, RAW_BUF_SIZE - cstate->raw_buf_len);
629
0
  nbytes += inbytes;
630
0
  cstate->raw_buf[nbytes] = '\0';
631
0
  cstate->raw_buf_len = nbytes;
632
633
0
  cstate->bytes_processed += inbytes;
634
0
  pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
635
636
0
  if (inbytes == 0)
637
0
    cstate->raw_reached_eof = true;
638
0
}
639
640
/*
641
 * CopyLoadInputBuf loads some more data into input_buf
642
 *
643
 * On return, at least one more input character is loaded into
644
 * input_buf, or input_reached_eof is set.
645
 *
646
 * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
647
 * of the buffer and then we load more data after that.
648
 */
649
static void
650
CopyLoadInputBuf(CopyFromState cstate)
651
0
{
652
0
  int     nbytes = INPUT_BUF_BYTES(cstate);
653
654
  /*
655
   * The caller has updated input_buf_index to indicate how much of the
656
   * input has been consumed and isn't needed anymore.  If input_buf is the
657
   * same physical area as raw_buf, update raw_buf_index accordingly.
658
   */
659
0
  if (cstate->raw_buf == cstate->input_buf)
660
0
  {
661
0
    Assert(!cstate->need_transcoding);
662
0
    Assert(cstate->input_buf_index >= cstate->raw_buf_index);
663
0
    cstate->raw_buf_index = cstate->input_buf_index;
664
0
  }
665
666
0
  for (;;)
667
0
  {
668
    /* If we now have some unconverted data, try to convert it */
669
0
    CopyConvertBuf(cstate);
670
671
    /* If we now have some more input bytes ready, return them */
672
0
    if (INPUT_BUF_BYTES(cstate) > nbytes)
673
0
      return;
674
675
    /*
676
     * If we reached an invalid byte sequence, or we're at an incomplete
677
     * multi-byte character but there is no more raw input data, report
678
     * conversion error.
679
     */
680
0
    if (cstate->input_reached_error)
681
0
      CopyConversionError(cstate);
682
683
    /* no more input, and everything has been converted */
684
0
    if (cstate->input_reached_eof)
685
0
      break;
686
687
    /* Try to load more raw data */
688
0
    Assert(!cstate->raw_reached_eof);
689
0
    CopyLoadRawBuf(cstate);
690
0
  }
691
0
}
692
693
/*
694
 * CopyReadBinaryData
695
 *
696
 * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
697
 * and writes them to 'dest'.  Returns the number of bytes read (which
698
 * would be less than 'nbytes' only if we reach EOF).
699
 */
700
static int
701
CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
702
0
{
703
0
  int     copied_bytes = 0;
704
705
0
  if (RAW_BUF_BYTES(cstate) >= nbytes)
706
0
  {
707
    /* Enough bytes are present in the buffer. */
708
0
    memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
709
0
    cstate->raw_buf_index += nbytes;
710
0
    copied_bytes = nbytes;
711
0
  }
712
0
  else
713
0
  {
714
    /*
715
     * Not enough bytes in the buffer, so must read from the file.  Need
716
     * to loop since 'nbytes' could be larger than the buffer size.
717
     */
718
0
    do
719
0
    {
720
0
      int     copy_bytes;
721
722
      /* Load more data if buffer is empty. */
723
0
      if (RAW_BUF_BYTES(cstate) == 0)
724
0
      {
725
0
        CopyLoadRawBuf(cstate);
726
0
        if (cstate->raw_reached_eof)
727
0
          break;   /* EOF */
728
0
      }
729
730
      /* Transfer some bytes. */
731
0
      copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
732
0
      memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
733
0
      cstate->raw_buf_index += copy_bytes;
734
0
      dest += copy_bytes;
735
0
      copied_bytes += copy_bytes;
736
0
    } while (copied_bytes < nbytes);
737
0
  }
738
739
0
  return copied_bytes;
740
0
}
741
742
/*
743
 * This function is exposed for use by extensions that read raw fields in the
744
 * next line. See NextCopyFromRawFieldsInternal() for details.
745
 */
746
bool
747
NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
748
0
{
749
0
  return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
750
0
                     cstate->opts.csv_mode);
751
0
}
752
753
/*
754
 * Workhorse for NextCopyFromRawFields().
755
 *
756
 * Read raw fields in the next line for COPY FROM in text or csv mode. Return
757
 * false if no more lines.
758
 *
759
 * An internal temporary buffer is returned via 'fields'. It is valid until
760
 * the next call of the function. Since the function returns all raw fields
761
 * in the input file, 'nfields' could be different from the number of columns
762
 * in the relation.
763
 *
764
 * NOTE: force_not_null option are not applied to the returned fields.
765
 *
766
 * We use pg_attribute_always_inline to reduce function call overhead
767
 * and to help compilers to optimize away the 'is_csv' condition when called
768
 * by internal functions such as CopyFromTextLikeOneRow().
769
 */
770
static pg_attribute_always_inline bool
771
NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
772
0
{
773
0
  int     fldct;
774
0
  bool    done = false;
775
776
  /* only available for text or csv input */
777
0
  Assert(!cstate->opts.binary);
778
779
  /* on input check that the header line is correct if needed */
780
0
  if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
781
0
  {
782
0
    ListCell   *cur;
783
0
    TupleDesc tupDesc;
784
0
    int     lines_to_skip = cstate->opts.header_line;
785
786
    /* If set to "match", one header line is skipped */
787
0
    if (cstate->opts.header_line == COPY_HEADER_MATCH)
788
0
      lines_to_skip = 1;
789
790
0
    tupDesc = RelationGetDescr(cstate->rel);
791
792
0
    for (int i = 0; i < lines_to_skip; i++)
793
0
    {
794
0
      cstate->cur_lineno++;
795
0
      if ((done = CopyReadLine(cstate, is_csv)))
796
0
        break;
797
0
    }
798
799
0
    if (cstate->opts.header_line == COPY_HEADER_MATCH)
800
0
    {
801
0
      int     fldnum;
802
803
0
      if (is_csv)
804
0
        fldct = CopyReadAttributesCSV(cstate);
805
0
      else
806
0
        fldct = CopyReadAttributesText(cstate);
807
808
0
      if (fldct != list_length(cstate->attnumlist))
809
0
        ereport(ERROR,
810
0
            (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
811
0
             errmsg("wrong number of fields in header line: got %d, expected %d",
812
0
                fldct, list_length(cstate->attnumlist))));
813
814
0
      fldnum = 0;
815
0
      foreach(cur, cstate->attnumlist)
816
0
      {
817
0
        int     attnum = lfirst_int(cur);
818
0
        char     *colName;
819
0
        Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
820
821
0
        Assert(fldnum < cstate->max_fields);
822
823
0
        colName = cstate->raw_fields[fldnum++];
824
0
        if (colName == NULL)
825
0
          ereport(ERROR,
826
0
              (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
827
0
               errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
828
0
                  fldnum, cstate->opts.null_print, NameStr(attr->attname))));
829
830
0
        if (namestrcmp(&attr->attname, colName) != 0)
831
0
        {
832
0
          ereport(ERROR,
833
0
              (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
834
0
               errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
835
0
                  fldnum, colName, NameStr(attr->attname))));
836
0
        }
837
0
      }
838
0
    }
839
840
0
    if (done)
841
0
      return false;
842
0
  }
843
844
0
  cstate->cur_lineno++;
845
846
  /* Actually read the line into memory here */
847
0
  done = CopyReadLine(cstate, is_csv);
848
849
  /*
850
   * EOF at start of line means we're done.  If we see EOF after some
851
   * characters, we act as though it was newline followed by EOF, ie,
852
   * process the line and then exit loop on next iteration.
853
   */
854
0
  if (done && cstate->line_buf.len == 0)
855
0
    return false;
856
857
  /* Parse the line into de-escaped field values */
858
0
  if (is_csv)
859
0
    fldct = CopyReadAttributesCSV(cstate);
860
0
  else
861
0
    fldct = CopyReadAttributesText(cstate);
862
863
0
  *fields = cstate->raw_fields;
864
0
  *nfields = fldct;
865
0
  return true;
866
0
}
867
868
/*
869
 * Read next tuple from file for COPY FROM. Return false if no more tuples.
870
 *
871
 * 'econtext' is used to evaluate default expression for each column that is
872
 * either not read from the file or is using the DEFAULT option of COPY FROM.
873
 * It can be NULL when no default values are used, i.e. when all columns are
874
 * read from the file, and DEFAULT option is unset.
875
 *
876
 * 'values' and 'nulls' arrays must be the same length as columns of the
877
 * relation passed to BeginCopyFrom. This function fills the arrays.
878
 */
879
bool
880
NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
881
       Datum *values, bool *nulls)
882
0
{
883
0
  TupleDesc tupDesc;
884
0
  AttrNumber  num_phys_attrs,
885
0
        num_defaults = cstate->num_defaults;
886
0
  int     i;
887
0
  int      *defmap = cstate->defmap;
888
0
  ExprState **defexprs = cstate->defexprs;
889
890
0
  tupDesc = RelationGetDescr(cstate->rel);
891
0
  num_phys_attrs = tupDesc->natts;
892
893
  /* Initialize all values for row to NULL */
894
0
  MemSet(values, 0, num_phys_attrs * sizeof(Datum));
895
0
  MemSet(nulls, true, num_phys_attrs * sizeof(bool));
896
0
  MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
897
898
  /* Get one row from source */
899
0
  if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
900
0
    return false;
901
902
  /*
903
   * Now compute and insert any defaults available for the columns not
904
   * provided by the input data.  Anything not processed here or above will
905
   * remain NULL.
906
   */
907
0
  for (i = 0; i < num_defaults; i++)
908
0
  {
909
    /*
910
     * The caller must supply econtext and have switched into the
911
     * per-tuple memory context in it.
912
     */
913
0
    Assert(econtext != NULL);
914
0
    Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
915
916
0
    values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
917
0
                     &nulls[defmap[i]]);
918
0
  }
919
920
0
  return true;
921
0
}
922
923
/* Implementation of the per-row callback for text format */
924
bool
925
CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
926
           bool *nulls)
927
0
{
928
0
  return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
929
0
}
930
931
/* Implementation of the per-row callback for CSV format */
932
bool
933
CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
934
          bool *nulls)
935
0
{
936
0
  return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
937
0
}
938
939
/*
940
 * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
941
 *
942
 * We use pg_attribute_always_inline to reduce function call overhead
943
 * and to help compilers to optimize away the 'is_csv' condition.
944
 */
945
static pg_attribute_always_inline bool
946
CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext,
947
             Datum *values, bool *nulls, bool is_csv)
948
{
949
  TupleDesc tupDesc;
950
  AttrNumber  attr_count;
951
  FmgrInfo   *in_functions = cstate->in_functions;
952
  Oid      *typioparams = cstate->typioparams;
953
  ExprState **defexprs = cstate->defexprs;
954
  char    **field_strings;
955
  ListCell   *cur;
956
  int     fldct;
957
  int     fieldno;
958
  char     *string;
959
960
  tupDesc = RelationGetDescr(cstate->rel);
961
  attr_count = list_length(cstate->attnumlist);
962
963
  /* read raw fields in the next line */
964
  if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv))
965
    return false;
966
967
  /* check for overflowing fields */
968
  if (attr_count > 0 && fldct > attr_count)
969
    ereport(ERROR,
970
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
971
         errmsg("extra data after last expected column")));
972
973
  fieldno = 0;
974
975
  /* Loop to read the user attributes on the line. */
976
  foreach(cur, cstate->attnumlist)
977
  {
978
    int     attnum = lfirst_int(cur);
979
    int     m = attnum - 1;
980
    Form_pg_attribute att = TupleDescAttr(tupDesc, m);
981
982
    if (fieldno >= fldct)
983
      ereport(ERROR,
984
          (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
985
           errmsg("missing data for column \"%s\"",
986
              NameStr(att->attname))));
987
    string = field_strings[fieldno++];
988
989
    if (cstate->convert_select_flags &&
990
      !cstate->convert_select_flags[m])
991
    {
992
      /* ignore input field, leaving column as NULL */
993
      continue;
994
    }
995
996
    if (is_csv)
997
    {
998
      if (string == NULL &&
999
        cstate->opts.force_notnull_flags[m])
1000
      {
1001
        /*
1002
         * FORCE_NOT_NULL option is set and column is NULL - convert
1003
         * it to the NULL string.
1004
         */
1005
        string = cstate->opts.null_print;
1006
      }
1007
      else if (string != NULL && cstate->opts.force_null_flags[m]
1008
           && strcmp(string, cstate->opts.null_print) == 0)
1009
      {
1010
        /*
1011
         * FORCE_NULL option is set and column matches the NULL
1012
         * string. It must have been quoted, or otherwise the string
1013
         * would already have been set to NULL. Convert it to NULL as
1014
         * specified.
1015
         */
1016
        string = NULL;
1017
      }
1018
    }
1019
1020
    cstate->cur_attname = NameStr(att->attname);
1021
    cstate->cur_attval = string;
1022
1023
    if (string != NULL)
1024
      nulls[m] = false;
1025
1026
    if (cstate->defaults[m])
1027
    {
1028
      /* We must have switched into the per-tuple memory context */
1029
      Assert(econtext != NULL);
1030
      Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
1031
1032
      values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1033
    }
1034
1035
    /*
1036
     * If ON_ERROR is specified with IGNORE, skip rows with soft errors
1037
     */
1038
    else if (!InputFunctionCallSafe(&in_functions[m],
1039
                    string,
1040
                    typioparams[m],
1041
                    att->atttypmod,
1042
                    (Node *) cstate->escontext,
1043
                    &values[m]))
1044
    {
1045
      Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
1046
1047
      cstate->num_errors++;
1048
1049
      if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
1050
      {
1051
        /*
1052
         * Since we emit line number and column info in the below
1053
         * notice message, we suppress error context information other
1054
         * than the relation name.
1055
         */
1056
        Assert(!cstate->relname_only);
1057
        cstate->relname_only = true;
1058
1059
        if (cstate->cur_attval)
1060
        {
1061
          char     *attval;
1062
1063
          attval = CopyLimitPrintoutLength(cstate->cur_attval);
1064
          ereport(NOTICE,
1065
              errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1066
                   cstate->cur_lineno,
1067
                   cstate->cur_attname,
1068
                   attval));
1069
          pfree(attval);
1070
        }
1071
        else
1072
          ereport(NOTICE,
1073
              errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1074
                   cstate->cur_lineno,
1075
                   cstate->cur_attname));
1076
1077
        /* reset relname_only */
1078
        cstate->relname_only = false;
1079
      }
1080
1081
      return true;
1082
    }
1083
1084
    cstate->cur_attname = NULL;
1085
    cstate->cur_attval = NULL;
1086
  }
1087
1088
  Assert(fieldno == attr_count);
1089
1090
  return true;
1091
}
1092
1093
/* Implementation of the per-row callback for binary format */
1094
bool
1095
CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
1096
           bool *nulls)
1097
0
{
1098
0
  TupleDesc tupDesc;
1099
0
  AttrNumber  attr_count;
1100
0
  FmgrInfo   *in_functions = cstate->in_functions;
1101
0
  Oid      *typioparams = cstate->typioparams;
1102
0
  int16   fld_count;
1103
0
  ListCell   *cur;
1104
1105
0
  tupDesc = RelationGetDescr(cstate->rel);
1106
0
  attr_count = list_length(cstate->attnumlist);
1107
1108
0
  cstate->cur_lineno++;
1109
1110
0
  if (!CopyGetInt16(cstate, &fld_count))
1111
0
  {
1112
    /* EOF detected (end of file, or protocol-level EOF) */
1113
0
    return false;
1114
0
  }
1115
1116
0
  if (fld_count == -1)
1117
0
  {
1118
    /*
1119
     * Received EOF marker.  Wait for the protocol-level EOF, and complain
1120
     * if it doesn't come immediately.  In COPY FROM STDIN, this ensures
1121
     * that we correctly handle CopyFail, if client chooses to send that
1122
     * now.  When copying from file, we could ignore the rest of the file
1123
     * like in text mode, but we choose to be consistent with the COPY
1124
     * FROM STDIN case.
1125
     */
1126
0
    char    dummy;
1127
1128
0
    if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
1129
0
      ereport(ERROR,
1130
0
          (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1131
0
           errmsg("received copy data after EOF marker")));
1132
0
    return false;
1133
0
  }
1134
1135
0
  if (fld_count != attr_count)
1136
0
    ereport(ERROR,
1137
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1138
0
         errmsg("row field count is %d, expected %d",
1139
0
            (int) fld_count, attr_count)));
1140
1141
0
  foreach(cur, cstate->attnumlist)
1142
0
  {
1143
0
    int     attnum = lfirst_int(cur);
1144
0
    int     m = attnum - 1;
1145
0
    Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1146
1147
0
    cstate->cur_attname = NameStr(att->attname);
1148
0
    values[m] = CopyReadBinaryAttribute(cstate,
1149
0
                      &in_functions[m],
1150
0
                      typioparams[m],
1151
0
                      att->atttypmod,
1152
0
                      &nulls[m]);
1153
0
    cstate->cur_attname = NULL;
1154
0
  }
1155
1156
0
  return true;
1157
0
}
1158
1159
/*
1160
 * Read the next input line and stash it in line_buf.
1161
 *
1162
 * Result is true if read was terminated by EOF, false if terminated
1163
 * by newline.  The terminating newline or EOF marker is not included
1164
 * in the final value of line_buf.
1165
 */
1166
static bool
1167
CopyReadLine(CopyFromState cstate, bool is_csv)
1168
0
{
1169
0
  bool    result;
1170
1171
0
  resetStringInfo(&cstate->line_buf);
1172
0
  cstate->line_buf_valid = false;
1173
1174
  /* Parse data and transfer into line_buf */
1175
0
  result = CopyReadLineText(cstate, is_csv);
1176
1177
0
  if (result)
1178
0
  {
1179
    /*
1180
     * Reached EOF.  In protocol version 3, we should ignore anything
1181
     * after \. up to the protocol end of copy data.  (XXX maybe better
1182
     * not to treat \. as special?)
1183
     */
1184
0
    if (cstate->copy_src == COPY_FRONTEND)
1185
0
    {
1186
0
      int     inbytes;
1187
1188
0
      do
1189
0
      {
1190
0
        inbytes = CopyGetData(cstate, cstate->input_buf,
1191
0
                    1, INPUT_BUF_SIZE);
1192
0
      } while (inbytes > 0);
1193
0
      cstate->input_buf_index = 0;
1194
0
      cstate->input_buf_len = 0;
1195
0
      cstate->raw_buf_index = 0;
1196
0
      cstate->raw_buf_len = 0;
1197
0
    }
1198
0
  }
1199
0
  else
1200
0
  {
1201
    /*
1202
     * If we didn't hit EOF, then we must have transferred the EOL marker
1203
     * to line_buf along with the data.  Get rid of it.
1204
     */
1205
0
    switch (cstate->eol_type)
1206
0
    {
1207
0
      case EOL_NL:
1208
0
        Assert(cstate->line_buf.len >= 1);
1209
0
        Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1210
0
        cstate->line_buf.len--;
1211
0
        cstate->line_buf.data[cstate->line_buf.len] = '\0';
1212
0
        break;
1213
0
      case EOL_CR:
1214
0
        Assert(cstate->line_buf.len >= 1);
1215
0
        Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1216
0
        cstate->line_buf.len--;
1217
0
        cstate->line_buf.data[cstate->line_buf.len] = '\0';
1218
0
        break;
1219
0
      case EOL_CRNL:
1220
0
        Assert(cstate->line_buf.len >= 2);
1221
0
        Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1222
0
        Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1223
0
        cstate->line_buf.len -= 2;
1224
0
        cstate->line_buf.data[cstate->line_buf.len] = '\0';
1225
0
        break;
1226
0
      case EOL_UNKNOWN:
1227
        /* shouldn't get here */
1228
0
        Assert(false);
1229
0
        break;
1230
0
    }
1231
0
  }
1232
1233
  /* Now it's safe to use the buffer in error messages */
1234
0
  cstate->line_buf_valid = true;
1235
1236
0
  return result;
1237
0
}
1238
1239
/*
1240
 * CopyReadLineText - inner loop of CopyReadLine for text mode
1241
 */
1242
static bool
1243
CopyReadLineText(CopyFromState cstate, bool is_csv)
1244
0
{
1245
0
  char     *copy_input_buf;
1246
0
  int     input_buf_ptr;
1247
0
  int     copy_buf_len;
1248
0
  bool    need_data = false;
1249
0
  bool    hit_eof = false;
1250
0
  bool    result = false;
1251
1252
  /* CSV variables */
1253
0
  bool    in_quote = false,
1254
0
        last_was_esc = false;
1255
0
  char    quotec = '\0';
1256
0
  char    escapec = '\0';
1257
1258
0
  if (is_csv)
1259
0
  {
1260
0
    quotec = cstate->opts.quote[0];
1261
0
    escapec = cstate->opts.escape[0];
1262
    /* ignore special escape processing if it's the same as quotec */
1263
0
    if (quotec == escapec)
1264
0
      escapec = '\0';
1265
0
  }
1266
1267
  /*
1268
   * The objective of this loop is to transfer the entire next input line
1269
   * into line_buf.  Hence, we only care for detecting newlines (\r and/or
1270
   * \n) and the end-of-copy marker (\.).
1271
   *
1272
   * In CSV mode, \r and \n inside a quoted field are just part of the data
1273
   * value and are put in line_buf.  We keep just enough state to know if we
1274
   * are currently in a quoted field or not.
1275
   *
1276
   * The input has already been converted to the database encoding.  All
1277
   * supported server encodings have the property that all bytes in a
1278
   * multi-byte sequence have the high bit set, so a multibyte character
1279
   * cannot contain any newline or escape characters embedded in the
1280
   * multibyte sequence.  Therefore, we can process the input byte-by-byte,
1281
   * regardless of the encoding.
1282
   *
1283
   * For speed, we try to move data from input_buf to line_buf in chunks
1284
   * rather than one character at a time.  input_buf_ptr points to the next
1285
   * character to examine; any characters from input_buf_index to
1286
   * input_buf_ptr have been determined to be part of the line, but not yet
1287
   * transferred to line_buf.
1288
   *
1289
   * For a little extra speed within the loop, we copy input_buf and
1290
   * input_buf_len into local variables.
1291
   */
1292
0
  copy_input_buf = cstate->input_buf;
1293
0
  input_buf_ptr = cstate->input_buf_index;
1294
0
  copy_buf_len = cstate->input_buf_len;
1295
1296
0
  for (;;)
1297
0
  {
1298
0
    int     prev_raw_ptr;
1299
0
    char    c;
1300
1301
    /*
1302
     * Load more data if needed.
1303
     *
1304
     * TODO: We could just force four bytes of read-ahead and avoid the
1305
     * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE().  That was
1306
     * unsafe with the old v2 COPY protocol, but we don't support that
1307
     * anymore.
1308
     */
1309
0
    if (input_buf_ptr >= copy_buf_len || need_data)
1310
0
    {
1311
0
      REFILL_LINEBUF;
1312
1313
0
      CopyLoadInputBuf(cstate);
1314
      /* update our local variables */
1315
0
      hit_eof = cstate->input_reached_eof;
1316
0
      input_buf_ptr = cstate->input_buf_index;
1317
0
      copy_buf_len = cstate->input_buf_len;
1318
1319
      /*
1320
       * If we are completely out of data, break out of the loop,
1321
       * reporting EOF.
1322
       */
1323
0
      if (INPUT_BUF_BYTES(cstate) <= 0)
1324
0
      {
1325
0
        result = true;
1326
0
        break;
1327
0
      }
1328
0
      need_data = false;
1329
0
    }
1330
1331
    /* OK to fetch a character */
1332
0
    prev_raw_ptr = input_buf_ptr;
1333
0
    c = copy_input_buf[input_buf_ptr++];
1334
1335
0
    if (is_csv)
1336
0
    {
1337
      /*
1338
       * If character is '\r', we may need to look ahead below.  Force
1339
       * fetch of the next character if we don't already have it.  We
1340
       * need to do this before changing CSV state, in case '\r' is also
1341
       * the quote or escape character.
1342
       */
1343
0
      if (c == '\r')
1344
0
      {
1345
0
        IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1346
0
      }
1347
1348
      /*
1349
       * Dealing with quotes and escapes here is mildly tricky. If the
1350
       * quote char is also the escape char, there's no problem - we
1351
       * just use the char as a toggle. If they are different, we need
1352
       * to ensure that we only take account of an escape inside a
1353
       * quoted field and immediately preceding a quote char, and not
1354
       * the second in an escape-escape sequence.
1355
       */
1356
0
      if (in_quote && c == escapec)
1357
0
        last_was_esc = !last_was_esc;
1358
0
      if (c == quotec && !last_was_esc)
1359
0
        in_quote = !in_quote;
1360
0
      if (c != escapec)
1361
0
        last_was_esc = false;
1362
1363
      /*
1364
       * Updating the line count for embedded CR and/or LF chars is
1365
       * necessarily a little fragile - this test is probably about the
1366
       * best we can do.  (XXX it's arguable whether we should do this
1367
       * at all --- is cur_lineno a physical or logical count?)
1368
       */
1369
0
      if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1370
0
        cstate->cur_lineno++;
1371
0
    }
1372
1373
    /* Process \r */
1374
0
    if (c == '\r' && (!is_csv || !in_quote))
1375
0
    {
1376
      /* Check for \r\n on first line, _and_ handle \r\n. */
1377
0
      if (cstate->eol_type == EOL_UNKNOWN ||
1378
0
        cstate->eol_type == EOL_CRNL)
1379
0
      {
1380
        /*
1381
         * If need more data, go back to loop top to load it.
1382
         *
1383
         * Note that if we are at EOF, c will wind up as '\0' because
1384
         * of the guaranteed pad of input_buf.
1385
         */
1386
0
        IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1387
1388
        /* get next char */
1389
0
        c = copy_input_buf[input_buf_ptr];
1390
1391
0
        if (c == '\n')
1392
0
        {
1393
0
          input_buf_ptr++;  /* eat newline */
1394
0
          cstate->eol_type = EOL_CRNL;  /* in case not set yet */
1395
0
        }
1396
0
        else
1397
0
        {
1398
          /* found \r, but no \n */
1399
0
          if (cstate->eol_type == EOL_CRNL)
1400
0
            ereport(ERROR,
1401
0
                (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1402
0
                 !is_csv ?
1403
0
                 errmsg("literal carriage return found in data") :
1404
0
                 errmsg("unquoted carriage return found in data"),
1405
0
                 !is_csv ?
1406
0
                 errhint("Use \"\\r\" to represent carriage return.") :
1407
0
                 errhint("Use quoted CSV field to represent carriage return.")));
1408
1409
          /*
1410
           * if we got here, it is the first line and we didn't find
1411
           * \n, so don't consume the peeked character
1412
           */
1413
0
          cstate->eol_type = EOL_CR;
1414
0
        }
1415
0
      }
1416
0
      else if (cstate->eol_type == EOL_NL)
1417
0
        ereport(ERROR,
1418
0
            (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1419
0
             !is_csv ?
1420
0
             errmsg("literal carriage return found in data") :
1421
0
             errmsg("unquoted carriage return found in data"),
1422
0
             !is_csv ?
1423
0
             errhint("Use \"\\r\" to represent carriage return.") :
1424
0
             errhint("Use quoted CSV field to represent carriage return.")));
1425
      /* If reach here, we have found the line terminator */
1426
0
      break;
1427
0
    }
1428
1429
    /* Process \n */
1430
0
    if (c == '\n' && (!is_csv || !in_quote))
1431
0
    {
1432
0
      if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1433
0
        ereport(ERROR,
1434
0
            (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1435
0
             !is_csv ?
1436
0
             errmsg("literal newline found in data") :
1437
0
             errmsg("unquoted newline found in data"),
1438
0
             !is_csv ?
1439
0
             errhint("Use \"\\n\" to represent newline.") :
1440
0
             errhint("Use quoted CSV field to represent newline.")));
1441
0
      cstate->eol_type = EOL_NL;  /* in case not set yet */
1442
      /* If reach here, we have found the line terminator */
1443
0
      break;
1444
0
    }
1445
1446
    /*
1447
     * Process backslash, except in CSV mode where backslash is a normal
1448
     * character.
1449
     */
1450
0
    if (c == '\\' && !is_csv)
1451
0
    {
1452
0
      char    c2;
1453
1454
0
      IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1455
0
      IF_NEED_REFILL_AND_EOF_BREAK(0);
1456
1457
      /* -----
1458
       * get next character
1459
       * Note: we do not change c so if it isn't \., we can fall
1460
       * through and continue processing.
1461
       * -----
1462
       */
1463
0
      c2 = copy_input_buf[input_buf_ptr];
1464
1465
0
      if (c2 == '.')
1466
0
      {
1467
0
        input_buf_ptr++;  /* consume the '.' */
1468
0
        if (cstate->eol_type == EOL_CRNL)
1469
0
        {
1470
          /* Get the next character */
1471
0
          IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1472
          /* if hit_eof, c2 will become '\0' */
1473
0
          c2 = copy_input_buf[input_buf_ptr++];
1474
1475
0
          if (c2 == '\n')
1476
0
            ereport(ERROR,
1477
0
                (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1478
0
                 errmsg("end-of-copy marker does not match previous newline style")));
1479
0
          else if (c2 != '\r')
1480
0
            ereport(ERROR,
1481
0
                (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1482
0
                 errmsg("end-of-copy marker is not alone on its line")));
1483
0
        }
1484
1485
        /* Get the next character */
1486
0
        IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1487
        /* if hit_eof, c2 will become '\0' */
1488
0
        c2 = copy_input_buf[input_buf_ptr++];
1489
1490
0
        if (c2 != '\r' && c2 != '\n')
1491
0
          ereport(ERROR,
1492
0
              (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1493
0
               errmsg("end-of-copy marker is not alone on its line")));
1494
1495
0
        if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1496
0
          (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1497
0
          (cstate->eol_type == EOL_CR && c2 != '\r'))
1498
0
          ereport(ERROR,
1499
0
              (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1500
0
               errmsg("end-of-copy marker does not match previous newline style")));
1501
1502
        /*
1503
         * If there is any data on this line before the \., complain.
1504
         */
1505
0
        if (cstate->line_buf.len > 0 ||
1506
0
          prev_raw_ptr > cstate->input_buf_index)
1507
0
          ereport(ERROR,
1508
0
              (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1509
0
               errmsg("end-of-copy marker is not alone on its line")));
1510
1511
        /*
1512
         * Discard the \. and newline, then report EOF.
1513
         */
1514
0
        cstate->input_buf_index = input_buf_ptr;
1515
0
        result = true; /* report EOF */
1516
0
        break;
1517
0
      }
1518
0
      else
1519
0
      {
1520
        /*
1521
         * If we are here, it means we found a backslash followed by
1522
         * something other than a period.  In non-CSV mode, anything
1523
         * after a backslash is special, so we skip over that second
1524
         * character too.  If we didn't do that \\. would be
1525
         * considered an eof-of copy, while in non-CSV mode it is a
1526
         * literal backslash followed by a period.
1527
         */
1528
0
        input_buf_ptr++;
1529
0
      }
1530
0
    }
1531
0
  }             /* end of outer loop */
1532
1533
  /*
1534
   * Transfer any still-uncopied data to line_buf.
1535
   */
1536
0
  REFILL_LINEBUF;
1537
1538
0
  return result;
1539
0
}
1540
1541
/*
1542
 *  Return decimal value for a hexadecimal digit
1543
 */
1544
static int
1545
GetDecimalFromHex(char hex)
1546
0
{
1547
0
  if (isdigit((unsigned char) hex))
1548
0
    return hex - '0';
1549
0
  else
1550
0
    return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1551
0
}
1552
1553
/*
1554
 * Parse the current line into separate attributes (fields),
1555
 * performing de-escaping as needed.
1556
 *
1557
 * The input is in line_buf.  We use attribute_buf to hold the result
1558
 * strings.  cstate->raw_fields[k] is set to point to the k'th attribute
1559
 * string, or NULL when the input matches the null marker string.
1560
 * This array is expanded as necessary.
1561
 *
1562
 * (Note that the caller cannot check for nulls since the returned
1563
 * string would be the post-de-escaping equivalent, which may look
1564
 * the same as some valid data string.)
1565
 *
1566
 * delim is the column delimiter string (must be just one byte for now).
1567
 * null_print is the null marker string.  Note that this is compared to
1568
 * the pre-de-escaped input string.
1569
 *
1570
 * The return value is the number of fields actually read.
1571
 */
1572
static int
1573
CopyReadAttributesText(CopyFromState cstate)
1574
0
{
1575
0
  char    delimc = cstate->opts.delim[0];
1576
0
  int     fieldno;
1577
0
  char     *output_ptr;
1578
0
  char     *cur_ptr;
1579
0
  char     *line_end_ptr;
1580
1581
  /*
1582
   * We need a special case for zero-column tables: check that the input
1583
   * line is empty, and return.
1584
   */
1585
0
  if (cstate->max_fields <= 0)
1586
0
  {
1587
0
    if (cstate->line_buf.len != 0)
1588
0
      ereport(ERROR,
1589
0
          (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1590
0
           errmsg("extra data after last expected column")));
1591
0
    return 0;
1592
0
  }
1593
1594
0
  resetStringInfo(&cstate->attribute_buf);
1595
1596
  /*
1597
   * The de-escaped attributes will certainly not be longer than the input
1598
   * data line, so we can just force attribute_buf to be large enough and
1599
   * then transfer data without any checks for enough space.  We need to do
1600
   * it this way because enlarging attribute_buf mid-stream would invalidate
1601
   * pointers already stored into cstate->raw_fields[].
1602
   */
1603
0
  if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1604
0
    enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1605
0
  output_ptr = cstate->attribute_buf.data;
1606
1607
  /* set pointer variables for loop */
1608
0
  cur_ptr = cstate->line_buf.data;
1609
0
  line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1610
1611
  /* Outer loop iterates over fields */
1612
0
  fieldno = 0;
1613
0
  for (;;)
1614
0
  {
1615
0
    bool    found_delim = false;
1616
0
    char     *start_ptr;
1617
0
    char     *end_ptr;
1618
0
    int     input_len;
1619
0
    bool    saw_non_ascii = false;
1620
1621
    /* Make sure there is enough space for the next value */
1622
0
    if (fieldno >= cstate->max_fields)
1623
0
    {
1624
0
      cstate->max_fields *= 2;
1625
0
      cstate->raw_fields =
1626
0
        repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1627
0
    }
1628
1629
    /* Remember start of field on both input and output sides */
1630
0
    start_ptr = cur_ptr;
1631
0
    cstate->raw_fields[fieldno] = output_ptr;
1632
1633
    /*
1634
     * Scan data for field.
1635
     *
1636
     * Note that in this loop, we are scanning to locate the end of field
1637
     * and also speculatively performing de-escaping.  Once we find the
1638
     * end-of-field, we can match the raw field contents against the null
1639
     * marker string.  Only after that comparison fails do we know that
1640
     * de-escaping is actually the right thing to do; therefore we *must
1641
     * not* throw any syntax errors before we've done the null-marker
1642
     * check.
1643
     */
1644
0
    for (;;)
1645
0
    {
1646
0
      char    c;
1647
1648
0
      end_ptr = cur_ptr;
1649
0
      if (cur_ptr >= line_end_ptr)
1650
0
        break;
1651
0
      c = *cur_ptr++;
1652
0
      if (c == delimc)
1653
0
      {
1654
0
        found_delim = true;
1655
0
        break;
1656
0
      }
1657
0
      if (c == '\\')
1658
0
      {
1659
0
        if (cur_ptr >= line_end_ptr)
1660
0
          break;
1661
0
        c = *cur_ptr++;
1662
0
        switch (c)
1663
0
        {
1664
0
          case '0':
1665
0
          case '1':
1666
0
          case '2':
1667
0
          case '3':
1668
0
          case '4':
1669
0
          case '5':
1670
0
          case '6':
1671
0
          case '7':
1672
0
            {
1673
              /* handle \013 */
1674
0
              int     val;
1675
1676
0
              val = OCTVALUE(c);
1677
0
              if (cur_ptr < line_end_ptr)
1678
0
              {
1679
0
                c = *cur_ptr;
1680
0
                if (ISOCTAL(c))
1681
0
                {
1682
0
                  cur_ptr++;
1683
0
                  val = (val << 3) + OCTVALUE(c);
1684
0
                  if (cur_ptr < line_end_ptr)
1685
0
                  {
1686
0
                    c = *cur_ptr;
1687
0
                    if (ISOCTAL(c))
1688
0
                    {
1689
0
                      cur_ptr++;
1690
0
                      val = (val << 3) + OCTVALUE(c);
1691
0
                    }
1692
0
                  }
1693
0
                }
1694
0
              }
1695
0
              c = val & 0377;
1696
0
              if (c == '\0' || IS_HIGHBIT_SET(c))
1697
0
                saw_non_ascii = true;
1698
0
            }
1699
0
            break;
1700
0
          case 'x':
1701
            /* Handle \x3F */
1702
0
            if (cur_ptr < line_end_ptr)
1703
0
            {
1704
0
              char    hexchar = *cur_ptr;
1705
1706
0
              if (isxdigit((unsigned char) hexchar))
1707
0
              {
1708
0
                int     val = GetDecimalFromHex(hexchar);
1709
1710
0
                cur_ptr++;
1711
0
                if (cur_ptr < line_end_ptr)
1712
0
                {
1713
0
                  hexchar = *cur_ptr;
1714
0
                  if (isxdigit((unsigned char) hexchar))
1715
0
                  {
1716
0
                    cur_ptr++;
1717
0
                    val = (val << 4) + GetDecimalFromHex(hexchar);
1718
0
                  }
1719
0
                }
1720
0
                c = val & 0xff;
1721
0
                if (c == '\0' || IS_HIGHBIT_SET(c))
1722
0
                  saw_non_ascii = true;
1723
0
              }
1724
0
            }
1725
0
            break;
1726
0
          case 'b':
1727
0
            c = '\b';
1728
0
            break;
1729
0
          case 'f':
1730
0
            c = '\f';
1731
0
            break;
1732
0
          case 'n':
1733
0
            c = '\n';
1734
0
            break;
1735
0
          case 'r':
1736
0
            c = '\r';
1737
0
            break;
1738
0
          case 't':
1739
0
            c = '\t';
1740
0
            break;
1741
0
          case 'v':
1742
0
            c = '\v';
1743
0
            break;
1744
1745
            /*
1746
             * in all other cases, take the char after '\'
1747
             * literally
1748
             */
1749
0
        }
1750
0
      }
1751
1752
      /* Add c to output string */
1753
0
      *output_ptr++ = c;
1754
0
    }
1755
1756
    /* Check whether raw input matched null marker */
1757
0
    input_len = end_ptr - start_ptr;
1758
0
    if (input_len == cstate->opts.null_print_len &&
1759
0
      strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1760
0
      cstate->raw_fields[fieldno] = NULL;
1761
    /* Check whether raw input matched default marker */
1762
0
    else if (fieldno < list_length(cstate->attnumlist) &&
1763
0
         cstate->opts.default_print &&
1764
0
         input_len == cstate->opts.default_print_len &&
1765
0
         strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1766
0
    {
1767
      /* fieldno is 0-indexed and attnum is 1-indexed */
1768
0
      int     m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1769
1770
0
      if (cstate->defexprs[m] != NULL)
1771
0
      {
1772
        /* defaults contain entries for all physical attributes */
1773
0
        cstate->defaults[m] = true;
1774
0
      }
1775
0
      else
1776
0
      {
1777
0
        TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1778
0
        Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1779
1780
0
        ereport(ERROR,
1781
0
            (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1782
0
             errmsg("unexpected default marker in COPY data"),
1783
0
             errdetail("Column \"%s\" has no default value.",
1784
0
                   NameStr(att->attname))));
1785
0
      }
1786
0
    }
1787
0
    else
1788
0
    {
1789
      /*
1790
       * At this point we know the field is supposed to contain data.
1791
       *
1792
       * If we de-escaped any non-7-bit-ASCII chars, make sure the
1793
       * resulting string is valid data for the db encoding.
1794
       */
1795
0
      if (saw_non_ascii)
1796
0
      {
1797
0
        char     *fld = cstate->raw_fields[fieldno];
1798
1799
0
        pg_verifymbstr(fld, output_ptr - fld, false);
1800
0
      }
1801
0
    }
1802
1803
    /* Terminate attribute value in output area */
1804
0
    *output_ptr++ = '\0';
1805
1806
0
    fieldno++;
1807
    /* Done if we hit EOL instead of a delim */
1808
0
    if (!found_delim)
1809
0
      break;
1810
0
  }
1811
1812
  /* Clean up state of attribute_buf */
1813
0
  output_ptr--;
1814
0
  Assert(*output_ptr == '\0');
1815
0
  cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1816
1817
0
  return fieldno;
1818
0
}
1819
1820
/*
1821
 * Parse the current line into separate attributes (fields),
1822
 * performing de-escaping as needed.  This has exactly the same API as
1823
 * CopyReadAttributesText, except we parse the fields according to
1824
 * "standard" (i.e. common) CSV usage.
1825
 */
1826
static int
1827
CopyReadAttributesCSV(CopyFromState cstate)
1828
0
{
1829
0
  char    delimc = cstate->opts.delim[0];
1830
0
  char    quotec = cstate->opts.quote[0];
1831
0
  char    escapec = cstate->opts.escape[0];
1832
0
  int     fieldno;
1833
0
  char     *output_ptr;
1834
0
  char     *cur_ptr;
1835
0
  char     *line_end_ptr;
1836
1837
  /*
1838
   * We need a special case for zero-column tables: check that the input
1839
   * line is empty, and return.
1840
   */
1841
0
  if (cstate->max_fields <= 0)
1842
0
  {
1843
0
    if (cstate->line_buf.len != 0)
1844
0
      ereport(ERROR,
1845
0
          (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1846
0
           errmsg("extra data after last expected column")));
1847
0
    return 0;
1848
0
  }
1849
1850
0
  resetStringInfo(&cstate->attribute_buf);
1851
1852
  /*
1853
   * The de-escaped attributes will certainly not be longer than the input
1854
   * data line, so we can just force attribute_buf to be large enough and
1855
   * then transfer data without any checks for enough space.  We need to do
1856
   * it this way because enlarging attribute_buf mid-stream would invalidate
1857
   * pointers already stored into cstate->raw_fields[].
1858
   */
1859
0
  if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1860
0
    enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1861
0
  output_ptr = cstate->attribute_buf.data;
1862
1863
  /* set pointer variables for loop */
1864
0
  cur_ptr = cstate->line_buf.data;
1865
0
  line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1866
1867
  /* Outer loop iterates over fields */
1868
0
  fieldno = 0;
1869
0
  for (;;)
1870
0
  {
1871
0
    bool    found_delim = false;
1872
0
    bool    saw_quote = false;
1873
0
    char     *start_ptr;
1874
0
    char     *end_ptr;
1875
0
    int     input_len;
1876
1877
    /* Make sure there is enough space for the next value */
1878
0
    if (fieldno >= cstate->max_fields)
1879
0
    {
1880
0
      cstate->max_fields *= 2;
1881
0
      cstate->raw_fields =
1882
0
        repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1883
0
    }
1884
1885
    /* Remember start of field on both input and output sides */
1886
0
    start_ptr = cur_ptr;
1887
0
    cstate->raw_fields[fieldno] = output_ptr;
1888
1889
    /*
1890
     * Scan data for field,
1891
     *
1892
     * The loop starts in "not quote" mode and then toggles between that
1893
     * and "in quote" mode. The loop exits normally if it is in "not
1894
     * quote" mode and a delimiter or line end is seen.
1895
     */
1896
0
    for (;;)
1897
0
    {
1898
0
      char    c;
1899
1900
      /* Not in quote */
1901
0
      for (;;)
1902
0
      {
1903
0
        end_ptr = cur_ptr;
1904
0
        if (cur_ptr >= line_end_ptr)
1905
0
          goto endfield;
1906
0
        c = *cur_ptr++;
1907
        /* unquoted field delimiter */
1908
0
        if (c == delimc)
1909
0
        {
1910
0
          found_delim = true;
1911
0
          goto endfield;
1912
0
        }
1913
        /* start of quoted field (or part of field) */
1914
0
        if (c == quotec)
1915
0
        {
1916
0
          saw_quote = true;
1917
0
          break;
1918
0
        }
1919
        /* Add c to output string */
1920
0
        *output_ptr++ = c;
1921
0
      }
1922
1923
      /* In quote */
1924
0
      for (;;)
1925
0
      {
1926
0
        end_ptr = cur_ptr;
1927
0
        if (cur_ptr >= line_end_ptr)
1928
0
          ereport(ERROR,
1929
0
              (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1930
0
               errmsg("unterminated CSV quoted field")));
1931
1932
0
        c = *cur_ptr++;
1933
1934
        /* escape within a quoted field */
1935
0
        if (c == escapec)
1936
0
        {
1937
          /*
1938
           * peek at the next char if available, and escape it if it
1939
           * is an escape char or a quote char
1940
           */
1941
0
          if (cur_ptr < line_end_ptr)
1942
0
          {
1943
0
            char    nextc = *cur_ptr;
1944
1945
0
            if (nextc == escapec || nextc == quotec)
1946
0
            {
1947
0
              *output_ptr++ = nextc;
1948
0
              cur_ptr++;
1949
0
              continue;
1950
0
            }
1951
0
          }
1952
0
        }
1953
1954
        /*
1955
         * end of quoted field. Must do this test after testing for
1956
         * escape in case quote char and escape char are the same
1957
         * (which is the common case).
1958
         */
1959
0
        if (c == quotec)
1960
0
          break;
1961
1962
        /* Add c to output string */
1963
0
        *output_ptr++ = c;
1964
0
      }
1965
0
    }
1966
0
endfield:
1967
1968
    /* Terminate attribute value in output area */
1969
0
    *output_ptr++ = '\0';
1970
1971
    /* Check whether raw input matched null marker */
1972
0
    input_len = end_ptr - start_ptr;
1973
0
    if (!saw_quote && input_len == cstate->opts.null_print_len &&
1974
0
      strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1975
0
      cstate->raw_fields[fieldno] = NULL;
1976
    /* Check whether raw input matched default marker */
1977
0
    else if (fieldno < list_length(cstate->attnumlist) &&
1978
0
         cstate->opts.default_print &&
1979
0
         input_len == cstate->opts.default_print_len &&
1980
0
         strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1981
0
    {
1982
      /* fieldno is 0-index and attnum is 1-index */
1983
0
      int     m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1984
1985
0
      if (cstate->defexprs[m] != NULL)
1986
0
      {
1987
        /* defaults contain entries for all physical attributes */
1988
0
        cstate->defaults[m] = true;
1989
0
      }
1990
0
      else
1991
0
      {
1992
0
        TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1993
0
        Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1994
1995
0
        ereport(ERROR,
1996
0
            (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1997
0
             errmsg("unexpected default marker in COPY data"),
1998
0
             errdetail("Column \"%s\" has no default value.",
1999
0
                   NameStr(att->attname))));
2000
0
      }
2001
0
    }
2002
2003
0
    fieldno++;
2004
    /* Done if we hit EOL instead of a delim */
2005
0
    if (!found_delim)
2006
0
      break;
2007
0
  }
2008
2009
  /* Clean up state of attribute_buf */
2010
0
  output_ptr--;
2011
0
  Assert(*output_ptr == '\0');
2012
0
  cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2013
2014
0
  return fieldno;
2015
0
}
2016
2017
2018
/*
2019
 * Read a binary attribute
2020
 */
2021
static Datum
2022
CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
2023
            Oid typioparam, int32 typmod,
2024
            bool *isnull)
2025
0
{
2026
0
  int32   fld_size;
2027
0
  Datum   result;
2028
2029
0
  if (!CopyGetInt32(cstate, &fld_size))
2030
0
    ereport(ERROR,
2031
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2032
0
         errmsg("unexpected EOF in COPY data")));
2033
0
  if (fld_size == -1)
2034
0
  {
2035
0
    *isnull = true;
2036
0
    return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2037
0
  }
2038
0
  if (fld_size < 0)
2039
0
    ereport(ERROR,
2040
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2041
0
         errmsg("invalid field size")));
2042
2043
  /* reset attribute_buf to empty, and load raw data in it */
2044
0
  resetStringInfo(&cstate->attribute_buf);
2045
2046
0
  enlargeStringInfo(&cstate->attribute_buf, fld_size);
2047
0
  if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2048
0
               fld_size) != fld_size)
2049
0
    ereport(ERROR,
2050
0
        (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2051
0
         errmsg("unexpected EOF in COPY data")));
2052
2053
0
  cstate->attribute_buf.len = fld_size;
2054
0
  cstate->attribute_buf.data[fld_size] = '\0';
2055
2056
  /* Call the column type's binary input converter */
2057
0
  result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2058
0
                 typioparam, typmod);
2059
2060
  /* Trouble if it didn't eat the whole buffer */
2061
0
  if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2062
0
    ereport(ERROR,
2063
0
        (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2064
0
         errmsg("incorrect binary data format")));
2065
2066
0
  *isnull = false;
2067
0
  return result;
2068
0
}