Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * copyfromparse.c
4 : : * Parse CSV/text/binary format for COPY FROM.
5 : : *
6 : : * This file contains routines to parse the text, CSV and binary input
7 : : * formats. The main entry point is NextCopyFrom(), which parses the
8 : : * next input line and returns it as Datums.
9 : : *
10 : : * In text/CSV mode, the parsing happens in multiple stages:
11 : : *
12 : : * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 : : * 1. 2. 3. 4.
14 : : *
15 : : * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 : : * places it into 'raw_buf'.
17 : : *
18 : : * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 : : * the data in 'raw_buf' from client to server encoding, placing the
20 : : * converted result in 'input_buf'.
21 : : *
22 : : * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 : : * It is responsible for finding the next newline marker, taking quote and
24 : : * escape characters into account according to the COPY options. The line
25 : : * is copied into 'line_buf', with quotes and escape characters still
26 : : * intact.
27 : : *
28 : : * 4. CopyReadAttributesText/CSV() function takes the input line from
29 : : * 'line_buf', and splits it into fields, unescaping the data as required.
30 : : * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 : : * pointers to each field.
32 : : *
33 : : * If encoding conversion is not required, a shortcut is taken in step 2 to
34 : : * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 : : * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 : : * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 : : * the data is valid in the current encoding.
38 : : *
39 : : * In binary mode, the pipeline is much simpler. Input is loaded into
40 : : * 'raw_buf', and encoding conversion is done in the datatype-specific
41 : : * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 : : * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 : : * data when it's passed the receive function.
44 : : *
45 : : * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 : : * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 : : * and 'attribute_buf' are expanded on demand, to hold the longest line
48 : : * encountered so far.
49 : : *
50 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
51 : : * Portions Copyright (c) 1994, Regents of the University of California
52 : : *
53 : : *
54 : : * IDENTIFICATION
55 : : * src/backend/commands/copyfromparse.c
56 : : *
57 : : *-------------------------------------------------------------------------
58 : : */
59 : : #include "postgres.h"
60 : :
61 : : #include <ctype.h>
62 : : #include <unistd.h>
63 : : #include <sys/stat.h>
64 : :
65 : : #include "commands/copyapi.h"
66 : : #include "commands/copyfrom_internal.h"
67 : : #include "commands/progress.h"
68 : : #include "executor/executor.h"
69 : : #include "libpq/libpq.h"
70 : : #include "libpq/pqformat.h"
71 : : #include "mb/pg_wchar.h"
72 : : #include "miscadmin.h"
73 : : #include "pgstat.h"
74 : : #include "port/pg_bswap.h"
75 : : #include "port/simd.h"
76 : : #include "utils/builtins.h"
77 : : #include "utils/rel.h"
78 : : #include "utils/wait_event.h"
79 : :
80 : : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
81 : : #define OCTVALUE(c) ((c) - '0')
82 : :
83 : : /*
84 : : * These macros centralize code used to process line_buf and input_buf buffers.
85 : : * They are macros because they often do continue/break control and to avoid
86 : : * function call overhead in tight COPY loops.
87 : : *
88 : : * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
89 : : * prevent the continue/break processing from working. We end the "if (1)"
90 : : * with "else ((void) 0)" to ensure the "if" does not unintentionally match
91 : : * any "else" in the calling code, and to avoid any compiler warnings about
92 : : * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
93 : : */
94 : :
95 : : /*
96 : : * This keeps the character read at the top of the loop in the buffer
97 : : * even if there is more than one read-ahead.
98 : : */
99 : : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
100 : : if (1) \
101 : : { \
102 : : if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
103 : : { \
104 : : input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
105 : : need_data = true; \
106 : : continue; \
107 : : } \
108 : : } else ((void) 0)
109 : :
110 : : /* This consumes the remainder of the buffer and breaks */
111 : : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
112 : : if (1) \
113 : : { \
114 : : if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
115 : : { \
116 : : if (extralen) \
117 : : input_buf_ptr = copy_buf_len; /* consume the partial character */ \
118 : : /* backslash just before EOF, treat as data char */ \
119 : : result = true; \
120 : : break; \
121 : : } \
122 : : } else ((void) 0)
123 : :
124 : : /*
125 : : * Transfer any approved data to line_buf; must do this to be sure
126 : : * there is some room in input_buf.
127 : : */
128 : : #define REFILL_LINEBUF \
129 : : if (1) \
130 : : { \
131 : : if (input_buf_ptr > cstate->input_buf_index) \
132 : : { \
133 : : appendBinaryStringInfo(&cstate->line_buf, \
134 : : cstate->input_buf + cstate->input_buf_index, \
135 : : input_buf_ptr - cstate->input_buf_index); \
136 : : cstate->input_buf_index = input_buf_ptr; \
137 : : } \
138 : : } else ((void) 0)
139 : :
140 : : /* NOTE: there's a copy of this in copyto.c */
141 : : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
142 : :
143 : :
144 : : /* non-export function prototypes */
145 : : static bool CopyReadLine(CopyFromState cstate, bool is_csv);
146 : : static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate,
147 : : bool is_csv);
148 : : static int CopyReadAttributesText(CopyFromState cstate);
149 : : static int CopyReadAttributesCSV(CopyFromState cstate);
150 : : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
151 : : Oid typioparam, int32 typmod,
152 : : bool *isnull);
153 : : static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate,
154 : : ExprContext *econtext,
155 : : Datum *values,
156 : : bool *nulls,
157 : : bool is_csv);
158 : : static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate,
159 : : char ***fields,
160 : : int *nfields,
161 : : bool is_csv);
162 : :
163 : :
164 : : /* Low-level communications functions */
165 : : static int CopyGetData(CopyFromState cstate, void *databuf,
166 : : int minread, int maxread);
167 : : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
168 : : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
169 : : static void CopyLoadInputBuf(CopyFromState cstate);
170 : : static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
171 : :
172 : : void
1938 heikki.linnakangas@i 173 :CBC 542 : ReceiveCopyBegin(CopyFromState cstate)
174 : : {
175 : : StringInfoData buf;
1837 176 : 542 : int natts = list_length(cstate->attnumlist);
177 : 542 : int16 format = (cstate->opts.binary ? 1 : 0);
178 : : int i;
179 : :
936 nathan@postgresql.or 180 : 542 : pq_beginmessage(&buf, PqMsg_CopyInResponse);
1837 heikki.linnakangas@i 181 : 542 : pq_sendbyte(&buf, format); /* overall format */
182 : 542 : pq_sendint16(&buf, natts);
183 [ + + ]: 1946 : for (i = 0; i < natts; i++)
184 : 1404 : pq_sendint16(&buf, format); /* per-column formats */
185 : 542 : pq_endmessage(&buf);
186 : 542 : cstate->copy_src = COPY_FRONTEND;
187 : 542 : cstate->fe_msgbuf = makeStringInfo();
188 : : /* We *must* flush here to ensure FE knows it can send. */
1938 189 : 542 : pq_flush();
190 : 542 : }
191 : :
192 : : void
193 : 7 : ReceiveCopyBinaryHeader(CopyFromState cstate)
194 : : {
195 : : char readSig[11];
196 : : int32 tmp;
197 : :
198 : : /* Signature */
199 [ + - ]: 7 : if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
200 [ - + ]: 7 : memcmp(readSig, BinarySignature, 11) != 0)
1938 heikki.linnakangas@i 201 [ # # ]:UBC 0 : ereport(ERROR,
202 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
203 : : errmsg("COPY file signature not recognized")));
204 : : /* Flags field */
1938 heikki.linnakangas@i 205 [ - + ]:CBC 7 : if (!CopyGetInt32(cstate, &tmp))
1938 heikki.linnakangas@i 206 [ # # ]:UBC 0 : ereport(ERROR,
207 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
208 : : errmsg("invalid COPY file header (missing flags)")));
1938 heikki.linnakangas@i 209 [ - + ]:CBC 7 : if ((tmp & (1 << 16)) != 0)
1938 heikki.linnakangas@i 210 [ # # ]:UBC 0 : ereport(ERROR,
211 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
212 : : errmsg("invalid COPY file header (WITH OIDS)")));
1938 heikki.linnakangas@i 213 :CBC 7 : tmp &= ~(1 << 16);
214 [ - + ]: 7 : if ((tmp >> 16) != 0)
1938 heikki.linnakangas@i 215 [ # # ]:UBC 0 : ereport(ERROR,
216 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
217 : : errmsg("unrecognized critical flags in COPY file header")));
218 : : /* Header extension length */
1938 heikki.linnakangas@i 219 [ + - ]:CBC 7 : if (!CopyGetInt32(cstate, &tmp) ||
220 [ - + ]: 7 : tmp < 0)
1938 heikki.linnakangas@i 221 [ # # ]:UBC 0 : ereport(ERROR,
222 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
223 : : errmsg("invalid COPY file header (missing length)")));
224 : : /* Skip extension header, if present */
1938 heikki.linnakangas@i 225 [ - + ]:CBC 7 : while (tmp-- > 0)
226 : : {
1938 heikki.linnakangas@i 227 [ # # ]:UBC 0 : if (CopyReadBinaryData(cstate, readSig, 1) != 1)
228 [ # # ]: 0 : ereport(ERROR,
229 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
230 : : errmsg("invalid COPY file header (wrong length)")));
231 : : }
1938 heikki.linnakangas@i 232 :CBC 7 : }
233 : :
234 : : /*
235 : : * CopyGetData reads data from the source (file or frontend)
236 : : *
237 : : * We attempt to read at least minread, and at most maxread, bytes from
238 : : * the source. The actual number of bytes read is returned; if this is
239 : : * less than minread, EOF was detected.
240 : : *
241 : : * Note: when copying from the frontend, we expect a proper EOF mark per
242 : : * protocol; if the frontend simply drops the connection, we raise error.
243 : : * It seems unwise to allow the COPY IN to complete normally in that case.
244 : : *
245 : : * NB: no data conversion is applied here.
246 : : */
247 : : static int
248 : 217089 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
249 : : {
250 : 217089 : int bytesread = 0;
251 : :
252 [ + + + - ]: 217089 : switch (cstate->copy_src)
253 : : {
254 : 566 : case COPY_FILE:
40 michael@paquier.xyz 255 :GNC 566 : pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ);
1938 heikki.linnakangas@i 256 :CBC 566 : bytesread = fread(databuf, 1, maxread, cstate->copy_file);
40 michael@paquier.xyz 257 :GNC 566 : pgstat_report_wait_end();
1938 heikki.linnakangas@i 258 [ - + ]:CBC 566 : if (ferror(cstate->copy_file))
1938 heikki.linnakangas@i 259 [ # # ]:UBC 0 : ereport(ERROR,
260 : : (errcode_for_file_access(),
261 : : errmsg("could not read from COPY file: %m")));
1938 heikki.linnakangas@i 262 [ + + ]:CBC 566 : if (bytesread == 0)
1809 263 : 226 : cstate->raw_reached_eof = true;
1938 264 : 566 : break;
1837 265 : 201501 : case COPY_FRONTEND:
1809 266 [ + - + + : 402188 : while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
+ + ]
267 : : {
268 : : int avail;
269 : :
1938 270 [ + + ]: 401807 : while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
271 : : {
272 : : /* Try to receive another message */
273 : : int mtype;
274 : : int maxmsglen;
275 : :
276 : 201120 : readmessage:
277 : 201120 : HOLD_CANCEL_INTERRUPTS();
278 : 201120 : pq_startmsgread();
279 : 201120 : mtype = pq_getbyte();
280 [ - + ]: 201120 : if (mtype == EOF)
1938 heikki.linnakangas@i 281 [ # # ]:UBC 0 : ereport(ERROR,
282 : : (errcode(ERRCODE_CONNECTION_FAILURE),
283 : : errmsg("unexpected EOF on client connection with an open transaction")));
284 : : /* Validate message type and set packet size limit */
1782 tgl@sss.pgh.pa.us 285 [ + + + ]:CBC 201120 : switch (mtype)
286 : : {
936 nathan@postgresql.or 287 : 200687 : case PqMsg_CopyData:
1782 tgl@sss.pgh.pa.us 288 : 200687 : maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
289 : 200687 : break;
936 nathan@postgresql.or 290 : 431 : case PqMsg_CopyDone:
291 : : case PqMsg_CopyFail:
292 : : case PqMsg_Flush:
293 : : case PqMsg_Sync:
1782 tgl@sss.pgh.pa.us 294 : 431 : maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
295 : 431 : break;
296 : 2 : default:
297 [ + - ]: 2 : ereport(ERROR,
298 : : (errcode(ERRCODE_PROTOCOL_VIOLATION),
299 : : errmsg("unexpected message type 0x%02X during COPY from stdin",
300 : : mtype)));
301 : : maxmsglen = 0; /* keep compiler quiet */
302 : : break;
303 : : }
304 : : /* Now collect the message body */
305 [ - + ]: 201118 : if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
1938 heikki.linnakangas@i 306 [ # # ]:UBC 0 : ereport(ERROR,
307 : : (errcode(ERRCODE_CONNECTION_FAILURE),
308 : : errmsg("unexpected EOF on client connection with an open transaction")));
1938 heikki.linnakangas@i 309 [ - + ]:CBC 201118 : RESUME_CANCEL_INTERRUPTS();
310 : : /* ... and process it */
311 [ + + - - : 201118 : switch (mtype)
- ]
312 : : {
936 nathan@postgresql.or 313 : 200687 : case PqMsg_CopyData:
1938 heikki.linnakangas@i 314 : 200687 : break;
936 nathan@postgresql.or 315 : 431 : case PqMsg_CopyDone:
316 : : /* COPY IN correctly terminated by frontend */
1809 heikki.linnakangas@i 317 : 431 : cstate->raw_reached_eof = true;
1938 318 : 431 : return bytesread;
936 nathan@postgresql.or 319 :UBC 0 : case PqMsg_CopyFail:
1938 heikki.linnakangas@i 320 [ # # ]: 0 : ereport(ERROR,
321 : : (errcode(ERRCODE_QUERY_CANCELED),
322 : : errmsg("COPY from stdin failed: %s",
323 : : pq_getmsgstring(cstate->fe_msgbuf))));
324 : : break;
936 nathan@postgresql.or 325 : 0 : case PqMsg_Flush:
326 : : case PqMsg_Sync:
327 : :
328 : : /*
329 : : * Ignore Flush/Sync for the convenience of client
330 : : * libraries (such as libpq) that may send those
331 : : * without noticing that the command they just
332 : : * sent was COPY.
333 : : */
1938 heikki.linnakangas@i 334 : 0 : goto readmessage;
335 : 0 : default:
1782 tgl@sss.pgh.pa.us 336 : 0 : Assert(false); /* NOT REACHED */
337 : : }
338 : : }
1938 heikki.linnakangas@i 339 :CBC 200687 : avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
340 [ - + ]: 200687 : if (avail > maxread)
1938 heikki.linnakangas@i 341 :UBC 0 : avail = maxread;
1938 heikki.linnakangas@i 342 :CBC 200687 : pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
114 peter@eisentraut.org 343 :GNC 200687 : databuf = (char *) databuf + avail;
1938 heikki.linnakangas@i 344 :CBC 200687 : maxread -= avail;
345 : 200687 : bytesread += avail;
346 : : }
347 : 201068 : break;
348 : 15022 : case COPY_CALLBACK:
349 : 15022 : bytesread = cstate->data_source_cb(databuf, minread, maxread);
350 : 15022 : break;
351 : : }
352 : :
353 : 216656 : return bytesread;
354 : : }
355 : :
356 : :
357 : : /*
358 : : * These functions do apply some data conversion
359 : : */
360 : :
361 : : /*
362 : : * CopyGetInt32 reads an int32 that appears in network byte order
363 : : *
364 : : * Returns true if OK, false if EOF
365 : : */
366 : : static inline bool
367 : 93 : CopyGetInt32(CopyFromState cstate, int32 *val)
368 : : {
369 : : uint32 buf;
370 : :
371 [ - + ]: 93 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
372 : : {
1938 heikki.linnakangas@i 373 :UBC 0 : *val = 0; /* suppress compiler warning */
374 : 0 : return false;
375 : : }
1938 heikki.linnakangas@i 376 :CBC 93 : *val = (int32) pg_ntoh32(buf);
377 : 93 : return true;
378 : : }
379 : :
380 : : /*
381 : : * CopyGetInt16 reads an int16 that appears in network byte order
382 : : */
383 : : static inline bool
384 : 21 : CopyGetInt16(CopyFromState cstate, int16 *val)
385 : : {
386 : : uint16 buf;
387 : :
388 [ - + ]: 21 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
389 : : {
1938 heikki.linnakangas@i 390 :UBC 0 : *val = 0; /* suppress compiler warning */
391 : 0 : return false;
392 : : }
1938 heikki.linnakangas@i 393 :CBC 21 : *val = (int16) pg_ntoh16(buf);
394 : 21 : return true;
395 : : }
396 : :
397 : :
398 : : /*
399 : : * Perform encoding conversion on data in 'raw_buf', writing the converted
400 : : * data into 'input_buf'.
401 : : *
402 : : * On entry, there must be some data to convert in 'raw_buf'.
403 : : */
404 : : static void
1809 405 : 433541 : CopyConvertBuf(CopyFromState cstate)
406 : : {
407 : : /*
408 : : * If the file and server encoding are the same, no encoding conversion is
409 : : * required. However, we still need to verify that the input is valid for
410 : : * the encoding.
411 : : */
412 [ + + ]: 433541 : if (!cstate->need_transcoding)
413 : : {
414 : : /*
415 : : * When conversion is not required, input_buf and raw_buf are the
416 : : * same. raw_buf_len is the total number of bytes in the buffer, and
417 : : * input_buf_len tracks how many of those bytes have already been
418 : : * verified.
419 : : */
420 : 433475 : int preverifiedlen = cstate->input_buf_len;
421 : 433475 : int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
422 : : int nverified;
423 : :
424 [ + + ]: 433475 : if (unverifiedlen == 0)
425 : : {
426 : : /*
427 : : * If no more raw data is coming, report the EOF to the caller.
428 : : */
429 [ + + ]: 217658 : if (cstate->raw_reached_eof)
430 : 1045 : cstate->input_reached_eof = true;
431 : 217658 : return;
432 : : }
433 : :
434 : : /*
435 : : * Verify the new data, including any residual unverified bytes from
436 : : * previous round.
437 : : */
438 : 215817 : nverified = pg_encoding_verifymbstr(cstate->file_encoding,
439 : 215817 : cstate->raw_buf + preverifiedlen,
440 : : unverifiedlen);
441 [ - + ]: 215817 : if (nverified == 0)
442 : : {
443 : : /*
444 : : * Could not verify anything.
445 : : *
446 : : * If there is no more raw input data coming, it means that there
447 : : * was an incomplete multi-byte sequence at the end. Also, if
448 : : * there's "enough" input left, we should be able to verify at
449 : : * least one character, and a failure to do so means that we've
450 : : * hit an invalid byte sequence.
451 : : */
1386 heikki.linnakangas@i 452 [ # # # # ]:UBC 0 : if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
1809 453 : 0 : cstate->input_reached_error = true;
454 : 0 : return;
455 : : }
1809 heikki.linnakangas@i 456 :CBC 215817 : cstate->input_buf_len += nverified;
457 : : }
458 : : else
459 : : {
460 : : /*
461 : : * Encoding conversion is needed.
462 : : */
463 : : int nbytes;
464 : : unsigned char *src;
465 : : int srclen;
466 : : unsigned char *dst;
467 : : int dstlen;
468 : : int convertedlen;
469 : :
470 [ + + ]: 66 : if (RAW_BUF_BYTES(cstate) == 0)
471 : : {
472 : : /*
473 : : * If no more raw data is coming, report the EOF to the caller.
474 : : */
475 [ + + ]: 42 : if (cstate->raw_reached_eof)
476 : 12 : cstate->input_reached_eof = true;
477 : 42 : return;
478 : : }
479 : :
480 : : /*
481 : : * First, copy down any unprocessed data.
482 : : */
483 : 24 : nbytes = INPUT_BUF_BYTES(cstate);
484 [ - + - - ]: 24 : if (nbytes > 0 && cstate->input_buf_index > 0)
1809 heikki.linnakangas@i 485 :UBC 0 : memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
486 : : nbytes);
1809 heikki.linnakangas@i 487 :CBC 24 : cstate->input_buf_index = 0;
488 : 24 : cstate->input_buf_len = nbytes;
489 : 24 : cstate->input_buf[nbytes] = '\0';
490 : :
491 : 24 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
492 : 24 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
493 : 24 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
494 : 24 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
495 : :
496 : : /*
497 : : * Do the conversion. This might stop short, if there is an invalid
498 : : * byte sequence in the input. We'll convert as much as we can in
499 : : * that case.
500 : : *
501 : : * Note: Even if we hit an invalid byte sequence, we don't report the
502 : : * error until all the valid bytes have been consumed. The input
503 : : * might contain an end-of-input marker (\.), and we don't want to
504 : : * report an error if the invalid byte sequence is after the
505 : : * end-of-input marker. We might unnecessarily convert some data
506 : : * after the end-of-input marker as long as it's valid for the
507 : : * encoding, but that's harmless.
508 : : */
509 : 24 : convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
510 : : cstate->file_encoding,
511 : : GetDatabaseEncoding(),
512 : : src, srclen,
513 : : dst, dstlen,
514 : : true);
515 [ + + ]: 24 : if (convertedlen == 0)
516 : : {
517 : : /*
518 : : * Could not convert anything. If there is no more raw input data
519 : : * coming, it means that there was an incomplete multi-byte
520 : : * sequence at the end. Also, if there is plenty of input left,
521 : : * we should be able to convert at least one character, so a
522 : : * failure to do so must mean that we've hit a byte sequence
523 : : * that's invalid.
524 : : */
525 [ + + - + ]: 12 : if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
526 : 6 : cstate->input_reached_error = true;
527 : 12 : return;
528 : : }
529 : 12 : cstate->raw_buf_index += convertedlen;
530 : 12 : cstate->input_buf_len += strlen((char *) dst);
531 : : }
532 : : }
533 : :
534 : : /*
535 : : * Report an encoding or conversion error.
536 : : */
537 : : static void
538 : 6 : CopyConversionError(CopyFromState cstate)
539 : : {
540 [ - + ]: 6 : Assert(cstate->raw_buf_len > 0);
541 [ - + ]: 6 : Assert(cstate->input_reached_error);
542 : :
543 [ - + ]: 6 : if (!cstate->need_transcoding)
544 : : {
545 : : /*
546 : : * Everything up to input_buf_len was successfully verified, and
547 : : * input_buf_len points to the invalid or incomplete character.
548 : : */
1809 heikki.linnakangas@i 549 :UBC 0 : report_invalid_encoding(cstate->file_encoding,
550 : 0 : cstate->raw_buf + cstate->input_buf_len,
551 : 0 : cstate->raw_buf_len - cstate->input_buf_len);
552 : : }
553 : : else
554 : : {
555 : : /*
556 : : * raw_buf_index points to the invalid or untranslatable character. We
557 : : * let the conversion routine report the error, because it can provide
558 : : * a more specific error message than we could here. An earlier call
559 : : * to the conversion routine in CopyConvertBuf() detected that there
560 : : * is an error, now we call the conversion routine again with
561 : : * noError=false, to have it throw the error.
562 : : */
563 : : unsigned char *src;
564 : : int srclen;
565 : : unsigned char *dst;
566 : : int dstlen;
567 : :
1809 heikki.linnakangas@i 568 :CBC 6 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
569 : 6 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
570 : 6 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
571 : 6 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
572 : :
573 : 6 : (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
574 : : cstate->file_encoding,
575 : : GetDatabaseEncoding(),
576 : : src, srclen,
577 : : dst, dstlen,
578 : : false);
579 : :
580 : : /*
581 : : * The conversion routine should have reported an error, so this
582 : : * should not be reached.
583 : : */
1809 heikki.linnakangas@i 584 [ # # ]:UBC 0 : elog(ERROR, "encoding conversion failed without error");
585 : : }
586 : : }
587 : :
588 : : /*
589 : : * Load more data from data source to raw_buf.
590 : : *
591 : : * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
592 : : * beginning of the buffer, and we load new data after that.
593 : : */
594 : : static void
1938 heikki.linnakangas@i 595 :CBC 216667 : CopyLoadRawBuf(CopyFromState cstate)
596 : : {
597 : : int nbytes;
598 : : int inbytes;
599 : :
600 : : /*
601 : : * In text mode, if encoding conversion is not required, raw_buf and
602 : : * input_buf point to the same buffer. Their len/index better agree, too.
603 : : */
1809 604 [ + + ]: 216667 : if (cstate->raw_buf == cstate->input_buf)
605 : : {
606 [ - + ]: 216613 : Assert(!cstate->need_transcoding);
607 [ - + ]: 216613 : Assert(cstate->raw_buf_index == cstate->input_buf_index);
608 [ - + ]: 216613 : Assert(cstate->input_buf_len <= cstate->raw_buf_len);
609 : : }
610 : :
611 : : /*
612 : : * Copy down the unprocessed data if any.
613 : : */
614 : 216667 : nbytes = RAW_BUF_BYTES(cstate);
615 [ + + + + ]: 216667 : if (nbytes > 0 && cstate->raw_buf_index > 0)
1938 heikki.linnakangas@i 616 :GBC 542 : memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
617 : : nbytes);
1809 heikki.linnakangas@i 618 :CBC 216667 : cstate->raw_buf_len -= cstate->raw_buf_index;
619 : 216667 : cstate->raw_buf_index = 0;
620 : :
621 : : /*
622 : : * If raw_buf and input_buf are in fact the same buffer, adjust the
623 : : * input_buf variables, too.
624 : : */
625 [ + + ]: 216667 : if (cstate->raw_buf == cstate->input_buf)
626 : : {
627 : 216613 : cstate->input_buf_len -= cstate->input_buf_index;
628 : 216613 : cstate->input_buf_index = 0;
629 : : }
630 : :
631 : : /* Load more data */
632 : 216667 : inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
633 : 216667 : 1, RAW_BUF_SIZE - cstate->raw_buf_len);
1938 634 : 216665 : nbytes += inbytes;
635 : 216665 : cstate->raw_buf[nbytes] = '\0';
636 : 216665 : cstate->raw_buf_len = nbytes;
637 : :
1865 638 : 216665 : cstate->bytes_processed += inbytes;
1894 tomas.vondra@postgre 639 : 216665 : pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
640 : :
1809 heikki.linnakangas@i 641 [ + + ]: 216665 : if (inbytes == 0)
642 : 818 : cstate->raw_reached_eof = true;
643 : 216665 : }
644 : :
645 : : /*
646 : : * CopyLoadInputBuf loads some more data into input_buf
647 : : *
648 : : * On return, at least one more input character is loaded into
649 : : * input_buf, or input_reached_eof is set.
650 : : *
651 : : * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
652 : : * of the buffer and then we load more data after that.
653 : : */
654 : : static void
655 : 216894 : CopyLoadInputBuf(CopyFromState cstate)
656 : : {
657 : 216894 : int nbytes = INPUT_BUF_BYTES(cstate);
658 : :
659 : : /*
660 : : * The caller has updated input_buf_index to indicate how much of the
661 : : * input has been consumed and isn't needed anymore. If input_buf is the
662 : : * same physical area as raw_buf, update raw_buf_index accordingly.
663 : : */
664 [ + + ]: 216894 : if (cstate->raw_buf == cstate->input_buf)
665 : : {
666 [ - + ]: 216864 : Assert(!cstate->need_transcoding);
667 [ - + ]: 216864 : Assert(cstate->input_buf_index >= cstate->raw_buf_index);
668 : 216864 : cstate->raw_buf_index = cstate->input_buf_index;
669 : : }
670 : :
671 : : for (;;)
672 : : {
673 : : /* If we now have some unconverted data, try to convert it */
674 : 433541 : CopyConvertBuf(cstate);
675 : :
676 : : /* If we now have some more input bytes ready, return them */
677 [ + + ]: 433541 : if (INPUT_BUF_BYTES(cstate) > nbytes)
678 : 215829 : return;
679 : :
680 : : /*
681 : : * If we reached an invalid byte sequence, or we're at an incomplete
682 : : * multi-byte character but there is no more raw input data, report
683 : : * conversion error.
684 : : */
685 [ + + ]: 217712 : if (cstate->input_reached_error)
686 : 6 : CopyConversionError(cstate);
687 : :
688 : : /* no more input, and everything has been converted */
689 [ + + ]: 217706 : if (cstate->input_reached_eof)
690 : 1057 : break;
691 : :
692 : : /* Try to load more raw data */
693 [ - + ]: 216649 : Assert(!cstate->raw_reached_eof);
694 : 216649 : CopyLoadRawBuf(cstate);
695 : : }
696 : : }
697 : :
698 : : /*
699 : : * CopyReadBinaryData
700 : : *
701 : : * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
702 : : * and writes them to 'dest'. Returns the number of bytes read (which
703 : : * would be less than 'nbytes' only if we reach EOF).
704 : : */
705 : : static int
1938 706 : 191 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
707 : : {
708 : 191 : int copied_bytes = 0;
709 : :
710 [ + + ]: 191 : if (RAW_BUF_BYTES(cstate) >= nbytes)
711 : : {
712 : : /* Enough bytes are present in the buffer. */
713 : 173 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
714 : 173 : cstate->raw_buf_index += nbytes;
715 : 173 : copied_bytes = nbytes;
716 : : }
717 : : else
718 : : {
719 : : /*
720 : : * Not enough bytes in the buffer, so must read from the file. Need
721 : : * to loop since 'nbytes' could be larger than the buffer size.
722 : : */
723 : : do
724 : : {
725 : : int copy_bytes;
726 : :
727 : : /* Load more data if buffer is empty. */
728 [ + - ]: 18 : if (RAW_BUF_BYTES(cstate) == 0)
729 : : {
1809 730 : 18 : CopyLoadRawBuf(cstate);
731 [ + + ]: 18 : if (cstate->raw_reached_eof)
1938 732 : 6 : break; /* EOF */
733 : : }
734 : :
735 : : /* Transfer some bytes. */
736 : 12 : copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
737 : 12 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
738 : 12 : cstate->raw_buf_index += copy_bytes;
739 : 12 : dest += copy_bytes;
740 : 12 : copied_bytes += copy_bytes;
741 [ - + ]: 12 : } while (copied_bytes < nbytes);
742 : : }
743 : :
744 : 191 : return copied_bytes;
745 : : }
746 : :
747 : : /*
748 : : * This function is exposed for use by extensions that read raw fields in the
749 : : * next line. See NextCopyFromRawFieldsInternal() for details.
750 : : */
751 : : bool
380 msawada@postgresql.o 752 :UBC 0 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
753 : : {
754 : 0 : return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
755 : 0 : cstate->opts.csv_mode);
756 : : }
757 : :
758 : : /*
759 : : * Workhorse for NextCopyFromRawFields().
760 : : *
761 : : * Read raw fields in the next line for COPY FROM in text or csv mode. Return
762 : : * false if no more lines.
763 : : *
764 : : * An internal temporary buffer is returned via 'fields'. It is valid until
765 : : * the next call of the function. Since the function returns all raw fields
766 : : * in the input file, 'nfields' could be different from the number of columns
767 : : * in the relation.
768 : : *
769 : : * NOTE: force_not_null option are not applied to the returned fields.
770 : : *
771 : : * We use pg_attribute_always_inline to reduce function call overhead
772 : : * and to help compilers to optimize away the 'is_csv' condition when called
773 : : * by internal functions such as CopyFromTextLikeOneRow().
774 : : */
775 : : static pg_attribute_always_inline bool
380 msawada@postgresql.o 776 :CBC 734761 : NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
777 : : {
778 : : int fldct;
255 fujii@postgresql.org 779 :GNC 734761 : bool done = false;
780 : :
781 : : /* only available for text or csv input */
1938 heikki.linnakangas@i 782 [ - + ]:CBC 734761 : Assert(!cstate->opts.binary);
783 : :
784 : : /* on input check that the header line is correct if needed */
255 fujii@postgresql.org 785 [ + + + + ]:GNC 734761 : if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
786 : : {
787 : : ListCell *cur;
788 : : TupleDesc tupDesc;
789 : 74 : int lines_to_skip = cstate->opts.header_line;
790 : :
791 : : /* If set to "match", one header line is skipped */
792 [ + + ]: 74 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
793 : 38 : lines_to_skip = 1;
794 : :
1446 peter@eisentraut.org 795 :CBC 74 : tupDesc = RelationGetDescr(cstate->rel);
796 : :
255 fujii@postgresql.org 797 [ + + ]:GNC 173 : for (int i = 0; i < lines_to_skip; i++)
798 : : {
799 : 103 : cstate->cur_lineno++;
800 [ + + ]: 103 : if ((done = CopyReadLine(cstate, is_csv)))
801 : 4 : break;
802 : : }
803 : :
1446 peter@eisentraut.org 804 [ + + ]:CBC 74 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
805 : : {
806 : : int fldnum;
807 : :
380 msawada@postgresql.o 808 [ + + ]: 38 : if (is_csv)
760 michael@paquier.xyz 809 : 5 : fldct = CopyReadAttributesCSV(cstate);
810 : : else
811 : 33 : fldct = CopyReadAttributesText(cstate);
812 : :
1446 peter@eisentraut.org 813 [ + + ]: 38 : if (fldct != list_length(cstate->attnumlist))
814 [ + - ]: 12 : ereport(ERROR,
815 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
816 : : errmsg("wrong number of fields in header line: got %d, expected %d",
817 : : fldct, list_length(cstate->attnumlist))));
818 : :
819 : 26 : fldnum = 0;
820 [ + - + + : 79 : foreach(cur, cstate->attnumlist)
+ + ]
821 : : {
822 : 63 : int attnum = lfirst_int(cur);
823 : : char *colName;
824 : 63 : Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
825 : :
1361 michael@paquier.xyz 826 [ - + ]: 63 : Assert(fldnum < cstate->max_fields);
827 : :
828 : 63 : colName = cstate->raw_fields[fldnum++];
1446 peter@eisentraut.org 829 [ + + ]: 63 : if (colName == NULL)
830 [ + - ]: 3 : ereport(ERROR,
831 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
832 : : errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
833 : : fldnum, cstate->opts.null_print, NameStr(attr->attname))));
834 : :
1403 tgl@sss.pgh.pa.us 835 [ + + ]: 60 : if (namestrcmp(&attr->attname, colName) != 0)
836 : : {
1446 peter@eisentraut.org 837 [ + - ]: 7 : ereport(ERROR,
838 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
839 : : errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
840 : : fldnum, colName, NameStr(attr->attname))));
841 : : }
842 : : }
843 : : }
844 : :
845 [ + + ]: 52 : if (done)
1446 peter@eisentraut.org 846 :GBC 4 : return false;
847 : : }
848 : :
1938 heikki.linnakangas@i 849 :CBC 734735 : cstate->cur_lineno++;
850 : :
851 : : /* Actually read the line into memory here */
380 msawada@postgresql.o 852 : 734735 : done = CopyReadLine(cstate, is_csv);
853 : :
854 : : /*
855 : : * EOF at start of line means we're done. If we see EOF after some
856 : : * characters, we act as though it was newline followed by EOF, ie,
857 : : * process the line and then exit loop on next iteration.
858 : : */
1938 heikki.linnakangas@i 859 [ + + + - ]: 734721 : if (done && cstate->line_buf.len == 0)
860 : 830 : return false;
861 : :
862 : : /* Parse the line into de-escaped field values */
380 msawada@postgresql.o 863 [ + + ]: 733891 : if (is_csv)
760 michael@paquier.xyz 864 : 252 : fldct = CopyReadAttributesCSV(cstate);
865 : : else
866 : 733639 : fldct = CopyReadAttributesText(cstate);
867 : :
1938 heikki.linnakangas@i 868 : 733885 : *fields = cstate->raw_fields;
869 : 733885 : *nfields = fldct;
870 : 733885 : return true;
871 : : }
872 : :
873 : : /*
874 : : * Read next tuple from file for COPY FROM. Return false if no more tuples.
875 : : *
876 : : * 'econtext' is used to evaluate default expression for each column that is
877 : : * either not read from the file or is using the DEFAULT option of COPY FROM.
878 : : * It can be NULL when no default values are used, i.e. when all columns are
879 : : * read from the file, and DEFAULT option is unset.
880 : : *
881 : : * 'values' and 'nulls' arrays must be the same length as columns of the
882 : : * relation passed to BeginCopyFrom. This function fills the arrays.
883 : : */
884 : : bool
885 : 734782 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
886 : : Datum *values, bool *nulls)
887 : : {
888 : : TupleDesc tupDesc;
889 : : AttrNumber num_phys_attrs,
890 : 734782 : num_defaults = cstate->num_defaults;
891 : : int i;
892 : 734782 : int *defmap = cstate->defmap;
893 : 734782 : ExprState **defexprs = cstate->defexprs;
894 : :
895 : 734782 : tupDesc = RelationGetDescr(cstate->rel);
896 : 734782 : num_phys_attrs = tupDesc->natts;
897 : :
898 : : /* Initialize all values for row to NULL */
899 [ + - + - : 3171672 : MemSet(values, 0, num_phys_attrs * sizeof(Datum));
+ - + - +
+ ]
900 [ + - + + : 734782 : MemSet(nulls, true, num_phys_attrs * sizeof(bool));
- + - - -
- ]
962 drowley@postgresql.o 901 [ + - + + : 806836 : MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
+ - + - +
+ ]
902 : :
903 : : /* Get one row from source */
380 msawada@postgresql.o 904 [ + + ]: 734782 : if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
905 : 840 : return false;
906 : :
907 : : /*
908 : : * Now compute and insert any defaults available for the columns not
909 : : * provided by the input data. Anything not processed here or above will
910 : : * remain NULL.
911 : : */
912 [ + + ]: 764112 : for (i = 0; i < num_defaults; i++)
913 : : {
914 : : /*
915 : : * The caller must supply econtext and have switched into the
916 : : * per-tuple memory context in it.
917 : : */
918 [ - + ]: 30265 : Assert(econtext != NULL);
919 [ - + ]: 30265 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
920 : :
921 : 30265 : values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
922 : 30265 : &nulls[defmap[i]]);
923 : : }
924 : :
925 : 733847 : return true;
926 : : }
927 : :
928 : : /* Implementation of the per-row callback for text format */
929 : : bool
930 : 734384 : CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
931 : : bool *nulls)
932 : : {
933 : 734384 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
934 : : }
935 : :
936 : : /* Implementation of the per-row callback for CSV format */
937 : : bool
938 : 377 : CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
939 : : bool *nulls)
940 : : {
941 : 377 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
942 : : }
943 : :
944 : : /*
945 : : * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
946 : : *
947 : : * We use pg_attribute_always_inline to reduce function call overhead
948 : : * and to help compilers to optimize away the 'is_csv' condition.
949 : : */
950 : : static pg_attribute_always_inline bool
951 : 734761 : CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext,
952 : : Datum *values, bool *nulls, bool is_csv)
953 : : {
954 : : TupleDesc tupDesc;
955 : : AttrNumber attr_count;
956 : 734761 : FmgrInfo *in_functions = cstate->in_functions;
957 : 734761 : Oid *typioparams = cstate->typioparams;
958 : 734761 : ExprState **defexprs = cstate->defexprs;
959 : : char **field_strings;
960 : : ListCell *cur;
961 : : int fldct;
962 : : int fieldno;
963 : : char *string;
12 peter@eisentraut.org 964 :GNC 734761 : bool current_row_erroneous = false;
965 : :
380 msawada@postgresql.o 966 :CBC 734761 : tupDesc = RelationGetDescr(cstate->rel);
967 : 734761 : attr_count = list_length(cstate->attnumlist);
968 : :
969 : : /* read raw fields in the next line */
970 [ + + ]: 734761 : if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv))
971 : 834 : return false;
972 : :
973 : : /* check for overflowing fields */
974 [ + + + + ]: 733885 : if (attr_count > 0 && fldct > attr_count)
975 [ + - ]: 12 : ereport(ERROR,
976 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
977 : : errmsg("extra data after last expected column")));
978 : :
979 : 733873 : fieldno = 0;
980 : :
981 : : /* Loop to read the user attributes on the line. */
982 [ + + + + : 3100243 : foreach(cur, cstate->attnumlist)
+ + ]
983 : : {
984 : 2366474 : int attnum = lfirst_int(cur);
985 : 2366474 : int m = attnum - 1;
986 : 2366474 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
987 : :
988 [ + + ]: 2366474 : if (fieldno >= fldct)
1938 heikki.linnakangas@i 989 [ + - ]: 12 : ereport(ERROR,
990 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
991 : : errmsg("missing data for column \"%s\"",
992 : : NameStr(att->attname))));
380 msawada@postgresql.o 993 : 2366462 : string = field_strings[fieldno++];
994 : :
995 [ + + ]: 2366462 : if (cstate->convert_select_flags &&
996 [ + + ]: 10 : !cstate->convert_select_flags[m])
997 : : {
998 : : /* ignore input field, leaving column as NULL */
999 : 5 : continue;
1000 : : }
1001 : :
1002 [ + + ]: 2366457 : if (is_csv)
1003 : : {
1004 [ + + ]: 503 : if (string == NULL &&
1005 [ + + ]: 22 : cstate->opts.force_notnull_flags[m])
1006 : : {
1007 : : /*
1008 : : * FORCE_NOT_NULL option is set and column is NULL - convert
1009 : : * it to the NULL string.
1010 : : */
1011 : 14 : string = cstate->opts.null_print;
1012 : : }
1013 [ + + + + ]: 489 : else if (string != NULL && cstate->opts.force_null_flags[m]
1014 [ + + ]: 25 : && strcmp(string, cstate->opts.null_print) == 0)
1015 : : {
1016 : : /*
1017 : : * FORCE_NULL option is set and column matches the NULL
1018 : : * string. It must have been quoted, or otherwise the string
1019 : : * would already have been set to NULL. Convert it to NULL as
1020 : : * specified.
1021 : : */
1022 : 13 : string = NULL;
1023 : : }
1024 : : }
1025 : :
1026 : 2366457 : cstate->cur_attname = NameStr(att->attname);
1027 : 2366457 : cstate->cur_attval = string;
1028 : :
1029 [ + + ]: 2366457 : if (string != NULL)
1030 : 2364032 : nulls[m] = false;
1031 : :
1032 [ + + ]: 2366457 : if (cstate->defaults[m])
1033 : : {
1034 : : /* We must have switched into the per-tuple memory context */
1035 [ - + ]: 30 : Assert(econtext != NULL);
1036 [ - + ]: 30 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
1037 : :
1038 : 30 : values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1039 : : }
1040 : :
1041 : : /*
1042 : : * If ON_ERROR is specified, handle the different options
1043 : : */
1044 [ + + ]: 2366408 : else if (!InputFunctionCallSafe(&in_functions[m],
1045 : : string,
1046 : 2366427 : typioparams[m],
1047 : : att->atttypmod,
1048 : 2366427 : (Node *) cstate->escontext,
1049 : 2366427 : &values[m]))
1050 : : {
1051 [ - + ]: 85 : Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
1052 : :
12 peter@eisentraut.org 1053 [ + + ]:GNC 85 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1054 : 64 : cstate->num_errors++;
1055 [ + - ]: 21 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1056 : : {
1057 : : /*
1058 : : * Reset error state so the subsequent InputFunctionCallSafe
1059 : : * call (for domain constraint check) can properly report
1060 : : * whether it succeeded or failed.
1061 : : */
1062 : 21 : cstate->escontext->error_occurred = false;
1063 : :
1064 [ - + ]: 21 : Assert(cstate->domain_with_constraint != NULL);
1065 : :
1066 : : /*
1067 : : * For constrained domains, we need an additional
1068 : : * InputFunctionCallSafe() to ensure that an error is thrown
1069 : : * if the domain constraint rejects null values.
1070 : : */
1071 [ + + + + ]: 36 : if (!cstate->domain_with_constraint[m] ||
1072 : 15 : InputFunctionCallSafe(&in_functions[m],
1073 : : NULL,
1074 : 15 : typioparams[m],
1075 : : att->atttypmod,
1076 : 15 : (Node *) cstate->escontext,
1077 : 15 : &values[m]))
1078 : : {
1079 : 12 : nulls[m] = true;
1080 : 12 : values[m] = (Datum) 0;
1081 : : }
1082 : : else
1083 [ + - ]: 9 : ereport(ERROR,
1084 : : errcode(ERRCODE_NOT_NULL_VIOLATION),
1085 : : errmsg("domain %s does not allow null values",
1086 : : format_type_be(typioparams[m])),
1087 : : errdetail("ON_ERROR SET_NULL cannot be applied because column \"%s\" (domain %s) does not accept null values.",
1088 : : cstate->cur_attname,
1089 : : format_type_be(typioparams[m])),
1090 : : errdatatype(typioparams[m]));
1091 : :
1092 : : /*
1093 : : * We count only the number of rows (not fields) where
1094 : : * ON_ERROR SET_NULL was applied.
1095 : : */
1096 [ + + ]: 12 : if (!current_row_erroneous)
1097 : : {
1098 : 9 : current_row_erroneous = true;
1099 : 9 : cstate->num_errors++;
1100 : : }
1101 : : }
1102 : :
380 msawada@postgresql.o 1103 [ + + ]:CBC 76 : if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
1104 : : {
1105 : : /*
1106 : : * Since we emit line number and column info in the below
1107 : : * notice message, we suppress error context information other
1108 : : * than the relation name.
1109 : : */
1110 [ - + ]: 33 : Assert(!cstate->relname_only);
1111 : 33 : cstate->relname_only = true;
1112 : :
1113 [ + + ]: 33 : if (cstate->cur_attval)
1114 : : {
1115 : : char *attval;
1116 : :
1117 : 30 : attval = CopyLimitPrintoutLength(cstate->cur_attval);
1118 : :
12 peter@eisentraut.org 1119 [ + + ]:GNC 30 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1120 [ + - ]: 18 : ereport(NOTICE,
1121 : : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1122 : : cstate->cur_lineno,
1123 : : cstate->cur_attname,
1124 : : attval));
1125 [ + - ]: 12 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1126 [ + - ]: 12 : ereport(NOTICE,
1127 : : errmsg("setting to null due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1128 : : cstate->cur_lineno,
1129 : : cstate->cur_attname,
1130 : : attval));
380 msawada@postgresql.o 1131 :CBC 30 : pfree(attval);
1132 : : }
1133 : : else
1134 : : {
12 peter@eisentraut.org 1135 [ + - ]:GNC 3 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1136 [ + - ]: 3 : ereport(NOTICE,
1137 : : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1138 : : cstate->cur_lineno,
1139 : : cstate->cur_attname));
1140 : : }
1141 : : /* reset relname_only */
380 msawada@postgresql.o 1142 :CBC 33 : cstate->relname_only = false;
1143 : : }
1144 : :
12 peter@eisentraut.org 1145 [ + + ]:GNC 76 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1146 : 64 : return true;
1147 [ + - ]: 12 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1148 : 12 : continue;
1149 : : }
1150 : :
380 msawada@postgresql.o 1151 :CBC 2366353 : cstate->cur_attname = NULL;
1152 : 2366353 : cstate->cur_attval = NULL;
1153 : : }
1154 : :
1155 [ - + ]: 733769 : Assert(fieldno == attr_count);
1156 : :
1157 : 733769 : return true;
1158 : : }
1159 : :
1160 : : /* Implementation of the per-row callback for binary format */
1161 : : bool
1162 : 21 : CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
1163 : : bool *nulls)
1164 : : {
1165 : : TupleDesc tupDesc;
1166 : : AttrNumber attr_count;
1167 : 21 : FmgrInfo *in_functions = cstate->in_functions;
1168 : 21 : Oid *typioparams = cstate->typioparams;
1169 : : int16 fld_count;
1170 : : ListCell *cur;
1171 : :
1172 : 21 : tupDesc = RelationGetDescr(cstate->rel);
1173 : 21 : attr_count = list_length(cstate->attnumlist);
1174 : :
1175 : 21 : cstate->cur_lineno++;
1176 : :
1177 [ - + ]: 21 : if (!CopyGetInt16(cstate, &fld_count))
1178 : : {
1179 : : /* EOF detected (end of file, or protocol-level EOF) */
380 msawada@postgresql.o 1180 :UBC 0 : return false;
1181 : : }
1182 : :
380 msawada@postgresql.o 1183 [ + + ]:CBC 21 : if (fld_count == -1)
1184 : : {
1185 : : /*
1186 : : * Received EOF marker. Wait for the protocol-level EOF, and complain
1187 : : * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1188 : : * that we correctly handle CopyFail, if client chooses to send that
1189 : : * now. When copying from file, we could ignore the rest of the file
1190 : : * like in text mode, but we choose to be consistent with the COPY
1191 : : * FROM STDIN case.
1192 : : */
1193 : : char dummy;
1194 : :
1195 [ - + ]: 6 : if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
380 msawada@postgresql.o 1196 [ # # ]:UBC 0 : ereport(ERROR,
1197 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1198 : : errmsg("received copy data after EOF marker")));
380 msawada@postgresql.o 1199 :CBC 6 : return false;
1200 : : }
1201 : :
1202 [ - + ]: 15 : if (fld_count != attr_count)
380 msawada@postgresql.o 1203 [ # # ]:UBC 0 : ereport(ERROR,
1204 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1205 : : errmsg("row field count is %d, expected %d",
1206 : : fld_count, attr_count)));
1207 : :
380 msawada@postgresql.o 1208 [ + - + + :CBC 93 : foreach(cur, cstate->attnumlist)
+ + ]
1209 : : {
1210 : 79 : int attnum = lfirst_int(cur);
1211 : 79 : int m = attnum - 1;
1212 : 79 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1213 : :
1214 : 79 : cstate->cur_attname = NameStr(att->attname);
1215 : 157 : values[m] = CopyReadBinaryAttribute(cstate,
1216 : 79 : &in_functions[m],
1217 : 79 : typioparams[m],
1218 : : att->atttypmod,
1219 : : &nulls[m]);
1220 : 78 : cstate->cur_attname = NULL;
1221 : : }
1222 : :
1938 heikki.linnakangas@i 1223 : 14 : return true;
1224 : : }
1225 : :
1226 : : /*
1227 : : * Read the next input line and stash it in line_buf.
1228 : : *
1229 : : * Result is true if read was terminated by EOF, false if terminated
1230 : : * by newline. The terminating newline or EOF marker is not included
1231 : : * in the final value of line_buf.
1232 : : */
1233 : : static bool
380 msawada@postgresql.o 1234 : 734838 : CopyReadLine(CopyFromState cstate, bool is_csv)
1235 : : {
1236 : : bool result;
1237 : :
1938 heikki.linnakangas@i 1238 : 734838 : resetStringInfo(&cstate->line_buf);
1809 1239 : 734838 : cstate->line_buf_valid = false;
1240 : :
1241 : : /*
1242 : : * Parse data and transfer into line_buf.
1243 : : *
1244 : : * Because this is performance critical, we inline CopyReadLineText() and
1245 : : * pass the boolean parameters as constants to allow the compiler to emit
1246 : : * specialized code with fewer branches.
1247 : : */
23 nathan@postgresql.or 1248 [ + + ]:GNC 734838 : if (is_csv)
1249 : 439 : result = CopyReadLineText(cstate, true);
1250 : : else
1251 : 734399 : result = CopyReadLineText(cstate, false);
1252 : :
1938 heikki.linnakangas@i 1253 [ + + ]:CBC 734824 : if (result)
1254 : : {
1255 : : /*
1256 : : * Reached EOF. In protocol version 3, we should ignore anything
1257 : : * after \. up to the protocol end of copy data. (XXX maybe better
1258 : : * not to treat \. as special?)
1259 : : */
1837 1260 [ + + ]: 834 : if (cstate->copy_src == COPY_FRONTEND)
1261 : : {
1262 : : int inbytes;
1263 : :
1264 : : do
1265 : : {
1809 1266 : 422 : inbytes = CopyGetData(cstate, cstate->input_buf,
1267 : : 1, INPUT_BUF_SIZE);
1268 [ - + ]: 422 : } while (inbytes > 0);
1269 : 422 : cstate->input_buf_index = 0;
1270 : 422 : cstate->input_buf_len = 0;
1271 : 422 : cstate->raw_buf_index = 0;
1272 : 422 : cstate->raw_buf_len = 0;
1273 : : }
1274 : : }
1275 : : else
1276 : : {
1277 : : /*
1278 : : * If we didn't hit EOF, then we must have transferred the EOL marker
1279 : : * to line_buf along with the data. Get rid of it.
1280 : : */
1938 1281 [ + - - - : 733990 : switch (cstate->eol_type)
- ]
1282 : : {
1283 : 733990 : case EOL_NL:
1284 [ - + ]: 733990 : Assert(cstate->line_buf.len >= 1);
1285 [ - + ]: 733990 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1286 : 733990 : cstate->line_buf.len--;
1287 : 733990 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1288 : 733990 : break;
1938 heikki.linnakangas@i 1289 :UBC 0 : case EOL_CR:
1290 [ # # ]: 0 : Assert(cstate->line_buf.len >= 1);
1291 [ # # ]: 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1292 : 0 : cstate->line_buf.len--;
1293 : 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1294 : 0 : break;
1295 : 0 : case EOL_CRNL:
1296 [ # # ]: 0 : Assert(cstate->line_buf.len >= 2);
1297 [ # # ]: 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1298 [ # # ]: 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1299 : 0 : cstate->line_buf.len -= 2;
1300 : 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1301 : 0 : break;
1302 : 0 : case EOL_UNKNOWN:
1303 : : /* shouldn't get here */
1304 : 0 : Assert(false);
1305 : : break;
1306 : : }
1307 : : }
1308 : :
1309 : : /* Now it's safe to use the buffer in error messages */
1809 heikki.linnakangas@i 1310 :CBC 734824 : cstate->line_buf_valid = true;
1311 : :
1938 1312 : 734824 : return result;
1313 : : }
1314 : :
1315 : : #ifndef USE_NO_SIMD
1316 : : /*
1317 : : * Helper function for CopyReadLineText() that uses SIMD instructions to scan
1318 : : * the input buffer for special characters. This can be much faster.
1319 : : *
1320 : : * Note that we disable SIMD for the remainder of the COPY FROM command upon
1321 : : * encountering a special character (except for end-of-line characters) or a
1322 : : * short line. This is perhaps too conservative, but it should help avoid
1323 : : * regressions. It could probably be made more lenient in the future via
1324 : : * fine-tuned heuristics.
1325 : : */
1326 : : static bool
2 nathan@postgresql.or 1327 :GNC 309440 : CopyReadLineTextSIMDHelper(CopyFromState cstate, bool is_csv,
1328 : : bool *hit_eof_p, int *input_buf_ptr_p)
1329 : : {
1330 : : char *copy_input_buf;
1331 : : int input_buf_ptr;
1332 : : int copy_buf_len;
1333 : : bool unique_esc_char; /* for csv, do quote/esc chars differ? */
1334 : 309440 : bool first = true;
1335 : 309440 : bool result = false;
1336 : 309440 : const Vector8 nl_vec = vector8_broadcast('\n');
1337 : 309440 : const Vector8 cr_vec = vector8_broadcast('\r');
1338 : : Vector8 bs_or_quote_vec; /* '\' for text, quote for csv */
1339 : : Vector8 esc_vec; /* only for csv */
1340 : :
1341 [ + + ]: 309440 : if (is_csv)
1342 : : {
1343 : 309 : char quote = cstate->opts.quote[0];
1344 : 309 : char esc = cstate->opts.escape[0];
1345 : :
1346 : 309 : bs_or_quote_vec = vector8_broadcast(quote);
1347 : 309 : esc_vec = vector8_broadcast(esc);
1348 : 309 : unique_esc_char = (quote != esc);
1349 : : }
1350 : : else
1351 : : {
1352 : 309131 : bs_or_quote_vec = vector8_broadcast('\\');
1353 : 309131 : unique_esc_char = false;
1354 : : }
1355 : :
1356 : : /*
1357 : : * For a little extra speed within the loop, we copy some state members
1358 : : * into local variables. Note that we need to use a separate local
1359 : : * variable for input_buf_ptr so that the REFILL_LINEBUF macro works. We
1360 : : * copy its value into the input_buf_ptr_p argument before returning.
1361 : : */
1362 : 309440 : copy_input_buf = cstate->input_buf;
1363 : 309440 : input_buf_ptr = cstate->input_buf_index;
1364 : 309440 : copy_buf_len = cstate->input_buf_len;
1365 : :
1366 : : /*
1367 : : * See the corresponding loop in CopyReadLineText() for more information
1368 : : * about the purpose of this loop. This one does the same thing using
1369 : : * SIMD instructions, although we are quick to bail out to the scalar path
1370 : : * if we encounter a special character.
1371 : : */
1372 : : for (;;)
1373 : 310228 : {
1374 : : Vector8 chunk;
1375 : : Vector8 match;
1376 : :
1377 : : /* Load more data if needed. */
1378 [ + + ]: 619668 : if (copy_buf_len - input_buf_ptr < sizeof(Vector8))
1379 : : {
1380 [ + + ]: 216426 : REFILL_LINEBUF;
1381 : :
1382 : 216426 : CopyLoadInputBuf(cstate);
1383 : : /* update our local variables */
1384 : 216418 : *hit_eof_p = cstate->input_reached_eof;
1385 : 216418 : input_buf_ptr = cstate->input_buf_index;
1386 : 216418 : copy_buf_len = cstate->input_buf_len;
1387 : :
1388 : : /*
1389 : : * If we are completely out of data, break out of the loop,
1390 : : * reporting EOF.
1391 : : */
1392 [ + + ]: 216418 : if (INPUT_BUF_BYTES(cstate) <= 0)
1393 : : {
1394 : 505 : result = true;
1395 : 505 : break;
1396 : : }
1397 : : }
1398 : :
1399 : : /*
1400 : : * If we still don't have enough data for the SIMD path, fall back to
1401 : : * the scalar code. Note that this doesn't necessarily mean we
1402 : : * encountered a short line, so we leave cstate->simd_enabled set to
1403 : : * true.
1404 : : */
1405 [ + + ]: 619155 : if (copy_buf_len - input_buf_ptr < sizeof(Vector8))
1406 : 215149 : break;
1407 : :
1408 : : /*
1409 : : * If we made it here, we have at least enough data to fit in a
1410 : : * Vector8, so we can use SIMD instructions to scan for special
1411 : : * characters.
1412 : : */
1413 : 404006 : vector8_load(&chunk, (const uint8 *) ©_input_buf[input_buf_ptr]);
1414 : :
1415 : : /*
1416 : : * Check for \n, \r, \\ (for text), quotes (for csv), and escapes (for
1417 : : * csv, if different from quotes).
1418 : : */
1419 : 404006 : match = vector8_eq(chunk, nl_vec);
1420 : 404006 : match = vector8_or(match, vector8_eq(chunk, cr_vec));
1421 : 404006 : match = vector8_or(match, vector8_eq(chunk, bs_or_quote_vec));
1422 [ + + ]: 404006 : if (unique_esc_char)
1423 : 19 : match = vector8_or(match, vector8_eq(chunk, esc_vec));
1424 : :
1425 : : /*
1426 : : * If we found a special character, advance to it and hand off to the
1427 : : * scalar path. Except for end-of-line characters, we also disable
1428 : : * SIMD processing for the remainder of the COPY FROM command.
1429 : : */
1430 [ + + ]: 404006 : if (vector8_is_highbit_set(match))
1431 : : {
1432 : : uint32 mask;
1433 : : char c;
1434 : :
1435 : 93778 : mask = vector8_highbit_mask(match);
1436 : 93778 : input_buf_ptr += pg_rightmost_one_pos32(mask);
1437 : :
1438 : : /*
1439 : : * Don't disable SIMD if we found \n or \r, else we'd stop using
1440 : : * SIMD instructions after the first line. As an exception, we do
1441 : : * disable it if this is the first vector we processed, as that
1442 : : * means the line is too short for SIMD.
1443 : : */
1444 : 93778 : c = copy_input_buf[input_buf_ptr];
1445 [ + + + + : 93778 : if (first || (c != '\n' && c != '\r'))
+ - ]
1446 : 314 : cstate->simd_enabled = false;
1447 : :
1448 : 93778 : break;
1449 : : }
1450 : :
1451 : : /* That chunk was clear of special characters, so we can skip it. */
1452 : 310228 : input_buf_ptr += sizeof(Vector8);
1453 : 310228 : first = false;
1454 : : }
1455 : :
1456 : 309432 : *input_buf_ptr_p = input_buf_ptr;
1457 : 309432 : return result;
1458 : : }
1459 : : #endif /* ! USE_NO_SIMD */
1460 : :
1461 : : /*
1462 : : * CopyReadLineText - inner loop of CopyReadLine for text mode
1463 : : */
1464 : : static pg_attribute_always_inline bool
380 msawada@postgresql.o 1465 :CBC 734838 : CopyReadLineText(CopyFromState cstate, bool is_csv)
1466 : : {
1467 : : char *copy_input_buf;
1468 : : int input_buf_ptr;
1469 : : int copy_buf_len;
1938 heikki.linnakangas@i 1470 : 734838 : bool need_data = false;
1471 : 734838 : bool hit_eof = false;
1472 : 734838 : bool result = false;
1473 : :
1474 : : /* CSV variables */
1475 : 734838 : bool in_quote = false,
1476 : 734838 : last_was_esc = false;
1477 : 734838 : char quotec = '\0';
1478 : 734838 : char escapec = '\0';
1479 : :
380 msawada@postgresql.o 1480 [ + + ]: 734838 : if (is_csv)
1481 : : {
1938 heikki.linnakangas@i 1482 : 439 : quotec = cstate->opts.quote[0];
1483 : 439 : escapec = cstate->opts.escape[0];
1484 : : /* ignore special escape processing if it's the same as quotec */
1485 [ + + ]: 439 : if (quotec == escapec)
1486 : 342 : escapec = '\0';
1487 : : }
1488 : :
1489 : : /*
1490 : : * The objective of this loop is to transfer the entire next input line
1491 : : * into line_buf. Hence, we only care for detecting newlines (\r and/or
1492 : : * \n) and the end-of-copy marker (\.).
1493 : : *
1494 : : * In CSV mode, \r and \n inside a quoted field are just part of the data
1495 : : * value and are put in line_buf. We keep just enough state to know if we
1496 : : * are currently in a quoted field or not.
1497 : : *
1498 : : * The input has already been converted to the database encoding. All
1499 : : * supported server encodings have the property that all bytes in a
1500 : : * multi-byte sequence have the high bit set, so a multibyte character
1501 : : * cannot contain any newline or escape characters embedded in the
1502 : : * multibyte sequence. Therefore, we can process the input byte-by-byte,
1503 : : * regardless of the encoding.
1504 : : *
1505 : : * For speed, we try to move data from input_buf to line_buf in chunks
1506 : : * rather than one character at a time. input_buf_ptr points to the next
1507 : : * character to examine; any characters from input_buf_index to
1508 : : * input_buf_ptr have been determined to be part of the line, but not yet
1509 : : * transferred to line_buf.
1510 : : *
1511 : : * For a little extra speed within the loop, we copy some state
1512 : : * information into local variables. input_buf_ptr could be changed in
1513 : : * the SIMD path, so we must set that one before it. The others are set
1514 : : * afterwards.
1515 : : */
1809 1516 : 734838 : input_buf_ptr = cstate->input_buf_index;
1517 : :
1518 : : /*
1519 : : * We first try to use SIMD for the task described above, falling back to
1520 : : * the scalar path (i.e., the loop below) if needed.
1521 : : */
1522 : : #ifndef USE_NO_SIMD
2 nathan@postgresql.or 1523 [ + + ]:GNC 734838 : if (cstate->simd_enabled)
1524 : : {
1525 : : /*
1526 : : * Using temporary variables seems to encourage the compiler to keep
1527 : : * them in a register, which is beneficial for performance.
1528 : : */
1529 : 309440 : bool tmp_hit_eof = false;
1530 : 309440 : int tmp_input_buf_ptr = 0; /* silence compiler warning */
1531 : :
1532 : 309440 : result = CopyReadLineTextSIMDHelper(cstate, is_csv, &tmp_hit_eof,
1533 : : &tmp_input_buf_ptr);
1534 : 309432 : hit_eof = tmp_hit_eof;
1535 : 309432 : input_buf_ptr = tmp_input_buf_ptr;
1536 : :
1537 [ + + ]: 309432 : if (result)
1538 : : {
1539 : : /* Transfer any still-uncopied data to line_buf. */
1540 [ - + ]: 505 : REFILL_LINEBUF;
1541 : :
1542 : 505 : return result;
1543 : : }
1544 : : }
1545 : : #endif /* ! USE_NO_SIMD */
1546 : :
1547 : 734325 : copy_input_buf = cstate->input_buf;
1809 heikki.linnakangas@i 1548 :CBC 734325 : copy_buf_len = cstate->input_buf_len;
1549 : :
1550 : : for (;;)
1938 1551 : 7653754 : {
1552 : : int prev_raw_ptr;
1553 : : char c;
1554 : :
1555 : : /*
1556 : : * Load more data if needed.
1557 : : *
1558 : : * TODO: We could just force four bytes of read-ahead and avoid the
1559 : : * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1560 : : * unsafe with the old v2 COPY protocol, but we don't support that
1561 : : * anymore.
1562 : : */
1809 1563 [ + + - + ]: 8388079 : if (input_buf_ptr >= copy_buf_len || need_data)
1564 : : {
1938 1565 [ + + ]: 468 : REFILL_LINEBUF;
1566 : :
1809 1567 : 468 : CopyLoadInputBuf(cstate);
1568 : : /* update our local variables */
1569 : 468 : hit_eof = cstate->input_reached_eof;
1570 : 468 : input_buf_ptr = cstate->input_buf_index;
1571 : 468 : copy_buf_len = cstate->input_buf_len;
1572 : :
1573 : : /*
1574 : : * If we are completely out of data, break out of the loop,
1575 : : * reporting EOF.
1576 : : */
1577 [ + + ]: 468 : if (INPUT_BUF_BYTES(cstate) <= 0)
1578 : : {
1938 1579 : 284 : result = true;
1580 : 284 : break;
1581 : : }
1582 : 184 : need_data = false;
1583 : : }
1584 : :
1585 : : /* OK to fetch a character */
1809 1586 : 8387795 : prev_raw_ptr = input_buf_ptr;
1587 : 8387795 : c = copy_input_buf[input_buf_ptr++];
1588 : :
380 msawada@postgresql.o 1589 [ + + ]: 8387795 : if (is_csv)
1590 : : {
1591 : : /*
1592 : : * If character is '\r', we may need to look ahead below. Force
1593 : : * fetch of the next character if we don't already have it. We
1594 : : * need to do this before changing CSV state, in case '\r' is also
1595 : : * the quote or escape character.
1596 : : */
531 tgl@sss.pgh.pa.us 1597 [ + + ]: 2163 : if (c == '\r')
1598 : : {
1938 heikki.linnakangas@i 1599 [ - + - - ]: 18 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1600 : : }
1601 : :
1602 : : /*
1603 : : * Dealing with quotes and escapes here is mildly tricky. If the
1604 : : * quote char is also the escape char, there's no problem - we
1605 : : * just use the char as a toggle. If they are different, we need
1606 : : * to ensure that we only take account of an escape inside a
1607 : : * quoted field and immediately preceding a quote char, and not
1608 : : * the second in an escape-escape sequence.
1609 : : */
1610 [ + + + + ]: 2163 : if (in_quote && c == escapec)
1611 : 24 : last_was_esc = !last_was_esc;
1612 [ + + + - ]: 2163 : if (c == quotec && !last_was_esc)
1613 : 260 : in_quote = !in_quote;
1614 [ + + ]: 2163 : if (c != escapec)
1615 : 2136 : last_was_esc = false;
1616 : :
1617 : : /*
1618 : : * Updating the line count for embedded CR and/or LF chars is
1619 : : * necessarily a little fragile - this test is probably about the
1620 : : * best we can do. (XXX it's arguable whether we should do this
1621 : : * at all --- is cur_lineno a physical or logical count?)
1622 : : */
1623 [ + + + + : 2163 : if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
+ + ]
1624 : 18 : cstate->cur_lineno++;
1625 : : }
1626 : :
1627 : : /* Process \r */
380 msawada@postgresql.o 1628 [ + + + - : 8387795 : if (c == '\r' && (!is_csv || !in_quote))
- + ]
1629 : : {
1630 : : /* Check for \r\n on first line, _and_ handle \r\n. */
1938 heikki.linnakangas@i 1631 [ # # ]:UBC 0 : if (cstate->eol_type == EOL_UNKNOWN ||
1632 [ # # ]: 0 : cstate->eol_type == EOL_CRNL)
1633 : : {
1634 : : /*
1635 : : * If need more data, go back to loop top to load it.
1636 : : *
1637 : : * Note that if we are at EOF, c will wind up as '\0' because
1638 : : * of the guaranteed pad of input_buf.
1639 : : */
1640 [ # # # # ]: 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1641 : :
1642 : : /* get next char */
1809 1643 : 0 : c = copy_input_buf[input_buf_ptr];
1644 : :
1938 1645 [ # # ]: 0 : if (c == '\n')
1646 : : {
1809 1647 : 0 : input_buf_ptr++; /* eat newline */
1938 1648 : 0 : cstate->eol_type = EOL_CRNL; /* in case not set yet */
1649 : : }
1650 : : else
1651 : : {
1652 : : /* found \r, but no \n */
1653 [ # # ]: 0 : if (cstate->eol_type == EOL_CRNL)
1654 [ # # # # : 0 : ereport(ERROR,
# # ]
1655 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1656 : : !is_csv ?
1657 : : errmsg("literal carriage return found in data") :
1658 : : errmsg("unquoted carriage return found in data"),
1659 : : !is_csv ?
1660 : : errhint("Use \"\\r\" to represent carriage return.") :
1661 : : errhint("Use quoted CSV field to represent carriage return.")));
1662 : :
1663 : : /*
1664 : : * if we got here, it is the first line and we didn't find
1665 : : * \n, so don't consume the peeked character
1666 : : */
1667 : 0 : cstate->eol_type = EOL_CR;
1668 : : }
1669 : : }
1670 [ # # ]: 0 : else if (cstate->eol_type == EOL_NL)
1671 [ # # # # : 0 : ereport(ERROR,
# # ]
1672 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1673 : : !is_csv ?
1674 : : errmsg("literal carriage return found in data") :
1675 : : errmsg("unquoted carriage return found in data"),
1676 : : !is_csv ?
1677 : : errhint("Use \"\\r\" to represent carriage return.") :
1678 : : errhint("Use quoted CSV field to represent carriage return.")));
1679 : : /* If reach here, we have found the line terminator */
1680 : 0 : break;
1681 : : }
1682 : :
1683 : : /* Process \n */
380 msawada@postgresql.o 1684 [ + + + + :CBC 8387795 : if (c == '\n' && (!is_csv || !in_quote))
+ + ]
1685 : : {
1938 heikki.linnakangas@i 1686 [ + - - + ]: 733990 : if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1938 heikki.linnakangas@i 1687 [ # # # # :UBC 0 : ereport(ERROR,
# # ]
1688 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1689 : : !is_csv ?
1690 : : errmsg("literal newline found in data") :
1691 : : errmsg("unquoted newline found in data"),
1692 : : !is_csv ?
1693 : : errhint("Use \"\\n\" to represent newline.") :
1694 : : errhint("Use quoted CSV field to represent newline.")));
1938 heikki.linnakangas@i 1695 :CBC 733990 : cstate->eol_type = EOL_NL; /* in case not set yet */
1696 : : /* If reach here, we have found the line terminator */
1697 : 733990 : break;
1698 : : }
1699 : :
1700 : : /*
1701 : : * Process backslash, except in CSV mode where backslash is a normal
1702 : : * character.
1703 : : */
380 msawada@postgresql.o 1704 [ + + + + ]: 7653805 : if (c == '\\' && !is_csv)
1705 : : {
1706 : : char c2;
1707 : :
1938 heikki.linnakangas@i 1708 [ - + - - ]: 4051 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1709 [ - + - - ]: 4051 : IF_NEED_REFILL_AND_EOF_BREAK(0);
1710 : :
1711 : : /* -----
1712 : : * get next character
1713 : : * Note: we do not change c so if it isn't \., we can fall
1714 : : * through and continue processing.
1715 : : * -----
1716 : : */
1809 1717 : 4051 : c2 = copy_input_buf[input_buf_ptr];
1718 : :
1938 1719 [ + + ]: 4051 : if (c2 == '.')
1720 : : {
1809 1721 : 51 : input_buf_ptr++; /* consume the '.' */
1938 1722 [ - + ]: 51 : if (cstate->eol_type == EOL_CRNL)
1723 : : {
1724 : : /* Get the next character */
1938 heikki.linnakangas@i 1725 [ # # # # ]:UBC 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1726 : : /* if hit_eof, c2 will become '\0' */
1809 1727 : 0 : c2 = copy_input_buf[input_buf_ptr++];
1728 : :
1938 1729 [ # # ]: 0 : if (c2 == '\n')
531 tgl@sss.pgh.pa.us 1730 [ # # ]: 0 : ereport(ERROR,
1731 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1732 : : errmsg("end-of-copy marker does not match previous newline style")));
1938 heikki.linnakangas@i 1733 [ # # ]: 0 : else if (c2 != '\r')
531 tgl@sss.pgh.pa.us 1734 [ # # ]: 0 : ereport(ERROR,
1735 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1736 : : errmsg("end-of-copy marker is not alone on its line")));
1737 : : }
1738 : :
1739 : : /* Get the next character */
1938 heikki.linnakangas@i 1740 [ - + - - ]:CBC 51 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1741 : : /* if hit_eof, c2 will become '\0' */
1809 1742 : 51 : c2 = copy_input_buf[input_buf_ptr++];
1743 : :
1938 1744 [ + - + + ]: 51 : if (c2 != '\r' && c2 != '\n')
531 tgl@sss.pgh.pa.us 1745 [ + - ]: 3 : ereport(ERROR,
1746 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1747 : : errmsg("end-of-copy marker is not alone on its line")));
1748 : :
1938 heikki.linnakangas@i 1749 [ + + + - ]: 48 : if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1750 [ - + - - ]: 48 : (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1751 [ - + - - ]: 48 : (cstate->eol_type == EOL_CR && c2 != '\r'))
1938 heikki.linnakangas@i 1752 [ # # ]:UBC 0 : ereport(ERROR,
1753 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1754 : : errmsg("end-of-copy marker does not match previous newline style")));
1755 : :
1756 : : /*
1757 : : * If there is any data on this line before the \., complain.
1758 : : */
530 tgl@sss.pgh.pa.us 1759 [ + - ]:CBC 48 : if (cstate->line_buf.len > 0 ||
1760 [ + + ]: 48 : prev_raw_ptr > cstate->input_buf_index)
1761 [ + - ]: 3 : ereport(ERROR,
1762 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1763 : : errmsg("end-of-copy marker is not alone on its line")));
1764 : :
1765 : : /*
1766 : : * Discard the \. and newline, then report EOF.
1767 : : */
1809 heikki.linnakangas@i 1768 : 45 : cstate->input_buf_index = input_buf_ptr;
1938 1769 : 45 : result = true; /* report EOF */
1770 : 45 : break;
1771 : : }
1772 : : else
1773 : : {
1774 : : /*
1775 : : * If we are here, it means we found a backslash followed by
1776 : : * something other than a period. In non-CSV mode, anything
1777 : : * after a backslash is special, so we skip over that second
1778 : : * character too. If we didn't do that \\. would be
1779 : : * considered an eof-of copy, while in non-CSV mode it is a
1780 : : * literal backslash followed by a period.
1781 : : */
1809 1782 : 4000 : input_buf_ptr++;
1783 : : }
1784 : : }
1785 : : } /* end of outer loop */
1786 : :
1787 : : /*
1788 : : * Transfer any still-uncopied data to line_buf.
1789 : : */
1938 1790 [ + + ]: 734319 : REFILL_LINEBUF;
1791 : :
1792 : 734319 : return result;
1793 : : }
1794 : :
1795 : : /*
1796 : : * Return decimal value for a hexadecimal digit
1797 : : */
1798 : : static int
1938 heikki.linnakangas@i 1799 :UBC 0 : GetDecimalFromHex(char hex)
1800 : : {
1801 [ # # ]: 0 : if (isdigit((unsigned char) hex))
1802 : 0 : return hex - '0';
1803 : : else
257 jdavis@postgresql.or 1804 :UNC 0 : return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1805 : : }
1806 : :
1807 : : /*
1808 : : * Parse the current line into separate attributes (fields),
1809 : : * performing de-escaping as needed.
1810 : : *
1811 : : * The input is in line_buf. We use attribute_buf to hold the result
1812 : : * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1813 : : * string, or NULL when the input matches the null marker string.
1814 : : * This array is expanded as necessary.
1815 : : *
1816 : : * (Note that the caller cannot check for nulls since the returned
1817 : : * string would be the post-de-escaping equivalent, which may look
1818 : : * the same as some valid data string.)
1819 : : *
1820 : : * delim is the column delimiter string (must be just one byte for now).
1821 : : * null_print is the null marker string. Note that this is compared to
1822 : : * the pre-de-escaped input string.
1823 : : *
1824 : : * The return value is the number of fields actually read.
1825 : : */
1826 : : static int
1938 heikki.linnakangas@i 1827 :CBC 733672 : CopyReadAttributesText(CopyFromState cstate)
1828 : : {
1829 : 733672 : char delimc = cstate->opts.delim[0];
1830 : : int fieldno;
1831 : : char *output_ptr;
1832 : : char *cur_ptr;
1833 : : char *line_end_ptr;
1834 : :
1835 : : /*
1836 : : * We need a special case for zero-column tables: check that the input
1837 : : * line is empty, and return.
1838 : : */
1839 [ + + ]: 733672 : if (cstate->max_fields <= 0)
1840 : : {
1841 [ - + ]: 4 : if (cstate->line_buf.len != 0)
1938 heikki.linnakangas@i 1842 [ # # ]:UBC 0 : ereport(ERROR,
1843 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1844 : : errmsg("extra data after last expected column")));
1938 heikki.linnakangas@i 1845 :CBC 4 : return 0;
1846 : : }
1847 : :
1848 : 733668 : resetStringInfo(&cstate->attribute_buf);
1849 : :
1850 : : /*
1851 : : * The de-escaped attributes will certainly not be longer than the input
1852 : : * data line, so we can just force attribute_buf to be large enough and
1853 : : * then transfer data without any checks for enough space. We need to do
1854 : : * it this way because enlarging attribute_buf mid-stream would invalidate
1855 : : * pointers already stored into cstate->raw_fields[].
1856 : : */
1857 [ + + ]: 733668 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1858 : 4 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1859 : 733668 : output_ptr = cstate->attribute_buf.data;
1860 : :
1861 : : /* set pointer variables for loop */
1862 : 733668 : cur_ptr = cstate->line_buf.data;
1863 : 733668 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1864 : :
1865 : : /* Outer loop iterates over fields */
1866 : 733668 : fieldno = 0;
1867 : : for (;;)
1868 : 1632505 : {
1869 : 2366173 : bool found_delim = false;
1870 : : char *start_ptr;
1871 : : char *end_ptr;
1872 : : int input_len;
1873 : 2366173 : bool saw_non_ascii = false;
1874 : :
1875 : : /* Make sure there is enough space for the next value */
1876 [ + + ]: 2366173 : if (fieldno >= cstate->max_fields)
1877 : : {
1878 : 21 : cstate->max_fields *= 2;
1879 : 21 : cstate->raw_fields =
1880 : 21 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1881 : : }
1882 : :
1883 : : /* Remember start of field on both input and output sides */
1884 : 2366173 : start_ptr = cur_ptr;
1885 : 2366173 : cstate->raw_fields[fieldno] = output_ptr;
1886 : :
1887 : : /*
1888 : : * Scan data for field.
1889 : : *
1890 : : * Note that in this loop, we are scanning to locate the end of field
1891 : : * and also speculatively performing de-escaping. Once we find the
1892 : : * end-of-field, we can match the raw field contents against the null
1893 : : * marker string. Only after that comparison fails do we know that
1894 : : * de-escaping is actually the right thing to do; therefore we *must
1895 : : * not* throw any syntax errors before we've done the null-marker
1896 : : * check.
1897 : : */
1898 : : for (;;)
1899 : 11625589 : {
1900 : : char c;
1901 : :
1902 : 13991762 : end_ptr = cur_ptr;
1903 [ + + ]: 13991762 : if (cur_ptr >= line_end_ptr)
1904 : 733665 : break;
1905 : 13258097 : c = *cur_ptr++;
1906 [ + + ]: 13258097 : if (c == delimc)
1907 : : {
1908 : 1632508 : found_delim = true;
1909 : 1632508 : break;
1910 : : }
1911 [ + + ]: 11625589 : if (c == '\\')
1912 : : {
1913 [ - + ]: 4000 : if (cur_ptr >= line_end_ptr)
1938 heikki.linnakangas@i 1914 :UBC 0 : break;
1938 heikki.linnakangas@i 1915 :CBC 4000 : c = *cur_ptr++;
1916 [ + + - - : 4000 : switch (c)
+ - - -
+ ]
1917 : : {
1918 : 6 : case '0':
1919 : : case '1':
1920 : : case '2':
1921 : : case '3':
1922 : : case '4':
1923 : : case '5':
1924 : : case '6':
1925 : : case '7':
1926 : : {
1927 : : /* handle \013 */
1928 : : int val;
1929 : :
1930 : 6 : val = OCTVALUE(c);
1931 [ + + ]: 6 : if (cur_ptr < line_end_ptr)
1932 : : {
1933 : 3 : c = *cur_ptr;
1934 [ - + - - ]: 3 : if (ISOCTAL(c))
1935 : : {
1938 heikki.linnakangas@i 1936 :UBC 0 : cur_ptr++;
1937 : 0 : val = (val << 3) + OCTVALUE(c);
1938 [ # # ]: 0 : if (cur_ptr < line_end_ptr)
1939 : : {
1940 : 0 : c = *cur_ptr;
1941 [ # # # # ]: 0 : if (ISOCTAL(c))
1942 : : {
1943 : 0 : cur_ptr++;
1944 : 0 : val = (val << 3) + OCTVALUE(c);
1945 : : }
1946 : : }
1947 : : }
1948 : : }
1938 heikki.linnakangas@i 1949 :CBC 6 : c = val & 0377;
1950 [ - + - - ]: 6 : if (c == '\0' || IS_HIGHBIT_SET(c))
1951 : 6 : saw_non_ascii = true;
1952 : : }
1953 : 6 : break;
1954 : 6 : case 'x':
1955 : : /* Handle \x3F */
1956 [ + + ]: 6 : if (cur_ptr < line_end_ptr)
1957 : : {
1958 : 3 : char hexchar = *cur_ptr;
1959 : :
1960 [ - + ]: 3 : if (isxdigit((unsigned char) hexchar))
1961 : : {
1938 heikki.linnakangas@i 1962 :UBC 0 : int val = GetDecimalFromHex(hexchar);
1963 : :
1964 : 0 : cur_ptr++;
1965 [ # # ]: 0 : if (cur_ptr < line_end_ptr)
1966 : : {
1967 : 0 : hexchar = *cur_ptr;
1968 [ # # ]: 0 : if (isxdigit((unsigned char) hexchar))
1969 : : {
1970 : 0 : cur_ptr++;
1971 : 0 : val = (val << 4) + GetDecimalFromHex(hexchar);
1972 : : }
1973 : : }
1974 : 0 : c = val & 0xff;
1975 [ # # # # ]: 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
1976 : 0 : saw_non_ascii = true;
1977 : : }
1978 : : }
1938 heikki.linnakangas@i 1979 :CBC 6 : break;
1938 heikki.linnakangas@i 1980 :UBC 0 : case 'b':
1981 : 0 : c = '\b';
1982 : 0 : break;
1983 : 0 : case 'f':
1984 : 0 : c = '\f';
1985 : 0 : break;
1938 heikki.linnakangas@i 1986 :CBC 1525 : case 'n':
1987 : 1525 : c = '\n';
1988 : 1525 : break;
1938 heikki.linnakangas@i 1989 :UBC 0 : case 'r':
1990 : 0 : c = '\r';
1991 : 0 : break;
1992 : 0 : case 't':
1993 : 0 : c = '\t';
1994 : 0 : break;
1995 : 0 : case 'v':
1996 : 0 : c = '\v';
1997 : 0 : break;
1998 : :
1999 : : /*
2000 : : * in all other cases, take the char after '\'
2001 : : * literally
2002 : : */
2003 : : }
2004 : : }
2005 : :
2006 : : /* Add c to output string */
1938 heikki.linnakangas@i 2007 :CBC 11625589 : *output_ptr++ = c;
2008 : : }
2009 : :
2010 : : /* Check whether raw input matched null marker */
2011 : 2366173 : input_len = end_ptr - start_ptr;
2012 [ + + ]: 2366173 : if (input_len == cstate->opts.null_print_len &&
2013 [ + + ]: 125736 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
2014 : 2407 : cstate->raw_fields[fieldno] = NULL;
2015 : : /* Check whether raw input matched default marker */
1096 andrew@dunslane.net 2016 [ + + ]: 2363766 : else if (fieldno < list_length(cstate->attnumlist) &&
2017 [ + + ]: 2363742 : cstate->opts.default_print &&
1098 2018 [ + + ]: 57 : input_len == cstate->opts.default_print_len &&
2019 [ + - ]: 15 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
2020 : 12 : {
2021 : : /* fieldno is 0-indexed and attnum is 1-indexed */
2022 : 15 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
2023 : :
2024 [ + + ]: 15 : if (cstate->defexprs[m] != NULL)
2025 : : {
2026 : : /* defaults contain entries for all physical attributes */
2027 : 12 : cstate->defaults[m] = true;
2028 : : }
2029 : : else
2030 : : {
2031 : 3 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2032 : 3 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2033 : :
2034 [ + - ]: 3 : ereport(ERROR,
2035 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2036 : : errmsg("unexpected default marker in COPY data"),
2037 : : errdetail("Column \"%s\" has no default value.",
2038 : : NameStr(att->attname))));
2039 : : }
2040 : : }
2041 : : else
2042 : : {
2043 : : /*
2044 : : * At this point we know the field is supposed to contain data.
2045 : : *
2046 : : * If we de-escaped any non-7-bit-ASCII chars, make sure the
2047 : : * resulting string is valid data for the db encoding.
2048 : : */
1938 heikki.linnakangas@i 2049 [ - + ]: 2363751 : if (saw_non_ascii)
2050 : : {
1938 heikki.linnakangas@i 2051 :UBC 0 : char *fld = cstate->raw_fields[fieldno];
2052 : :
2053 : 0 : pg_verifymbstr(fld, output_ptr - fld, false);
2054 : : }
2055 : : }
2056 : :
2057 : : /* Terminate attribute value in output area */
1938 heikki.linnakangas@i 2058 :CBC 2366170 : *output_ptr++ = '\0';
2059 : :
2060 : 2366170 : fieldno++;
2061 : : /* Done if we hit EOL instead of a delim */
2062 [ + + ]: 2366170 : if (!found_delim)
2063 : 733665 : break;
2064 : : }
2065 : :
2066 : : /* Clean up state of attribute_buf */
2067 : 733665 : output_ptr--;
2068 [ - + ]: 733665 : Assert(*output_ptr == '\0');
2069 : 733665 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2070 : :
2071 : 733665 : return fieldno;
2072 : : }
2073 : :
2074 : : /*
2075 : : * Parse the current line into separate attributes (fields),
2076 : : * performing de-escaping as needed. This has exactly the same API as
2077 : : * CopyReadAttributesText, except we parse the fields according to
2078 : : * "standard" (i.e. common) CSV usage.
2079 : : */
2080 : : static int
2081 : 257 : CopyReadAttributesCSV(CopyFromState cstate)
2082 : : {
2083 : 257 : char delimc = cstate->opts.delim[0];
2084 : 257 : char quotec = cstate->opts.quote[0];
2085 : 257 : char escapec = cstate->opts.escape[0];
2086 : : int fieldno;
2087 : : char *output_ptr;
2088 : : char *cur_ptr;
2089 : : char *line_end_ptr;
2090 : :
2091 : : /*
2092 : : * We need a special case for zero-column tables: check that the input
2093 : : * line is empty, and return.
2094 : : */
2095 [ - + ]: 257 : if (cstate->max_fields <= 0)
2096 : : {
1938 heikki.linnakangas@i 2097 [ # # ]:UBC 0 : if (cstate->line_buf.len != 0)
2098 [ # # ]: 0 : ereport(ERROR,
2099 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2100 : : errmsg("extra data after last expected column")));
2101 : 0 : return 0;
2102 : : }
2103 : :
1938 heikki.linnakangas@i 2104 :CBC 257 : resetStringInfo(&cstate->attribute_buf);
2105 : :
2106 : : /*
2107 : : * The de-escaped attributes will certainly not be longer than the input
2108 : : * data line, so we can just force attribute_buf to be large enough and
2109 : : * then transfer data without any checks for enough space. We need to do
2110 : : * it this way because enlarging attribute_buf mid-stream would invalidate
2111 : : * pointers already stored into cstate->raw_fields[].
2112 : : */
2113 [ - + ]: 257 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1938 heikki.linnakangas@i 2114 :UBC 0 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1938 heikki.linnakangas@i 2115 :CBC 257 : output_ptr = cstate->attribute_buf.data;
2116 : :
2117 : : /* set pointer variables for loop */
2118 : 257 : cur_ptr = cstate->line_buf.data;
2119 : 257 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
2120 : :
2121 : : /* Outer loop iterates over fields */
2122 : 257 : fieldno = 0;
2123 : : for (;;)
2124 : 267 : {
2125 : 524 : bool found_delim = false;
2126 : 524 : bool saw_quote = false;
2127 : : char *start_ptr;
2128 : : char *end_ptr;
2129 : : int input_len;
2130 : :
2131 : : /* Make sure there is enough space for the next value */
2132 [ - + ]: 524 : if (fieldno >= cstate->max_fields)
2133 : : {
1938 heikki.linnakangas@i 2134 :UBC 0 : cstate->max_fields *= 2;
2135 : 0 : cstate->raw_fields =
2136 : 0 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
2137 : : }
2138 : :
2139 : : /* Remember start of field on both input and output sides */
1938 heikki.linnakangas@i 2140 :CBC 524 : start_ptr = cur_ptr;
2141 : 524 : cstate->raw_fields[fieldno] = output_ptr;
2142 : :
2143 : : /*
2144 : : * Scan data for field,
2145 : : *
2146 : : * The loop starts in "not quote" mode and then toggles between that
2147 : : * and "in quote" mode. The loop exits normally if it is in "not
2148 : : * quote" mode and a delimiter or line end is seen.
2149 : : */
2150 : : for (;;)
2151 : 114 : {
2152 : : char c;
2153 : :
2154 : : /* Not in quote */
2155 : : for (;;)
2156 : : {
2157 : 1666 : end_ptr = cur_ptr;
2158 [ + + ]: 1666 : if (cur_ptr >= line_end_ptr)
2159 : 254 : goto endfield;
2160 : 1412 : c = *cur_ptr++;
2161 : : /* unquoted field delimiter */
2162 [ + + ]: 1412 : if (c == delimc)
2163 : : {
2164 : 270 : found_delim = true;
2165 : 270 : goto endfield;
2166 : : }
2167 : : /* start of quoted field (or part of field) */
2168 [ + + ]: 1142 : if (c == quotec)
2169 : : {
2170 : 114 : saw_quote = true;
2171 : 114 : break;
2172 : : }
2173 : : /* Add c to output string */
2174 : 1028 : *output_ptr++ = c;
2175 : : }
2176 : :
2177 : : /* In quote */
2178 : : for (;;)
2179 : : {
2180 : 710 : end_ptr = cur_ptr;
2181 [ - + ]: 710 : if (cur_ptr >= line_end_ptr)
1938 heikki.linnakangas@i 2182 [ # # ]:UBC 0 : ereport(ERROR,
2183 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2184 : : errmsg("unterminated CSV quoted field")));
2185 : :
1938 heikki.linnakangas@i 2186 :CBC 710 : c = *cur_ptr++;
2187 : :
2188 : : /* escape within a quoted field */
2189 [ + + ]: 710 : if (c == escapec)
2190 : : {
2191 : : /*
2192 : : * peek at the next char if available, and escape it if it
2193 : : * is an escape char or a quote char
2194 : : */
2195 [ + + ]: 62 : if (cur_ptr < line_end_ptr)
2196 : : {
2197 : 36 : char nextc = *cur_ptr;
2198 : :
2199 [ + + - + ]: 36 : if (nextc == escapec || nextc == quotec)
2200 : : {
2201 : 12 : *output_ptr++ = nextc;
2202 : 12 : cur_ptr++;
2203 : 12 : continue;
2204 : : }
2205 : : }
2206 : : }
2207 : :
2208 : : /*
2209 : : * end of quoted field. Must do this test after testing for
2210 : : * escape in case quote char and escape char are the same
2211 : : * (which is the common case).
2212 : : */
2213 [ + + ]: 698 : if (c == quotec)
2214 : 114 : break;
2215 : :
2216 : : /* Add c to output string */
2217 : 584 : *output_ptr++ = c;
2218 : : }
2219 : : }
2220 : 524 : endfield:
2221 : :
2222 : : /* Terminate attribute value in output area */
2223 : 524 : *output_ptr++ = '\0';
2224 : :
2225 : : /* Check whether raw input matched null marker */
2226 : 524 : input_len = end_ptr - start_ptr;
2227 [ + + + + ]: 524 : if (!saw_quote && input_len == cstate->opts.null_print_len &&
2228 [ + - ]: 22 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
2229 : 22 : cstate->raw_fields[fieldno] = NULL;
2230 : : /* Check whether raw input matched default marker */
1096 andrew@dunslane.net 2231 [ + - ]: 502 : else if (fieldno < list_length(cstate->attnumlist) &&
2232 [ + + ]: 502 : cstate->opts.default_print &&
1098 2233 [ + + ]: 75 : input_len == cstate->opts.default_print_len &&
2234 [ + - ]: 21 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
2235 : : {
2236 : : /* fieldno is 0-index and attnum is 1-index */
2237 : 21 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
2238 : :
2239 [ + + ]: 21 : if (cstate->defexprs[m] != NULL)
2240 : : {
2241 : : /* defaults contain entries for all physical attributes */
2242 : 18 : cstate->defaults[m] = true;
2243 : : }
2244 : : else
2245 : : {
2246 : 3 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2247 : 3 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2248 : :
2249 [ + - ]: 3 : ereport(ERROR,
2250 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2251 : : errmsg("unexpected default marker in COPY data"),
2252 : : errdetail("Column \"%s\" has no default value.",
2253 : : NameStr(att->attname))));
2254 : : }
2255 : : }
2256 : :
1938 heikki.linnakangas@i 2257 : 521 : fieldno++;
2258 : : /* Done if we hit EOL instead of a delim */
2259 [ + + ]: 521 : if (!found_delim)
2260 : 254 : break;
2261 : : }
2262 : :
2263 : : /* Clean up state of attribute_buf */
2264 : 254 : output_ptr--;
2265 [ - + ]: 254 : Assert(*output_ptr == '\0');
2266 : 254 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2267 : :
2268 : 254 : return fieldno;
2269 : : }
2270 : :
2271 : :
2272 : : /*
2273 : : * Read a binary attribute
2274 : : */
2275 : : static Datum
2276 : 79 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
2277 : : Oid typioparam, int32 typmod,
2278 : : bool *isnull)
2279 : : {
2280 : : int32 fld_size;
2281 : : Datum result;
2282 : :
2283 [ - + ]: 79 : if (!CopyGetInt32(cstate, &fld_size))
1938 heikki.linnakangas@i 2284 [ # # ]:UBC 0 : ereport(ERROR,
2285 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2286 : : errmsg("unexpected EOF in COPY data")));
1938 heikki.linnakangas@i 2287 [ + + ]:CBC 79 : if (fld_size == -1)
2288 : : {
2289 : 15 : *isnull = true;
2290 : 15 : return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2291 : : }
2292 [ - + ]: 64 : if (fld_size < 0)
1938 heikki.linnakangas@i 2293 [ # # ]:UBC 0 : ereport(ERROR,
2294 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2295 : : errmsg("invalid field size")));
2296 : :
2297 : : /* reset attribute_buf to empty, and load raw data in it */
1938 heikki.linnakangas@i 2298 :CBC 64 : resetStringInfo(&cstate->attribute_buf);
2299 : :
2300 : 64 : enlargeStringInfo(&cstate->attribute_buf, fld_size);
2301 : 64 : if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2302 [ - + ]: 64 : fld_size) != fld_size)
1938 heikki.linnakangas@i 2303 [ # # ]:UBC 0 : ereport(ERROR,
2304 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2305 : : errmsg("unexpected EOF in COPY data")));
2306 : :
1938 heikki.linnakangas@i 2307 :CBC 64 : cstate->attribute_buf.len = fld_size;
2308 : 64 : cstate->attribute_buf.data[fld_size] = '\0';
2309 : :
2310 : : /* Call the column type's binary input converter */
2311 : 64 : result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2312 : : typioparam, typmod);
2313 : :
2314 : : /* Trouble if it didn't eat the whole buffer */
2315 [ + + ]: 64 : if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2316 [ + - ]: 1 : ereport(ERROR,
2317 : : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2318 : : errmsg("incorrect binary data format")));
2319 : :
2320 : 63 : *isnull = false;
2321 : 63 : return result;
2322 : : }
|