Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * copyfromparse.c
4 : : * Parse CSV/text/binary format for COPY FROM.
5 : : *
6 : : * This file contains routines to parse the text, CSV and binary input
7 : : * formats. The main entry point is NextCopyFrom(), which parses the
8 : : * next input line and returns it as Datums.
9 : : *
10 : : * In text/CSV mode, the parsing happens in multiple stages:
11 : : *
12 : : * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 : : * 1. 2. 3. 4.
14 : : *
15 : : * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 : : * places it into 'raw_buf'.
17 : : *
18 : : * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 : : * the data in 'raw_buf' from client to server encoding, placing the
20 : : * converted result in 'input_buf'.
21 : : *
22 : : * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 : : * It is responsible for finding the next newline marker, taking quote and
24 : : * escape characters into account according to the COPY options. The line
25 : : * is copied into 'line_buf', with quotes and escape characters still
26 : : * intact.
27 : : *
28 : : * 4. CopyReadAttributesText/CSV() function takes the input line from
29 : : * 'line_buf', and splits it into fields, unescaping the data as required.
30 : : * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 : : * pointers to each field.
32 : : *
33 : : * If encoding conversion is not required, a shortcut is taken in step 2 to
34 : : * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 : : * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 : : * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 : : * the data is valid in the current encoding.
38 : : *
39 : : * In binary mode, the pipeline is much simpler. Input is loaded into
40 : : * 'raw_buf', and encoding conversion is done in the datatype-specific
41 : : * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 : : * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 : : * data when it's passed the receive function.
44 : : *
45 : : * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 : : * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 : : * and 'attribute_buf' are expanded on demand, to hold the longest line
48 : : * encountered so far.
49 : : *
50 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
51 : : * Portions Copyright (c) 1994, Regents of the University of California
52 : : *
53 : : *
54 : : * IDENTIFICATION
55 : : * src/backend/commands/copyfromparse.c
56 : : *
57 : : *-------------------------------------------------------------------------
58 : : */
59 : : #include "postgres.h"
60 : :
61 : : #include <ctype.h>
62 : : #include <unistd.h>
63 : : #include <sys/stat.h>
64 : :
65 : : #include "commands/copyapi.h"
66 : : #include "commands/copyfrom_internal.h"
67 : : #include "commands/progress.h"
68 : : #include "executor/executor.h"
69 : : #include "libpq/libpq.h"
70 : : #include "libpq/pqformat.h"
71 : : #include "mb/pg_wchar.h"
72 : : #include "miscadmin.h"
73 : : #include "pgstat.h"
74 : : #include "port/pg_bitutils.h"
75 : : #include "port/pg_bswap.h"
76 : : #include "port/simd.h"
77 : : #include "utils/builtins.h"
78 : : #include "utils/rel.h"
79 : : #include "utils/wait_event.h"
80 : :
81 : : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
82 : : #define OCTVALUE(c) ((c) - '0')
83 : :
84 : : /*
85 : : * These macros centralize code used to process line_buf and input_buf buffers.
86 : : * They are macros because they often do continue/break control and to avoid
87 : : * function call overhead in tight COPY loops.
88 : : *
89 : : * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
90 : : * prevent the continue/break processing from working. We end the "if (1)"
91 : : * with "else ((void) 0)" to ensure the "if" does not unintentionally match
92 : : * any "else" in the calling code, and to avoid any compiler warnings about
93 : : * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
94 : : */
95 : :
96 : : /*
97 : : * This keeps the character read at the top of the loop in the buffer
98 : : * even if there is more than one read-ahead.
99 : : */
100 : : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
101 : : if (1) \
102 : : { \
103 : : if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
104 : : { \
105 : : input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
106 : : need_data = true; \
107 : : continue; \
108 : : } \
109 : : } else ((void) 0)
110 : :
111 : : /* This consumes the remainder of the buffer and breaks */
112 : : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
113 : : if (1) \
114 : : { \
115 : : if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
116 : : { \
117 : : if (extralen) \
118 : : input_buf_ptr = copy_buf_len; /* consume the partial character */ \
119 : : /* backslash just before EOF, treat as data char */ \
120 : : result = true; \
121 : : break; \
122 : : } \
123 : : } else ((void) 0)
124 : :
125 : : /*
126 : : * Transfer any approved data to line_buf; must do this to be sure
127 : : * there is some room in input_buf.
128 : : */
129 : : #define REFILL_LINEBUF \
130 : : if (1) \
131 : : { \
132 : : if (input_buf_ptr > cstate->input_buf_index) \
133 : : { \
134 : : appendBinaryStringInfo(&cstate->line_buf, \
135 : : cstate->input_buf + cstate->input_buf_index, \
136 : : input_buf_ptr - cstate->input_buf_index); \
137 : : cstate->input_buf_index = input_buf_ptr; \
138 : : } \
139 : : } else ((void) 0)
140 : :
141 : : /* NOTE: there's a copy of this in copyto.c */
142 : : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
143 : :
144 : :
145 : : /* non-export function prototypes */
146 : : static bool CopyReadLine(CopyFromState cstate, bool is_csv);
147 : : static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate,
148 : : bool is_csv);
149 : : static int CopyReadAttributesText(CopyFromState cstate);
150 : : static int CopyReadAttributesCSV(CopyFromState cstate);
151 : : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
152 : : Oid typioparam, int32 typmod,
153 : : bool *isnull);
154 : : static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate,
155 : : ExprContext *econtext,
156 : : Datum *values,
157 : : bool *nulls,
158 : : bool is_csv);
159 : : static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate,
160 : : char ***fields,
161 : : int *nfields,
162 : : bool is_csv);
163 : :
164 : :
165 : : /* Low-level communications functions */
166 : : static int CopyGetData(CopyFromState cstate, void *databuf,
167 : : int minread, int maxread);
168 : : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
169 : : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
170 : : static void CopyLoadInputBuf(CopyFromState cstate);
171 : : static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
172 : :
173 : : void
1989 heikki.linnakangas@i 174 :CBC 682 : ReceiveCopyBegin(CopyFromState cstate)
175 : : {
176 : : StringInfoData buf;
1888 177 : 682 : int natts = list_length(cstate->attnumlist);
50 andrew@dunslane.net 178 :GNC 682 : int16 format = (cstate->opts.format == COPY_FORMAT_BINARY ? 1 : 0);
179 : : int i;
180 : :
987 nathan@postgresql.or 181 :CBC 682 : pq_beginmessage(&buf, PqMsg_CopyInResponse);
1888 heikki.linnakangas@i 182 : 682 : pq_sendbyte(&buf, format); /* overall format */
183 : 682 : pq_sendint16(&buf, natts);
184 [ + + ]: 2447 : for (i = 0; i < natts; i++)
185 : 1765 : pq_sendint16(&buf, format); /* per-column formats */
186 : 682 : pq_endmessage(&buf);
187 : 682 : cstate->copy_src = COPY_FRONTEND;
188 : 682 : cstate->fe_msgbuf = makeStringInfo();
189 : : /* We *must* flush here to ensure FE knows it can send. */
1989 190 : 682 : pq_flush();
191 : 682 : }
192 : :
193 : : void
194 : 8 : ReceiveCopyBinaryHeader(CopyFromState cstate)
195 : : {
196 : : char readSig[11];
197 : : int32 tmp;
198 : :
199 : : /* Signature */
200 [ + - ]: 8 : if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
201 [ - + ]: 8 : memcmp(readSig, BinarySignature, 11) != 0)
1989 heikki.linnakangas@i 202 [ # # ]:UBC 0 : ereport(ERROR,
203 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
204 : : errmsg("COPY file signature not recognized")));
205 : : /* Flags field */
1989 heikki.linnakangas@i 206 [ - + ]:CBC 8 : if (!CopyGetInt32(cstate, &tmp))
1989 heikki.linnakangas@i 207 [ # # ]:UBC 0 : ereport(ERROR,
208 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
209 : : errmsg("invalid COPY file header (missing flags)")));
1989 heikki.linnakangas@i 210 [ - + ]:CBC 8 : if ((tmp & (1 << 16)) != 0)
1989 heikki.linnakangas@i 211 [ # # ]:UBC 0 : ereport(ERROR,
212 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
213 : : errmsg("invalid COPY file header (WITH OIDS)")));
1989 heikki.linnakangas@i 214 :CBC 8 : tmp &= ~(1 << 16);
215 [ - + ]: 8 : if ((tmp >> 16) != 0)
1989 heikki.linnakangas@i 216 [ # # ]:UBC 0 : ereport(ERROR,
217 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
218 : : errmsg("unrecognized critical flags in COPY file header")));
219 : : /* Header extension length */
1989 heikki.linnakangas@i 220 [ + - ]:CBC 8 : if (!CopyGetInt32(cstate, &tmp) ||
221 [ - + ]: 8 : tmp < 0)
1989 heikki.linnakangas@i 222 [ # # ]:UBC 0 : ereport(ERROR,
223 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
224 : : errmsg("invalid COPY file header (missing length)")));
225 : : /* Skip extension header, if present */
1989 heikki.linnakangas@i 226 [ - + ]:CBC 8 : while (tmp-- > 0)
227 : : {
1989 heikki.linnakangas@i 228 [ # # ]:UBC 0 : if (CopyReadBinaryData(cstate, readSig, 1) != 1)
229 [ # # ]: 0 : ereport(ERROR,
230 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
231 : : errmsg("invalid COPY file header (wrong length)")));
232 : : }
1989 heikki.linnakangas@i 233 :CBC 8 : }
234 : :
235 : : /*
236 : : * CopyGetData reads data from the source (file or frontend)
237 : : *
238 : : * We attempt to read at least minread, and at most maxread, bytes from
239 : : * the source. The actual number of bytes read is returned; if this is
240 : : * less than minread, EOF was detected.
241 : : *
242 : : * Note: when copying from the frontend, we expect a proper EOF mark per
243 : : * protocol; if the frontend simply drops the connection, we raise error.
244 : : * It seems unwise to allow the COPY IN to complete normally in that case.
245 : : *
246 : : * NB: no data conversion is applied here.
247 : : */
248 : : static int
249 : 217606 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
250 : : {
251 : 217606 : int bytesread = 0;
252 : :
253 [ + + + - ]: 217606 : switch (cstate->copy_src)
254 : : {
255 : 709 : case COPY_FILE:
91 michael@paquier.xyz 256 :GNC 709 : pgstat_report_wait_start(WAIT_EVENT_COPY_FROM_READ);
1989 heikki.linnakangas@i 257 :CBC 709 : bytesread = fread(databuf, 1, maxread, cstate->copy_file);
91 michael@paquier.xyz 258 :GNC 709 : pgstat_report_wait_end();
1989 heikki.linnakangas@i 259 [ - + ]:CBC 709 : if (ferror(cstate->copy_file))
1989 heikki.linnakangas@i 260 [ # # ]:UBC 0 : ereport(ERROR,
261 : : (errcode_for_file_access(),
262 : : errmsg("could not read from COPY file: %m")));
1989 heikki.linnakangas@i 263 [ + + ]:CBC 709 : if (bytesread == 0)
1860 264 : 279 : cstate->raw_reached_eof = true;
1989 265 : 709 : break;
1888 266 : 201840 : case COPY_FRONTEND:
1860 267 [ + - + + : 402661 : while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
+ + ]
268 : : {
269 : : int avail;
270 : :
1989 271 [ + + ]: 402180 : while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
272 : : {
273 : : /* Try to receive another message */
274 : : int mtype;
275 : : int maxmsglen;
276 : :
277 : 201359 : readmessage:
278 : 201359 : HOLD_CANCEL_INTERRUPTS();
279 : 201359 : pq_startmsgread();
280 : 201359 : mtype = pq_getbyte();
281 [ - + ]: 201359 : if (mtype == EOF)
1989 heikki.linnakangas@i 282 [ # # ]:UBC 0 : ereport(ERROR,
283 : : (errcode(ERRCODE_CONNECTION_FAILURE),
284 : : errmsg("unexpected EOF on client connection with an open transaction")));
285 : : /* Validate message type and set packet size limit */
1833 tgl@sss.pgh.pa.us 286 [ + + + ]:CBC 201359 : switch (mtype)
287 : : {
987 nathan@postgresql.or 288 : 200821 : case PqMsg_CopyData:
1833 tgl@sss.pgh.pa.us 289 : 200821 : maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
290 : 200821 : break;
987 nathan@postgresql.or 291 : 536 : case PqMsg_CopyDone:
292 : : case PqMsg_CopyFail:
293 : : case PqMsg_Flush:
294 : : case PqMsg_Sync:
1833 tgl@sss.pgh.pa.us 295 : 536 : maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
296 : 536 : break;
297 : 2 : default:
298 [ + - ]: 2 : ereport(ERROR,
299 : : (errcode(ERRCODE_PROTOCOL_VIOLATION),
300 : : errmsg("unexpected message type 0x%02X during COPY from stdin",
301 : : mtype)));
302 : : maxmsglen = 0; /* keep compiler quiet */
303 : : break;
304 : : }
305 : : /* Now collect the message body */
306 [ - + ]: 201357 : if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
1989 heikki.linnakangas@i 307 [ # # ]:UBC 0 : ereport(ERROR,
308 : : (errcode(ERRCODE_CONNECTION_FAILURE),
309 : : errmsg("unexpected EOF on client connection with an open transaction")));
1989 heikki.linnakangas@i 310 [ - + ]:CBC 201357 : RESUME_CANCEL_INTERRUPTS();
311 : : /* ... and process it */
312 [ + + - - : 201357 : switch (mtype)
- ]
313 : : {
987 nathan@postgresql.or 314 : 200821 : case PqMsg_CopyData:
1989 heikki.linnakangas@i 315 : 200821 : break;
987 nathan@postgresql.or 316 : 536 : case PqMsg_CopyDone:
317 : : /* COPY IN correctly terminated by frontend */
1860 heikki.linnakangas@i 318 : 536 : cstate->raw_reached_eof = true;
1989 319 : 536 : return bytesread;
987 nathan@postgresql.or 320 :UBC 0 : case PqMsg_CopyFail:
1989 heikki.linnakangas@i 321 [ # # ]: 0 : ereport(ERROR,
322 : : (errcode(ERRCODE_QUERY_CANCELED),
323 : : errmsg("COPY from stdin failed: %s",
324 : : pq_getmsgstring(cstate->fe_msgbuf))));
325 : : break;
987 nathan@postgresql.or 326 : 0 : case PqMsg_Flush:
327 : : case PqMsg_Sync:
328 : :
329 : : /*
330 : : * Ignore Flush/Sync for the convenience of client
331 : : * libraries (such as libpq) that may send those
332 : : * without noticing that the command they just
333 : : * sent was COPY.
334 : : */
1989 heikki.linnakangas@i 335 : 0 : goto readmessage;
336 : 0 : default:
1833 tgl@sss.pgh.pa.us 337 : 0 : Assert(false); /* NOT REACHED */
338 : : }
339 : : }
1989 heikki.linnakangas@i 340 :CBC 200821 : avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
341 [ - + ]: 200821 : if (avail > maxread)
1989 heikki.linnakangas@i 342 :UBC 0 : avail = maxread;
1989 heikki.linnakangas@i 343 :CBC 200821 : pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
165 peter@eisentraut.org 344 :GNC 200821 : databuf = (char *) databuf + avail;
1989 heikki.linnakangas@i 345 :CBC 200821 : maxread -= avail;
346 : 200821 : bytesread += avail;
347 : : }
348 : 201302 : break;
349 : 15057 : case COPY_CALLBACK:
350 : 15057 : bytesread = cstate->data_source_cb(databuf, minread, maxread);
351 : 15057 : break;
352 : : }
353 : :
354 : 217068 : return bytesread;
355 : : }
356 : :
357 : :
358 : : /*
359 : : * These functions do apply some data conversion
360 : : */
361 : :
362 : : /*
363 : : * CopyGetInt32 reads an int32 that appears in network byte order
364 : : *
365 : : * Returns true if OK, false if EOF
366 : : */
367 : : static inline bool
368 : 116 : CopyGetInt32(CopyFromState cstate, int32 *val)
369 : : {
370 : : uint32 buf;
371 : :
372 [ - + ]: 116 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
373 : : {
1989 heikki.linnakangas@i 374 :UBC 0 : *val = 0; /* suppress compiler warning */
375 : 0 : return false;
376 : : }
1989 heikki.linnakangas@i 377 :CBC 116 : *val = (int32) pg_ntoh32(buf);
378 : 116 : return true;
379 : : }
380 : :
381 : : /*
382 : : * CopyGetInt16 reads an int16 that appears in network byte order
383 : : */
384 : : static inline bool
385 : 25 : CopyGetInt16(CopyFromState cstate, int16 *val)
386 : : {
387 : : uint16 buf;
388 : :
389 [ - + ]: 25 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
390 : : {
1989 heikki.linnakangas@i 391 :UBC 0 : *val = 0; /* suppress compiler warning */
392 : 0 : return false;
393 : : }
1989 heikki.linnakangas@i 394 :CBC 25 : *val = (int16) pg_ntoh16(buf);
395 : 25 : return true;
396 : : }
397 : :
398 : :
399 : : /*
400 : : * Perform encoding conversion on data in 'raw_buf', writing the converted
401 : : * data into 'input_buf'.
402 : : *
403 : : * On entry, there must be some data to convert in 'raw_buf'.
404 : : */
405 : : static void
1860 406 : 434428 : CopyConvertBuf(CopyFromState cstate)
407 : : {
408 : : /*
409 : : * If the file and server encoding are the same, no encoding conversion is
410 : : * required. However, we still need to verify that the input is valid for
411 : : * the encoding.
412 : : */
413 [ + + ]: 434428 : if (!cstate->need_transcoding)
414 : : {
415 : : /*
416 : : * When conversion is not required, input_buf and raw_buf are the
417 : : * same. raw_buf_len is the total number of bytes in the buffer, and
418 : : * input_buf_len tracks how many of those bytes have already been
419 : : * verified.
420 : : */
421 : 434340 : int preverifiedlen = cstate->input_buf_len;
422 : 434340 : int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
423 : : int nverified;
424 : :
425 [ + + ]: 434340 : if (unverifiedlen == 0)
426 : : {
427 : : /*
428 : : * If no more raw data is coming, report the EOF to the caller.
429 : : */
430 [ + + ]: 218276 : if (cstate->raw_reached_eof)
431 : 1262 : cstate->input_reached_eof = true;
432 : 218276 : return;
433 : : }
434 : :
435 : : /*
436 : : * Verify the new data, including any residual unverified bytes from
437 : : * previous round.
438 : : */
439 : 216064 : nverified = pg_encoding_verifymbstr(cstate->file_encoding,
440 : 216064 : cstate->raw_buf + preverifiedlen,
441 : : unverifiedlen);
442 [ - + ]: 216064 : if (nverified == 0)
443 : : {
444 : : /*
445 : : * Could not verify anything.
446 : : *
447 : : * If there is no more raw input data coming, it means that there
448 : : * was an incomplete multi-byte sequence at the end. Also, if
449 : : * there's "enough" input left, we should be able to verify at
450 : : * least one character, and a failure to do so means that we've
451 : : * hit an invalid byte sequence.
452 : : */
1437 heikki.linnakangas@i 453 [ # # # # ]:UBC 0 : if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
1860 454 : 0 : cstate->input_reached_error = true;
455 : 0 : return;
456 : : }
1860 heikki.linnakangas@i 457 :CBC 216064 : cstate->input_buf_len += nverified;
458 : : }
459 : : else
460 : : {
461 : : /*
462 : : * Encoding conversion is needed.
463 : : */
464 : : int nbytes;
465 : : unsigned char *src;
466 : : int srclen;
467 : : unsigned char *dst;
468 : : int dstlen;
469 : : int convertedlen;
470 : :
471 [ + + ]: 88 : if (RAW_BUF_BYTES(cstate) == 0)
472 : : {
473 : : /*
474 : : * If no more raw data is coming, report the EOF to the caller.
475 : : */
476 [ + + ]: 56 : if (cstate->raw_reached_eof)
477 : 16 : cstate->input_reached_eof = true;
478 : 56 : return;
479 : : }
480 : :
481 : : /*
482 : : * First, copy down any unprocessed data.
483 : : */
484 : 32 : nbytes = INPUT_BUF_BYTES(cstate);
485 [ - + - - ]: 32 : if (nbytes > 0 && cstate->input_buf_index > 0)
1860 heikki.linnakangas@i 486 :UBC 0 : memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
487 : : nbytes);
1860 heikki.linnakangas@i 488 :CBC 32 : cstate->input_buf_index = 0;
489 : 32 : cstate->input_buf_len = nbytes;
490 : 32 : cstate->input_buf[nbytes] = '\0';
491 : :
492 : 32 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
493 : 32 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
494 : 32 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
495 : 32 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
496 : :
497 : : /*
498 : : * Do the conversion. This might stop short, if there is an invalid
499 : : * byte sequence in the input. We'll convert as much as we can in
500 : : * that case.
501 : : *
502 : : * Note: Even if we hit an invalid byte sequence, we don't report the
503 : : * error until all the valid bytes have been consumed. The input
504 : : * might contain an end-of-input marker (\.), and we don't want to
505 : : * report an error if the invalid byte sequence is after the
506 : : * end-of-input marker. We might unnecessarily convert some data
507 : : * after the end-of-input marker as long as it's valid for the
508 : : * encoding, but that's harmless.
509 : : */
510 : 32 : convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
511 : : cstate->file_encoding,
512 : : GetDatabaseEncoding(),
513 : : src, srclen,
514 : : dst, dstlen,
515 : : true);
516 [ + + ]: 32 : if (convertedlen == 0)
517 : : {
518 : : /*
519 : : * Could not convert anything. If there is no more raw input data
520 : : * coming, it means that there was an incomplete multi-byte
521 : : * sequence at the end. Also, if there is plenty of input left,
522 : : * we should be able to convert at least one character, so a
523 : : * failure to do so must mean that we've hit a byte sequence
524 : : * that's invalid.
525 : : */
526 [ + + - + ]: 16 : if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
527 : 8 : cstate->input_reached_error = true;
528 : 16 : return;
529 : : }
530 : 16 : cstate->raw_buf_index += convertedlen;
531 : 16 : cstate->input_buf_len += strlen((char *) dst);
532 : : }
533 : : }
534 : :
535 : : /*
536 : : * Report an encoding or conversion error.
537 : : */
538 : : static void
539 : 8 : CopyConversionError(CopyFromState cstate)
540 : : {
541 [ - + ]: 8 : Assert(cstate->raw_buf_len > 0);
542 [ - + ]: 8 : Assert(cstate->input_reached_error);
543 : :
544 [ - + ]: 8 : if (!cstate->need_transcoding)
545 : : {
546 : : /*
547 : : * Everything up to input_buf_len was successfully verified, and
548 : : * input_buf_len points to the invalid or incomplete character.
549 : : */
1860 heikki.linnakangas@i 550 :UBC 0 : report_invalid_encoding(cstate->file_encoding,
551 : 0 : cstate->raw_buf + cstate->input_buf_len,
552 : 0 : cstate->raw_buf_len - cstate->input_buf_len);
553 : : }
554 : : else
555 : : {
556 : : /*
557 : : * raw_buf_index points to the invalid or untranslatable character. We
558 : : * let the conversion routine report the error, because it can provide
559 : : * a more specific error message than we could here. An earlier call
560 : : * to the conversion routine in CopyConvertBuf() detected that there
561 : : * is an error, now we call the conversion routine again with
562 : : * noError=false, to have it throw the error.
563 : : */
564 : : unsigned char *src;
565 : : int srclen;
566 : : unsigned char *dst;
567 : : int dstlen;
568 : :
1860 heikki.linnakangas@i 569 :CBC 8 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
570 : 8 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
571 : 8 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
572 : 8 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
573 : :
574 : 8 : (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
575 : : cstate->file_encoding,
576 : : GetDatabaseEncoding(),
577 : : src, srclen,
578 : : dst, dstlen,
579 : : false);
580 : :
581 : : /*
582 : : * The conversion routine should have reported an error, so this
583 : : * should not be reached.
584 : : */
1860 heikki.linnakangas@i 585 [ # # ]:UBC 0 : elog(ERROR, "encoding conversion failed without error");
586 : : }
587 : : }
588 : :
589 : : /*
590 : : * Load more data from data source to raw_buf.
591 : : *
592 : : * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
593 : : * beginning of the buffer, and we load new data after that.
594 : : */
595 : : static void
1989 heikki.linnakangas@i 596 :CBC 217082 : CopyLoadRawBuf(CopyFromState cstate)
597 : : {
598 : : int nbytes;
599 : : int inbytes;
600 : :
601 : : /*
602 : : * In text mode, if encoding conversion is not required, raw_buf and
603 : : * input_buf point to the same buffer. Their len/index better agree, too.
604 : : */
1860 605 [ + + ]: 217082 : if (cstate->raw_buf == cstate->input_buf)
606 : : {
607 [ - + ]: 217014 : Assert(!cstate->need_transcoding);
608 [ - + ]: 217014 : Assert(cstate->raw_buf_index == cstate->input_buf_index);
609 [ - + ]: 217014 : Assert(cstate->input_buf_len <= cstate->raw_buf_len);
610 : : }
611 : :
612 : : /*
613 : : * Copy down the unprocessed data if any.
614 : : */
615 : 217082 : nbytes = RAW_BUF_BYTES(cstate);
616 [ + + + + ]: 217082 : if (nbytes > 0 && cstate->raw_buf_index > 0)
1989 heikki.linnakangas@i 617 :GBC 618 : memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
618 : : nbytes);
1860 heikki.linnakangas@i 619 :CBC 217082 : cstate->raw_buf_len -= cstate->raw_buf_index;
620 : 217082 : cstate->raw_buf_index = 0;
621 : :
622 : : /*
623 : : * If raw_buf and input_buf are in fact the same buffer, adjust the
624 : : * input_buf variables, too.
625 : : */
626 [ + + ]: 217082 : if (cstate->raw_buf == cstate->input_buf)
627 : : {
628 : 217014 : cstate->input_buf_len -= cstate->input_buf_index;
629 : 217014 : cstate->input_buf_index = 0;
630 : : }
631 : :
632 : : /* Load more data */
633 : 217082 : inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
634 : 217082 : 1, RAW_BUF_SIZE - cstate->raw_buf_len);
1989 635 : 217080 : nbytes += inbytes;
636 : 217080 : cstate->raw_buf[nbytes] = '\0';
637 : 217080 : cstate->raw_buf_len = nbytes;
638 : :
1916 639 : 217080 : cstate->bytes_processed += inbytes;
1945 tomas.vondra@postgre 640 : 217080 : pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
641 : :
1860 heikki.linnakangas@i 642 [ + + ]: 217080 : if (inbytes == 0)
643 : 979 : cstate->raw_reached_eof = true;
644 : 217080 : }
645 : :
646 : : /*
647 : : * CopyLoadInputBuf loads some more data into input_buf
648 : : *
649 : : * On return, at least one more input character is loaded into
650 : : * input_buf, or input_reached_eof is set.
651 : : *
652 : : * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
653 : : * of the buffer and then we load more data after that.
654 : : */
655 : : static void
656 : 217368 : CopyLoadInputBuf(CopyFromState cstate)
657 : : {
658 : 217368 : int nbytes = INPUT_BUF_BYTES(cstate);
659 : :
660 : : /*
661 : : * The caller has updated input_buf_index to indicate how much of the
662 : : * input has been consumed and isn't needed anymore. If input_buf is the
663 : : * same physical area as raw_buf, update raw_buf_index accordingly.
664 : : */
665 [ + + ]: 217368 : if (cstate->raw_buf == cstate->input_buf)
666 : : {
667 [ - + ]: 217328 : Assert(!cstate->need_transcoding);
668 [ - + ]: 217328 : Assert(cstate->input_buf_index >= cstate->raw_buf_index);
669 : 217328 : cstate->raw_buf_index = cstate->input_buf_index;
670 : : }
671 : :
672 : : for (;;)
673 : : {
674 : : /* If we now have some unconverted data, try to convert it */
675 : 434428 : CopyConvertBuf(cstate);
676 : :
677 : : /* If we now have some more input bytes ready, return them */
678 [ + + ]: 434428 : if (INPUT_BUF_BYTES(cstate) > nbytes)
679 : 216080 : return;
680 : :
681 : : /*
682 : : * If we reached an invalid byte sequence, or we're at an incomplete
683 : : * multi-byte character but there is no more raw input data, report
684 : : * conversion error.
685 : : */
686 [ + + ]: 218348 : if (cstate->input_reached_error)
687 : 8 : CopyConversionError(cstate);
688 : :
689 : : /* no more input, and everything has been converted */
690 [ + + ]: 218340 : if (cstate->input_reached_eof)
691 : 1278 : break;
692 : :
693 : : /* Try to load more raw data */
694 [ - + ]: 217062 : Assert(!cstate->raw_reached_eof);
695 : 217062 : CopyLoadRawBuf(cstate);
696 : : }
697 : : }
698 : :
699 : : /*
700 : : * CopyReadBinaryData
701 : : *
702 : : * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
703 : : * and writes them to 'dest'. Returns the number of bytes read (which
704 : : * would be less than 'nbytes' only if we reach EOF).
705 : : */
706 : : static int
1989 707 : 236 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
708 : : {
709 : 236 : int copied_bytes = 0;
710 : :
711 [ + + ]: 236 : if (RAW_BUF_BYTES(cstate) >= nbytes)
712 : : {
713 : : /* Enough bytes are present in the buffer. */
714 : 216 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
715 : 216 : cstate->raw_buf_index += nbytes;
716 : 216 : copied_bytes = nbytes;
717 : : }
718 : : else
719 : : {
720 : : /*
721 : : * Not enough bytes in the buffer, so must read from the file. Need
722 : : * to loop since 'nbytes' could be larger than the buffer size.
723 : : */
724 : : do
725 : : {
726 : : int copy_bytes;
727 : :
728 : : /* Load more data if buffer is empty. */
729 [ + - ]: 20 : if (RAW_BUF_BYTES(cstate) == 0)
730 : : {
1860 731 : 20 : CopyLoadRawBuf(cstate);
732 [ + + ]: 20 : if (cstate->raw_reached_eof)
1989 733 : 7 : break; /* EOF */
734 : : }
735 : :
736 : : /* Transfer some bytes. */
737 : 13 : copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
738 : 13 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
739 : 13 : cstate->raw_buf_index += copy_bytes;
740 : 13 : dest += copy_bytes;
741 : 13 : copied_bytes += copy_bytes;
742 [ - + ]: 13 : } while (copied_bytes < nbytes);
743 : : }
744 : :
745 : 236 : return copied_bytes;
746 : : }
747 : :
748 : : /*
749 : : * This function is exposed for use by extensions that read raw fields in the
750 : : * next line. See NextCopyFromRawFieldsInternal() for details.
751 : : */
752 : : bool
431 msawada@postgresql.o 753 :UBC 0 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
754 : : {
755 : 0 : return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
50 andrew@dunslane.net 756 :UNC 0 : cstate->opts.format == COPY_FORMAT_CSV);
757 : : }
758 : :
759 : : /*
760 : : * Workhorse for NextCopyFromRawFields().
761 : : *
762 : : * Read raw fields in the next line for COPY FROM in text or csv mode. Return
763 : : * false if no more lines.
764 : : *
765 : : * An internal temporary buffer is returned via 'fields'. It is valid until
766 : : * the next call of the function. Since the function returns all raw fields
767 : : * in the input file, 'nfields' could be different from the number of columns
768 : : * in the relation.
769 : : *
770 : : * NOTE: force_not_null option are not applied to the returned fields.
771 : : *
772 : : * We use pg_attribute_always_inline to reduce function call overhead
773 : : * and to help compilers to optimize away the 'is_csv' condition when called
774 : : * by internal functions such as CopyFromTextLikeOneRow().
775 : : */
776 : : static pg_attribute_always_inline bool
431 msawada@postgresql.o 777 :CBC 859722 : NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
778 : : {
779 : : int fldct;
306 fujii@postgresql.org 780 :GNC 859722 : bool done = false;
781 : :
782 : : /* only available for text or csv input */
50 andrew@dunslane.net 783 [ + + - + ]: 859722 : Assert(cstate->opts.format == COPY_FORMAT_TEXT ||
784 : : cstate->opts.format == COPY_FORMAT_CSV);
785 : :
786 : : /* on input check that the header line is correct if needed */
306 fujii@postgresql.org 787 [ + + + + ]: 859722 : if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
788 : : {
789 : : ListCell *cur;
790 : : TupleDesc tupDesc;
791 : 93 : int lines_to_skip = cstate->opts.header_line;
792 : :
793 : : /* If set to "match", one header line is skipped */
794 [ + + ]: 93 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
795 : 50 : lines_to_skip = 1;
796 : :
1497 peter@eisentraut.org 797 :CBC 93 : tupDesc = RelationGetDescr(cstate->rel);
798 : :
306 fujii@postgresql.org 799 [ + + ]:GNC 218 : for (int i = 0; i < lines_to_skip; i++)
800 : : {
801 : 130 : cstate->cur_lineno++;
802 [ + + ]: 130 : if ((done = CopyReadLine(cstate, is_csv)))
803 : 5 : break;
804 : : }
805 : :
1497 peter@eisentraut.org 806 [ + + ]:CBC 93 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
807 : : {
808 : : int fldnum;
809 : :
431 msawada@postgresql.o 810 [ + + ]: 50 : if (is_csv)
811 michael@paquier.xyz 811 : 6 : fldct = CopyReadAttributesCSV(cstate);
812 : : else
813 : 44 : fldct = CopyReadAttributesText(cstate);
814 : :
1497 peter@eisentraut.org 815 [ + + ]: 50 : if (fldct != list_length(cstate->attnumlist))
816 [ + - ]: 16 : ereport(ERROR,
817 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
818 : : errmsg("wrong number of fields in header line: got %d, expected %d",
819 : : fldct, list_length(cstate->attnumlist))));
820 : :
821 : 34 : fldnum = 0;
822 [ + - + + : 104 : foreach(cur, cstate->attnumlist)
+ + ]
823 : : {
824 : 83 : int attnum = lfirst_int(cur);
825 : : char *colName;
826 : 83 : Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
827 : :
1412 michael@paquier.xyz 828 [ - + ]: 83 : Assert(fldnum < cstate->max_fields);
829 : :
830 : 83 : colName = cstate->raw_fields[fldnum++];
1497 peter@eisentraut.org 831 [ + + ]: 83 : if (colName == NULL)
832 [ + - ]: 4 : ereport(ERROR,
833 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
834 : : errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
835 : : fldnum, cstate->opts.null_print, NameStr(attr->attname))));
836 : :
1454 tgl@sss.pgh.pa.us 837 [ + + ]: 79 : if (namestrcmp(&attr->attname, colName) != 0)
838 : : {
1497 peter@eisentraut.org 839 [ + - ]: 9 : ereport(ERROR,
840 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
841 : : errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
842 : : fldnum, colName, NameStr(attr->attname))));
843 : : }
844 : : }
845 : : }
846 : :
847 [ + + ]: 64 : if (done)
1497 peter@eisentraut.org 848 :GBC 5 : return false;
849 : : }
850 : :
1989 heikki.linnakangas@i 851 :CBC 859688 : cstate->cur_lineno++;
852 : :
853 : : /* Actually read the line into memory here */
431 msawada@postgresql.o 854 : 859688 : done = CopyReadLine(cstate, is_csv);
855 : :
856 : : /*
857 : : * EOF at start of line means we're done. If we see EOF after some
858 : : * characters, we act as though it was newline followed by EOF, ie,
859 : : * process the line and then exit loop on next iteration.
860 : : */
1989 heikki.linnakangas@i 861 [ + + + - ]: 859670 : if (done && cstate->line_buf.len == 0)
862 : 985 : return false;
863 : :
864 : : /* Parse the line into de-escaped field values */
431 msawada@postgresql.o 865 [ + + ]: 858685 : if (is_csv)
811 michael@paquier.xyz 866 : 312 : fldct = CopyReadAttributesCSV(cstate);
867 : : else
868 : 858373 : fldct = CopyReadAttributesText(cstate);
869 : :
1989 heikki.linnakangas@i 870 : 858677 : *fields = cstate->raw_fields;
871 : 858677 : *nfields = fldct;
872 : 858677 : return true;
873 : : }
874 : :
875 : : /*
876 : : * Read next tuple from file for COPY FROM. Return false if no more tuples.
877 : : *
878 : : * 'econtext' is used to evaluate default expression for each column that is
879 : : * either not read from the file or is using the DEFAULT option of COPY FROM.
880 : : * It can be NULL when no default values are used, i.e. when all columns are
881 : : * read from the file, and DEFAULT option is unset.
882 : : *
883 : : * 'values' and 'nulls' arrays must be the same length as columns of the
884 : : * relation passed to BeginCopyFrom. This function fills the arrays.
885 : : */
886 : : bool
887 : 859747 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
888 : : Datum *values, bool *nulls)
889 : : {
890 : : TupleDesc tupDesc;
891 : : AttrNumber num_phys_attrs,
892 : 859747 : num_defaults = cstate->num_defaults;
893 : : int i;
894 : 859747 : int *defmap = cstate->defmap;
895 : 859747 : ExprState **defexprs = cstate->defexprs;
896 : :
897 : 859747 : tupDesc = RelationGetDescr(cstate->rel);
898 : 859747 : num_phys_attrs = tupDesc->natts;
899 : :
900 : : /* Initialize all values for row to NULL */
901 [ + - + - : 3773196 : MemSet(values, 0, num_phys_attrs * sizeof(Datum));
+ - + - +
+ ]
902 [ + - + + : 859747 : MemSet(nulls, true, num_phys_attrs * sizeof(bool));
- + - - -
- ]
1013 drowley@postgresql.o 903 [ + - + + : 955819 : MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
+ - + - +
+ ]
904 : :
905 : : /* Get one row from source */
431 msawada@postgresql.o 906 [ + + ]: 859747 : if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
907 : 997 : return false;
908 : :
909 : : /*
910 : : * Now compute and insert any defaults available for the columns not
911 : : * provided by the input data. Anything not processed here or above will
912 : : * remain NULL.
913 : : */
914 [ + + ]: 898970 : for (i = 0; i < num_defaults; i++)
915 : : {
916 : : /*
917 : : * The caller must supply econtext and have switched into the
918 : : * per-tuple memory context in it.
919 : : */
920 [ - + ]: 40345 : Assert(econtext != NULL);
921 [ - + ]: 40345 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
922 : :
923 : 40345 : values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
924 : 40345 : &nulls[defmap[i]]);
925 : : }
926 : :
927 : 858625 : return true;
928 : : }
929 : :
930 : : /* Implementation of the per-row callback for text format */
931 : : bool
932 : 859252 : CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
933 : : bool *nulls)
934 : : {
935 : 859252 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
936 : : }
937 : :
938 : : /* Implementation of the per-row callback for CSV format */
939 : : bool
940 : 470 : CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
941 : : bool *nulls)
942 : : {
943 : 470 : return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
944 : : }
945 : :
946 : : /*
947 : : * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
948 : : *
949 : : * We use pg_attribute_always_inline to reduce function call overhead
950 : : * and to help compilers to optimize away the 'is_csv' condition.
951 : : */
952 : : static pg_attribute_always_inline bool
953 : 859722 : CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext,
954 : : Datum *values, bool *nulls, bool is_csv)
955 : : {
956 : : TupleDesc tupDesc;
957 : : AttrNumber attr_count;
958 : 859722 : FmgrInfo *in_functions = cstate->in_functions;
959 : 859722 : Oid *typioparams = cstate->typioparams;
960 : 859722 : ExprState **defexprs = cstate->defexprs;
961 : : char **field_strings;
962 : : ListCell *cur;
963 : : int fldct;
964 : : int fieldno;
965 : : char *string;
63 peter@eisentraut.org 966 :GNC 859722 : bool current_row_erroneous = false;
967 : :
431 msawada@postgresql.o 968 :CBC 859722 : tupDesc = RelationGetDescr(cstate->rel);
969 : 859722 : attr_count = list_length(cstate->attnumlist);
970 : :
971 : : /* read raw fields in the next line */
972 [ + + ]: 859722 : if (!NextCopyFromRawFieldsInternal(cstate, &field_strings, &fldct, is_csv))
973 : 990 : return false;
974 : :
975 : : /* check for overflowing fields */
976 [ + + + + ]: 858677 : if (attr_count > 0 && fldct > attr_count)
977 [ + - ]: 16 : ereport(ERROR,
978 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
979 : : errmsg("extra data after last expected column")));
980 : :
981 : 858661 : fieldno = 0;
982 : :
983 : : /* Loop to read the user attributes on the line. */
984 [ + + + + : 3680756 : foreach(cur, cstate->attnumlist)
+ + ]
985 : : {
986 : 2822230 : int attnum = lfirst_int(cur);
987 : 2822230 : int m = attnum - 1;
988 : 2822230 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
989 : :
990 [ + + ]: 2822230 : if (fieldno >= fldct)
1989 heikki.linnakangas@i 991 [ + - ]: 16 : ereport(ERROR,
992 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
993 : : errmsg("missing data for column \"%s\"",
994 : : NameStr(att->attname))));
431 msawada@postgresql.o 995 : 2822214 : string = field_strings[fieldno++];
996 : :
997 [ + + ]: 2822214 : if (cstate->convert_select_flags &&
998 [ + + ]: 10 : !cstate->convert_select_flags[m])
999 : : {
1000 : : /* ignore input field, leaving column as NULL */
1001 : 5 : continue;
1002 : : }
1003 : :
1004 [ + + ]: 2822209 : if (is_csv)
1005 : : {
1006 [ + + ]: 619 : if (string == NULL &&
1007 [ + + ]: 27 : cstate->opts.force_notnull_flags[m])
1008 : : {
1009 : : /*
1010 : : * FORCE_NOT_NULL option is set and column is NULL - convert
1011 : : * it to the NULL string.
1012 : : */
1013 : 18 : string = cstate->opts.null_print;
1014 : : }
1015 [ + + + + ]: 601 : else if (string != NULL && cstate->opts.force_null_flags[m]
1016 [ + + ]: 32 : && strcmp(string, cstate->opts.null_print) == 0)
1017 : : {
1018 : : /*
1019 : : * FORCE_NULL option is set and column matches the NULL
1020 : : * string. It must have been quoted, or otherwise the string
1021 : : * would already have been set to NULL. Convert it to NULL as
1022 : : * specified.
1023 : : */
1024 : 17 : string = NULL;
1025 : : }
1026 : : }
1027 : :
1028 : 2822209 : cstate->cur_attname = NameStr(att->attname);
1029 : 2822209 : cstate->cur_attval = string;
1030 : :
1031 [ + + ]: 2822209 : if (string != NULL)
1032 : 2819457 : nulls[m] = false;
1033 : :
1034 [ + + ]: 2822209 : if (cstate->defaults[m])
1035 : : {
1036 : : /* We must have switched into the per-tuple memory context */
1037 [ - + ]: 38 : Assert(econtext != NULL);
1038 [ - + ]: 38 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
1039 : :
1040 : 38 : values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1041 : : }
1042 : :
1043 : : /*
1044 : : * If ON_ERROR is specified, handle the different options
1045 : : */
1046 [ + + ]: 2822146 : else if (!InputFunctionCallSafe(&in_functions[m],
1047 : : string,
1048 : 2822171 : typioparams[m],
1049 : : att->atttypmod,
1050 : 2822171 : (Node *) cstate->escontext,
1051 : 2822171 : &values[m]))
1052 : : {
1053 [ - + ]: 112 : Assert(cstate->opts.on_error != COPY_ON_ERROR_STOP);
1054 : :
63 peter@eisentraut.org 1055 [ + + ]:GNC 112 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1056 : 82 : cstate->num_errors++;
1057 [ + - ]: 30 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1058 : : {
1059 : : /*
1060 : : * Reset error state so the subsequent InputFunctionCallSafe
1061 : : * call (for domain constraint check) can properly report
1062 : : * whether it succeeded or failed.
1063 : : */
1064 : 30 : cstate->escontext->error_occurred = false;
1065 : :
1066 [ - + ]: 30 : Assert(cstate->domain_with_constraint != NULL);
1067 : :
1068 : : /*
1069 : : * For constrained domains, we need an additional
1070 : : * InputFunctionCallSafe() to ensure that an error is thrown
1071 : : * if the domain constraint rejects null values.
1072 : : */
1073 [ + + + + ]: 50 : if (!cstate->domain_with_constraint[m] ||
1074 : 20 : InputFunctionCallSafe(&in_functions[m],
1075 : : NULL,
1076 : 20 : typioparams[m],
1077 : : att->atttypmod,
1078 : 20 : (Node *) cstate->escontext,
1079 : 20 : &values[m]))
1080 : : {
1081 : 18 : nulls[m] = true;
1082 : 18 : values[m] = (Datum) 0;
1083 : : }
1084 : : else
1085 [ + - ]: 12 : ereport(ERROR,
1086 : : errcode(ERRCODE_NOT_NULL_VIOLATION),
1087 : : errmsg("domain %s does not allow null values",
1088 : : format_type_be(typioparams[m])),
1089 : : errdetail("ON_ERROR SET_NULL cannot be applied because column \"%s\" (domain %s) does not accept null values.",
1090 : : cstate->cur_attname,
1091 : : format_type_be(typioparams[m])),
1092 : : errdatatype(typioparams[m]));
1093 : :
1094 : : /*
1095 : : * We count only the number of rows (not fields) where
1096 : : * ON_ERROR SET_NULL was applied.
1097 : : */
1098 [ + + ]: 18 : if (!current_row_erroneous)
1099 : : {
1100 : 14 : current_row_erroneous = true;
1101 : 14 : cstate->num_errors++;
1102 : : }
1103 : : }
1104 : :
431 msawada@postgresql.o 1105 [ + + ]:CBC 100 : if (cstate->opts.log_verbosity == COPY_LOG_VERBOSITY_VERBOSE)
1106 : : {
1107 : : /*
1108 : : * Since we emit line number and column info in the below
1109 : : * notice message, we suppress error context information other
1110 : : * than the relation name.
1111 : : */
1112 [ - + ]: 44 : Assert(!cstate->relname_only);
1113 : 44 : cstate->relname_only = true;
1114 : :
1115 [ + + ]: 44 : if (cstate->cur_attval)
1116 : : {
1117 : : char *attval;
1118 : :
1119 : 40 : attval = CopyLimitPrintoutLength(cstate->cur_attval);
1120 : :
63 peter@eisentraut.org 1121 [ + + ]:GNC 40 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1122 [ + - ]: 24 : ereport(NOTICE,
1123 : : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1124 : : cstate->cur_lineno,
1125 : : cstate->cur_attname,
1126 : : attval));
1127 [ + - ]: 16 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1128 [ + - ]: 16 : ereport(NOTICE,
1129 : : errmsg("setting to null due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1130 : : cstate->cur_lineno,
1131 : : cstate->cur_attname,
1132 : : attval));
431 msawada@postgresql.o 1133 :CBC 40 : pfree(attval);
1134 : : }
1135 : : else
1136 : : {
63 peter@eisentraut.org 1137 [ + - ]:GNC 4 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1138 [ + - ]: 4 : ereport(NOTICE,
1139 : : errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1140 : : cstate->cur_lineno,
1141 : : cstate->cur_attname));
1142 : : }
1143 : : /* reset relname_only */
431 msawada@postgresql.o 1144 :CBC 44 : cstate->relname_only = false;
1145 : : }
1146 : :
63 peter@eisentraut.org 1147 [ + + ]:GNC 100 : if (cstate->opts.on_error == COPY_ON_ERROR_IGNORE)
1148 : 82 : return true;
1149 [ + - ]: 18 : else if (cstate->opts.on_error == COPY_ON_ERROR_SET_NULL)
1150 : 18 : continue;
1151 : : }
1152 : :
431 msawada@postgresql.o 1153 :CBC 2822072 : cstate->cur_attname = NULL;
1154 : 2822072 : cstate->cur_attval = NULL;
1155 : : }
1156 : :
1157 [ - + ]: 858526 : Assert(fieldno == attr_count);
1158 : :
1159 : 858526 : return true;
1160 : : }
1161 : :
1162 : : /* Implementation of the per-row callback for binary format */
1163 : : bool
1164 : 25 : CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values,
1165 : : bool *nulls)
1166 : : {
1167 : : TupleDesc tupDesc;
1168 : : AttrNumber attr_count;
1169 : 25 : FmgrInfo *in_functions = cstate->in_functions;
1170 : 25 : Oid *typioparams = cstate->typioparams;
1171 : : int16 fld_count;
1172 : : ListCell *cur;
1173 : :
1174 : 25 : tupDesc = RelationGetDescr(cstate->rel);
1175 : 25 : attr_count = list_length(cstate->attnumlist);
1176 : :
1177 : 25 : cstate->cur_lineno++;
1178 : :
1179 [ - + ]: 25 : if (!CopyGetInt16(cstate, &fld_count))
1180 : : {
1181 : : /* EOF detected (end of file, or protocol-level EOF) */
431 msawada@postgresql.o 1182 :UBC 0 : return false;
1183 : : }
1184 : :
431 msawada@postgresql.o 1185 [ + + ]:CBC 25 : if (fld_count == -1)
1186 : : {
1187 : : /*
1188 : : * Received EOF marker. Wait for the protocol-level EOF, and complain
1189 : : * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1190 : : * that we correctly handle CopyFail, if client chooses to send that
1191 : : * now. When copying from file, we could ignore the rest of the file
1192 : : * like in text mode, but we choose to be consistent with the COPY
1193 : : * FROM STDIN case.
1194 : : */
1195 : : char dummy;
1196 : :
1197 [ - + ]: 7 : if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
431 msawada@postgresql.o 1198 [ # # ]:UBC 0 : ereport(ERROR,
1199 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1200 : : errmsg("received copy data after EOF marker")));
431 msawada@postgresql.o 1201 :CBC 7 : return false;
1202 : : }
1203 : :
1204 [ - + ]: 18 : if (fld_count != attr_count)
431 msawada@postgresql.o 1205 [ # # ]:UBC 0 : ereport(ERROR,
1206 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1207 : : errmsg("row field count is %d, expected %d",
1208 : : fld_count, attr_count)));
1209 : :
431 msawada@postgresql.o 1210 [ + - + + :CBC 117 : foreach(cur, cstate->attnumlist)
+ + ]
1211 : : {
1212 : 100 : int attnum = lfirst_int(cur);
1213 : 100 : int m = attnum - 1;
1214 : 100 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1215 : :
1216 : 100 : cstate->cur_attname = NameStr(att->attname);
1217 : 199 : values[m] = CopyReadBinaryAttribute(cstate,
1218 : 100 : &in_functions[m],
1219 : 100 : typioparams[m],
1220 : : att->atttypmod,
1221 : : &nulls[m]);
1222 : 99 : cstate->cur_attname = NULL;
1223 : : }
1224 : :
1989 heikki.linnakangas@i 1225 : 17 : return true;
1226 : : }
1227 : :
1228 : : /*
1229 : : * Read the next input line and stash it in line_buf.
1230 : : *
1231 : : * Result is true if read was terminated by EOF, false if terminated
1232 : : * by newline. The terminating newline or EOF marker is not included
1233 : : * in the final value of line_buf.
1234 : : */
1235 : : static bool
431 msawada@postgresql.o 1236 : 859818 : CopyReadLine(CopyFromState cstate, bool is_csv)
1237 : : {
1238 : : bool result;
1239 : :
1989 heikki.linnakangas@i 1240 : 859818 : resetStringInfo(&cstate->line_buf);
1860 1241 : 859818 : cstate->line_buf_valid = false;
1242 : :
1243 : : /*
1244 : : * Parse data and transfer into line_buf.
1245 : : *
1246 : : * Because this is performance critical, we inline CopyReadLineText() and
1247 : : * pass the boolean parameters as constants to allow the compiler to emit
1248 : : * specialized code with fewer branches.
1249 : : */
74 nathan@postgresql.or 1250 [ + + ]:GNC 859818 : if (is_csv)
1251 : 546 : result = CopyReadLineText(cstate, true);
1252 : : else
1253 : 859272 : result = CopyReadLineText(cstate, false);
1254 : :
1989 heikki.linnakangas@i 1255 [ + + ]:CBC 859800 : if (result)
1256 : : {
1257 : : /*
1258 : : * Reached EOF. In protocol version 3, we should ignore anything
1259 : : * after \. up to the protocol end of copy data. (XXX maybe better
1260 : : * not to treat \. as special?)
1261 : : */
1888 1262 [ + + ]: 990 : if (cstate->copy_src == COPY_FRONTEND)
1263 : : {
1264 : : int inbytes;
1265 : :
1266 : : do
1267 : : {
1860 1268 : 524 : inbytes = CopyGetData(cstate, cstate->input_buf,
1269 : : 1, INPUT_BUF_SIZE);
1270 [ - + ]: 524 : } while (inbytes > 0);
1271 : 524 : cstate->input_buf_index = 0;
1272 : 524 : cstate->input_buf_len = 0;
1273 : 524 : cstate->raw_buf_index = 0;
1274 : 524 : cstate->raw_buf_len = 0;
1275 : : }
1276 : : }
1277 : : else
1278 : : {
1279 : : /*
1280 : : * If we didn't hit EOF, then we must have transferred the EOL marker
1281 : : * to line_buf along with the data. Get rid of it.
1282 : : */
1989 1283 [ + - - - : 858810 : switch (cstate->eol_type)
- ]
1284 : : {
1285 : 858810 : case EOL_NL:
1286 [ - + ]: 858810 : Assert(cstate->line_buf.len >= 1);
1287 [ - + ]: 858810 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1288 : 858810 : cstate->line_buf.len--;
1289 : 858810 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1290 : 858810 : break;
1989 heikki.linnakangas@i 1291 :UBC 0 : case EOL_CR:
1292 [ # # ]: 0 : Assert(cstate->line_buf.len >= 1);
1293 [ # # ]: 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1294 : 0 : cstate->line_buf.len--;
1295 : 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1296 : 0 : break;
1297 : 0 : case EOL_CRNL:
1298 [ # # ]: 0 : Assert(cstate->line_buf.len >= 2);
1299 [ # # ]: 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1300 [ # # ]: 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1301 : 0 : cstate->line_buf.len -= 2;
1302 : 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1303 : 0 : break;
1304 : 0 : case EOL_UNKNOWN:
1305 : : /* shouldn't get here */
1306 : 0 : Assert(false);
1307 : : break;
1308 : : }
1309 : : }
1310 : :
1311 : : /* Now it's safe to use the buffer in error messages */
1860 heikki.linnakangas@i 1312 :CBC 859800 : cstate->line_buf_valid = true;
1313 : :
1989 1314 : 859800 : return result;
1315 : : }
1316 : :
1317 : : #ifndef USE_NO_SIMD
1318 : : /*
1319 : : * Helper function for CopyReadLineText() that uses SIMD instructions to scan
1320 : : * the input buffer for special characters. This can be much faster.
1321 : : *
1322 : : * Note that we disable SIMD for the remainder of the COPY FROM command upon
1323 : : * encountering a special character (except for end-of-line characters) or a
1324 : : * short line. This is perhaps too conservative, but it should help avoid
1325 : : * regressions. It could probably be made more lenient in the future via
1326 : : * fine-tuned heuristics.
1327 : : */
1328 : : static bool
53 nathan@postgresql.or 1329 :GNC 337277 : CopyReadLineTextSIMDHelper(CopyFromState cstate, bool is_csv,
1330 : : bool *hit_eof_p, int *input_buf_ptr_p)
1331 : : {
1332 : : char *copy_input_buf;
1333 : : int input_buf_ptr;
1334 : : int copy_buf_len;
1335 : : bool unique_esc_char; /* for csv, do quote/esc chars differ? */
1336 : 337277 : bool first = true;
1337 : 337277 : bool result = false;
1338 : 337277 : const Vector8 nl_vec = vector8_broadcast('\n');
1339 : 337277 : const Vector8 cr_vec = vector8_broadcast('\r');
1340 : : Vector8 bs_or_quote_vec; /* '\' for text, quote for csv */
1341 : : Vector8 esc_vec; /* only for csv */
1342 : :
1343 [ + + ]: 337277 : if (is_csv)
1344 : : {
1345 : 392 : char quote = cstate->opts.quote[0];
1346 : 392 : char esc = cstate->opts.escape[0];
1347 : :
1348 : 392 : bs_or_quote_vec = vector8_broadcast(quote);
1349 : 392 : esc_vec = vector8_broadcast(esc);
1350 : 392 : unique_esc_char = (quote != esc);
1351 : : }
1352 : : else
1353 : : {
1354 : 336885 : bs_or_quote_vec = vector8_broadcast('\\');
1355 : 336885 : unique_esc_char = false;
1356 : : }
1357 : :
1358 : : /*
1359 : : * For a little extra speed within the loop, we copy some state members
1360 : : * into local variables. Note that we need to use a separate local
1361 : : * variable for input_buf_ptr so that the REFILL_LINEBUF macro works. We
1362 : : * copy its value into the input_buf_ptr_p argument before returning.
1363 : : */
1364 : 337277 : copy_input_buf = cstate->input_buf;
1365 : 337277 : input_buf_ptr = cstate->input_buf_index;
1366 : 337277 : copy_buf_len = cstate->input_buf_len;
1367 : :
1368 : : /*
1369 : : * See the corresponding loop in CopyReadLineText() for more information
1370 : : * about the purpose of this loop. This one does the same thing using
1371 : : * SIMD instructions, although we are quick to bail out to the scalar path
1372 : : * if we encounter a special character.
1373 : : */
1374 : : for (;;)
1375 : 402538 : {
1376 : : Vector8 chunk;
1377 : : Vector8 match;
1378 : :
1379 : : /* Load more data if needed. */
1380 [ + + ]: 739815 : if (copy_buf_len - input_buf_ptr < sizeof(Vector8))
1381 : : {
1382 [ + + ]: 216810 : REFILL_LINEBUF;
1383 : :
1384 : 216810 : CopyLoadInputBuf(cstate);
1385 : : /* update our local variables */
1386 : 216800 : *hit_eof_p = cstate->input_reached_eof;
1387 : 216800 : input_buf_ptr = cstate->input_buf_index;
1388 : 216800 : copy_buf_len = cstate->input_buf_len;
1389 : :
1390 : : /*
1391 : : * If we are completely out of data, break out of the loop,
1392 : : * reporting EOF.
1393 : : */
1394 [ + + ]: 216800 : if (INPUT_BUF_BYTES(cstate) <= 0)
1395 : : {
1396 : 590 : result = true;
1397 : 590 : break;
1398 : : }
1399 : : }
1400 : :
1401 : : /*
1402 : : * If we still don't have enough data for the SIMD path, fall back to
1403 : : * the scalar code. Note that this doesn't necessarily mean we
1404 : : * encountered a short line, so we leave cstate->simd_enabled set to
1405 : : * true.
1406 : : */
1407 [ + + ]: 739215 : if (copy_buf_len - input_buf_ptr < sizeof(Vector8))
1408 : 215322 : break;
1409 : :
1410 : : /*
1411 : : * If we made it here, we have at least enough data to fit in a
1412 : : * Vector8, so we can use SIMD instructions to scan for special
1413 : : * characters.
1414 : : */
1415 : 523893 : vector8_load(&chunk, (const uint8 *) ©_input_buf[input_buf_ptr]);
1416 : :
1417 : : /*
1418 : : * Check for \n, \r, \\ (for text), quotes (for csv), and escapes (for
1419 : : * csv, if different from quotes).
1420 : : */
1421 : 523893 : match = vector8_eq(chunk, nl_vec);
1422 : 523893 : match = vector8_or(match, vector8_eq(chunk, cr_vec));
1423 : 523893 : match = vector8_or(match, vector8_eq(chunk, bs_or_quote_vec));
1424 [ + + ]: 523893 : if (unique_esc_char)
1425 : 21 : match = vector8_or(match, vector8_eq(chunk, esc_vec));
1426 : :
1427 : : /*
1428 : : * If we found a special character, advance to it and hand off to the
1429 : : * scalar path. Except for end-of-line characters, we also disable
1430 : : * SIMD processing for the remainder of the COPY FROM command.
1431 : : */
1432 [ + + ]: 523893 : if (vector8_is_highbit_set(match))
1433 : : {
1434 : : uint32 mask;
1435 : : char c;
1436 : :
1437 : 121355 : mask = vector8_highbit_mask(match);
1438 : 121355 : input_buf_ptr += pg_rightmost_one_pos32(mask);
1439 : :
1440 : : /*
1441 : : * Don't disable SIMD if we found \n or \r, else we'd stop using
1442 : : * SIMD instructions after the first line. As an exception, we do
1443 : : * disable it if this is the first vector we processed, as that
1444 : : * means the line is too short for SIMD.
1445 : : */
1446 : 121355 : c = copy_input_buf[input_buf_ptr];
1447 [ + + + + : 121355 : if (first || (c != '\n' && c != '\r'))
+ - ]
1448 : 390 : cstate->simd_enabled = false;
1449 : :
1450 : 121355 : break;
1451 : : }
1452 : :
1453 : : /* That chunk was clear of special characters, so we can skip it. */
1454 : 402538 : input_buf_ptr += sizeof(Vector8);
1455 : 402538 : first = false;
1456 : : }
1457 : :
1458 : 337267 : *input_buf_ptr_p = input_buf_ptr;
1459 : 337267 : return result;
1460 : : }
1461 : : #endif /* ! USE_NO_SIMD */
1462 : :
1463 : : /*
1464 : : * CopyReadLineText - inner loop of CopyReadLine for text mode
1465 : : */
1466 : : static pg_attribute_always_inline bool
431 msawada@postgresql.o 1467 :CBC 859818 : CopyReadLineText(CopyFromState cstate, bool is_csv)
1468 : : {
1469 : : char *copy_input_buf;
1470 : : int input_buf_ptr;
1471 : : int copy_buf_len;
1989 heikki.linnakangas@i 1472 : 859818 : bool need_data = false;
1473 : 859818 : bool hit_eof = false;
1474 : 859818 : bool result = false;
1475 : :
1476 : : /* CSV variables */
1477 : 859818 : bool in_quote = false,
1478 : 859818 : last_was_esc = false;
1479 : 859818 : char quotec = '\0';
1480 : 859818 : char escapec = '\0';
1481 : :
431 msawada@postgresql.o 1482 [ + + ]: 859818 : if (is_csv)
1483 : : {
1989 heikki.linnakangas@i 1484 : 546 : quotec = cstate->opts.quote[0];
1485 : 546 : escapec = cstate->opts.escape[0];
1486 : : /* ignore special escape processing if it's the same as quotec */
1487 [ + + ]: 546 : if (quotec == escapec)
1488 : 438 : escapec = '\0';
1489 : : }
1490 : :
1491 : : /*
1492 : : * The objective of this loop is to transfer the entire next input line
1493 : : * into line_buf. Hence, we only care for detecting newlines (\r and/or
1494 : : * \n) and the end-of-copy marker (\.).
1495 : : *
1496 : : * In CSV mode, \r and \n inside a quoted field are just part of the data
1497 : : * value and are put in line_buf. We keep just enough state to know if we
1498 : : * are currently in a quoted field or not.
1499 : : *
1500 : : * The input has already been converted to the database encoding. All
1501 : : * supported server encodings have the property that all bytes in a
1502 : : * multi-byte sequence have the high bit set, so a multibyte character
1503 : : * cannot contain any newline or escape characters embedded in the
1504 : : * multibyte sequence. Therefore, we can process the input byte-by-byte,
1505 : : * regardless of the encoding.
1506 : : *
1507 : : * For speed, we try to move data from input_buf to line_buf in chunks
1508 : : * rather than one character at a time. input_buf_ptr points to the next
1509 : : * character to examine; any characters from input_buf_index to
1510 : : * input_buf_ptr have been determined to be part of the line, but not yet
1511 : : * transferred to line_buf.
1512 : : *
1513 : : * For a little extra speed within the loop, we copy some state
1514 : : * information into local variables. input_buf_ptr could be changed in
1515 : : * the SIMD path, so we must set that one before it. The others are set
1516 : : * afterwards.
1517 : : */
1860 1518 : 859818 : input_buf_ptr = cstate->input_buf_index;
1519 : :
1520 : : /*
1521 : : * We first try to use SIMD for the task described above, falling back to
1522 : : * the scalar path (i.e., the loop below) if needed.
1523 : : */
1524 : : #ifndef USE_NO_SIMD
53 nathan@postgresql.or 1525 [ + + ]:GNC 859818 : if (cstate->simd_enabled)
1526 : : {
1527 : : /*
1528 : : * Using temporary variables seems to encourage the compiler to keep
1529 : : * them in a register, which is beneficial for performance.
1530 : : */
1531 : 337277 : bool tmp_hit_eof = false;
1532 : 337277 : int tmp_input_buf_ptr = 0; /* silence compiler warning */
1533 : :
1534 : 337277 : result = CopyReadLineTextSIMDHelper(cstate, is_csv, &tmp_hit_eof,
1535 : : &tmp_input_buf_ptr);
1536 : 337267 : hit_eof = tmp_hit_eof;
1537 : 337267 : input_buf_ptr = tmp_input_buf_ptr;
1538 : :
1539 [ + + ]: 337267 : if (result)
1540 : : {
1541 : : /* Transfer any still-uncopied data to line_buf. */
1542 [ - + ]: 590 : REFILL_LINEBUF;
1543 : :
1544 : 590 : return result;
1545 : : }
1546 : : }
1547 : : #endif /* ! USE_NO_SIMD */
1548 : :
1549 : 859218 : copy_input_buf = cstate->input_buf;
1860 heikki.linnakangas@i 1550 :CBC 859218 : copy_buf_len = cstate->input_buf_len;
1551 : :
1552 : : for (;;)
1989 1553 : 9201696 : {
1554 : : int prev_raw_ptr;
1555 : : char c;
1556 : :
1557 : : /*
1558 : : * Load more data if needed.
1559 : : *
1560 : : * TODO: We could just force four bytes of read-ahead and avoid the
1561 : : * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1562 : : * unsafe with the old v2 COPY protocol, but we don't support that
1563 : : * anymore.
1564 : : */
1860 1565 [ + + - + ]: 10060914 : if (input_buf_ptr >= copy_buf_len || need_data)
1566 : : {
1989 1567 [ + + ]: 558 : REFILL_LINEBUF;
1568 : :
1860 1569 : 558 : CopyLoadInputBuf(cstate);
1570 : : /* update our local variables */
1571 : 558 : hit_eof = cstate->input_reached_eof;
1572 : 558 : input_buf_ptr = cstate->input_buf_index;
1573 : 558 : copy_buf_len = cstate->input_buf_len;
1574 : :
1575 : : /*
1576 : : * If we are completely out of data, break out of the loop,
1577 : : * reporting EOF.
1578 : : */
1579 [ + + ]: 558 : if (INPUT_BUF_BYTES(cstate) <= 0)
1580 : : {
1989 1581 : 353 : result = true;
1582 : 353 : break;
1583 : : }
1584 : 205 : need_data = false;
1585 : : }
1586 : :
1587 : : /* OK to fetch a character */
1860 1588 : 10060561 : prev_raw_ptr = input_buf_ptr;
1589 : 10060561 : c = copy_input_buf[input_buf_ptr++];
1590 : :
431 msawada@postgresql.o 1591 [ + + ]: 10060561 : if (is_csv)
1592 : : {
1593 : : /*
1594 : : * If character is '\r', we may need to look ahead below. Force
1595 : : * fetch of the next character if we don't already have it. We
1596 : : * need to do this before changing CSV state, in case '\r' is also
1597 : : * the quote or escape character.
1598 : : */
582 tgl@sss.pgh.pa.us 1599 [ + + ]: 2615 : if (c == '\r')
1600 : : {
1989 heikki.linnakangas@i 1601 [ - + - - ]: 24 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1602 : : }
1603 : :
1604 : : /*
1605 : : * Dealing with quotes and escapes here is mildly tricky. If the
1606 : : * quote char is also the escape char, there's no problem - we
1607 : : * just use the char as a toggle. If they are different, we need
1608 : : * to ensure that we only take account of an escape inside a
1609 : : * quoted field and immediately preceding a quote char, and not
1610 : : * the second in an escape-escape sequence.
1611 : : */
1612 [ + + + + ]: 2615 : if (in_quote && c == escapec)
1613 : 32 : last_was_esc = !last_was_esc;
1614 [ + + + - ]: 2615 : if (c == quotec && !last_was_esc)
1615 : 308 : in_quote = !in_quote;
1616 [ + + ]: 2615 : if (c != escapec)
1617 : 2579 : last_was_esc = false;
1618 : :
1619 : : /*
1620 : : * Updating the line count for embedded CR and/or LF chars is
1621 : : * necessarily a little fragile - this test is probably about the
1622 : : * best we can do. (XXX it's arguable whether we should do this
1623 : : * at all --- is cur_lineno a physical or logical count?)
1624 : : */
1625 [ + + + + : 2615 : if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
+ + ]
1626 : 24 : cstate->cur_lineno++;
1627 : : }
1628 : :
1629 : : /* Process \r */
431 msawada@postgresql.o 1630 [ + + + - : 10060561 : if (c == '\r' && (!is_csv || !in_quote))
- + ]
1631 : : {
1632 : : /* Check for \r\n on first line, _and_ handle \r\n. */
1989 heikki.linnakangas@i 1633 [ # # ]:UBC 0 : if (cstate->eol_type == EOL_UNKNOWN ||
1634 [ # # ]: 0 : cstate->eol_type == EOL_CRNL)
1635 : : {
1636 : : /*
1637 : : * If need more data, go back to loop top to load it.
1638 : : *
1639 : : * Note that if we are at EOF, c will wind up as '\0' because
1640 : : * of the guaranteed pad of input_buf.
1641 : : */
1642 [ # # # # ]: 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1643 : :
1644 : : /* get next char */
1860 1645 : 0 : c = copy_input_buf[input_buf_ptr];
1646 : :
1989 1647 [ # # ]: 0 : if (c == '\n')
1648 : : {
1860 1649 : 0 : input_buf_ptr++; /* eat newline */
1989 1650 : 0 : cstate->eol_type = EOL_CRNL; /* in case not set yet */
1651 : : }
1652 : : else
1653 : : {
1654 : : /* found \r, but no \n */
1655 [ # # ]: 0 : if (cstate->eol_type == EOL_CRNL)
1656 [ # # # # : 0 : ereport(ERROR,
# # ]
1657 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1658 : : !is_csv ?
1659 : : errmsg("literal carriage return found in data") :
1660 : : errmsg("unquoted carriage return found in data"),
1661 : : !is_csv ?
1662 : : errhint("Use \"\\r\" to represent carriage return.") :
1663 : : errhint("Use quoted CSV field to represent carriage return.")));
1664 : :
1665 : : /*
1666 : : * if we got here, it is the first line and we didn't find
1667 : : * \n, so don't consume the peeked character
1668 : : */
1669 : 0 : cstate->eol_type = EOL_CR;
1670 : : }
1671 : : }
1672 [ # # ]: 0 : else if (cstate->eol_type == EOL_NL)
1673 [ # # # # : 0 : ereport(ERROR,
# # ]
1674 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1675 : : !is_csv ?
1676 : : errmsg("literal carriage return found in data") :
1677 : : errmsg("unquoted carriage return found in data"),
1678 : : !is_csv ?
1679 : : errhint("Use \"\\r\" to represent carriage return.") :
1680 : : errhint("Use quoted CSV field to represent carriage return.")));
1681 : : /* If reach here, we have found the line terminator */
1682 : 0 : break;
1683 : : }
1684 : :
1685 : : /* Process \n */
431 msawada@postgresql.o 1686 [ + + + + :CBC 10060561 : if (c == '\n' && (!is_csv || !in_quote))
+ + ]
1687 : : {
1989 heikki.linnakangas@i 1688 [ + - - + ]: 858810 : if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1989 heikki.linnakangas@i 1689 [ # # # # :UBC 0 : ereport(ERROR,
# # ]
1690 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1691 : : !is_csv ?
1692 : : errmsg("literal newline found in data") :
1693 : : errmsg("unquoted newline found in data"),
1694 : : !is_csv ?
1695 : : errhint("Use \"\\n\" to represent newline.") :
1696 : : errhint("Use quoted CSV field to represent newline.")));
1989 heikki.linnakangas@i 1697 :CBC 858810 : cstate->eol_type = EOL_NL; /* in case not set yet */
1698 : : /* If reach here, we have found the line terminator */
1699 : 858810 : break;
1700 : : }
1701 : :
1702 : : /*
1703 : : * Process backslash, except in CSV mode where backslash is a normal
1704 : : * character.
1705 : : */
431 msawada@postgresql.o 1706 [ + + + + ]: 9201751 : if (c == '\\' && !is_csv)
1707 : : {
1708 : : char c2;
1709 : :
1989 heikki.linnakangas@i 1710 [ - + - - ]: 4904 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1711 [ - + - - ]: 4904 : IF_NEED_REFILL_AND_EOF_BREAK(0);
1712 : :
1713 : : /* -----
1714 : : * get next character
1715 : : * Note: we do not change c so if it isn't \., we can fall
1716 : : * through and continue processing.
1717 : : * -----
1718 : : */
1860 1719 : 4904 : c2 = copy_input_buf[input_buf_ptr];
1720 : :
1989 1721 [ + + ]: 4904 : if (c2 == '.')
1722 : : {
1860 1723 : 55 : input_buf_ptr++; /* consume the '.' */
1989 1724 [ - + ]: 55 : if (cstate->eol_type == EOL_CRNL)
1725 : : {
1726 : : /* Get the next character */
1989 heikki.linnakangas@i 1727 [ # # # # ]:UBC 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1728 : : /* if hit_eof, c2 will become '\0' */
1860 1729 : 0 : c2 = copy_input_buf[input_buf_ptr++];
1730 : :
1989 1731 [ # # ]: 0 : if (c2 == '\n')
582 tgl@sss.pgh.pa.us 1732 [ # # ]: 0 : ereport(ERROR,
1733 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1734 : : errmsg("end-of-copy marker does not match previous newline style")));
1989 heikki.linnakangas@i 1735 [ # # ]: 0 : else if (c2 != '\r')
582 tgl@sss.pgh.pa.us 1736 [ # # ]: 0 : ereport(ERROR,
1737 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1738 : : errmsg("end-of-copy marker is not alone on its line")));
1739 : : }
1740 : :
1741 : : /* Get the next character */
1989 heikki.linnakangas@i 1742 [ - + - - ]:CBC 55 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1743 : : /* if hit_eof, c2 will become '\0' */
1860 1744 : 55 : c2 = copy_input_buf[input_buf_ptr++];
1745 : :
1989 1746 [ + - + + ]: 55 : if (c2 != '\r' && c2 != '\n')
582 tgl@sss.pgh.pa.us 1747 [ + - ]: 4 : ereport(ERROR,
1748 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1749 : : errmsg("end-of-copy marker is not alone on its line")));
1750 : :
1989 heikki.linnakangas@i 1751 [ + + + - ]: 51 : if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1752 [ - + - - ]: 51 : (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1753 [ - + - - ]: 51 : (cstate->eol_type == EOL_CR && c2 != '\r'))
1989 heikki.linnakangas@i 1754 [ # # ]:UBC 0 : ereport(ERROR,
1755 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1756 : : errmsg("end-of-copy marker does not match previous newline style")));
1757 : :
1758 : : /*
1759 : : * If there is any data on this line before the \., complain.
1760 : : */
581 tgl@sss.pgh.pa.us 1761 [ + - ]:CBC 51 : if (cstate->line_buf.len > 0 ||
1762 [ + + ]: 51 : prev_raw_ptr > cstate->input_buf_index)
1763 [ + - ]: 4 : ereport(ERROR,
1764 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1765 : : errmsg("end-of-copy marker is not alone on its line")));
1766 : :
1767 : : /*
1768 : : * Discard the \. and newline, then report EOF.
1769 : : */
1860 heikki.linnakangas@i 1770 : 47 : cstate->input_buf_index = input_buf_ptr;
1989 1771 : 47 : result = true; /* report EOF */
1772 : 47 : break;
1773 : : }
1774 : : else
1775 : : {
1776 : : /*
1777 : : * If we are here, it means we found a backslash followed by
1778 : : * something other than a period. In non-CSV mode, anything
1779 : : * after a backslash is special, so we skip over that second
1780 : : * character too. If we didn't do that \\. would be
1781 : : * considered an eof-of copy, while in non-CSV mode it is a
1782 : : * literal backslash followed by a period.
1783 : : */
1860 1784 : 4849 : input_buf_ptr++;
1785 : : }
1786 : : }
1787 : : } /* end of outer loop */
1788 : :
1789 : : /*
1790 : : * Transfer any still-uncopied data to line_buf.
1791 : : */
1989 1792 [ + + ]: 859210 : REFILL_LINEBUF;
1793 : :
1794 : 859210 : return result;
1795 : : }
1796 : :
1797 : : /*
1798 : : * Return decimal value for a hexadecimal digit
1799 : : */
1800 : : static int
1989 heikki.linnakangas@i 1801 :UBC 0 : GetDecimalFromHex(char hex)
1802 : : {
1803 [ # # ]: 0 : if (isdigit((unsigned char) hex))
1804 : 0 : return hex - '0';
1805 : : else
308 jdavis@postgresql.or 1806 :UNC 0 : return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1807 : : }
1808 : :
1809 : : /*
1810 : : * Parse the current line into separate attributes (fields),
1811 : : * performing de-escaping as needed.
1812 : : *
1813 : : * The input is in line_buf. We use attribute_buf to hold the result
1814 : : * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1815 : : * string, or NULL when the input matches the null marker string.
1816 : : * This array is expanded as necessary.
1817 : : *
1818 : : * (Note that the caller cannot check for nulls since the returned
1819 : : * string would be the post-de-escaping equivalent, which may look
1820 : : * the same as some valid data string.)
1821 : : *
1822 : : * delim is the column delimiter string (must be just one byte for now).
1823 : : * null_print is the null marker string. Note that this is compared to
1824 : : * the pre-de-escaped input string.
1825 : : *
1826 : : * The return value is the number of fields actually read.
1827 : : */
1828 : : static int
1989 heikki.linnakangas@i 1829 :CBC 858417 : CopyReadAttributesText(CopyFromState cstate)
1830 : : {
1831 : 858417 : char delimc = cstate->opts.delim[0];
1832 : : int fieldno;
1833 : : char *output_ptr;
1834 : : char *cur_ptr;
1835 : : char *line_end_ptr;
1836 : :
1837 : : /*
1838 : : * We need a special case for zero-column tables: check that the input
1839 : : * line is empty, and return.
1840 : : */
1841 [ + + ]: 858417 : if (cstate->max_fields <= 0)
1842 : : {
1843 [ - + ]: 4 : if (cstate->line_buf.len != 0)
1989 heikki.linnakangas@i 1844 [ # # ]:UBC 0 : ereport(ERROR,
1845 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1846 : : errmsg("extra data after last expected column")));
1989 heikki.linnakangas@i 1847 :CBC 4 : return 0;
1848 : : }
1849 : :
1850 : 858413 : resetStringInfo(&cstate->attribute_buf);
1851 : :
1852 : : /*
1853 : : * The de-escaped attributes will certainly not be longer than the input
1854 : : * data line, so we can just force attribute_buf to be large enough and
1855 : : * then transfer data without any checks for enough space. We need to do
1856 : : * it this way because enlarging attribute_buf mid-stream would invalidate
1857 : : * pointers already stored into cstate->raw_fields[].
1858 : : */
1859 [ + + ]: 858413 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1860 : 4 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1861 : 858413 : output_ptr = cstate->attribute_buf.data;
1862 : :
1863 : : /* set pointer variables for loop */
1864 : 858413 : cur_ptr = cstate->line_buf.data;
1865 : 858413 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1866 : :
1867 : : /* Outer loop iterates over fields */
1868 : 858413 : fieldno = 0;
1869 : : for (;;)
1870 : 1963469 : {
1871 : 2821882 : bool found_delim = false;
1872 : : char *start_ptr;
1873 : : char *end_ptr;
1874 : : int input_len;
1875 : 2821882 : bool saw_non_ascii = false;
1876 : :
1877 : : /* Make sure there is enough space for the next value */
1878 [ + + ]: 2821882 : if (fieldno >= cstate->max_fields)
1879 : : {
1880 : 28 : cstate->max_fields *= 2;
1881 : 28 : cstate->raw_fields =
1882 : 28 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1883 : : }
1884 : :
1885 : : /* Remember start of field on both input and output sides */
1886 : 2821882 : start_ptr = cur_ptr;
1887 : 2821882 : cstate->raw_fields[fieldno] = output_ptr;
1888 : :
1889 : : /*
1890 : : * Scan data for field.
1891 : : *
1892 : : * Note that in this loop, we are scanning to locate the end of field
1893 : : * and also speculatively performing de-escaping. Once we find the
1894 : : * end-of-field, we can match the raw field contents against the null
1895 : : * marker string. Only after that comparison fails do we know that
1896 : : * de-escaping is actually the right thing to do; therefore we *must
1897 : : * not* throw any syntax errors before we've done the null-marker
1898 : : * check.
1899 : : */
1900 : : for (;;)
1901 : 14507294 : {
1902 : : char c;
1903 : :
1904 : 17329176 : end_ptr = cur_ptr;
1905 [ + + ]: 17329176 : if (cur_ptr >= line_end_ptr)
1906 : 858409 : break;
1907 : 16470767 : c = *cur_ptr++;
1908 [ + + ]: 16470767 : if (c == delimc)
1909 : : {
1910 : 1963473 : found_delim = true;
1911 : 1963473 : break;
1912 : : }
1913 [ + + ]: 14507294 : if (c == '\\')
1914 : : {
1915 [ - + ]: 4849 : if (cur_ptr >= line_end_ptr)
1989 heikki.linnakangas@i 1916 :UBC 0 : break;
1989 heikki.linnakangas@i 1917 :CBC 4849 : c = *cur_ptr++;
1918 [ + + - - : 4849 : switch (c)
+ - - -
+ ]
1919 : : {
1920 : 8 : case '0':
1921 : : case '1':
1922 : : case '2':
1923 : : case '3':
1924 : : case '4':
1925 : : case '5':
1926 : : case '6':
1927 : : case '7':
1928 : : {
1929 : : /* handle \013 */
1930 : : int val;
1931 : :
1932 : 8 : val = OCTVALUE(c);
1933 [ + + ]: 8 : if (cur_ptr < line_end_ptr)
1934 : : {
1935 : 4 : c = *cur_ptr;
1936 [ - + - - ]: 4 : if (ISOCTAL(c))
1937 : : {
1989 heikki.linnakangas@i 1938 :UBC 0 : cur_ptr++;
1939 : 0 : val = (val << 3) + OCTVALUE(c);
1940 [ # # ]: 0 : if (cur_ptr < line_end_ptr)
1941 : : {
1942 : 0 : c = *cur_ptr;
1943 [ # # # # ]: 0 : if (ISOCTAL(c))
1944 : : {
1945 : 0 : cur_ptr++;
1946 : 0 : val = (val << 3) + OCTVALUE(c);
1947 : : }
1948 : : }
1949 : : }
1950 : : }
1989 heikki.linnakangas@i 1951 :CBC 8 : c = val & 0377;
1952 [ - + - - ]: 8 : if (c == '\0' || IS_HIGHBIT_SET(c))
1953 : 8 : saw_non_ascii = true;
1954 : : }
1955 : 8 : break;
1956 : 8 : case 'x':
1957 : : /* Handle \x3F */
1958 [ + + ]: 8 : if (cur_ptr < line_end_ptr)
1959 : : {
1960 : 4 : char hexchar = *cur_ptr;
1961 : :
1962 [ - + ]: 4 : if (isxdigit((unsigned char) hexchar))
1963 : : {
1989 heikki.linnakangas@i 1964 :UBC 0 : int val = GetDecimalFromHex(hexchar);
1965 : :
1966 : 0 : cur_ptr++;
1967 [ # # ]: 0 : if (cur_ptr < line_end_ptr)
1968 : : {
1969 : 0 : hexchar = *cur_ptr;
1970 [ # # ]: 0 : if (isxdigit((unsigned char) hexchar))
1971 : : {
1972 : 0 : cur_ptr++;
1973 : 0 : val = (val << 4) + GetDecimalFromHex(hexchar);
1974 : : }
1975 : : }
1976 : 0 : c = val & 0xff;
1977 [ # # # # ]: 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
1978 : 0 : saw_non_ascii = true;
1979 : : }
1980 : : }
1989 heikki.linnakangas@i 1981 :CBC 8 : break;
1989 heikki.linnakangas@i 1982 :UBC 0 : case 'b':
1983 : 0 : c = '\b';
1984 : 0 : break;
1985 : 0 : case 'f':
1986 : 0 : c = '\f';
1987 : 0 : break;
1989 heikki.linnakangas@i 1988 :CBC 2033 : case 'n':
1989 : 2033 : c = '\n';
1990 : 2033 : break;
1989 heikki.linnakangas@i 1991 :UBC 0 : case 'r':
1992 : 0 : c = '\r';
1993 : 0 : break;
1994 : 0 : case 't':
1995 : 0 : c = '\t';
1996 : 0 : break;
1997 : 0 : case 'v':
1998 : 0 : c = '\v';
1999 : 0 : break;
2000 : :
2001 : : /*
2002 : : * in all other cases, take the char after '\'
2003 : : * literally
2004 : : */
2005 : : }
2006 : : }
2007 : :
2008 : : /* Add c to output string */
1989 heikki.linnakangas@i 2009 :CBC 14507294 : *output_ptr++ = c;
2010 : : }
2011 : :
2012 : : /* Check whether raw input matched null marker */
2013 : 2821882 : input_len = end_ptr - start_ptr;
2014 [ + + ]: 2821882 : if (input_len == cstate->opts.null_print_len &&
2015 [ + + ]: 166188 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
2016 : 2730 : cstate->raw_fields[fieldno] = NULL;
2017 : : /* Check whether raw input matched default marker */
1147 andrew@dunslane.net 2018 [ + + ]: 2819152 : else if (fieldno < list_length(cstate->attnumlist) &&
2019 [ + + ]: 2819120 : cstate->opts.default_print &&
1149 2020 [ + + ]: 76 : input_len == cstate->opts.default_print_len &&
2021 [ + - ]: 20 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
2022 : 16 : {
2023 : : /* fieldno is 0-indexed and attnum is 1-indexed */
2024 : 20 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
2025 : :
2026 [ + + ]: 20 : if (cstate->defexprs[m] != NULL)
2027 : : {
2028 : : /* defaults contain entries for all physical attributes */
2029 : 16 : cstate->defaults[m] = true;
2030 : : }
2031 : : else
2032 : : {
2033 : 4 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2034 : 4 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2035 : :
2036 [ + - ]: 4 : ereport(ERROR,
2037 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2038 : : errmsg("unexpected default marker in COPY data"),
2039 : : errdetail("Column \"%s\" has no default value.",
2040 : : NameStr(att->attname))));
2041 : : }
2042 : : }
2043 : : else
2044 : : {
2045 : : /*
2046 : : * At this point we know the field is supposed to contain data.
2047 : : *
2048 : : * If we de-escaped any non-7-bit-ASCII chars, make sure the
2049 : : * resulting string is valid data for the db encoding.
2050 : : */
1989 heikki.linnakangas@i 2051 [ - + ]: 2819132 : if (saw_non_ascii)
2052 : : {
1989 heikki.linnakangas@i 2053 :UBC 0 : char *fld = cstate->raw_fields[fieldno];
2054 : :
2055 : 0 : pg_verifymbstr(fld, output_ptr - fld, false);
2056 : : }
2057 : : }
2058 : :
2059 : : /* Terminate attribute value in output area */
1989 heikki.linnakangas@i 2060 :CBC 2821878 : *output_ptr++ = '\0';
2061 : :
2062 : 2821878 : fieldno++;
2063 : : /* Done if we hit EOL instead of a delim */
2064 [ + + ]: 2821878 : if (!found_delim)
2065 : 858409 : break;
2066 : : }
2067 : :
2068 : : /* Clean up state of attribute_buf */
2069 : 858409 : output_ptr--;
2070 [ - + ]: 858409 : Assert(*output_ptr == '\0');
2071 : 858409 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2072 : :
2073 : 858409 : return fieldno;
2074 : : }
2075 : :
2076 : : /*
2077 : : * Parse the current line into separate attributes (fields),
2078 : : * performing de-escaping as needed. This has exactly the same API as
2079 : : * CopyReadAttributesText, except we parse the fields according to
2080 : : * "standard" (i.e. common) CSV usage.
2081 : : */
2082 : : static int
2083 : 318 : CopyReadAttributesCSV(CopyFromState cstate)
2084 : : {
2085 : 318 : char delimc = cstate->opts.delim[0];
2086 : 318 : char quotec = cstate->opts.quote[0];
2087 : 318 : char escapec = cstate->opts.escape[0];
2088 : : int fieldno;
2089 : : char *output_ptr;
2090 : : char *cur_ptr;
2091 : : char *line_end_ptr;
2092 : :
2093 : : /*
2094 : : * We need a special case for zero-column tables: check that the input
2095 : : * line is empty, and return.
2096 : : */
2097 [ - + ]: 318 : if (cstate->max_fields <= 0)
2098 : : {
1989 heikki.linnakangas@i 2099 [ # # ]:UBC 0 : if (cstate->line_buf.len != 0)
2100 [ # # ]: 0 : ereport(ERROR,
2101 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2102 : : errmsg("extra data after last expected column")));
2103 : 0 : return 0;
2104 : : }
2105 : :
1989 heikki.linnakangas@i 2106 :CBC 318 : resetStringInfo(&cstate->attribute_buf);
2107 : :
2108 : : /*
2109 : : * The de-escaped attributes will certainly not be longer than the input
2110 : : * data line, so we can just force attribute_buf to be large enough and
2111 : : * then transfer data without any checks for enough space. We need to do
2112 : : * it this way because enlarging attribute_buf mid-stream would invalidate
2113 : : * pointers already stored into cstate->raw_fields[].
2114 : : */
2115 [ - + ]: 318 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1989 heikki.linnakangas@i 2116 :UBC 0 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1989 heikki.linnakangas@i 2117 :CBC 318 : output_ptr = cstate->attribute_buf.data;
2118 : :
2119 : : /* set pointer variables for loop */
2120 : 318 : cur_ptr = cstate->line_buf.data;
2121 : 318 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
2122 : :
2123 : : /* Outer loop iterates over fields */
2124 : 318 : fieldno = 0;
2125 : : for (;;)
2126 : 326 : {
2127 : 644 : bool found_delim = false;
2128 : 644 : bool saw_quote = false;
2129 : : char *start_ptr;
2130 : : char *end_ptr;
2131 : : int input_len;
2132 : :
2133 : : /* Make sure there is enough space for the next value */
2134 [ - + ]: 644 : if (fieldno >= cstate->max_fields)
2135 : : {
1989 heikki.linnakangas@i 2136 :UBC 0 : cstate->max_fields *= 2;
2137 : 0 : cstate->raw_fields =
2138 : 0 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
2139 : : }
2140 : :
2141 : : /* Remember start of field on both input and output sides */
1989 heikki.linnakangas@i 2142 :CBC 644 : start_ptr = cur_ptr;
2143 : 644 : cstate->raw_fields[fieldno] = output_ptr;
2144 : :
2145 : : /*
2146 : : * Scan data for field,
2147 : : *
2148 : : * The loop starts in "not quote" mode and then toggles between that
2149 : : * and "in quote" mode. The loop exits normally if it is in "not
2150 : : * quote" mode and a delimiter or line end is seen.
2151 : : */
2152 : : for (;;)
2153 : 137 : {
2154 : : char c;
2155 : :
2156 : : /* Not in quote */
2157 : : for (;;)
2158 : : {
2159 : 2045 : end_ptr = cur_ptr;
2160 [ + + ]: 2045 : if (cur_ptr >= line_end_ptr)
2161 : 314 : goto endfield;
2162 : 1731 : c = *cur_ptr++;
2163 : : /* unquoted field delimiter */
2164 [ + + ]: 1731 : if (c == delimc)
2165 : : {
2166 : 330 : found_delim = true;
2167 : 330 : goto endfield;
2168 : : }
2169 : : /* start of quoted field (or part of field) */
2170 [ + + ]: 1401 : if (c == quotec)
2171 : : {
2172 : 137 : saw_quote = true;
2173 : 137 : break;
2174 : : }
2175 : : /* Add c to output string */
2176 : 1264 : *output_ptr++ = c;
2177 : : }
2178 : :
2179 : : /* In quote */
2180 : : for (;;)
2181 : : {
2182 : 852 : end_ptr = cur_ptr;
2183 [ - + ]: 852 : if (cur_ptr >= line_end_ptr)
1989 heikki.linnakangas@i 2184 [ # # ]:UBC 0 : ereport(ERROR,
2185 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2186 : : errmsg("unterminated CSV quoted field")));
2187 : :
1989 heikki.linnakangas@i 2188 :CBC 852 : c = *cur_ptr++;
2189 : :
2190 : : /* escape within a quoted field */
2191 [ + + ]: 852 : if (c == escapec)
2192 : : {
2193 : : /*
2194 : : * peek at the next char if available, and escape it if it
2195 : : * is an escape char or a quote char
2196 : : */
2197 [ + + ]: 81 : if (cur_ptr < line_end_ptr)
2198 : : {
2199 : 47 : char nextc = *cur_ptr;
2200 : :
2201 [ + + - + ]: 47 : if (nextc == escapec || nextc == quotec)
2202 : : {
2203 : 16 : *output_ptr++ = nextc;
2204 : 16 : cur_ptr++;
2205 : 16 : continue;
2206 : : }
2207 : : }
2208 : : }
2209 : :
2210 : : /*
2211 : : * end of quoted field. Must do this test after testing for
2212 : : * escape in case quote char and escape char are the same
2213 : : * (which is the common case).
2214 : : */
2215 [ + + ]: 836 : if (c == quotec)
2216 : 137 : break;
2217 : :
2218 : : /* Add c to output string */
2219 : 699 : *output_ptr++ = c;
2220 : : }
2221 : : }
2222 : 644 : endfield:
2223 : :
2224 : : /* Terminate attribute value in output area */
2225 : 644 : *output_ptr++ = '\0';
2226 : :
2227 : : /* Check whether raw input matched null marker */
2228 : 644 : input_len = end_ptr - start_ptr;
2229 [ + + + + ]: 644 : if (!saw_quote && input_len == cstate->opts.null_print_len &&
2230 [ + - ]: 27 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
2231 : 27 : cstate->raw_fields[fieldno] = NULL;
2232 : : /* Check whether raw input matched default marker */
1147 andrew@dunslane.net 2233 [ + - ]: 617 : else if (fieldno < list_length(cstate->attnumlist) &&
2234 [ + + ]: 617 : cstate->opts.default_print &&
1149 2235 [ + + ]: 94 : input_len == cstate->opts.default_print_len &&
2236 [ + - ]: 26 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
2237 : : {
2238 : : /* fieldno is 0-index and attnum is 1-index */
2239 : 26 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
2240 : :
2241 [ + + ]: 26 : if (cstate->defexprs[m] != NULL)
2242 : : {
2243 : : /* defaults contain entries for all physical attributes */
2244 : 22 : cstate->defaults[m] = true;
2245 : : }
2246 : : else
2247 : : {
2248 : 4 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2249 : 4 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2250 : :
2251 [ + - ]: 4 : ereport(ERROR,
2252 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2253 : : errmsg("unexpected default marker in COPY data"),
2254 : : errdetail("Column \"%s\" has no default value.",
2255 : : NameStr(att->attname))));
2256 : : }
2257 : : }
2258 : :
1989 heikki.linnakangas@i 2259 : 640 : fieldno++;
2260 : : /* Done if we hit EOL instead of a delim */
2261 [ + + ]: 640 : if (!found_delim)
2262 : 314 : break;
2263 : : }
2264 : :
2265 : : /* Clean up state of attribute_buf */
2266 : 314 : output_ptr--;
2267 [ - + ]: 314 : Assert(*output_ptr == '\0');
2268 : 314 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2269 : :
2270 : 314 : return fieldno;
2271 : : }
2272 : :
2273 : :
2274 : : /*
2275 : : * Read a binary attribute
2276 : : */
2277 : : static Datum
2278 : 100 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
2279 : : Oid typioparam, int32 typmod,
2280 : : bool *isnull)
2281 : : {
2282 : : int32 fld_size;
2283 : : Datum result;
2284 : :
2285 [ - + ]: 100 : if (!CopyGetInt32(cstate, &fld_size))
1989 heikki.linnakangas@i 2286 [ # # ]:UBC 0 : ereport(ERROR,
2287 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2288 : : errmsg("unexpected EOF in COPY data")));
1989 heikki.linnakangas@i 2289 [ + + ]:CBC 100 : if (fld_size == -1)
2290 : : {
2291 : 20 : *isnull = true;
2292 : 20 : return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2293 : : }
2294 [ - + ]: 80 : if (fld_size < 0)
1989 heikki.linnakangas@i 2295 [ # # ]:UBC 0 : ereport(ERROR,
2296 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2297 : : errmsg("invalid field size")));
2298 : :
2299 : : /* reset attribute_buf to empty, and load raw data in it */
1989 heikki.linnakangas@i 2300 :CBC 80 : resetStringInfo(&cstate->attribute_buf);
2301 : :
2302 : 80 : enlargeStringInfo(&cstate->attribute_buf, fld_size);
2303 : 80 : if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2304 [ - + ]: 80 : fld_size) != fld_size)
1989 heikki.linnakangas@i 2305 [ # # ]:UBC 0 : ereport(ERROR,
2306 : : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
2307 : : errmsg("unexpected EOF in COPY data")));
2308 : :
1989 heikki.linnakangas@i 2309 :CBC 80 : cstate->attribute_buf.len = fld_size;
2310 : 80 : cstate->attribute_buf.data[fld_size] = '\0';
2311 : :
2312 : : /* Call the column type's binary input converter */
2313 : 80 : result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2314 : : typioparam, typmod);
2315 : :
2316 : : /* Trouble if it didn't eat the whole buffer */
2317 [ + + ]: 80 : if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2318 [ + - ]: 1 : ereport(ERROR,
2319 : : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
2320 : : errmsg("incorrect binary data format")));
2321 : :
2322 : 79 : *isnull = false;
2323 : 79 : return result;
2324 : : }
|