88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.236 2004/12/31 21:59:41 pgsql Exp $
11+ * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.237 2005/03/12 05:41:34 momjian Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
@@ -98,7 +98,6 @@ static bool fe_eof; /* true if detected end of copy data */
9898static EolType eol_type ; /* EOL type of input */
9999static int client_encoding ; /* remote side's character encoding */
100100static int server_encoding ; /* local encoding */
101- static bool embedded_line_warning ;
102101
103102/* these are just for error messages, see copy_in_error_callback */
104103static bool copy_binary ; /* is it a binary copy? */
@@ -139,7 +138,7 @@ static void CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
139138static void CopyFrom (Relation rel , List * attnumlist , bool binary , bool oids ,
140139 char * delim , char * null_print , bool csv_mode , char * quote , char * escape ,
141140 List * force_notnull_atts );
142- static bool CopyReadLine (void );
141+ static bool CopyReadLine (char * quote , char * escape );
143142static char * CopyReadAttribute (const char * delim , const char * null_print ,
144143 CopyReadResult * result , bool * isnull );
145144static char * CopyReadAttributeCSV (const char * delim , const char * null_print ,
@@ -1191,7 +1190,6 @@ CopyTo(Relation rel, List *attnumlist, bool binary, bool oids,
11911190 attr = tupDesc -> attrs ;
11921191 num_phys_attrs = tupDesc -> natts ;
11931192 attr_count = list_length (attnumlist );
1194- embedded_line_warning = false;
11951193
11961194 /*
11971195 * Get info about the columns we need to process.
@@ -1718,7 +1716,8 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
17181716 ListCell * cur ;
17191717
17201718 /* Actually read the line into memory here */
1721- done = CopyReadLine ();
1719+ done = csv_mode ?
1720+ CopyReadLine (quote , escape ) : CopyReadLine (NULL , NULL );
17221721
17231722 /*
17241723 * EOF at start of line means we're done. If we see EOF after
@@ -2006,7 +2005,7 @@ CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
20062005 * by newline.
20072006 */
20082007static bool
2009- CopyReadLine (void )
2008+ CopyReadLine (char * quote , char * escape )
20102009{
20112010 bool result ;
20122011 bool change_encoding = (client_encoding != server_encoding );
@@ -2015,6 +2014,19 @@ CopyReadLine(void)
20152014 int j ;
20162015 unsigned char s [2 ];
20172016 char * cvt ;
2017+ bool in_quote = false, last_was_esc = false, csv_mode = false;
2018+ char quotec = '\0' , escapec = '\0' ;
2019+
2020+ if (quote )
2021+ {
2022+ csv_mode = true;
2023+ quotec = quote [0 ];
2024+ escapec = escape [0 ];
2025+ /* ignore special escape processing if it's the same as quotec */
2026+ if (quotec == escapec )
2027+ escapec = '\0' ;
2028+ }
2029+
20182030
20192031 s [1 ] = 0 ;
20202032
@@ -2031,11 +2043,20 @@ CopyReadLine(void)
20312043
20322044 /*
20332045 * In this loop we only care for detecting newlines (\r and/or \n) and
2034- * the end-of-copy marker (\.). For backwards compatibility we allow
2046+ * the end-of-copy marker (\.).
2047+ *
2048+ * In Text mode, for backwards compatibility we allow
20352049 * backslashes to escape newline characters. Backslashes other than
20362050 * the end marker get put into the line_buf, since CopyReadAttribute
2037- * does its own escape processing. These four characters, and only
2038- * these four, are assumed the same in frontend and backend encodings.
2051+ * does its own escape processing.
2052+ *
2053+ * In CSV mode, CR and NL inside q quoted field are just part of the
2054+ * data value and are put in line_buf. We keep just enough state
2055+ * to know if we are currently in a quoted field or not.
2056+ *
2057+ * These four characters, and only these four, are assumed the same in
2058+ * frontend and backend encodings.
2059+ *
20392060 * We do not assume that second and later bytes of a frontend
20402061 * multibyte character couldn't look like ASCII characters.
20412062 */
@@ -2047,13 +2068,49 @@ CopyReadLine(void)
20472068 result = true;
20482069 break ;
20492070 }
2050- if (c == '\r' )
2071+
2072+ if (csv_mode )
2073+ {
2074+ /*
2075+ * Dealing with quotes and escapes here is mildly tricky. If the
2076+ * quote char is also the escape char, there's no problem - we
2077+ * just use the char as a toggle. If they are different, we need
2078+ * to ensure that we only take account of an escape inside a quoted
2079+ * field and immediately preceding a quote char, and not the
2080+ * second in a escape-escape sequence.
2081+ */
2082+
2083+ if (in_quote && c == escapec )
2084+ last_was_esc = ! last_was_esc ;
2085+ if (c == quotec && ! last_was_esc )
2086+ in_quote = ! in_quote ;
2087+ if (c != escapec )
2088+ last_was_esc = false;
2089+
2090+ /*
2091+ * updating the line count for embedded CR and/or LF chars is
2092+ * necessarily a little fragile - this test is probably about
2093+ * the best we can do.
2094+ */
2095+ if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n' ))
2096+ copy_lineno ++ ;
2097+ }
2098+
2099+ if (!in_quote && c == '\r' )
20512100 {
20522101 if (eol_type == EOL_NL )
2053- ereport (ERROR ,
2054- (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2055- errmsg ("literal carriage return found in data" ),
2056- errhint ("Use \"\\r\" to represent carriage return." )));
2102+ {
2103+ if (! csv_mode )
2104+ ereport (ERROR ,
2105+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2106+ errmsg ("literal carriage return found in data" ),
2107+ errhint ("Use \"\\r\" to represent carriage return." )));
2108+ else
2109+ ereport (ERROR ,
2110+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2111+ errmsg ("unquoted carriage return found in CSV data" ),
2112+ errhint ("Use quoted CSV field to represent carriage return." )));
2113+ }
20572114 /* Check for \r\n on first line, _and_ handle \r\n. */
20582115 if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL )
20592116 {
@@ -2068,10 +2125,19 @@ CopyReadLine(void)
20682125 {
20692126 /* found \r, but no \n */
20702127 if (eol_type == EOL_CRNL )
2071- ereport (ERROR ,
2072- (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2073- errmsg ("literal carriage return found in data" ),
2074- errhint ("Use \"\\r\" to represent carriage return." )));
2128+ {
2129+ if (!csv_mode )
2130+ ereport (ERROR ,
2131+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2132+ errmsg ("literal carriage return found in data" ),
2133+ errhint ("Use \"\\r\" to represent carriage return." )));
2134+ else
2135+ ereport (ERROR ,
2136+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2137+ errmsg ("unquoted carriage return found in data" ),
2138+ errhint ("Use quoted CSV field to represent carriage return." )));
2139+
2140+ }
20752141
20762142 /*
20772143 * if we got here, it is the first line and we didn't
@@ -2083,26 +2149,47 @@ CopyReadLine(void)
20832149 }
20842150 break ;
20852151 }
2086- if (c == '\n' )
2152+ if (! in_quote && c == '\n' )
20872153 {
20882154 if (eol_type == EOL_CR || eol_type == EOL_CRNL )
2089- ereport (ERROR ,
2090- (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2091- errmsg ("literal newline found in data" ),
2092- errhint ("Use \"\\n\" to represent newline." )));
2155+ {
2156+ if (!csv_mode )
2157+ ereport (ERROR ,
2158+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2159+ errmsg ("literal newline found in data" ),
2160+ errhint ("Use \"\\n\" to represent newline." )));
2161+ else
2162+ ereport (ERROR ,
2163+ (errcode (ERRCODE_BAD_COPY_FILE_FORMAT ),
2164+ errmsg ("unquoted newline found in data" ),
2165+ errhint ("Use quoted CSV field to represent newline." )));
2166+
2167+ }
20932168 eol_type = EOL_NL ;
20942169 break ;
20952170 }
2096- if (c == '\\' )
2171+
2172+ if ((line_buf .len == 0 || !csv_mode ) && c == '\\' )
20972173 {
2098- c = CopyGetChar ();
2099- if (c == EOF )
2174+ int c2 ;
2175+
2176+ if (csv_mode )
2177+ c2 = CopyPeekChar ();
2178+ else
2179+ c2 = c = CopyGetChar ();
2180+
2181+ if (c2 == EOF )
21002182 {
21012183 result = true;
2184+ if (csv_mode )
2185+ CopyDonePeek (c2 , true);
21022186 break ;
21032187 }
2104- if (c == '.' )
2188+ if (c2 == '.' )
21052189 {
2190+ if (csv_mode )
2191+ CopyDonePeek (c2 , true); /* allow keep calling GetChar() */
2192+
21062193 if (eol_type == EOL_CRNL )
21072194 {
21082195 c = CopyGetChar ();
@@ -2140,8 +2227,12 @@ CopyReadLine(void)
21402227 result = true; /* report EOF */
21412228 break ;
21422229 }
2143- /* not EOF mark, so emit \ and following char literally */
2144- appendStringInfoCharMacro (& line_buf , '\\' );
2230+
2231+ if (csv_mode )
2232+ CopyDonePeek (c2 , false); /* not a dot, so put it back */
2233+ else
2234+ /* not EOF mark, so emit \ and following char literally */
2235+ appendStringInfoCharMacro (& line_buf , '\\' );
21452236 }
21462237
21472238 appendStringInfoCharMacro (& line_buf , c );
@@ -2369,34 +2460,6 @@ CopyReadAttributeCSV(const char *delim, const char *null_print, char *quote,
23692460
23702461 for (;;)
23712462 {
2372- /* handle multiline quoted fields */
2373- if (in_quote && line_buf .cursor >= line_buf .len )
2374- {
2375- bool done ;
2376-
2377- switch (eol_type )
2378- {
2379- case EOL_NL :
2380- appendStringInfoString (& attribute_buf , "\n" );
2381- break ;
2382- case EOL_CR :
2383- appendStringInfoString (& attribute_buf , "\r" );
2384- break ;
2385- case EOL_CRNL :
2386- appendStringInfoString (& attribute_buf , "\r\n" );
2387- break ;
2388- case EOL_UNKNOWN :
2389- /* shouldn't happen - just keep going */
2390- break ;
2391- }
2392-
2393- copy_lineno ++ ;
2394- done = CopyReadLine ();
2395- if (done && line_buf .len == 0 )
2396- break ;
2397- start_cursor = line_buf .cursor ;
2398- }
2399-
24002463 end_cursor = line_buf .cursor ;
24012464 if (line_buf .cursor >= line_buf .len )
24022465 break ;
@@ -2629,25 +2692,6 @@ CopyAttributeOutCSV(char *server_string, char *delim, char *quote,
26292692 !use_quote && (c = * test_string ) != '\0' ;
26302693 test_string += mblen )
26312694 {
2632- /*
2633- * We don't know here what the surrounding line end characters
2634- * might be. It might not even be under postgres' control. So
2635- * we simple warn on ANY embedded line ending character.
2636- *
2637- * This warning will disappear when we make line parsing field-aware,
2638- * so that we can reliably read in embedded line ending characters
2639- * regardless of the file's line-end context.
2640- *
2641- */
2642-
2643- if (!embedded_line_warning && (c == '\n' || c == '\r' ) )
2644- {
2645- embedded_line_warning = true;
2646- elog (WARNING ,
2647- "CSV fields with embedded linefeed or carriage return "
2648- "characters might not be able to be reimported" );
2649- }
2650-
26512695 if (c == delimc || c == quotec || c == '\n' || c == '\r' )
26522696 use_quote = true;
26532697 if (!same_encoding )
0 commit comments