Commit e5eaa8b6 authored by V Narayanan's avatar V Narayanan

Bug#40814 CSV engine does not parse \X characters when they occur in unquoted fields

    
When a .CSV file for table in the CSV engine contains
\X characters as part of unquoted fields, e.g.
    
2,naraya\nan
    
\n is not interpreted as a new line (it is however interpreted as a
newline in a quoted field).
    
The old algorithm copied the entire value for a unquoted field without
parsing the \X characters. 
    
The new algorithm adds the capability to handle \X characters in the 
unquoted fields of a .CSV file.

mysql-test/r/csv.result:
  Bug#40814 CSV engine does not parse \X characters when they occur in unquoted fields
  
  Contains additional test output corresponding to the new 
  tests added.
mysql-test/t/csv.test:
  Bug#40814 CSV engine does not parse \X characters when they occur in unquoted fields
  
  Contains additional tests for testing the behaviour of the CSV 
  storage engine when the fields are not enclosed in quotes and
  contain \X characters.
storage/csv/ha_tina.cc:
  Bug#40814 CSV engine does not parse \X characters when they occur in unquoted fields
  
  Changes the parsing logic of the rows in a CSV file, to parse
  \X characters that might be present in the unquoted fields.
parent 685cf200
......@@ -5407,4 +5407,60 @@ test.t1 repair status OK
select * from t1 limit 1;
a
drop table t1;
#
# Test for the following cases
# 1) integers and strings enclosed in quotes
# 2) integers and strings not enclosed in quotes
# 3) \X characters with quotes
# 4) \X characters outside quotes
#
CREATE TABLE t1(c1 INT NOT NULL, c2 VARCHAR(50) NOT NULL) ENGINE=csv;
# remove the already existing .CSV file if any
# create the .CSV file that contains the hard-coded data used in
# testing
1,"integer sans quotes"
1,string sans quotes
1,quotes"in between" strings
"1",Integer with quote and string with no quote
1,"escape sequence \n \" \\ \r \a within quotes"
1,escape sequence \n \" \\ \r \a without quotes
# select from the table in which the data has been filled in using
# the hard-coded .CSV file
SELECT * FROM t1;
c1 c2
1 integer sans quotes
1 string sans quotes
1 quotes"in between" strings
1 Integer with quote and string with no quote
1 escape sequence
" \ \a within quotes
1 escape sequence
" \ \a without quotes
DROP TABLE t1;
# Test for the case when a field begins with a quote, but does not end in a
# quote.
# Note: This results in an error.
CREATE TABLE t1(c1 INT NOT NULL, c2 VARCHAR(50) NOT NULL) ENGINE=csv;
# remove the already existing .CSV file if any
# create the .CSV file that contains the hard-coded data used in
# testing
1,"string only at the beginning quotes
# select from the table in which the data has been filled in using
# the hard-coded .CSV file
SELECT * FROM t1;
ERROR HY000: Table 't1' is marked as crashed and should be repaired
DROP TABLE t1;
# Test for the case when a field ends with a quote, but does not begin in a
# quote.
# Note: This results in an error.
CREATE TABLE t1(c1 INT NOT NULL, c2 VARCHAR(50) NOT NULL) ENGINE=csv;
# remove the already existing .CSV file if any
# create the .CSV file that contains the hard-coded data used in
# testing
1,string with only ending quotes"
# select from the table in which the data has been filled in using
# the hard-coded .CSV file
SELECT * FROM t1;
ERROR HY000: Table 't1' is marked as crashed and should be repaired
DROP TABLE t1;
End of 5.1 tests
......
......@@ -1819,4 +1819,84 @@ repair table t1;
select * from t1 limit 1;
drop table t1;
#
# Bug #40814 CSV engine does not parse \X characters when they occur in unquoted fields
#
--echo #
--echo # Test for the following cases
--echo # 1) integers and strings enclosed in quotes
--echo # 2) integers and strings not enclosed in quotes
--echo # 3) \X characters with quotes
--echo # 4) \X characters outside quotes
--echo #
CREATE TABLE t1(c1 INT NOT NULL, c2 VARCHAR(50) NOT NULL) ENGINE=csv;
--echo # remove the already existing .CSV file if any
--remove_file $MYSQLD_DATADIR/test/t1.CSV
--echo # create the .CSV file that contains the hard-coded data used in
--echo # testing
--write_file $MYSQLD_DATADIR/test/t1.CSV
1,"integer sans quotes"
1,string sans quotes
1,quotes"in between" strings
"1",Integer with quote and string with no quote
1,"escape sequence \n \" \\ \r \a within quotes"
1,escape sequence \n \" \\ \r \a without quotes
EOF
--cat_file $MYSQLD_DATADIR/test/t1.CSV
--echo # select from the table in which the data has been filled in using
--echo # the hard-coded .CSV file
SELECT * FROM t1;
DROP TABLE t1;
--echo # Test for the case when a field begins with a quote, but does not end in a
--echo # quote.
--echo # Note: This results in an error.
CREATE TABLE t1(c1 INT NOT NULL, c2 VARCHAR(50) NOT NULL) ENGINE=csv;
--echo # remove the already existing .CSV file if any
--remove_file $MYSQLD_DATADIR/test/t1.CSV
--echo # create the .CSV file that contains the hard-coded data used in
--echo # testing
--write_file $MYSQLD_DATADIR/test/t1.CSV
1,"string only at the beginning quotes
EOF
--cat_file $MYSQLD_DATADIR/test/t1.CSV
--echo # select from the table in which the data has been filled in using
--echo # the hard-coded .CSV file
--error ER_CRASHED_ON_USAGE
SELECT * FROM t1;
DROP TABLE t1;
--echo # Test for the case when a field ends with a quote, but does not begin in a
--echo # quote.
--echo # Note: This results in an error.
CREATE TABLE t1(c1 INT NOT NULL, c2 VARCHAR(50) NOT NULL) ENGINE=csv;
--echo # remove the already existing .CSV file if any
--remove_file $MYSQLD_DATADIR/test/t1.CSV
--echo # create the .CSV file that contains the hard-coded data used in
--echo # testing
--write_file $MYSQLD_DATADIR/test/t1.CSV
1,string with only ending quotes"
EOF
--cat_file $MYSQLD_DATADIR/test/t1.CSV
--echo # select from the table in which the data has been filled in using
--echo # the hard-coded .CSV file
--error ER_CRASHED_ON_USAGE
SELECT * FROM t1;
DROP TABLE t1;
--echo End of 5.1 tests
......@@ -614,6 +614,33 @@ int ha_tina::find_current_row(uchar *buf)
memset(buf, 0, table->s->null_bytes);
/*
Parse the line obtained using the following algorithm
BEGIN
1) Store the EOL (end of line) for the current row
2) Until all the fields in the current query have not been
filled
2.1) If the current character is a quote
2.1.1) Until EOL has not been reached
a) If end of current field is reached, move
to next field and jump to step 2.3
b) If current character is a \\ handle
\\n, \\r, \\, \\"
c) else append the current character into the buffer
before checking that EOL has not been reached.
2.2) If the current character does not begin with a quote
2.2.1) Until EOL has not been reached
a) If the end of field has been reached move to the
next field and jump to step 2.3
b) If current character begins with \\ handle
\\n, \\r, \\, \\"
c) else append the current character into the buffer
before checking that EOL has not been reached.
2.3) Store the current field value and jump to 2)
TERMINATE
*/
for (Field **field=table->field ; *field ; field++)
{
char curr_char;
......@@ -622,19 +649,23 @@ int ha_tina::find_current_row(uchar *buf)
if (curr_offset >= end_offset)
goto err;
curr_char= file_buff->get_value(curr_offset);
/* Handle the case where the first character is a quote */
if (curr_char == '"')
{
curr_offset++; // Incrementpast the first quote
/* Increment past the first quote */
curr_offset++;
for(; curr_offset < end_offset; curr_offset++)
/* Loop through the row to extract the values for the current field */
for ( ; curr_offset < end_offset; curr_offset++)
{
curr_char= file_buff->get_value(curr_offset);
// Need to convert line feeds!
/* check for end of the current field */
if (curr_char == '"' &&
(curr_offset == end_offset - 1 ||
file_buff->get_value(curr_offset + 1) == ','))
{
curr_offset+= 2; // Move past the , and the "
/* Move past the , and the " */
curr_offset+= 2;
break;
}
if (curr_char == '\\' && curr_offset != (end_offset - 1))
......@@ -656,7 +687,7 @@ int ha_tina::find_current_row(uchar *buf)
else // ordinary symbol
{
/*
We are at final symbol and no last quote was found =>
If we are at final symbol and no last quote was found =>
we are working with a damaged file.
*/
if (curr_offset == end_offset - 1)
......@@ -667,15 +698,41 @@ int ha_tina::find_current_row(uchar *buf)
}
else
{
for(; curr_offset < end_offset; curr_offset++)
for ( ; curr_offset < end_offset; curr_offset++)
{
curr_char= file_buff->get_value(curr_offset);
/* Move past the ,*/
if (curr_char == ',')
{
curr_offset++; // Skip the ,
curr_offset++;
break;
}
buffer.append(curr_char);
if (curr_char == '\\' && curr_offset != (end_offset - 1))
{
curr_offset++;
curr_char= file_buff->get_value(curr_offset);
if (curr_char == 'r')
buffer.append('\r');
else if (curr_char == 'n' )
buffer.append('\n');
else if (curr_char == '\\' || curr_char == '"')
buffer.append(curr_char);
else /* This could only happed with an externally created file */
{
buffer.append('\\');
buffer.append(curr_char);
}
}
else
{
/*
We are at the final symbol and a quote was found for the
unquoted field => We are working with a damaged field.
*/
if (curr_offset == end_offset - 1 && curr_char == '"')
goto err;
buffer.append(curr_char);
}
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment