Commit 4b6b3a96 authored by Alexander Barkov's avatar Alexander Barkov

Clean-up: Adding a class Term_string to share some LOAD DATA code

The new class is used for:
- FIELDS TERMINATED BY 'string'
- LINES STARTING BY 'string'
- LINES TERMINATED BY 'string'

The implementation of "FIELDS ENCLOSED BY 'char'" and "FIELDS ESCAPED BY 'char'"
should eventually also switch to this class to support multi-byte characters.
Currently multi-byte ENCLOSED and ESCAPED characters are rejected at parse time.
parent e56650d7
......@@ -61,6 +61,39 @@ XML_TAG::XML_TAG(int l, String f, String v)
}
/*
Field and line terminators must be interpreted as sequence of unsigned char.
Otherwise, non-ascii terminators will be negative on some platforms,
and positive on others (depending on the implementation of char).
*/
class Term_string
{
const uchar *m_ptr;
uint m_length;
int m_initial_byte;
public:
Term_string(const String &str) :
m_ptr(static_cast<const uchar*>(static_cast<const void*>(str.ptr()))),
m_length(str.length()),
m_initial_byte((uchar) (str.length() ? str.ptr()[0] : INT_MAX))
{ }
void set(const uchar *str, uint length, int initial_byte)
{
m_ptr= str;
m_length= length;
m_initial_byte= initial_byte;
}
void reset() { set(NULL, 0, INT_MAX); }
const uchar *ptr() const { return m_ptr; }
uint length() const { return m_length; }
int initial_byte() const { return m_initial_byte; }
bool eq(const Term_string &other) const
{
return length() == other.length() && !memcmp(ptr(), other.ptr(), length());
}
};
#define GET (stack_pos != stack ? *--stack_pos : my_b_get(&cache))
#define PUSH(A) *(stack_pos++)=(A)
......@@ -69,10 +102,10 @@ class READ_INFO {
String data; /* Read buffer */
uint fixed_length; /* Length of the fixed length record */
uint max_length; /* Max length of row */
const uchar *field_term_ptr,*line_term_ptr;
const char *line_start_ptr,*line_start_end;
uint field_term_length,line_term_length,enclosed_length;
int field_term_char,line_term_char,enclosed_char,escape_char;
Term_string m_field_term; /* FIELDS TERMINATED BY 'string' */
Term_string m_line_term; /* LINES TERMINATED BY 'string' */
Term_string m_line_start; /* LINES STARTING BY 'string' */
int enclosed_char,escape_char;
int *stack,*stack_pos;
bool found_end_of_line,start_of_line,eof;
NET *io_net;
......@@ -101,7 +134,11 @@ class READ_INFO {
int read_fixed_length(void);
int next_line(void);
char unescape(char chr);
int terminator(const uchar *ptr, uint length);
bool terminator(const uchar *ptr, uint length);
bool terminator(const Term_string &str)
{ return terminator(str.ptr(), str.length()); }
bool terminator(int chr, const Term_string &str)
{ return str.initial_byte() == chr && terminator(str); }
bool find_start_of_fields();
/* load xml */
List<XML_TAG> taglist;
......@@ -1348,8 +1385,9 @@ READ_INFO::READ_INFO(THD *thd, File file_par, uint tot_length, CHARSET_INFO *cs,
String &field_term, String &line_start, String &line_term,
String &enclosed_par, int escape, bool get_it_from_net,
bool is_fifo)
:file(file_par), fixed_length(tot_length), escape_char(escape),
found_end_of_line(false), eof(false),
:file(file_par), fixed_length(tot_length),
m_field_term(field_term), m_line_term(line_term), m_line_start(line_start),
escape_char(escape), found_end_of_line(false), eof(false),
error(false), line_cuted(false), found_null(false), read_charset(cs)
{
data.set_thread_specific();
......@@ -1358,39 +1396,17 @@ READ_INFO::READ_INFO(THD *thd, File file_par, uint tot_length, CHARSET_INFO *cs,
Otherwise, non-ascii terminators will be negative on some platforms,
and positive on others (depending on the implementation of char).
*/
field_term_ptr=
static_cast<const uchar*>(static_cast<const void*>(field_term.ptr()));
field_term_length= field_term.length();
line_term_ptr=
static_cast<const uchar*>(static_cast<const void*>(line_term.ptr()));
line_term_length= line_term.length();
level= 0; /* for load xml */
if (line_start.length() == 0)
{
line_start_ptr=0;
start_of_line= 0;
}
else
{
line_start_ptr= line_start.ptr();
line_start_end=line_start_ptr+line_start.length();
start_of_line= 1;
}
start_of_line= line_start.length() != 0;
/* If field_terminator == line_terminator, don't use line_terminator */
if (field_term_length == line_term_length &&
!memcmp(field_term_ptr,line_term_ptr,field_term_length))
{
line_term_length=0;
line_term_ptr= NULL;
}
enclosed_char= (enclosed_length=enclosed_par.length()) ?
(uchar) enclosed_par[0] : INT_MAX;
field_term_char= field_term_length ? field_term_ptr[0] : INT_MAX;
line_term_char= line_term_length ? line_term_ptr[0] : INT_MAX;
if (m_field_term.eq(m_line_term))
m_line_term.reset();
enclosed_char= enclosed_par.length() ? (uchar) enclosed_par[0] : INT_MAX;
/* Set of a stack for unget if long terminators */
uint length= MY_MAX(cs->mbmaxlen, MY_MAX(field_term_length, line_term_length)) + 1;
uint length= MY_MAX(cs->mbmaxlen, MY_MAX(m_field_term.length(),
m_line_term.length())) + 1;
set_if_bigger(length,line_start.length());
stack= stack_pos= (int*) thd->alloc(sizeof(int) * length);
......@@ -1432,7 +1448,7 @@ READ_INFO::~READ_INFO()
}
inline int READ_INFO::terminator(const uchar *ptr,uint length)
inline bool READ_INFO::terminator(const uchar *ptr, uint length)
{
int chr=0; // Keep gcc happy
uint i;
......@@ -1444,11 +1460,11 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length)
}
}
if (i == length)
return 1;
return true;
PUSH(chr);
while (i-- > 1)
PUSH(*--ptr);
return 0;
return false;
}
......@@ -1516,12 +1532,12 @@ int READ_INFO::read_field()
chr= escape_char;
}
#ifdef ALLOW_LINESEPARATOR_IN_STRINGS
if (chr == line_term_char)
if (chr == m_line_term.initial_byte())
#else
if (chr == line_term_char && found_enclosed_char == INT_MAX)
if (chr == m_line_term.initial_byte() && found_enclosed_char == INT_MAX)
#endif
{
if (terminator(line_term_ptr,line_term_length))
if (terminator(m_line_term))
{ // Maybe unexpected linefeed
enclosed=0;
found_end_of_line=1;
......@@ -1538,9 +1554,7 @@ int READ_INFO::read_field()
continue;
}
// End of enclosed field if followed by field_term or line_term
if (chr == my_b_EOF ||
(chr == line_term_char && terminator(line_term_ptr,
line_term_length)))
if (chr == my_b_EOF || terminator(chr, m_line_term))
{
/* Maybe unexpected linefeed */
enclosed=1;
......@@ -1549,8 +1563,7 @@ int READ_INFO::read_field()
row_end= (uchar *) data.end();
return 0;
}
if (chr == field_term_char &&
terminator(field_term_ptr,field_term_length))
if (terminator(chr, m_field_term))
{
enclosed=1;
row_start= (uchar *) data.ptr() + 1;
......@@ -1565,9 +1578,10 @@ int READ_INFO::read_field()
/* copy the found term character to 'to' */
chr= found_enclosed_char;
}
else if (chr == field_term_char && found_enclosed_char == INT_MAX)
else if (chr == m_field_term.initial_byte() &&
found_enclosed_char == INT_MAX)
{
if (terminator(field_term_ptr,field_term_length))
if (terminator(m_field_term))
{
enclosed=0;
row_start= (uchar *) data.ptr();
......@@ -1665,13 +1679,10 @@ int READ_INFO::read_fixed_length()
data.append((uchar) unescape((char) chr));
continue;
}
if (chr == line_term_char)
{
if (terminator(line_term_ptr,line_term_length))
{ // Maybe unexpected linefeed
found_end_of_line=1;
break;
}
if (terminator(chr, m_line_term))
{ // Maybe unexpected linefeed
found_end_of_line= true;
break;
}
data.append(chr);
}
......@@ -1690,14 +1701,14 @@ int READ_INFO::read_fixed_length()
int READ_INFO::next_line()
{
line_cuted=0;
start_of_line= line_start_ptr != 0;
start_of_line= m_line_start.length() != 0;
if (found_end_of_line || eof)
{
found_end_of_line=0;
return eof;
}
found_end_of_line=0;
if (!line_term_length)
if (!m_line_term.length())
return 0; // No lines
for (;;)
{
......@@ -1725,10 +1736,11 @@ int READ_INFO::next_line()
or a broken byte sequence was found.
Check if the sequence is a prefix of the "LINES TERMINATED BY" string.
*/
if ((uchar) buf[0] == line_term_char && i <= line_term_length &&
!memcmp(buf, line_term_ptr, i))
if ((uchar) buf[0] == m_line_term.initial_byte() &&
i <= m_line_term.length() &&
!memcmp(buf, m_line_term.ptr(), i))
{
if (line_term_length == i)
if (m_line_term.length() == i)
{
/*
We found a "LINES TERMINATED BY" string that consists
......@@ -1742,10 +1754,11 @@ int READ_INFO::next_line()
that still needs to be checked is (line_term_length - i).
Note, READ_INFO::terminator() assumes that the leftmost byte of the
argument is already scanned from the file and is checked to
be a known prefix (e.g. against line_term_char).
be a known prefix (e.g. against line_term.initial_char()).
So we need to pass one extra byte.
*/
if (terminator(line_term_ptr + i - 1, line_term_length - i + 1))
if (terminator(m_line_term.ptr() + i - 1,
m_line_term.length() - i + 1))
return 0;
}
/*
......@@ -1768,7 +1781,7 @@ int READ_INFO::next_line()
return 1;
continue;
}
if (buf[0] == line_term_char && terminator(line_term_ptr,line_term_length))
if (terminator(buf[0], m_line_term))
return 0;
line_cuted= true;
}
......@@ -1777,30 +1790,12 @@ int READ_INFO::next_line()
bool READ_INFO::find_start_of_fields()
{
int chr;
try_again:
do
for (int chr= GET ; chr != my_b_EOF ; chr= GET)
{
if ((chr=GET) == my_b_EOF)
{
found_end_of_line=eof=1;
return 1;
}
} while ((char) chr != line_start_ptr[0]);
for (const char *ptr=line_start_ptr+1 ; ptr != line_start_end ; ptr++)
{
chr=GET; // Eof will be checked later
if ((char) chr != *ptr)
{ // Can't be line_start
PUSH(chr);
while (--ptr != line_start_ptr)
{ // Restart with next char
PUSH( *ptr);
}
goto try_again;
}
if (terminator(chr, m_line_start))
return false;
}
return 0;
return (found_end_of_line= eof= true);
}
......@@ -1990,11 +1985,11 @@ int READ_INFO::read_xml(THD *thd)
}
// row tag should be in ROWS IDENTIFIED BY '<row>' - stored in line_term
if((tag.length() == line_term_length -2) &&
(memcmp(tag.ptr(), line_term_ptr + 1, tag.length()) == 0))
if((tag.length() == m_line_term.length() - 2) &&
(memcmp(tag.ptr(), m_line_term.ptr() + 1, tag.length()) == 0))
{
DBUG_PRINT("read_xml", ("start-of-row: %i %s %s",
level,tag.c_ptr_safe(), line_term_ptr));
level,tag.c_ptr_safe(), m_line_term.ptr()));
}
if(chr == ' ' || chr == '>')
......@@ -2061,8 +2056,8 @@ int READ_INFO::read_xml(THD *thd)
chr= my_tospace(GET);
}
if((tag.length() == line_term_length -2) &&
(memcmp(tag.ptr(), line_term_ptr + 1, tag.length()) == 0))
if((tag.length() == m_line_term.length() - 2) &&
(memcmp(tag.ptr(), m_line_term.ptr() + 1, tag.length()) == 0))
{
DBUG_PRINT("read_xml", ("found end-of-row %i %s",
level, tag.c_ptr_safe()));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment