Commit 6c0cfad1 authored by unknown's avatar unknown

Bug#16233: XML: ExtractValue() fails with special characters

ExtractValue didn't understand tag and attribute names
consisting of "tricky" national letters (e.g. latin accenter letters).
It happened because XPath lex parser recognized only basic
latin letter a..z ad a part of an identifier.

Fixed to recognize all letters by means of new "full ctype" which
was added recently.


mysql-test/r/xml.result:
  Adding test case
mysql-test/t/xml.test:
  Adding test case
sql/item_xmlfunc.cc:
  Using recently implemented "true" ctype functionality
      to treat all national letters as valid tag names,
      Only basic latin letters worked so far.
strings/ctype-simple.c:
  A bug fix: ctype is array of 257 elements,
      adding offset to address correct element.
parent a514095a
...@@ -615,3 +615,26 @@ select extractValue('<e>1</e>','last()'); ...@@ -615,3 +615,26 @@ select extractValue('<e>1</e>','last()');
ERROR HY000: XPATH syntax error: '' ERROR HY000: XPATH syntax error: ''
select extractValue('<e><a>1</a></e>','/e/'); select extractValue('<e><a>1</a></e>','/e/');
ERROR HY000: XPATH syntax error: '' ERROR HY000: XPATH syntax error: ''
set names utf8;
select extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r');
extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r')
r
select extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ');
extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ')
Ñ
select extractValue('<Ñ r="r"/>','/Ñ/@r');
extractValue('<Ñ r="r"/>','/Ñ/@r')
r
select extractValue('<r Ñ="Ñ"/>','/r/@Ñ');
extractValue('<r Ñ="Ñ"/>','/r/@Ñ')
Ñ
DROP PROCEDURE IF EXISTS p2;
CREATE PROCEDURE p2 ()
BEGIN
DECLARE p LONGTEXT CHARACTER SET UTF8 DEFAULT '<Ñ><r>A</r></Ñ>';
SELECT EXTRACTVALUE(p,'/Ñ/r');
END//
CALL p2();
EXTRACTVALUE(p,'/Ñ/r')
A
DROP PROCEDURE p2;
...@@ -295,3 +295,23 @@ select extractValue('<e>1</e>','last()'); ...@@ -295,3 +295,23 @@ select extractValue('<e>1</e>','last()');
--error 1105 --error 1105
select extractValue('<e><a>1</a></e>','/e/'); select extractValue('<e><a>1</a></e>','/e/');
#
# Bug#16233: XML: ExtractValue() fails with special characters
#
set names utf8;
select extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r');
select extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ');
select extractValue('<Ñ r="r"/>','/Ñ/@r');
select extractValue('<r Ñ="Ñ"/>','/r/@Ñ');
--disable_warnings
DROP PROCEDURE IF EXISTS p2;
--enable_warnings
DELIMITER //;
CREATE PROCEDURE p2 ()
BEGIN
DECLARE p LONGTEXT CHARACTER SET UTF8 DEFAULT '<Ñ><r>A</r></Ñ>';
SELECT EXTRACTVALUE(p,'/Ñ/r');
END//
DELIMITER ;//
CALL p2();
DROP PROCEDURE p2;
...@@ -1304,30 +1304,6 @@ my_xpath_init(MY_XPATH *xpath) ...@@ -1304,30 +1304,6 @@ my_xpath_init(MY_XPATH *xpath)
} }
/*
Some ctype-alike helper functions. Note, we cannot
reuse cs->ident_map[], because in Xpath, unlike in SQL,
dash character is a valid identifier part.
*/
static int
my_xident_beg(int c)
{
return (((c) >= 'a' && (c) <= 'z') ||
((c) >= 'A' && (c) <= 'Z') ||
((c) == '_'));
}
static int
my_xident_body(int c)
{
return (((c) >= 'a' && (c) <= 'z') ||
((c) >= 'A' && (c) <= 'Z') ||
((c) >= '0' && (c) <= '9') ||
((c)=='-') || ((c) == '_'));
}
static int static int
my_xdigit(int c) my_xdigit(int c)
{ {
...@@ -1350,7 +1326,7 @@ static void ...@@ -1350,7 +1326,7 @@ static void
my_xpath_lex_scan(MY_XPATH *xpath, my_xpath_lex_scan(MY_XPATH *xpath,
MY_XPATH_LEX *lex, const char *beg, const char *end) MY_XPATH_LEX *lex, const char *beg, const char *end)
{ {
int ch; int ch, ctype, length;
for ( ; beg < end && *beg == ' ' ; beg++); // skip leading spaces for ( ; beg < end && *beg == ' ' ; beg++); // skip leading spaces
lex->beg= beg; lex->beg= beg;
...@@ -1360,20 +1336,20 @@ my_xpath_lex_scan(MY_XPATH *xpath, ...@@ -1360,20 +1336,20 @@ my_xpath_lex_scan(MY_XPATH *xpath,
lex->term= MY_XPATH_LEX_EOF; // end of line reached lex->term= MY_XPATH_LEX_EOF; // end of line reached
return; return;
} }
ch= *beg++;
if (ch > 0 && ch < 128 && simpletok[ch]) // Check ident, or a function call, or a keyword
if ((length= xpath->cs->cset->ctype(xpath->cs, &ctype,
(const uchar*) beg,
(const uchar*) end)) > 0 &&
((ctype & (_MY_L | _MY_U)) || *beg == '_'))
{ {
// a token consisting of one character found // scan untill the end of the idenfitier
lex->end= beg; for (beg+= length;
lex->term= ch; (length= xpath->cs->cset->ctype(xpath->cs, &ctype,
return; (const uchar*) beg,
} (const uchar*) end)) > 0 &&
((ctype & (_MY_L | _MY_U | _MY_NMR)) || *beg == '_' || *beg == '-') ;
if (my_xident_beg(ch)) // ident, or a function call, or a keyword beg+= length) /* no op */;
{
// scan until the end of the identifier
for ( ; beg < end && my_xident_body(*beg); beg++);
lex->end= beg; lex->end= beg;
// check if a function call // check if a function call
...@@ -1388,6 +1364,18 @@ my_xpath_lex_scan(MY_XPATH *xpath, ...@@ -1388,6 +1364,18 @@ my_xpath_lex_scan(MY_XPATH *xpath,
return; return;
} }
ch= *beg++;
if (ch > 0 && ch < 128 && simpletok[ch])
{
// a token consisting of one character found
lex->end= beg;
lex->term= ch;
return;
}
if (my_xdigit(ch)) // a sequence of digits if (my_xdigit(ch)) // a sequence of digits
{ {
for ( ; beg < end && my_xdigit(*beg) ; beg++); for ( ; beg < end && my_xdigit(*beg) ; beg++);
......
...@@ -1362,7 +1362,7 @@ int my_mb_ctype_8bit(CHARSET_INFO *cs, int *ctype, ...@@ -1362,7 +1362,7 @@ int my_mb_ctype_8bit(CHARSET_INFO *cs, int *ctype,
*ctype= 0; *ctype= 0;
return MY_CS_TOOSMALL; return MY_CS_TOOSMALL;
} }
*ctype= cs->ctype[*s]; *ctype= cs->ctype[*s + 1];
return 1; return 1;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment