Commit 4b3826ba authored by bar@mysql.com's avatar bar@mysql.com

Bug#22638 SOUNDEX broken for international characters

Problem: SOUNDEX returned an invalid string for international
characters in multi-byte character sets.
For example: for a Chinese/Japanese 3-byte long character
_utf8 0xE99885 it took only the very first byte 0xE9,
put it into the outout string and then appended with three 
DIGIT ZERO characters, so the result was 0xE9303030 - which
is an invalide utf8 string.
Fix: make SOUNDEX() multi-byte aware and - put only complete
characters into result, thus return only valid strings.
This patch also makes SOUNDEX() compatible with UCS2.
parent f0a95a4e
...@@ -839,6 +839,24 @@ lily ...@@ -839,6 +839,24 @@ lily
river river
drop table t1; drop table t1;
deallocate prepare stmt; deallocate prepare stmt;
set names latin1;
set character_set_connection=ucs2;
select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
soundex('') soundex('he') soundex('hello all folks') soundex('#3556 in bugdb')
H000 H4142 I51231
select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
hex(soundex('')) hex(soundex('he')) hex(soundex('hello all folks')) hex(soundex('#3556 in bugdb'))
0048003000300030 00480034003100340032 004900350031003200330031
select 'mood' sounds like 'mud';
'mood' sounds like 'mud'
1
select hex(soundex(_ucs2 0x041004110412));
hex(soundex(_ucs2 0x041004110412))
0410003000300030
select hex(soundex(_ucs2 0x00BF00C0));
hex(soundex(_ucs2 0x00BF00C0))
00C0003000300030
set names latin1;
create table t1(a blob, b text charset utf8, c text charset ucs2); create table t1(a blob, b text charset utf8, c text charset ucs2);
select data_type, character_octet_length, character_maximum_length select data_type, character_octet_length, character_maximum_length
from information_schema.columns where table_name='t1'; from information_schema.columns where table_name='t1';
......
...@@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test'); ...@@ -854,6 +854,18 @@ select * from t1 where soundex(a) = soundex('test');
id a id a
1 Test 1 Test
drop table t1; drop table t1;
select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB)
阅000
select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB))
E99885303030
select soundex(_utf8 0xD091D092D093);
soundex(_utf8 0xD091D092D093)
Б000
select hex(soundex(_utf8 0xD091D092D093));
hex(soundex(_utf8 0xD091D092D093))
D091303030
SET collation_connection='utf8_general_ci'; SET collation_connection='utf8_general_ci';
create table t1 select repeat('a',4000) a; create table t1 select repeat('a',4000) a;
delete from t1; delete from t1;
......
...@@ -572,6 +572,20 @@ select utext from t1 where utext like '%%'; ...@@ -572,6 +572,20 @@ select utext from t1 where utext like '%%';
drop table t1; drop table t1;
deallocate prepare stmt; deallocate prepare stmt;
#
# Bug#22638 SOUNDEX broken for international characters
#
set names latin1;
set character_set_connection=ucs2;
select soundex(''),soundex('he'),soundex('hello all folks'),soundex('#3556 in bugdb');
select hex(soundex('')),hex(soundex('he')),hex(soundex('hello all folks')),hex(soundex('#3556 in bugdb'));
select 'mood' sounds like 'mud';
# Cyrillic A, BE, VE
select hex(soundex(_ucs2 0x041004110412));
# Make sure that "U+00BF INVERTED QUESTION MARK" is not considered as letter
select hex(soundex(_ucs2 0x00BF00C0));
set names latin1;
# #
# Bug #14290: character_maximum_length for text fields # Bug #14290: character_maximum_length for text fields
# #
......
...@@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST'); ...@@ -702,6 +702,14 @@ select * from t1 where soundex(a) = soundex('TEST');
select * from t1 where soundex(a) = soundex('test'); select * from t1 where soundex(a) = soundex('test');
drop table t1; drop table t1;
#
# Bug#22638 SOUNDEX broken for international characters
#
select soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB);
select hex(soundex(_utf8 0xE99885E8A788E99A8FE697B6E69BB4E696B0E79A84E696B0E997BB));
select soundex(_utf8 0xD091D092D093);
select hex(soundex(_utf8 0xD091D092D093));
SET collation_connection='utf8_general_ci'; SET collation_connection='utf8_general_ci';
-- source include/ctype_filesort.inc -- source include/ctype_filesort.inc
......
...@@ -1805,7 +1805,8 @@ void Item_func_soundex::fix_length_and_dec() ...@@ -1805,7 +1805,8 @@ void Item_func_soundex::fix_length_and_dec()
{ {
collation.set(args[0]->collation); collation.set(args[0]->collation);
max_length=args[0]->max_length; max_length=args[0]->max_length;
set_if_bigger(max_length,4); set_if_bigger(max_length, 4 * collation.collation->mbminlen);
tmp_value.set_charset(collation.collation);
} }
...@@ -1815,14 +1816,15 @@ void Item_func_soundex::fix_length_and_dec() ...@@ -1815,14 +1816,15 @@ void Item_func_soundex::fix_length_and_dec()
else return 0 else return 0
*/ */
static char soundex_toupper(char ch) static int soundex_toupper(int ch)
{ {
return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch; return (ch >= 'a' && ch <= 'z') ? ch - 'a' + 'A' : ch;
} }
static char get_scode(char *ptr)
static char get_scode(int wc)
{ {
uchar ch= soundex_toupper(*ptr); int ch= soundex_toupper(wc);
if (ch < 'A' || ch > 'Z') if (ch < 'A' || ch > 'Z')
{ {
// Thread extended alfa (country spec) // Thread extended alfa (country spec)
...@@ -1832,46 +1834,121 @@ static char get_scode(char *ptr) ...@@ -1832,46 +1834,121 @@ static char get_scode(char *ptr)
} }
static bool my_uni_isalpha(int wc)
{
/*
Return true for all Basic Latin letters: a..z A..Z.
Return true for all Unicode characters with code higher than U+00C0:
- characters between 'z' and U+00C0 are controls and punctuations.
- "U+00C0 LATIN CAPITAL LETTER A WITH GRAVE" is the first letter after 'z'.
*/
return (wc >= 'a' && wc <= 'z') ||
(wc >= 'A' && wc <= 'Z') ||
(wc >= 0xC0);
}
String *Item_func_soundex::val_str(String *str) String *Item_func_soundex::val_str(String *str)
{ {
DBUG_ASSERT(fixed == 1); DBUG_ASSERT(fixed == 1);
String *res =args[0]->val_str(str); String *res =args[0]->val_str(str);
char last_ch,ch; char last_ch,ch;
CHARSET_INFO *cs= collation.collation; CHARSET_INFO *cs= collation.collation;
my_wc_t wc;
uint nchars;
int rc;
if ((null_value=args[0]->null_value)) if ((null_value= args[0]->null_value))
return 0; /* purecov: inspected */ return 0; /* purecov: inspected */
if (tmp_value.alloc(max(res->length(),4))) if (tmp_value.alloc(max(res->length(), 4 * cs->mbminlen)))
return str; /* purecov: inspected */ return str; /* purecov: inspected */
char *to= (char *) tmp_value.ptr(); char *to= (char *) tmp_value.ptr();
char *from= (char *) res->ptr(), *end=from+res->length(); char *to_end= to + tmp_value.alloced_length();
tmp_value.set_charset(cs); char *from= (char *) res->ptr(), *end= from + res->length();
while (from != end && !my_isalpha(cs,*from)) // Skip pre-space for ( ; ; ) /* Skip pre-space */
from++; /* purecov: inspected */ {
if (from == end) if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0)
return &my_empty_string; // No alpha characters. return &my_empty_string; /* EOL or invalid byte sequence */
*to++ = soundex_toupper(*from); // Copy first letter
last_ch = get_scode(from); // code of the first letter if (rc == 1 && cs->ctype)
// for the first 'double-letter check. {
// Loop on input letters until /* Single byte letter found */
// end of input (null) or output if (my_isalpha(cs, *from))
// letter code count = 3 {
for (from++ ; from < end ; from++) last_ch= get_scode(*from); // Code of the first letter
{ *to++= soundex_toupper(*from++); // Copy first letter
if (!my_isalpha(cs,*from)) break;
}
from++;
}
else
{
from+= rc;
if (my_uni_isalpha(wc))
{
/* Multibyte letter found */
wc= soundex_toupper(wc);
last_ch= get_scode(wc); // Code of the first letter
if ((rc= cs->cset->wc_mb(cs, wc, (uchar*) to, (uchar*) to_end)) <= 0)
{
/* Extra safety - should not really happen */
DBUG_ASSERT(false);
return &my_empty_string;
}
to+= rc;
break;
}
}
}
/*
last_ch is now set to the first 'double-letter' check.
loop on input letters until end of input
*/
for (nchars= 1 ; ; )
{
if ((rc= cs->cset->mb_wc(cs, &wc, (uchar*) from, (uchar*) end)) <= 0)
break; /* EOL or invalid byte sequence */
if (rc == 1 && cs->ctype)
{
if (!my_isalpha(cs, *from++))
continue; continue;
ch=get_scode(from); }
else
{
from+= rc;
if (!my_uni_isalpha(wc))
continue;
}
ch= get_scode(wc);
if ((ch != '0') && (ch != last_ch)) // if not skipped or double if ((ch != '0') && (ch != last_ch)) // if not skipped or double
{ {
*to++ = ch; // letter, copy to output // letter, copy to output
last_ch = ch; // save code of last input letter if ((rc= cs->cset->wc_mb(cs, (my_wc_t) ch,
(uchar*) to, (uchar*) to_end)) <= 0)
{
// Extra safety - should not really happen
DBUG_ASSERT(false);
break;
}
to+= rc;
nchars++;
last_ch= ch; // save code of last input letter
} // for next double-letter check } // for next double-letter check
} }
for (end=(char*) tmp_value.ptr()+4 ; to < end ; to++)
*to = '0'; /* Pad up to 4 characters with DIGIT ZERO, if the string is shorter */
*to=0; // end string if (nchars < 4)
{
uint nbytes= (4 - nchars) * cs->mbminlen;
cs->cset->fill(cs, to, nbytes, '0');
to+= nbytes;
}
tmp_value.length((uint) (to-tmp_value.ptr())); tmp_value.length((uint) (to-tmp_value.ptr()));
return &tmp_value; return &tmp_value;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment