Commit 8b4af2a2 authored by Alexander Nozdrin's avatar Alexander Nozdrin

Bug#55980 Character sets: supplementary character _bin ordering is wrong

Problem:
- ORDER BY for utf8mb4_bin, utf16_bin and utf32_bin returned
  results in a wrong order, because old functions
  (supporting only BMP range) were used to handle these collations.
- Additionally, utf16_bin did not sort supplementary characters
  between U+D700 and U+E000, as WL#1213 specification specified.

include/m_ctype.h:
  Adding prototypes.
mysql-test/include/ctype_filesort2.inc:
  Adding a new shared test file.
mysql-test/t/ctype_utf8mb4.test:
  Adding tests.
strings/ctype-ucs2.c:
  - Fixing my_strncoll[sp]_utf16_bin to compare
    binary representation instead of code points,
    to make columns with indexes sort correct.
  - Fixing my_collation_handler_utf32_bin and
    my_collation_handler_utf16_bin to use new
    functions.
strings/ctype-utf8.c:
  - Adding my_strnxfrm[len]_unicode_fill_bin()
    to handle utf8mb4_bin, utf16_bin and utf32_bin,
    using 3 bytes per weight.
    This function also performs special reordering in case of utf16_bin.
  - Fixing my_collation_utf8mb4_bin handler to use the
    new function.
parent 90eef290
......@@ -539,6 +539,11 @@ size_t my_strnxfrm_unicode(CHARSET_INFO *,
uchar *dst, size_t dstlen,
const uchar *src, size_t srclen);
size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *,
uchar *dst, size_t dstlen,
const uchar *src, size_t srclen);
size_t my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *, size_t);
int my_wildcmp_unicode(CHARSET_INFO *cs,
const char *str, const char *str_end,
const char *wildstr, const char *wildend,
......
#
# Testing filesort for full Unicode character sets
# with supplementary characters.
#
--echo #
--echo # Bug#55980 Character sets: supplementary character _bin ordering is wrong
--echo #
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
SHOW CREATE TABLE t1;
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
ALTER TABLE t1 ADD KEY(a);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
DROP TABLE IF EXISTS t1;
......@@ -611,6 +611,31 @@ utf16_bin 00610009
utf16_bin 0061
utf16_bin 00610020
drop table t1;
#
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
#
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(1) CHARACTER SET utf16 COLLATE utf16_bin NOT NULL DEFAULT ''
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
0385 CE85
D800DF84 F0908E84
DBC0DC00 F4808080
FF9D EFBE9D
ALTER TABLE t1 ADD KEY(a);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
0385 CE85
D800DF84 F0908E84
DBC0DC00 F4808080
FF9D EFBE9D
DROP TABLE IF EXISTS t1;
select @@collation_connection;
@@collation_connection
utf16_bin
......
......@@ -610,6 +610,31 @@ utf32_bin 0000006100000009
utf32_bin 00000061
utf32_bin 0000006100000020
drop table t1;
#
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
#
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(1) CHARACTER SET utf32 COLLATE utf32_bin NOT NULL DEFAULT ''
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
00000385 CE85
0000FF9D EFBE9D
00010384 F0908E84
00100000 F4808080
ALTER TABLE t1 ADD KEY(a);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
00000385 CE85
0000FF9D EFBE9D
00010384 F0908E84
00100000 F4808080
DROP TABLE IF EXISTS t1;
select @@collation_connection;
@@collation_connection
utf32_bin
......
......@@ -987,6 +987,31 @@ utf8mb4_bin 6109
utf8mb4_bin 61
utf8mb4_bin 6120
drop table t1;
#
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
#
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT ''
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
CE85 CE85
EFBE9D EFBE9D
F0908E84 F0908E84
F4808080 F4808080
ALTER TABLE t1 ADD KEY(a);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
CE85 CE85
EFBE9D EFBE9D
F0908E84 F0908E84
F4808080 F4808080
DROP TABLE IF EXISTS t1;
select @@collation_connection;
@@collation_connection
utf8mb4_bin
......
......@@ -326,6 +326,7 @@ SET collation_connection='utf16_general_ci';
SET NAMES latin1;
SET collation_connection='utf16_bin';
-- source include/ctype_filesort.inc
-- source include/ctype_filesort2.inc
-- source include/ctype_like_escape.inc
#
......
......@@ -328,6 +328,7 @@ SET collation_connection='utf32_general_ci';
SET NAMES latin1;
SET collation_connection='utf32_bin';
-- source include/ctype_filesort.inc
-- source include/ctype_filesort2.inc
-- source include/ctype_like_escape.inc
#
......
......@@ -733,6 +733,7 @@ SET collation_connection='utf8mb4_general_ci';
-- source include/ctype_german.inc
SET collation_connection='utf8mb4_bin';
-- source include/ctype_filesort.inc
-- source include/ctype_filesort2.inc
-- source include/ctype_like_escape.inc
#
......
......@@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs,
}
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
return my_bincmp(s, s + s_res, t, t + t_res);
}
s+= s_res;
......@@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
return my_bincmp(s, s + s_res, t, t + t_res);
}
s+= s_res;
......@@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
NULL, /* init */
my_strnncoll_utf16_bin,
my_strnncollsp_utf16_bin,
my_strnxfrm_unicode,
my_strnxfrmlen_simple,
my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_unicode_full_bin,
my_like_range_utf16,
my_wildcmp_utf16_bin,
my_strcasecmp_mb2_or_mb4,
......@@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
NULL, /* init */
my_strnncoll_utf32_bin,
my_strnncollsp_utf32_bin,
my_strnxfrm_unicode,
my_strnxfrmlen_utf32,
my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_unicode_full_bin,
my_like_range_utf32,
my_wildcmp_utf32_bin,
my_strcasecmp_mb2_or_mb4,
......
......@@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs,
/*
This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32
Store sorting weights using 2 bytes per character.
This function is shared between
- utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
which support BMP only (U+0000..U+FFFF).
- utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
which map all supplementary characters to weight 0xFFFD.
*/
size_t
my_strnxfrm_unicode(CHARSET_INFO *cs,
......@@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
}
/*
Store sorting weights using 3 bytes per character.
This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
*/
size_t
my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
uchar *dst, size_t dstlen,
const uchar *src, size_t srclen)
{
my_wc_t wc;
uchar *de= dst + dstlen;
uchar *de_beg= de - 2; /* The beginning of the last chunk */
const uchar *se = src + srclen;
LINT_INIT(wc);
DBUG_ASSERT(src);
DBUG_ASSERT(cs->state & MY_CS_BINSORT);
while (dst < de_beg)
{
int res;
if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
break;
src+= res;
if (cs->mbminlen == 2) /* utf16_bin */
{
/*
Reorder code points to weights as follows:
U+0000..U+D7FF -> [00][00][00]..[00][D7][FF] BMP part #1
U+10000..U+10FFFF -> [01][00][00]..[10][FF][FF] Supplementary
U+E000..U+FFFF -> [20][E0][00]..[20][FF][FF] BMP part #2
*/
if (wc >= 0xE000 && wc <= 0xFFFF)
wc+= 0x200000;
}
*dst++= (uchar) (wc >> 16);
*dst++= (uchar) ((wc >> 8) & 0xFF);
*dst++= (uchar) (wc & 0xFF);
}
while (dst < de_beg) /* Fill the tail with keys for space character */
{
*dst++= 0x00;
*dst++= 0x00;
*dst++= 0x20;
}
/* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */
if (dst < de)
{
*dst++= 0x00;
if (dst < de)
*dst= 0x00;
}
return dstlen;
}
size_t
my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len)
{
return ((len + 3) / cs->mbmaxlen) * 3;
}
#endif /* HAVE_UNIDATA */
......@@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
NULL, /* init */
my_strnncoll_mb_bin,
my_strnncollsp_mb_bin,
my_strnxfrm_unicode,
my_strnxfrmlen_utf8mb4,
my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_unicode_full_bin,
my_like_range_mb,
my_wildcmp_mb_bin,
my_strcasecmp_mb_bin,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment