Commit e4f8cea3 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"

parent a5f4412b
......@@ -2206,3 +2206,21 @@ DEALLOCATE PREPARE stmt;
#
# End of 10.0 tests
#
#
# Start of 10.1 tests
#
#
# MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a));
INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002);
SELECT COUNT(DISTINCT a) FROM t1;
COUNT(DISTINCT a)
1
DROP TABLE t1;
SELECT _utf32 0x10001=_utf32 0x10002;
_utf32 0x10001=_utf32 0x10002
1
#
# End of 10.1 tests
#
......@@ -956,3 +956,20 @@ DEALLOCATE PREPARE stmt;
--echo # End of 10.0 tests
--echo #
--echo #
--echo # Start of 10.1 tests
--echo #
--echo #
--echo # MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
--echo #
# Make sure that all non-BMP characters are compared as equal
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a));
INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002);
SELECT COUNT(DISTINCT a) FROM t1;
DROP TABLE t1;
SELECT _utf32 0x10001=_utf32 0x10002;
--echo #
--echo # End of 10.1 tests
--echo #
......@@ -1892,6 +1892,34 @@ struct charset_info_st my_charset_utf16le_bin=
*/
#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
#define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1))
#define MY_UTF32_WC4(b0,b1,b2,b3) ((b0 << 24) + (b1 << 16) + (b2 << 8) + (b3))
static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
uchar b2, uchar b3)
{
my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3);
if (wc <= 0xFFFF)
{
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
return MY_CS_REPLACEMENT_CHARACTER;
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
#include "strcoll.ic"
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3))
#include "strcoll.ic"
#undef IS_MB2_CHAR
#undef IS_MB4_CHAR
static int
my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
......@@ -1899,7 +1927,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
{
if (s + 4 > e)
return MY_CS_TOOSMALL4;
*pwc= (s[0] << 24) + (s[1] << 16) + (s[2] << 8) + (s[3]);
*pwc= MY_UTF32_WC4(s[0], s[1], s[2], s[3]);
return *pwc > 0x10FFFF ? MY_CS_ILSEQ : 4;
}
......@@ -2029,144 +2057,6 @@ my_casedn_utf32(CHARSET_INFO *cs, char *src, size_t srclen,
}
static int
my_strnncoll_utf32(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
const uchar *se= s + slen;
const uchar *te= t + tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
while (s < se && t < te)
{
int s_res= my_utf32_uni(cs, &s_wc, s, se);
int t_res= my_utf32_uni(cs, &t_wc, t, te);
if ( s_res <= 0 || t_res <= 0)
{
/* Incorrect string, compare by char value */
return my_bincmp(s, se, t, te);
}
my_tosort_utf32(uni_plane, &s_wc);
my_tosort_utf32(uni_plane, &t_wc);
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
}
s+= s_res;
t+= t_res;
}
return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
}
/**
Compare strings, discarding end space
If one string is shorter as the other, then we space extend the other
so that the strings have equal length.
This will ensure that the following things hold:
"a" == "a "
"a\0" < "a"
"a\0" < "a "
@param cs Character set pinter.
@param a First string to compare.
@param a_length Length of 'a'.
@param b Second string to compare.
@param b_length Length of 'b'.
IMPLEMENTATION
@return Comparison result.
@retval Negative number, if a less than b.
@retval 0, if a is equal to b
@retval Positive number, if a > b
*/
static int
my_strnncollsp_utf32(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference)
{
int res;
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
const uchar *se= s + slen, *te= t + tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT((slen % 4) == 0);
DBUG_ASSERT((tlen % 4) == 0);
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
diff_if_only_endspace_difference= FALSE;
#endif
while ( s < se && t < te )
{
int s_res= my_utf32_uni(cs, &s_wc, s, se);
int t_res= my_utf32_uni(cs, &t_wc, t, te);
if ( s_res <= 0 || t_res <= 0 )
{
/* Incorrect string, compare bytewise */
return my_bincmp(s, se, t, te);
}
my_tosort_utf32(uni_plane, &s_wc);
my_tosort_utf32(uni_plane, &t_wc);
if ( s_wc != t_wc )
{
return s_wc > t_wc ? 1 : -1;
}
s+= s_res;
t+= t_res;
}
slen= (size_t) (se - s);
tlen= (size_t) (te - t);
res= 0;
if (slen != tlen)
{
int s_res, swap= 1;
if (diff_if_only_endspace_difference)
res= 1; /* Assume 's' is bigger */
if (slen < tlen)
{
slen= tlen;
s= t;
se= te;
swap= -1;
res= -res;
}
for ( ; s < se; s+= s_res)
{
if ((s_res= my_utf32_uni(cs, &s_wc, s, se)) < 0)
{
DBUG_ASSERT(0);
return 0;
}
if (s_wc != ' ')
return (s_wc < ' ') ? -swap : swap;
}
}
return res;
}
static uint
my_ismbchar_utf32(CHARSET_INFO *cs __attribute__((unused)),
const char *b,
......@@ -2578,97 +2468,6 @@ my_wildcmp_utf32_bin(CHARSET_INFO *cs,
}
static int
my_strnncoll_utf32_bin(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
const uchar *se= s + slen;
const uchar *te= t + tlen;
while (s < se && t < te)
{
int s_res= my_utf32_uni(cs, &s_wc, s, se);
int t_res= my_utf32_uni(cs, &t_wc, t, te);
if (s_res <= 0 || t_res <= 0)
{
/* Incorrect string, compare by char value */
return my_bincmp(s, se, t, te);
}
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
}
s+= s_res;
t+= t_res;
}
return (int) (t_is_prefix ? (t-te) : ((se - s) - (te - t)));
}
static inline my_wc_t
my_utf32_get(const uchar *s)
{
return
((my_wc_t) s[0] << 24) +
((my_wc_t) s[1] << 16) +
((my_wc_t) s[2] << 8) +
s[3];
}
static int
my_strnncollsp_utf32_bin(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference
__attribute__((unused)))
{
const uchar *se, *te;
size_t minlen;
DBUG_ASSERT((slen % 4) == 0);
DBUG_ASSERT((tlen % 4) == 0);
se= s + slen;
te= t + tlen;
for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 4)
{
my_wc_t s_wc= my_utf32_get(s);
my_wc_t t_wc= my_utf32_get(t);
if (s_wc != t_wc)
return s_wc > t_wc ? 1 : -1;
s+= 4;
t+= 4;
}
if (slen != tlen)
{
int swap= 1;
if (slen < tlen)
{
s= t;
se= te;
swap= -1;
}
for ( ; s < se ; s+= 4)
{
my_wc_t s_wc= my_utf32_get(s);
if (s_wc != ' ')
return (s_wc < ' ') ? -swap : swap;
}
}
return 0;
}
static size_t
my_scan_utf32(CHARSET_INFO *cs,
const char *str, const char *end, int sequence_type)
......@@ -2696,8 +2495,8 @@ my_scan_utf32(CHARSET_INFO *cs,
static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
{
NULL, /* init */
my_strnncoll_utf32,
my_strnncollsp_utf32,
my_strnncoll_utf32_general_ci,
my_strnncollsp_utf32_general_ci,
my_strnxfrm_unicode,
my_strnxfrmlen_unicode,
my_like_range_generic,
......
......@@ -537,6 +537,55 @@ static STRNNCOLL_PARAM strcoll_utf16le_general_ci[]=
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xFF\xDB\xFF\xDF"), 0},/* Non-BMP MB4 vs non-BMP MB4 */
{CSTR("\x00\x00"), CSTR("\x00\xD8\x01\xDC"), -1},/* U+0000 vs non-BMP MB4 */
{CSTR("\x00\x00"), CSTR("\xFF\xDB\xFF\xDF"), -1},/* U+0000 vs non-BMP MB4 */
{NULL, 0, NULL, 0, 0}
};
static STRNNCOLL_PARAM strcoll_utf32_common[]=
{
/* Minimum character: U+0000 == _utf32 0x00000000 */
{CSTR("\x00\x00\x00\x00"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x00\x00\x00"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x00\x00\x00"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x00\x00\x00"), CSTR("\x00\x00\x00"),-1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x00\x00\x00"), CSTR("\x00\x20\x00\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\x00\x00\x00\x00"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */
/* Minimum non-BMP character: U+10000 == _utf32 0x00010000 */
{CSTR("\x00\x01\x00\x00"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x01\x00\x00"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x01\x00\x00"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x01\x00\x00"), CSTR("\x00\x00\x00"),-1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x01\x00\x00"), CSTR("\x00\x20\x00\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\x00\x01\x00\x00"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */
/* Maximum character: U+10FFFF == _utf32 0x0010FFFF */
{CSTR("\x00\x10\xFF\xFF"), CSTR("\x00"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x10\xFF\xFF"), CSTR("\xFF"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x10\xFF\xFF"), CSTR("\x00\x00"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x10\xFF\xFF"), CSTR("\x00\x00\x00"), -1}, /* MB4 vs incomplete MB4 */
{CSTR("\x00\x10\xFF\xFF"), CSTR("\x20\x00\x00\x00"),-1},/* MB4 vs broken MB3 */
{CSTR("\x00\x10\xFF\xFF"), CSTR("\xFF\xFF\xFF\xFF"),-1},/* MB4 vs broken MB4 */
/* Broken MB4 vs incomplete/broken MB3 */
{CSTR("\x00\x20\x00\x00"), CSTR("\x00"), 1}, /* Broken MB4 vs incomplete MB4 */
{CSTR("\x00\x20\x00\x00"), CSTR("\x00\x00"), 1}, /* Broken MB4 vs incomplete MB4 */
{CSTR("\x00\x20\x00\x00"), CSTR("\x00\x00\x00"), 1}, /* Broken MB4 vs incomplete MB4 */
{CSTR("\x00\x20\x00\x00"), CSTR("\x00\x20\x00\x01"),-1},/* Broken MB4 vs broken MB4 */
{NULL, 0, NULL, 0, 0}
};
static STRNNCOLL_PARAM strcoll_utf32_general_ci[]=
{
/* Two non-BMP characters are compared as equal */
{CSTR("\x00\x01\x00\x00"), CSTR("\x00\x01\x00\x01"), 0}, /* non-BMP MB4 vs non-BMP MB4 */
{CSTR("\x00\x00\x00\x00"), CSTR("\x00\x01\x00\x00"), -1}, /* U+0000 vs non-BMP MB4 */
{CSTR("\x00\x00\x00\x00"), CSTR("\x00\x01\x00\x01"), -1}, /* U+0000 vs non-BMP MB4 */
{NULL, 0, NULL, 0, 0}
};
......@@ -688,6 +737,11 @@ test_strcollsp()
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common);
#endif
#ifdef HAVE_CHARSET_utf32
failed+= strcollsp(&my_charset_utf32_general_ci, strcoll_utf32_common);
failed+= strcollsp(&my_charset_utf32_general_ci, strcoll_utf32_general_ci);
failed+= strcollsp(&my_charset_utf32_bin, strcoll_utf32_common);
#endif
#ifdef HAVE_CHARSET_utf8
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment