Commit 35d8ac35 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"

parent 7ab7f532
...@@ -1802,5 +1802,28 @@ DROP TABLE t1; ...@@ -1802,5 +1802,28 @@ DROP TABLE t1;
--echo # --echo #
--echo # --echo #
--echo # End of tests --echo # ctype_utf8mb4.inc: Start of 10.1 tests
--echo #
--echo #
--echo # MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
--echo #
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
INSERT INTO t1 VALUES (0x61);
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
SELECT HEX(a) FROM t1 ORDER BY a;
SELECT HEX(a) FROM t1 ORDER BY a DESC;
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
SELECT HEX(a) FROM t1 ORDER BY a;
SELECT HEX(a) FROM t1 ORDER BY a DESC;
DROP TABLE t1;
--echo #
--echo # ctype_utf8mb4.inc: End of 10.1 tests
--echo #
--echo #
--echo # End of ctype_utf8mb4.inc
--echo # --echo #
...@@ -2495,5 +2495,57 @@ DROP TABLE t1; ...@@ -2495,5 +2495,57 @@ DROP TABLE t1;
# End of 5.5 tests # End of 5.5 tests
# #
# #
# End of tests # ctype_utf8mb4.inc: Start of 10.1 tests
#
#
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
INSERT INTO t1 VALUES (0x61);
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
DROP TABLE t1;
#
# ctype_utf8mb4.inc: End of 10.1 tests
#
#
# End of ctype_utf8mb4.inc
# #
...@@ -2642,5 +2642,57 @@ DROP TABLE t1; ...@@ -2642,5 +2642,57 @@ DROP TABLE t1;
# End of 5.5 tests # End of 5.5 tests
# #
# #
# End of tests # ctype_utf8mb4.inc: Start of 10.1 tests
#
#
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
INSERT INTO t1 VALUES (0x61);
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
DROP TABLE t1;
#
# ctype_utf8mb4.inc: End of 10.1 tests
#
#
# End of ctype_utf8mb4.inc
# #
...@@ -2642,5 +2642,57 @@ DROP TABLE t1; ...@@ -2642,5 +2642,57 @@ DROP TABLE t1;
# End of 5.5 tests # End of 5.5 tests
# #
# #
# End of tests # ctype_utf8mb4.inc: Start of 10.1 tests
#
#
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
INSERT INTO t1 VALUES (0x61);
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
DROP TABLE t1;
#
# ctype_utf8mb4.inc: End of 10.1 tests
#
#
# End of ctype_utf8mb4.inc
# #
...@@ -85,7 +85,8 @@ ...@@ -85,7 +85,8 @@
IS_CONTINUATION_BYTE(b3) && \ IS_CONTINUATION_BYTE(b3) && \
(b0 >= 0xf1 || b1 >= 0x90) && \ (b0 >= 0xf1 || b1 >= 0x90) && \
(b0 <= 0xf3 || b1 <= 0x8F)) (b0 <= 0xf3 || b1 <= 0x8F))
#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
IS_UTF8MB4_STEP2(b0,b1,b2,b3))
/* Convert individual bytes to Unicode code points */ /* Convert individual bytes to Unicode code points */
#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\ #define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
...@@ -7622,146 +7623,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src) ...@@ -7622,146 +7623,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src)
} }
static int
my_strnncoll_utf8mb4(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
const uchar *se= s + slen;
const uchar *te= t + tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
while ( s < se && t < te )
{
int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
if ( s_res <= 0 || t_res <= 0 )
{
/* Incorrect string, compare bytewise */
return bincmp_utf8mb4(s, se, t, te);
}
my_tosort_unicode(uni_plane, &s_wc, cs->state);
my_tosort_unicode(uni_plane, &t_wc, cs->state);
if ( s_wc != t_wc )
{
return s_wc > t_wc ? 1 : -1;
}
s+= s_res;
t+= t_res;
}
return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
}
/**
Compare strings, discarding end space
If one string is shorter as the other, then we space extend the other
so that the strings have equal length.
This will ensure that the following things hold:
"a" == "a "
"a\0" < "a"
"a\0" < "a "
@param cs Character set pinter.
@param a First string to compare.
@param a_length Length of 'a'.
@param b Second string to compare.
@param b_length Length of 'b'.
@param diff_if_only_endspace_difference
Set to 1 if the strings should be regarded as different
if they only difference in end space
@return Comparison result.
@retval Negative number, if a less than b.
@retval 0, if a is equal to b
@retval Positive number, if a > b
*/
static int
my_strnncollsp_utf8mb4(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference)
{
int res;
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
const uchar *se= s + slen, *te= t + tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
diff_if_only_endspace_difference= FALSE;
#endif
while ( s < se && t < te )
{
int s_res= my_mb_wc_utf8mb4(cs, &s_wc, s, se);
int t_res= my_mb_wc_utf8mb4(cs, &t_wc, t, te);
if ( s_res <= 0 || t_res <= 0 )
{
/* Incorrect string, compare bytewise */
return bincmp_utf8mb4(s, se, t, te);
}
my_tosort_unicode(uni_plane, &s_wc, cs->state);
my_tosort_unicode(uni_plane, &t_wc, cs->state);
if ( s_wc != t_wc )
{
return s_wc > t_wc ? 1 : -1;
}
s+=s_res;
t+=t_res;
}
slen= (size_t) (se-s);
tlen= (size_t) (te-t);
res= 0;
if (slen != tlen)
{
int swap= 1;
if (diff_if_only_endspace_difference)
res= 1; /* Assume 'a' is bigger */
if (slen < tlen)
{
slen= tlen;
s= t;
se= te;
swap= -1;
res= -res;
}
/*
This following loop uses the fact that in UTF-8
all multibyte characters are greater than space,
and all multibyte head characters are greater than
space. It means if we meet a character greater
than space, it always means that the longer string
is greater. So we can reuse the same loop from the
8bit version, without having to process full multibute
sequences.
*/
for ( ; s < se; s++)
{
if (*s != ' ')
return (*s < ' ') ? -swap : swap;
}
}
return res;
}
/** /**
Compare 0-terminated UTF8 strings. Compare 0-terminated UTF8 strings.
...@@ -7906,6 +7767,30 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs, ...@@ -7906,6 +7767,30 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN #undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
/* my_well_formed_char_length_utf8mb4 */ /* my_well_formed_char_length_utf8mb4 */
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci
#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3)
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0)
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf8_general_ci(b0,b1)
#define WEIGHT_MB3(b0,b1,b2) my_weight_mb3_utf8_general_ci(b0,b1,b2)
/*
There is no mapping between code point and weight for non-BMP characters
in utf8mb4_general_ci. Just using code point as weight.
*/
#define WEIGHT_MB4(b0,b1,b2,b3) UTF8MB4_CODE(b0,b1,b2,b3)
#include "strcoll.ic"
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB1(b0) ((int) (uchar) (b0))
#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1))
#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3))
#include "strcoll.ic"
static uint static uint
my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e) my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
{ {
...@@ -7934,8 +7819,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c) ...@@ -7934,8 +7819,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c)
static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
{ {
NULL, /* init */ NULL, /* init */
my_strnncoll_utf8mb4, my_strnncoll_utf8mb4_general_ci,
my_strnncollsp_utf8mb4, my_strnncollsp_utf8mb4_general_ci,
my_strnxfrm_unicode, my_strnxfrm_unicode,
my_strnxfrmlen_unicode, my_strnxfrmlen_unicode,
my_like_range_mb, my_like_range_mb,
...@@ -7950,8 +7835,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler= ...@@ -7950,8 +7835,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
{ {
NULL, /* init */ NULL, /* init */
my_strnncoll_mb_bin, my_strnncoll_utf8mb4_bin,
my_strnncollsp_mb_bin, my_strnncollsp_utf8mb4_bin,
my_strnxfrm_unicode_full_bin, my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_unicode_full_bin, my_strnxfrmlen_unicode_full_bin,
my_like_range_mb, my_like_range_mb,
......
...@@ -118,6 +118,18 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) ...@@ -118,6 +118,18 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
} }
#endif #endif
#ifdef IS_MB4_CHAR
if (str + 4 > end) /* Incomplete four-byte character */
goto bad;
if (IS_MB4_CHAR(str[0], str[1], str[2], str[3]))
{
*weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]);
return 4; /* A valid four-byte character */
}
#endif
bad: bad:
*weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */ *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */
return 1; return 1;
...@@ -252,4 +264,5 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), ...@@ -252,4 +264,5 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
#undef WEIGHT_MB1 #undef WEIGHT_MB1
#undef WEIGHT_MB2 #undef WEIGHT_MB2
#undef WEIGHT_MB3 #undef WEIGHT_MB3
#undef WEIGHT_MB4
#undef WEIGHT_PAD_SPACE #undef WEIGHT_PAD_SPACE
...@@ -369,6 +369,49 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]= ...@@ -369,6 +369,49 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
}; };
STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
{
/* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xC2"), -1}, /* MB4 vs incomplete MB2 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\x7F"),-1}, /* MB4 vs broken MB3 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0\xC0"),-1}, /* MB4 vs broken MB3 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xE0\xA0"), -1}, /* MB4 vs incomplete MB3 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80"),-1}, /* MB4 vs incomplete MB4 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xC2"), -1}, /* MB4 vs incomplete MB2 */
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\x7F"),-1}, /* MB4 vs broken MB3 */
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0\xC0"),-1}, /* MB4 vs broken MB3 */
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xE0\xA0"), -1}, /* MB4 vs incomplete MB3 */
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80"),-1}, /* MB4 vs incomplete MB4 */
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\x7F"),-1},/* MB4 vs broken MB4 */
{CSTR("\xF4\x8F\xBF\xBF"), CSTR("\xF0\x90\x80\xC0"),-1},/* MB4 vs broken MB4 */
/* Broken MB4 vs incomplete/broken MB3 */
{CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0"), 1}, /* Broken MB4 vs incomplete MB3 */
{CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\x7F"),1}, /* Broken MB4 vs broken MB3 */
{CSTR("\xF0\x90\x80\x7F"), CSTR("\xE0\xA0\xC0"),1}, /* Broken MB4 vs broken MB3 */
/*
Broken MB4 vs incomplete MB4:
The three leftmost bytes are compared binary, the fourth byte is compared
to auto-padded space.
*/
{CSTR("\xF0\x90\x80\x1F"), CSTR("\xF0\x90\x80"),-1}, /* Broken MB4 vs incomplete MB4 */
{CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80"),1}, /* Broken MB4 vs incomplete MB4 */
/* Broken MB4 vs broken MB4 */
{CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\x7F"),-1},/* Broken MB4 vs broken MB4 */
{CSTR("\xF0\x90\x80\x7E"), CSTR("\xF0\x90\x80\xC0"),-1},/* Broken MB4 vs broken MB4 */
{NULL, 0, NULL, 0, 0}
};
static void static void
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen) str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
{ {
...@@ -497,6 +540,12 @@ test_strcollsp() ...@@ -497,6 +540,12 @@ test_strcollsp()
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_bin, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_bin, strcoll_utf8mb3_common);
#endif
#ifdef HAVE_CHARSET_utf8mb4
failed+= strcollsp(&my_charset_utf8mb4_general_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8mb4_bin, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8mb4_general_ci, strcoll_utf8mb4_common);
failed+= strcollsp(&my_charset_utf8mb4_bin, strcoll_utf8mb4_common);
#endif #endif
return failed; return failed;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment