Commit b2e324a2 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-8416 ucs2: compare broken bytes as "greater than any non-broken character"

MDEV-8418 utf16: compare broken bytes as "greater than any non-broken character"
parent 35d8ac35
This diff is collapsed.
...@@ -64,13 +64,16 @@ ...@@ -64,13 +64,16 @@
@return - the number of bytes scanned @return - the number of bytes scanned
The including source file must define the following macros: The including source file must define the following macros:
IS_MB1_CHAR(x) IS_MB1_CHAR(b0) - for character sets that have MB1 characters
IS_MB1_MB2HEAD_GAP(x) - optional, for better performance IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance
IS_MB2_CHAR(x,y) IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters
IS_MB3_CHAR(x,y,z) - for character sets with mbmaxlen>2 IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters
IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters
WEIGHT_PAD_SPACE WEIGHT_PAD_SPACE
WEIGHT_MB1(x) WEIGHT_MB1(b0) - for character sets that have MB1 characters
WEIGHT_MB2(x,y) WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters
WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters
WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters
WEIGHT_ILSEQ(x) WEIGHT_ILSEQ(x)
*/ */
static inline uint static inline uint
...@@ -82,11 +85,13 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) ...@@ -82,11 +85,13 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
return 0; return 0;
} }
#ifdef IS_MB1_CHAR
if (IS_MB1_CHAR(*str)) if (IS_MB1_CHAR(*str))
{ {
*weight= WEIGHT_MB1(*str); /* A valid single byte character*/ *weight= WEIGHT_MB1(*str); /* A valid single byte character*/
return 1; return 1;
} }
#endif
#ifdef IS_MB1_MBHEAD_UNUSED_GAP #ifdef IS_MB1_MBHEAD_UNUSED_GAP
/* /*
...@@ -98,6 +103,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) ...@@ -98,6 +103,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
goto bad; goto bad;
#endif #endif
#ifdef IS_MB2_CHAR
if (str + 2 > end) /* The string ended unexpectedly */ if (str + 2 > end) /* The string ended unexpectedly */
goto bad; /* Treat as a bad byte */ goto bad; /* Treat as a bad byte */
...@@ -106,6 +112,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) ...@@ -106,6 +112,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
*weight= WEIGHT_MB2(str[0], str[1]); *weight= WEIGHT_MB2(str[0], str[1]);
return 2; /* A valid two-byte character */ return 2; /* A valid two-byte character */
} }
#endif
#ifdef IS_MB3_CHAR #ifdef IS_MB3_CHAR
if (str + 3 > end) /* Incomplete three-byte character */ if (str + 3 > end) /* Incomplete three-byte character */
......
...@@ -149,7 +149,7 @@ typedef struct ...@@ -149,7 +149,7 @@ typedef struct
A1A1 - MB2 or 8BIT+8BIT A1A1 - MB2 or 8BIT+8BIT
E0E0 - MB2 E0E0 - MB2
*/ */
STRNNCOLL_PARAM strcoll_mb2_common[]= static STRNNCOLL_PARAM strcoll_mb2_common[]=
{ {
/* Compare two good sequences */ /* Compare two good sequences */
{CSTR(""), CSTR(""), 0}, {CSTR(""), CSTR(""), 0},
...@@ -210,7 +210,7 @@ STRNNCOLL_PARAM strcoll_mb2_common[]= ...@@ -210,7 +210,7 @@ STRNNCOLL_PARAM strcoll_mb2_common[]=
/* /*
For character sets that have good mb2 characters A1A1 and F9FE For character sets that have good mb2 characters A1A1 and F9FE
*/ */
STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= static STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
{ {
/* Compare two good characters */ /* Compare two good characters */
{CSTR(""), CSTR("\xF9\xFE"), -1}, {CSTR(""), CSTR("\xF9\xFE"), -1},
...@@ -246,7 +246,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= ...@@ -246,7 +246,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
A1A1 - a good mb2 character A1A1 - a good mb2 character
F9FE - a bad sequence F9FE - a bad sequence
*/ */
STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= static STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
{ {
/* Compare a good character to an illegal or an incomplete sequence */ /* Compare a good character to an illegal or an incomplete sequence */
{CSTR(""), CSTR("\xF9\xFE"), -1}, {CSTR(""), CSTR("\xF9\xFE"), -1},
...@@ -283,7 +283,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= ...@@ -283,7 +283,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
F9 - ILSEQ or H2 F9 - ILSEQ or H2
F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ) F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
*/ */
STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= static STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
{ {
/* Compare two good characters */ /* Compare two good characters */
{CSTR(""), CSTR("\xA1"), -1}, {CSTR(""), CSTR("\xA1"), -1},
...@@ -323,7 +323,7 @@ STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= ...@@ -323,7 +323,7 @@ STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
and sort in this order: and sort in this order:
8181 < A1 < E0E0 8181 < A1 < E0E0
*/ */
STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= static STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
{ {
{CSTR("\x81\x81"), CSTR("\xA1"), -1}, {CSTR("\x81\x81"), CSTR("\xA1"), -1},
{CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1}, {CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
...@@ -336,7 +336,7 @@ STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= ...@@ -336,7 +336,7 @@ STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
/* /*
A shared test for eucjpms and ujis. A shared test for eucjpms and ujis.
*/ */
STRNNCOLL_PARAM strcoll_ujis[]= static STRNNCOLL_PARAM strcoll_ujis[]=
{ {
{CSTR("\x8E\xA1"), CSTR("\x8E"), -1}, /* Good MB2 vs incomplete MB2 */ {CSTR("\x8E\xA1"), CSTR("\x8E"), -1}, /* Good MB2 vs incomplete MB2 */
{CSTR("\x8E\xA1"), CSTR("\x8F\xA1"), -1}, /* Good MB2 vs incomplete MB3 */ {CSTR("\x8E\xA1"), CSTR("\x8F\xA1"), -1}, /* Good MB2 vs incomplete MB3 */
...@@ -347,7 +347,7 @@ STRNNCOLL_PARAM strcoll_ujis[]= ...@@ -347,7 +347,7 @@ STRNNCOLL_PARAM strcoll_ujis[]=
}; };
STRNNCOLL_PARAM strcoll_utf8mb3_common[]= static STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
{ {
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */ {CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */ {CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */
...@@ -369,7 +369,7 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]= ...@@ -369,7 +369,7 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
}; };
STRNNCOLL_PARAM strcoll_utf8mb4_common[]= static STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
{ {
/* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */ /* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */ {CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */
...@@ -412,6 +412,101 @@ STRNNCOLL_PARAM strcoll_utf8mb4_common[]= ...@@ -412,6 +412,101 @@ STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
}; };
static STRNNCOLL_PARAM strcoll_ucs2_common[]=
{
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Incomlete MB2 vs incomplete MB2 */
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Incomlete MB2 vs incomplete MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC0"), -1}, /* MB2 vs incomplete MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC2"), -1}, /* MB2 vs incomplete MB2 */
{CSTR("\xC2\xA0"), CSTR("\xC2\xA1"), -1}, /* MB2 vs MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC2\xA2"), -1}, /* MB2 vs MB2 */
{CSTR("\xFF\xFF"), CSTR("\x00"),-1}, /* MB2 vs incomplete */
{CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00"),-1}, /* MB2+MB2 vs incomplete */
{CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00\x00\x00"), 1},/* MB2+MB2 vs MB2+incomplete */
{NULL, 0, NULL, 0, 0}
};
/* Tests that involve comparison to SPACE (explicit, or padded) */
static STRNNCOLL_PARAM strcoll_ucs2_space[]=
{
{CSTR("\x00\x1F"), CSTR("\x00\x20"), -1}, /* MB2 vs MB2 */
{CSTR("\x00\x20"), CSTR("\x00\x21"), -1}, /* MB2 vs MB2 */
{CSTR("\x00\x1F"), CSTR(""), -1}, /* MB2 vs empty */
{CSTR("\x00\x20"), CSTR(""), 0}, /* MB2 vs empty */
{CSTR("\x00\x21"), CSTR(""), 1}, /* MB2 vs empty */
{NULL, 0, NULL, 0, 0}
};
/* Tests that involve comparison to SPACE (explicit, or padded) */
static STRNNCOLL_PARAM strcoll_utf16le_space[]=
{
{CSTR("\x1F\x00"), CSTR("\x20\x00"), -1}, /* MB2 vs MB2 */
{CSTR("\x20\x00"), CSTR("\x21\x00"), -1}, /* MB2 vs MB2 */
{CSTR("\x1F\x00"), CSTR(""), -1}, /* MB2 vs empty */
{CSTR("\x20\x00"), CSTR(""), 0}, /* MB2 vs empty */
{CSTR("\x21\x00"), CSTR(""), 1}, /* MB2 vs empty */
{NULL, 0, NULL, 0, 0}
};
static STRNNCOLL_PARAM strcoll_utf16_common[]=
{
/* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC"), -1},/* MB4 vs incomplete MB4 */
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\xFF\xDF"), -1},/* MB4 vs incomplete MB4 */
/* Broken MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC\x01"),-1},/* Broken MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xE0\xFE"), CSTR("\xDB\xFF\xE0\xFF"),-1},/* Broken MB4 vs broken MB4 */
{NULL, 0, NULL, 0, 0}
};
static STRNNCOLL_PARAM strcoll_utf16le_common[]=
{
/* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xD0"),-1},/* MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00"), -1},/* MB4 vs incomplete MB4 */
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xE0"),-1},/* MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xFF\xDC\x00"), -1},/* MB4 vs incomplete MB4 */
/* Broken MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x01\xDC"),-1},/* Broken MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFE\xE0"), CSTR("\xFF\xDB\xFF\xE0"),-1},/* Broken MB4 vs broken MB4 */
{NULL, 0, NULL, 0, 0}
};
static void static void
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen) str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
{ {
...@@ -528,6 +623,12 @@ test_strcollsp() ...@@ -528,6 +623,12 @@ test_strcollsp()
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0); failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0); failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0);
#endif #endif
#ifdef HAVE_CHARSET_ucs2
failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_space);
#endif
#ifdef HAVE_CHARSET_ujis #ifdef HAVE_CHARSET_ujis
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_mb2_common); failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_common); failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_common);
...@@ -536,6 +637,21 @@ test_strcollsp() ...@@ -536,6 +637,21 @@ test_strcollsp()
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis); failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis);
failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis); failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis);
#endif #endif
#ifdef HAVE_CHARSET_utf16
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_utf16_common);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_utf16_common);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_space);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_common);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common);
#endif
#ifdef HAVE_CHARSET_utf8 #ifdef HAVE_CHARSET_utf8
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment