Commit b2e324a2 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-8416 ucs2: compare broken bytes as "greater than any non-broken character"

MDEV-8418 utf16: compare broken bytes as "greater than any non-broken character"
parent 35d8ac35
...@@ -1186,11 +1186,14 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), ...@@ -1186,11 +1186,14 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00 #define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00
#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF #define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF
#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8) #define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8)
#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC) #define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC)
#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800) /* Test if a byte is a leading byte of a high or low surrogate head: */
#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8)
/* Test if a Unicode code point is a high or low surrogate head */
#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800)
#define MY_UTF16_WC2(a, b) ((a << 8) + b) #define MY_UTF16_WC2(a, b) ((a << 8) + b)
/* /*
a= 110110?? (<< 18) a= 110110?? (<< 18)
...@@ -1201,6 +1204,30 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), ...@@ -1201,6 +1204,30 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \ #define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \
((c & 3) << 8) + d + 0x10000) ((c & 3) << 8) + d + 0x10000)
#define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b0))
#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b0) && MY_UTF16_LOW_HEAD(b2))
static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
{
my_wc_t wc= MY_UTF16_WC2(b0, b1);
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_ci
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1)
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3))
#include "strcoll.ic"
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b0, b1))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b0, b1, b2, b3))
#include "strcoll.ic"
#undef IS_MB2_CHAR
#undef IS_MB4_CHAR
static int static int
my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)), my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t *pwc, const uchar *s, const uchar *e) my_wc_t *pwc, const uchar *s, const uchar *e)
...@@ -1371,146 +1398,6 @@ my_casedn_utf16(CHARSET_INFO *cs, char *src, size_t srclen, ...@@ -1371,146 +1398,6 @@ my_casedn_utf16(CHARSET_INFO *cs, char *src, size_t srclen,
} }
static int
my_strnncoll_utf16(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
int s_res, t_res;
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
const uchar *se= s + slen;
const uchar *te= t + tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
while (s < se && t < te)
{
s_res= mb_wc(cs, &s_wc, s, se);
t_res= mb_wc(cs, &t_wc, t, te);
if (s_res <= 0 || t_res <= 0)
{
/* Incorrect string, compare by char value */
return my_bincmp(s, se, t, te);
}
my_tosort_utf16(uni_plane, &s_wc);
my_tosort_utf16(uni_plane, &t_wc);
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
}
s+= s_res;
t+= t_res;
}
return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
}
/**
Compare strings, discarding end space
If one string is shorter as the other, then we space extend the other
so that the strings have equal length.
This will ensure that the following things hold:
"a" == "a "
"a\0" < "a"
"a\0" < "a "
@param cs Character set pinter.
@param a First string to compare.
@param a_length Length of 'a'.
@param b Second string to compare.
@param b_length Length of 'b'.
IMPLEMENTATION
@return Comparison result.
@retval Negative number, if a less than b.
@retval 0, if a is equal to b
@retval Positive number, if a > b
*/
static int
my_strnncollsp_utf16(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference)
{
int res;
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
const uchar *se= s + slen, *te= t + tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
DBUG_ASSERT((slen % 2) == 0);
DBUG_ASSERT((tlen % 2) == 0);
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
diff_if_only_endspace_difference= FALSE;
#endif
while (s < se && t < te)
{
int s_res= mb_wc(cs, &s_wc, s, se);
int t_res= mb_wc(cs, &t_wc, t, te);
if (s_res <= 0 || t_res <= 0)
{
/* Incorrect string, compare bytewise */
return my_bincmp(s, se, t, te);
}
my_tosort_utf16(uni_plane, &s_wc);
my_tosort_utf16(uni_plane, &t_wc);
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
}
s+= s_res;
t+= t_res;
}
slen= (size_t) (se - s);
tlen= (size_t) (te - t);
res= 0;
if (slen != tlen)
{
int s_res, swap= 1;
if (diff_if_only_endspace_difference)
res= 1; /* Assume 's' is bigger */
if (slen < tlen)
{
slen= tlen;
s= t;
se= te;
swap= -1;
res= -res;
}
for ( ; s < se; s+= s_res)
{
if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0)
{
DBUG_ASSERT(0);
return 0;
}
if (s_wc != ' ')
return (s_wc < ' ') ? -swap : swap;
}
}
return res;
}
static uint static uint
my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e) my_ismbchar_utf16(CHARSET_INFO *cs, const char *b, const char *e)
{ {
...@@ -1623,111 +1510,6 @@ my_wildcmp_utf16_bin(CHARSET_INFO *cs, ...@@ -1623,111 +1510,6 @@ my_wildcmp_utf16_bin(CHARSET_INFO *cs,
} }
static int
my_strnncoll_utf16_bin(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
int s_res,t_res;
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
const uchar *se=s+slen;
const uchar *te=t+tlen;
while ( s < se && t < te )
{
s_res= mb_wc(cs, &s_wc, s, se);
t_res= mb_wc(cs, &t_wc, t, te);
if (s_res <= 0 || t_res <= 0)
{
/* Incorrect string, compare by char value */
return my_bincmp(s, se, t, te);
}
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
}
s+= s_res;
t+= t_res;
}
return (int) (t_is_prefix ? (t - te) : ((se - s) - (te - t)));
}
static int
my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference)
{
int res;
my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
my_charset_conv_mb_wc mb_wc= cs->cset->mb_wc;
const uchar *se= s + slen, *te= t + tlen;
DBUG_ASSERT((slen % 2) == 0);
DBUG_ASSERT((tlen % 2) == 0);
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
diff_if_only_endspace_difference= FALSE;
#endif
while (s < se && t < te)
{
int s_res= mb_wc(cs, &s_wc, s, se);
int t_res= mb_wc(cs, &t_wc, t, te);
if (s_res <= 0 || t_res <= 0)
{
/* Incorrect string, compare bytewise */
return my_bincmp(s, se, t, te);
}
if (s_wc != t_wc)
{
return s_wc > t_wc ? 1 : -1;
}
s+= s_res;
t+= t_res;
}
slen= (size_t) (se - s);
tlen= (size_t) (te - t);
res= 0;
if (slen != tlen)
{
int s_res, swap= 1;
if (diff_if_only_endspace_difference)
res= 1; /* Assume 's' is bigger */
if (slen < tlen)
{
slen= tlen;
s= t;
se= te;
swap= -1;
res= -res;
}
for ( ; s < se; s+= s_res)
{
if ((s_res= mb_wc(cs, &s_wc, s, se)) < 0)
{
DBUG_ASSERT(0);
return 0;
}
if (s_wc != ' ')
return (s_wc < ' ') ? -swap : swap;
}
}
return res;
}
static void static void
my_hash_sort_utf16_bin(CHARSET_INFO *cs, my_hash_sort_utf16_bin(CHARSET_INFO *cs,
const uchar *pos, size_t len, ulong *nr1, ulong *nr2) const uchar *pos, size_t len, ulong *nr1, ulong *nr2)
...@@ -1747,8 +1529,8 @@ my_hash_sort_utf16_bin(CHARSET_INFO *cs, ...@@ -1747,8 +1529,8 @@ my_hash_sort_utf16_bin(CHARSET_INFO *cs,
static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler = static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
{ {
NULL, /* init */ NULL, /* init */
my_strnncoll_utf16, my_strnncoll_utf16_general_ci,
my_strnncollsp_utf16, my_strnncollsp_utf16_general_ci,
my_strnxfrm_unicode, my_strnxfrm_unicode,
my_strnxfrmlen_unicode, my_strnxfrmlen_unicode,
my_like_range_generic, my_like_range_generic,
...@@ -1877,6 +1659,24 @@ struct charset_info_st my_charset_utf16_bin= ...@@ -1877,6 +1659,24 @@ struct charset_info_st my_charset_utf16_bin=
}; };
#define IS_MB2_CHAR(b0,b1) (!MY_UTF16_SURROGATE_HEAD(b1))
#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3))
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_ci
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0)
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2))
#include "strcoll.ic"
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) ((int) MY_UTF16_WC2(b1, b0))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF16_WC4(b1, b0, b3, b2))
#include "strcoll.ic"
#undef IS_MB2_CHAR
#undef IS_MB4_CHAR
static int static int
my_utf16le_uni(CHARSET_INFO *cs __attribute__((unused)), my_utf16le_uni(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t *pwc, const uchar *s, const uchar *e) my_wc_t *pwc, const uchar *s, const uchar *e)
...@@ -1948,6 +1748,38 @@ my_lengthsp_utf16le(CHARSET_INFO *cs __attribute__((unused)), ...@@ -1948,6 +1748,38 @@ my_lengthsp_utf16le(CHARSET_INFO *cs __attribute__((unused)),
} }
static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler =
{
NULL, /* init */
my_strnncoll_utf16le_general_ci,
my_strnncollsp_utf16le_general_ci,
my_strnxfrm_unicode,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_utf16_ci,
my_strcasecmp_mb2_or_mb4,
my_instr_mb,
my_hash_sort_utf16,
my_propagate_simple
};
static MY_COLLATION_HANDLER my_collation_utf16le_bin_handler =
{
NULL, /* init */
my_strnncoll_utf16le_bin,
my_strnncollsp_utf16le_bin,
my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_unicode_full_bin,
my_like_range_generic,
my_wildcmp_utf16_bin,
my_strcasecmp_mb2_or_mb4,
my_instr_mb,
my_hash_sort_utf16_bin,
my_propagate_simple
};
static MY_CHARSET_HANDLER my_charset_utf16le_handler= static MY_CHARSET_HANDLER my_charset_utf16le_handler=
{ {
NULL, /* init */ NULL, /* init */
...@@ -2012,7 +1844,7 @@ struct charset_info_st my_charset_utf16le_general_ci= ...@@ -2012,7 +1844,7 @@ struct charset_info_st my_charset_utf16le_general_ci=
0, /* escape_with_backslash_is_dangerous */ 0, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */ 1, /* levels_for_order */
&my_charset_utf16le_handler, &my_charset_utf16le_handler,
&my_collation_utf16_general_ci_handler &my_collation_utf16le_general_ci_handler
}; };
...@@ -2045,7 +1877,7 @@ struct charset_info_st my_charset_utf16le_bin= ...@@ -2045,7 +1877,7 @@ struct charset_info_st my_charset_utf16le_bin=
0, /* escape_with_backslash_is_dangerous */ 0, /* escape_with_backslash_is_dangerous */
1, /* levels_for_order */ 1, /* levels_for_order */
&my_charset_utf16le_handler, &my_charset_utf16le_handler,
&my_collation_utf16_bin_handler &my_collation_utf16le_bin_handler
}; };
...@@ -3058,6 +2890,31 @@ static const uchar to_upper_ucs2[] = { ...@@ -3058,6 +2890,31 @@ static const uchar to_upper_ucs2[] = {
}; };
/* Definitions for strcoll.ic */
#define IS_MB2_CHAR(x,y) (1)
#define UCS2_CODE(b0,b1) (((uchar) b0) << 8 | ((uchar) b1))
static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1)
{
my_wc_t wc= UCS2_CODE(b0, b1);
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1)
#include "strcoll.ic"
#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1)
#include "strcoll.ic"
static int static int
my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)), my_charlen_ucs2(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, const uchar *e) const uchar *s, const uchar *e)
...@@ -3208,120 +3065,6 @@ my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)), ...@@ -3208,120 +3065,6 @@ my_fill_ucs2(CHARSET_INFO *cs __attribute__((unused)),
} }
static int my_strnncoll_ucs2(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
int s_res,t_res;
my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
const uchar *se=s+slen;
const uchar *te=t+tlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
while ( s < se && t < te )
{
s_res=my_ucs2_uni(cs,&s_wc, s, se);
t_res=my_ucs2_uni(cs,&t_wc, t, te);
if ( s_res <= 0 || t_res <= 0 )
{
/* Incorrect string, compare by char value */
return ((int)s[0]-(int)t[0]);
}
my_tosort_ucs2(uni_plane, &s_wc);
my_tosort_ucs2(uni_plane, &t_wc);
if ( s_wc != t_wc )
{
return s_wc > t_wc ? 1 : -1;
}
s+=s_res;
t+=t_res;
}
return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
}
/*
Compare strings, discarding end space
SYNOPSIS
my_strnncollsp_ucs2()
cs character set handler
a First string to compare
a_length Length of 'a'
b Second string to compare
b_length Length of 'b'
IMPLEMENTATION
If one string is shorter as the other, then we space extend the other
so that the strings have equal length.
This will ensure that the following things hold:
"a" == "a "
"a\0" < "a"
"a\0" < "a "
RETURN
< 0 a < b
= 0 a == b
> 0 a > b
*/
static int my_strnncollsp_ucs2(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference
__attribute__((unused)))
{
const uchar *se, *te;
size_t minlen;
MY_UNICASE_INFO *uni_plane= cs->caseinfo;
/* extra safety to make sure the lengths are even numbers */
slen&= ~1;
tlen&= ~1;
se= s + slen;
te= t + tlen;
for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
{
int s_wc = uni_plane->page[s[0]] ? (int) uni_plane->page[s[0]][s[1]].sort :
(((int) s[0]) << 8) + (int) s[1];
int t_wc = uni_plane->page[t[0]] ? (int) uni_plane->page[t[0]][t[1]].sort :
(((int) t[0]) << 8) + (int) t[1];
if ( s_wc != t_wc )
return s_wc > t_wc ? 1 : -1;
s+= 2;
t+= 2;
}
if (slen != tlen)
{
int swap= 1;
if (slen < tlen)
{
s= t;
se= te;
swap= -1;
}
for ( ; s < se ; s+= 2)
{
if (s[0] || s[1] != ' ')
return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
}
}
return 0;
}
static uint my_ismbchar_ucs2(CHARSET_INFO *cs __attribute__((unused)), static uint my_ismbchar_ucs2(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *b,
const char *e) const char *e)
...@@ -3417,85 +3160,6 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs, ...@@ -3417,85 +3160,6 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
} }
static
int my_strnncoll_ucs2_bin(CHARSET_INFO *cs,
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool t_is_prefix)
{
int s_res,t_res;
my_wc_t UNINIT_VAR(s_wc),UNINIT_VAR(t_wc);
const uchar *se=s+slen;
const uchar *te=t+tlen;
while ( s < se && t < te )
{
s_res=my_ucs2_uni(cs,&s_wc, s, se);
t_res=my_ucs2_uni(cs,&t_wc, t, te);
if ( s_res <= 0 || t_res <= 0 )
{
/* Incorrect string, compare by char value */
return ((int)s[0]-(int)t[0]);
}
if ( s_wc != t_wc )
{
return s_wc > t_wc ? 1 : -1;
}
s+=s_res;
t+=t_res;
}
return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
}
static int my_strnncollsp_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)),
const uchar *s, size_t slen,
const uchar *t, size_t tlen,
my_bool diff_if_only_endspace_difference
__attribute__((unused)))
{
const uchar *se, *te;
size_t minlen;
/* extra safety to make sure the lengths are even numbers */
slen= (slen >> 1) << 1;
tlen= (tlen >> 1) << 1;
se= s + slen;
te= t + tlen;
for (minlen= MY_MIN(slen, tlen); minlen; minlen-= 2)
{
int s_wc= s[0] * 256 + s[1];
int t_wc= t[0] * 256 + t[1];
if ( s_wc != t_wc )
return s_wc > t_wc ? 1 : -1;
s+= 2;
t+= 2;
}
if (slen != tlen)
{
int swap= 1;
if (slen < tlen)
{
s= t;
se= te;
swap= -1;
}
for ( ; s < se ; s+= 2)
{
if (s[0] || s[1] != ' ')
return (s[0] == 0 && s[1] < ' ') ? -swap : swap;
}
}
return 0;
}
static static
void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)), void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)),
const uchar *key, size_t len,ulong *nr1, ulong *nr2) const uchar *key, size_t len,ulong *nr1, ulong *nr2)
...@@ -3518,8 +3182,8 @@ void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)), ...@@ -3518,8 +3182,8 @@ void my_hash_sort_ucs2_bin(CHARSET_INFO *cs __attribute__((unused)),
static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler = static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
{ {
NULL, /* init */ NULL, /* init */
my_strnncoll_ucs2, my_strnncoll_ucs2_general_ci,
my_strnncollsp_ucs2, my_strnncollsp_ucs2_general_ci,
my_strnxfrm_unicode, my_strnxfrm_unicode,
my_strnxfrmlen_unicode, my_strnxfrmlen_unicode,
my_like_range_generic, my_like_range_generic,
......
...@@ -64,13 +64,16 @@ ...@@ -64,13 +64,16 @@
@return - the number of bytes scanned @return - the number of bytes scanned
The including source file must define the following macros: The including source file must define the following macros:
IS_MB1_CHAR(x) IS_MB1_CHAR(b0) - for character sets that have MB1 characters
IS_MB1_MB2HEAD_GAP(x) - optional, for better performance IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance
IS_MB2_CHAR(x,y) IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters
IS_MB3_CHAR(x,y,z) - for character sets with mbmaxlen>2 IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters
IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters
WEIGHT_PAD_SPACE WEIGHT_PAD_SPACE
WEIGHT_MB1(x) WEIGHT_MB1(b0) - for character sets that have MB1 characters
WEIGHT_MB2(x,y) WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters
WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters
WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters
WEIGHT_ILSEQ(x) WEIGHT_ILSEQ(x)
*/ */
static inline uint static inline uint
...@@ -82,11 +85,13 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) ...@@ -82,11 +85,13 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
return 0; return 0;
} }
#ifdef IS_MB1_CHAR
if (IS_MB1_CHAR(*str)) if (IS_MB1_CHAR(*str))
{ {
*weight= WEIGHT_MB1(*str); /* A valid single byte character*/ *weight= WEIGHT_MB1(*str); /* A valid single byte character*/
return 1; return 1;
} }
#endif
#ifdef IS_MB1_MBHEAD_UNUSED_GAP #ifdef IS_MB1_MBHEAD_UNUSED_GAP
/* /*
...@@ -98,6 +103,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) ...@@ -98,6 +103,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
goto bad; goto bad;
#endif #endif
#ifdef IS_MB2_CHAR
if (str + 2 > end) /* The string ended unexpectedly */ if (str + 2 > end) /* The string ended unexpectedly */
goto bad; /* Treat as a bad byte */ goto bad; /* Treat as a bad byte */
...@@ -106,6 +112,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) ...@@ -106,6 +112,7 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
*weight= WEIGHT_MB2(str[0], str[1]); *weight= WEIGHT_MB2(str[0], str[1]);
return 2; /* A valid two-byte character */ return 2; /* A valid two-byte character */
} }
#endif
#ifdef IS_MB3_CHAR #ifdef IS_MB3_CHAR
if (str + 3 > end) /* Incomplete three-byte character */ if (str + 3 > end) /* Incomplete three-byte character */
......
...@@ -149,7 +149,7 @@ typedef struct ...@@ -149,7 +149,7 @@ typedef struct
A1A1 - MB2 or 8BIT+8BIT A1A1 - MB2 or 8BIT+8BIT
E0E0 - MB2 E0E0 - MB2
*/ */
STRNNCOLL_PARAM strcoll_mb2_common[]= static STRNNCOLL_PARAM strcoll_mb2_common[]=
{ {
/* Compare two good sequences */ /* Compare two good sequences */
{CSTR(""), CSTR(""), 0}, {CSTR(""), CSTR(""), 0},
...@@ -210,7 +210,7 @@ STRNNCOLL_PARAM strcoll_mb2_common[]= ...@@ -210,7 +210,7 @@ STRNNCOLL_PARAM strcoll_mb2_common[]=
/* /*
For character sets that have good mb2 characters A1A1 and F9FE For character sets that have good mb2 characters A1A1 and F9FE
*/ */
STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= static STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
{ {
/* Compare two good characters */ /* Compare two good characters */
{CSTR(""), CSTR("\xF9\xFE"), -1}, {CSTR(""), CSTR("\xF9\xFE"), -1},
...@@ -246,7 +246,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]= ...@@ -246,7 +246,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_mb2_F9FE[]=
A1A1 - a good mb2 character A1A1 - a good mb2 character
F9FE - a bad sequence F9FE - a bad sequence
*/ */
STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= static STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
{ {
/* Compare a good character to an illegal or an incomplete sequence */ /* Compare a good character to an illegal or an incomplete sequence */
{CSTR(""), CSTR("\xF9\xFE"), -1}, {CSTR(""), CSTR("\xF9\xFE"), -1},
...@@ -283,7 +283,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]= ...@@ -283,7 +283,7 @@ STRNNCOLL_PARAM strcoll_mb2_A1A1_bad_F9FE[]=
F9 - ILSEQ or H2 F9 - ILSEQ or H2
F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ) F9FE - a bad sequence (ILSEQ+XX or H2+ILSEQ)
*/ */
STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= static STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
{ {
/* Compare two good characters */ /* Compare two good characters */
{CSTR(""), CSTR("\xA1"), -1}, {CSTR(""), CSTR("\xA1"), -1},
...@@ -323,7 +323,7 @@ STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]= ...@@ -323,7 +323,7 @@ STRNNCOLL_PARAM strcoll_mb1_A1_bad_F9FE[]=
and sort in this order: and sort in this order:
8181 < A1 < E0E0 8181 < A1 < E0E0
*/ */
STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= static STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
{ {
{CSTR("\x81\x81"), CSTR("\xA1"), -1}, {CSTR("\x81\x81"), CSTR("\xA1"), -1},
{CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1}, {CSTR("\x81\x81"), CSTR("\xE0\xE0"), -1},
...@@ -336,7 +336,7 @@ STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]= ...@@ -336,7 +336,7 @@ STRNNCOLL_PARAM strcoll_8181_A1_E0E0[]=
/* /*
A shared test for eucjpms and ujis. A shared test for eucjpms and ujis.
*/ */
STRNNCOLL_PARAM strcoll_ujis[]= static STRNNCOLL_PARAM strcoll_ujis[]=
{ {
{CSTR("\x8E\xA1"), CSTR("\x8E"), -1}, /* Good MB2 vs incomplete MB2 */ {CSTR("\x8E\xA1"), CSTR("\x8E"), -1}, /* Good MB2 vs incomplete MB2 */
{CSTR("\x8E\xA1"), CSTR("\x8F\xA1"), -1}, /* Good MB2 vs incomplete MB3 */ {CSTR("\x8E\xA1"), CSTR("\x8F\xA1"), -1}, /* Good MB2 vs incomplete MB3 */
...@@ -347,7 +347,7 @@ STRNNCOLL_PARAM strcoll_ujis[]= ...@@ -347,7 +347,7 @@ STRNNCOLL_PARAM strcoll_ujis[]=
}; };
STRNNCOLL_PARAM strcoll_utf8mb3_common[]= static STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
{ {
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */ {CSTR("\xC0"), CSTR("\xC1"), -1}, /* Unused byte vs unused byte */
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */ {CSTR("\xC0"), CSTR("\xFF"), -1}, /* Unused byte vs unused byte */
...@@ -369,7 +369,7 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]= ...@@ -369,7 +369,7 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
}; };
STRNNCOLL_PARAM strcoll_utf8mb4_common[]= static STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
{ {
/* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */ /* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
{CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */ {CSTR("\xF0\x90\x80\x80"), CSTR("\xC0"), -1}, /* MB4 vs unused byte */
...@@ -412,6 +412,101 @@ STRNNCOLL_PARAM strcoll_utf8mb4_common[]= ...@@ -412,6 +412,101 @@ STRNNCOLL_PARAM strcoll_utf8mb4_common[]=
}; };
static STRNNCOLL_PARAM strcoll_ucs2_common[]=
{
{CSTR("\xC0"), CSTR("\xC1"), -1}, /* Incomlete MB2 vs incomplete MB2 */
{CSTR("\xC0"), CSTR("\xFF"), -1}, /* Incomlete MB2 vs incomplete MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC0"), -1}, /* MB2 vs incomplete MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC2"), -1}, /* MB2 vs incomplete MB2 */
{CSTR("\xC2\xA0"), CSTR("\xC2\xA1"), -1}, /* MB2 vs MB2 */
{CSTR("\xC2\xA1"), CSTR("\xC2\xA2"), -1}, /* MB2 vs MB2 */
{CSTR("\xFF\xFF"), CSTR("\x00"),-1}, /* MB2 vs incomplete */
{CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00"),-1}, /* MB2+MB2 vs incomplete */
{CSTR("\xFF\xFF\xFF\xFF"), CSTR("\x00\x00\x00"), 1},/* MB2+MB2 vs MB2+incomplete */
{NULL, 0, NULL, 0, 0}
};
/* Tests that involve comparison to SPACE (explicit, or padded) */
static STRNNCOLL_PARAM strcoll_ucs2_space[]=
{
{CSTR("\x00\x1F"), CSTR("\x00\x20"), -1}, /* MB2 vs MB2 */
{CSTR("\x00\x20"), CSTR("\x00\x21"), -1}, /* MB2 vs MB2 */
{CSTR("\x00\x1F"), CSTR(""), -1}, /* MB2 vs empty */
{CSTR("\x00\x20"), CSTR(""), 0}, /* MB2 vs empty */
{CSTR("\x00\x21"), CSTR(""), 1}, /* MB2 vs empty */
{NULL, 0, NULL, 0, 0}
};
/* Tests that involve comparison to SPACE (explicit, or padded) */
static STRNNCOLL_PARAM strcoll_utf16le_space[]=
{
{CSTR("\x1F\x00"), CSTR("\x20\x00"), -1}, /* MB2 vs MB2 */
{CSTR("\x20\x00"), CSTR("\x21\x00"), -1}, /* MB2 vs MB2 */
{CSTR("\x1F\x00"), CSTR(""), -1}, /* MB2 vs empty */
{CSTR("\x20\x00"), CSTR(""), 0}, /* MB2 vs empty */
{CSTR("\x21\x00"), CSTR(""), 1}, /* MB2 vs empty */
{NULL, 0, NULL, 0, 0}
};
static STRNNCOLL_PARAM strcoll_utf16_common[]=
{
/* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC"), -1},/* MB4 vs incomplete MB4 */
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xDB\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xD8\x00\xE0\x00"),-1},/* MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\x00"), -1},/* MB4 vs broken MB2 */
{CSTR("\xDB\xFF\xDF\xFF"), CSTR("\xDC\xFF\xDF"), -1},/* MB4 vs incomplete MB4 */
/* Broken MB4 vs broken MB4 */
{CSTR("\xD8\x00\xDC\x00"), CSTR("\xD8\x00\xDC\x01"),-1},/* Broken MB4 vs broken MB4 */
{CSTR("\xDB\xFF\xE0\xFE"), CSTR("\xDB\xFF\xE0\xFF"),-1},/* Broken MB4 vs broken MB4 */
{NULL, 0, NULL, 0, 0}
};
static STRNNCOLL_PARAM strcoll_utf16le_common[]=
{
/* Minimum four-byte character: U+10000 == _utf16 0xD800DC00 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00\xD0"),-1},/* MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x00"), -1},/* MB4 vs incomplete MB4 */
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC0"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xC2"), -1},/* MB4 vs incomplete MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xDB"),-1},/* MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xD8\x00\xE0"),-1},/* MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\x00\xDC"), -1},/* MB4 vs broken MB2 */
{CSTR("\xFF\xDB\xFF\xDF"), CSTR("\xFF\xDC\x00"), -1},/* MB4 vs incomplete MB4 */
/* Broken MB4 vs broken MB4 */
{CSTR("\x00\xD8\x00\xDC"), CSTR("\x00\xD8\x01\xDC"),-1},/* Broken MB4 vs broken MB4 */
{CSTR("\xFF\xDB\xFE\xE0"), CSTR("\xFF\xDB\xFF\xE0"),-1},/* Broken MB4 vs broken MB4 */
{NULL, 0, NULL, 0, 0}
};
static void static void
str2hex(char *dst, size_t dstlen, const char *src, size_t srclen) str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
{ {
...@@ -528,6 +623,12 @@ test_strcollsp() ...@@ -528,6 +623,12 @@ test_strcollsp()
failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0); failed+= strcollsp(&my_charset_sjis_japanese_ci, strcoll_8181_A1_E0E0);
failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0); failed+= strcollsp(&my_charset_sjis_bin, strcoll_8181_A1_E0E0);
#endif #endif
#ifdef HAVE_CHARSET_ucs2
failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_ucs2_general_ci, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_ucs2_bin, strcoll_ucs2_space);
#endif
#ifdef HAVE_CHARSET_ujis #ifdef HAVE_CHARSET_ujis
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_mb2_common); failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_mb2_common);
failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_common); failed+= strcollsp(&my_charset_ujis_bin, strcoll_mb2_common);
...@@ -536,6 +637,21 @@ test_strcollsp() ...@@ -536,6 +637,21 @@ test_strcollsp()
failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis); failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis);
failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis); failed+= strcollsp(&my_charset_ujis_bin, strcoll_ujis);
#endif #endif
#ifdef HAVE_CHARSET_utf16
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_utf16_general_ci, strcoll_utf16_common);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_ucs2_space);
failed+= strcollsp(&my_charset_utf16_bin, strcoll_utf16_common);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_space);
failed+= strcollsp(&my_charset_utf16le_general_ci,strcoll_utf16le_common);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_ucs2_common);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_space);
failed+= strcollsp(&my_charset_utf16le_bin, strcoll_utf16le_common);
#endif
#ifdef HAVE_CHARSET_utf8 #ifdef HAVE_CHARSET_utf8
failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_ci, strcoll_utf8mb3_common);
failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common); failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment