Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
mariadb
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
mariadb
Commits
35d8ac35
Commit
35d8ac35
authored
Jul 06, 2015
by
Alexander Barkov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
parent
7ab7f532
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
275 additions
and
149 deletions
+275
-149
mysql-test/include/ctype_utf8mb4.inc
mysql-test/include/ctype_utf8mb4.inc
+24
-1
mysql-test/r/ctype_utf8mb4_heap.result
mysql-test/r/ctype_utf8mb4_heap.result
+53
-1
mysql-test/r/ctype_utf8mb4_innodb.result
mysql-test/r/ctype_utf8mb4_innodb.result
+53
-1
mysql-test/r/ctype_utf8mb4_myisam.result
mysql-test/r/ctype_utf8mb4_myisam.result
+53
-1
strings/ctype-utf8.c
strings/ctype-utf8.c
+30
-145
strings/strcoll.ic
strings/strcoll.ic
+13
-0
unittest/strings/strings-t.c
unittest/strings/strings-t.c
+49
-0
No files found.
mysql-test/include/ctype_utf8mb4.inc
View file @
35d8ac35
...
...
@@ -1802,5 +1802,28 @@ DROP TABLE t1;
--
echo
#
--
echo
#
--
echo
# End of tests
--
echo
# ctype_utf8mb4.inc: Start of 10.1 tests
--
echo
#
--
echo
#
--
echo
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
--
echo
#
CREATE
TABLE
t1
(
a
VARCHAR
(
10
)
CHARACTER
SET
utf8mb4
,
KEY
(
a
));
INSERT
INTO
t1
VALUES
(
0x61
);
INSERT
INTO
t1
VALUES
(
0xC280
),(
0xDFBF
);
INSERT
INTO
t1
VALUES
(
0xE0A080
),(
0xEFBFBF
);
INSERT
INTO
t1
VALUES
(
0xF0908080
),(
0xF48FBFBF
);
SELECT
HEX
(
a
)
FROM
t1
ORDER
BY
a
;
SELECT
HEX
(
a
)
FROM
t1
ORDER
BY
a
DESC
;
ALTER
TABLE
t1
MODIFY
a
VARCHAR
(
10
)
CHARACTER
SET
utf8mb4
COLLATE
utf8mb4_bin
;
SELECT
HEX
(
a
)
FROM
t1
ORDER
BY
a
;
SELECT
HEX
(
a
)
FROM
t1
ORDER
BY
a
DESC
;
DROP
TABLE
t1
;
--
echo
#
--
echo
# ctype_utf8mb4.inc: End of 10.1 tests
--
echo
#
--
echo
#
--
echo
# End of ctype_utf8mb4.inc
--
echo
#
mysql-test/r/ctype_utf8mb4_heap.result
View file @
35d8ac35
...
...
@@ -2495,5 +2495,57 @@ DROP TABLE t1;
# End of 5.5 tests
#
#
# End of tests
# ctype_utf8mb4.inc: Start of 10.1 tests
#
#
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
INSERT INTO t1 VALUES (0x61);
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
DROP TABLE t1;
#
# ctype_utf8mb4.inc: End of 10.1 tests
#
#
# End of ctype_utf8mb4.inc
#
mysql-test/r/ctype_utf8mb4_innodb.result
View file @
35d8ac35
...
...
@@ -2642,5 +2642,57 @@ DROP TABLE t1;
# End of 5.5 tests
#
#
# End of tests
# ctype_utf8mb4.inc: Start of 10.1 tests
#
#
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
INSERT INTO t1 VALUES (0x61);
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
DROP TABLE t1;
#
# ctype_utf8mb4.inc: End of 10.1 tests
#
#
# End of ctype_utf8mb4.inc
#
mysql-test/r/ctype_utf8mb4_myisam.result
View file @
35d8ac35
...
...
@@ -2642,5 +2642,57 @@ DROP TABLE t1;
# End of 5.5 tests
#
#
# End of tests
# ctype_utf8mb4.inc: Start of 10.1 tests
#
#
# MDEV-8417 utf8mb4: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf8mb4, KEY(a));
INSERT INTO t1 VALUES (0x61);
INSERT INTO t1 VALUES (0xC280),(0xDFBF);
INSERT INTO t1 VALUES (0xE0A080),(0xEFBFBF);
INSERT INTO t1 VALUES (0xF0908080),(0xF48FBFBF);
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
ALTER TABLE t1 MODIFY a VARCHAR(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
SELECT HEX(a) FROM t1 ORDER BY a;
HEX(a)
61
C280
DFBF
E0A080
EFBFBF
F0908080
F48FBFBF
SELECT HEX(a) FROM t1 ORDER BY a DESC;
HEX(a)
F48FBFBF
F0908080
EFBFBF
E0A080
DFBF
C280
61
DROP TABLE t1;
#
# ctype_utf8mb4.inc: End of 10.1 tests
#
#
# End of ctype_utf8mb4.inc
#
strings/ctype-utf8.c
View file @
35d8ac35
...
...
@@ -85,7 +85,8 @@
IS_CONTINUATION_BYTE(b3) && \
(b0 >= 0xf1 || b1 >= 0x90) && \
(b0 <= 0xf3 || b1 <= 0x8F))
#define IS_UTF8MB4_STEP3(b0,b1,b2,b3) (((uchar) (b0) < 0xF5) && \
IS_UTF8MB4_STEP2(b0,b1,b2,b3))
/* Convert individual bytes to Unicode code points */
#define UTF8MB2_CODE(b0,b1) (((my_wc_t) ((uchar) b0 & 0x1f) << 6) |\
...
...
@@ -7622,146 +7623,6 @@ my_casedn_str_utf8mb4(CHARSET_INFO *cs, char *src)
}
static
int
my_strnncoll_utf8mb4
(
CHARSET_INFO
*
cs
,
const
uchar
*
s
,
size_t
slen
,
const
uchar
*
t
,
size_t
tlen
,
my_bool
t_is_prefix
)
{
my_wc_t
UNINIT_VAR
(
s_wc
),
UNINIT_VAR
(
t_wc
);
const
uchar
*
se
=
s
+
slen
;
const
uchar
*
te
=
t
+
tlen
;
MY_UNICASE_INFO
*
uni_plane
=
cs
->
caseinfo
;
while
(
s
<
se
&&
t
<
te
)
{
int
s_res
=
my_mb_wc_utf8mb4
(
cs
,
&
s_wc
,
s
,
se
);
int
t_res
=
my_mb_wc_utf8mb4
(
cs
,
&
t_wc
,
t
,
te
);
if
(
s_res
<=
0
||
t_res
<=
0
)
{
/* Incorrect string, compare bytewise */
return
bincmp_utf8mb4
(
s
,
se
,
t
,
te
);
}
my_tosort_unicode
(
uni_plane
,
&
s_wc
,
cs
->
state
);
my_tosort_unicode
(
uni_plane
,
&
t_wc
,
cs
->
state
);
if
(
s_wc
!=
t_wc
)
{
return
s_wc
>
t_wc
?
1
:
-
1
;
}
s
+=
s_res
;
t
+=
t_res
;
}
return
(
int
)
(
t_is_prefix
?
(
t
-
te
)
:
((
se
-
s
)
-
(
te
-
t
)));
}
/**
Compare strings, discarding end space
If one string is shorter as the other, then we space extend the other
so that the strings have equal length.
This will ensure that the following things hold:
"a" == "a "
"a\0" < "a"
"a\0" < "a "
@param cs Character set pinter.
@param a First string to compare.
@param a_length Length of 'a'.
@param b Second string to compare.
@param b_length Length of 'b'.
@param diff_if_only_endspace_difference
Set to 1 if the strings should be regarded as different
if they only difference in end space
@return Comparison result.
@retval Negative number, if a less than b.
@retval 0, if a is equal to b
@retval Positive number, if a > b
*/
static
int
my_strnncollsp_utf8mb4
(
CHARSET_INFO
*
cs
,
const
uchar
*
s
,
size_t
slen
,
const
uchar
*
t
,
size_t
tlen
,
my_bool
diff_if_only_endspace_difference
)
{
int
res
;
my_wc_t
UNINIT_VAR
(
s_wc
),
UNINIT_VAR
(
t_wc
);
const
uchar
*
se
=
s
+
slen
,
*
te
=
t
+
tlen
;
MY_UNICASE_INFO
*
uni_plane
=
cs
->
caseinfo
;
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
diff_if_only_endspace_difference
=
FALSE
;
#endif
while
(
s
<
se
&&
t
<
te
)
{
int
s_res
=
my_mb_wc_utf8mb4
(
cs
,
&
s_wc
,
s
,
se
);
int
t_res
=
my_mb_wc_utf8mb4
(
cs
,
&
t_wc
,
t
,
te
);
if
(
s_res
<=
0
||
t_res
<=
0
)
{
/* Incorrect string, compare bytewise */
return
bincmp_utf8mb4
(
s
,
se
,
t
,
te
);
}
my_tosort_unicode
(
uni_plane
,
&
s_wc
,
cs
->
state
);
my_tosort_unicode
(
uni_plane
,
&
t_wc
,
cs
->
state
);
if
(
s_wc
!=
t_wc
)
{
return
s_wc
>
t_wc
?
1
:
-
1
;
}
s
+=
s_res
;
t
+=
t_res
;
}
slen
=
(
size_t
)
(
se
-
s
);
tlen
=
(
size_t
)
(
te
-
t
);
res
=
0
;
if
(
slen
!=
tlen
)
{
int
swap
=
1
;
if
(
diff_if_only_endspace_difference
)
res
=
1
;
/* Assume 'a' is bigger */
if
(
slen
<
tlen
)
{
slen
=
tlen
;
s
=
t
;
se
=
te
;
swap
=
-
1
;
res
=
-
res
;
}
/*
This following loop uses the fact that in UTF-8
all multibyte characters are greater than space,
and all multibyte head characters are greater than
space. It means if we meet a character greater
than space, it always means that the longer string
is greater. So we can reuse the same loop from the
8bit version, without having to process full multibute
sequences.
*/
for
(
;
s
<
se
;
s
++
)
{
if
(
*
s
!=
' '
)
return
(
*
s
<
' '
)
?
-
swap
:
swap
;
}
}
return
res
;
}
/**
Compare 0-terminated UTF8 strings.
...
...
@@ -7906,6 +7767,30 @@ size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
#undef DEFINE_WELL_FORMED_CHAR_LENGTH_USING_CHARLEN
/* my_well_formed_char_length_utf8mb4 */
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_general_ci
#define IS_MB4_CHAR(b0,b1,b2,b3) IS_UTF8MB4_STEP3(b0,b1,b2,b3)
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB1(b0) my_weight_mb1_utf8_general_ci(b0)
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf8_general_ci(b0,b1)
#define WEIGHT_MB3(b0,b1,b2) my_weight_mb3_utf8_general_ci(b0,b1,b2)
/*
There is no mapping between code point and weight for non-BMP characters
in utf8mb4_general_ci. Just using code point as weight.
*/
#define WEIGHT_MB4(b0,b1,b2,b3) UTF8MB4_CODE(b0,b1,b2,b3)
#include "strcoll.ic"
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf8mb4_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB1(b0) ((int) (uchar) (b0))
#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1))
#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3))
#include "strcoll.ic"
static
uint
my_ismbchar_utf8mb4
(
CHARSET_INFO
*
cs
,
const
char
*
b
,
const
char
*
e
)
{
...
...
@@ -7934,8 +7819,8 @@ my_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), uint c)
static
MY_COLLATION_HANDLER
my_collation_utf8mb4_general_ci_handler
=
{
NULL
,
/* init */
my_strnncoll_utf8mb4
,
my_strnncollsp_utf8mb4
,
my_strnncoll_utf8mb4
_general_ci
,
my_strnncollsp_utf8mb4
_general_ci
,
my_strnxfrm_unicode
,
my_strnxfrmlen_unicode
,
my_like_range_mb
,
...
...
@@ -7950,8 +7835,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler=
static
MY_COLLATION_HANDLER
my_collation_utf8mb4_bin_handler
=
{
NULL
,
/* init */
my_strnncoll_
mb
_bin
,
my_strnncollsp_
mb
_bin
,
my_strnncoll_
utf8mb4
_bin
,
my_strnncollsp_
utf8mb4
_bin
,
my_strnxfrm_unicode_full_bin
,
my_strnxfrmlen_unicode_full_bin
,
my_like_range_mb
,
...
...
strings/strcoll.ic
View file @
35d8ac35
...
...
@@ -118,6 +118,18 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
}
#endif
#ifdef IS_MB4_CHAR
if (str + 4 > end) /* Incomplete four-byte character */
goto bad;
if (IS_MB4_CHAR(str[0], str[1], str[2], str[3]))
{
*weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]);
return 4; /* A valid four-byte character */
}
#endif
bad:
*weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */
return 1;
...
...
@@ -252,4 +264,5 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
#undef WEIGHT_MB1
#undef WEIGHT_MB2
#undef WEIGHT_MB3
#undef WEIGHT_MB4
#undef WEIGHT_PAD_SPACE
unittest/strings/strings-t.c
View file @
35d8ac35
...
...
@@ -369,6 +369,49 @@ STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
};
STRNNCOLL_PARAM
strcoll_utf8mb4_common
[]
=
{
/* Minimum four-byte character: U+10000 == _utf8 0xF0908080 */
{
CSTR
(
"
\xF0\x90\x80\x80
"
),
CSTR
(
"
\xC0
"
),
-
1
},
/* MB4 vs unused byte */
{
CSTR
(
"
\xF0\x90\x80\x80
"
),
CSTR
(
"
\xC2
"
),
-
1
},
/* MB4 vs incomplete MB2 */
{
CSTR
(
"
\xF0\x90\x80\x80
"
),
CSTR
(
"
\xE0\xA0\x7F
"
),
-
1
},
/* MB4 vs broken MB3 */
{
CSTR
(
"
\xF0\x90\x80\x80
"
),
CSTR
(
"
\xE0\xA0\xC0
"
),
-
1
},
/* MB4 vs broken MB3 */
{
CSTR
(
"
\xF0\x90\x80\x80
"
),
CSTR
(
"
\xE0\xA0
"
),
-
1
},
/* MB4 vs incomplete MB3 */
{
CSTR
(
"
\xF0\x90\x80\x80
"
),
CSTR
(
"
\xF0\x90\x80
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\xF0\x90\x80\x80
"
),
CSTR
(
"
\xF0\x90\x80\x7F
"
),
-
1
},
/* MB4 vs broken MB4 */
{
CSTR
(
"
\xF0\x90\x80\x80
"
),
CSTR
(
"
\xF0\x90\x80\xC0
"
),
-
1
},
/* MB4 vs broken MB4 */
/* Maximum four-byte character: U+10FFFF == _utf8 0xF48FBFBF */
{
CSTR
(
"
\xF4\x8F\xBF\xBF
"
),
CSTR
(
"
\xC0
"
),
-
1
},
/* MB4 vs unused byte */
{
CSTR
(
"
\xF4\x8F\xBF\xBF
"
),
CSTR
(
"
\xC2
"
),
-
1
},
/* MB4 vs incomplete MB2 */
{
CSTR
(
"
\xF4\x8F\xBF\xBF
"
),
CSTR
(
"
\xE0\xA0\x7F
"
),
-
1
},
/* MB4 vs broken MB3 */
{
CSTR
(
"
\xF4\x8F\xBF\xBF
"
),
CSTR
(
"
\xE0\xA0\xC0
"
),
-
1
},
/* MB4 vs broken MB3 */
{
CSTR
(
"
\xF4\x8F\xBF\xBF
"
),
CSTR
(
"
\xE0\xA0
"
),
-
1
},
/* MB4 vs incomplete MB3 */
{
CSTR
(
"
\xF4\x8F\xBF\xBF
"
),
CSTR
(
"
\xF0\x90\x80
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\xF4\x8F\xBF\xBF
"
),
CSTR
(
"
\xF0\x90\x80\x7F
"
),
-
1
},
/* MB4 vs broken MB4 */
{
CSTR
(
"
\xF4\x8F\xBF\xBF
"
),
CSTR
(
"
\xF0\x90\x80\xC0
"
),
-
1
},
/* MB4 vs broken MB4 */
/* Broken MB4 vs incomplete/broken MB3 */
{
CSTR
(
"
\xF0\x90\x80\x7F
"
),
CSTR
(
"
\xE0\xA0
"
),
1
},
/* Broken MB4 vs incomplete MB3 */
{
CSTR
(
"
\xF0\x90\x80\x7F
"
),
CSTR
(
"
\xE0\xA0\x7F
"
),
1
},
/* Broken MB4 vs broken MB3 */
{
CSTR
(
"
\xF0\x90\x80\x7F
"
),
CSTR
(
"
\xE0\xA0\xC0
"
),
1
},
/* Broken MB4 vs broken MB3 */
/*
Broken MB4 vs incomplete MB4:
The three leftmost bytes are compared binary, the fourth byte is compared
to auto-padded space.
*/
{
CSTR
(
"
\xF0\x90\x80\x1F
"
),
CSTR
(
"
\xF0\x90\x80
"
),
-
1
},
/* Broken MB4 vs incomplete MB4 */
{
CSTR
(
"
\xF0\x90\x80\x7E
"
),
CSTR
(
"
\xF0\x90\x80
"
),
1
},
/* Broken MB4 vs incomplete MB4 */
/* Broken MB4 vs broken MB4 */
{
CSTR
(
"
\xF0\x90\x80\x7E
"
),
CSTR
(
"
\xF0\x90\x80\x7F
"
),
-
1
},
/* Broken MB4 vs broken MB4 */
{
CSTR
(
"
\xF0\x90\x80\x7E
"
),
CSTR
(
"
\xF0\x90\x80\xC0
"
),
-
1
},
/* Broken MB4 vs broken MB4 */
{
NULL
,
0
,
NULL
,
0
,
0
}
};
static
void
str2hex
(
char
*
dst
,
size_t
dstlen
,
const
char
*
src
,
size_t
srclen
)
{
...
...
@@ -497,6 +540,12 @@ test_strcollsp()
failed
+=
strcollsp
(
&
my_charset_utf8_general_ci
,
strcoll_utf8mb3_common
);
failed
+=
strcollsp
(
&
my_charset_utf8_general_mysql500_ci
,
strcoll_utf8mb3_common
);
failed
+=
strcollsp
(
&
my_charset_utf8_bin
,
strcoll_utf8mb3_common
);
#endif
#ifdef HAVE_CHARSET_utf8mb4
failed
+=
strcollsp
(
&
my_charset_utf8mb4_general_ci
,
strcoll_utf8mb3_common
);
failed
+=
strcollsp
(
&
my_charset_utf8mb4_bin
,
strcoll_utf8mb3_common
);
failed
+=
strcollsp
(
&
my_charset_utf8mb4_general_ci
,
strcoll_utf8mb4_common
);
failed
+=
strcollsp
(
&
my_charset_utf8mb4_bin
,
strcoll_utf8mb4_common
);
#endif
return
failed
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment