Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
mariadb
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
mariadb
Commits
e4f8cea3
Commit
e4f8cea3
authored
Jul 07, 2015
by
Alexander Barkov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
parent
a5f4412b
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
120 additions
and
232 deletions
+120
-232
mysql-test/r/ctype_utf32.result
mysql-test/r/ctype_utf32.result
+18
-0
mysql-test/t/ctype_utf32.test
mysql-test/t/ctype_utf32.test
+17
-0
strings/ctype-ucs2.c
strings/ctype-ucs2.c
+31
-232
unittest/strings/strings-t.c
unittest/strings/strings-t.c
+54
-0
No files found.
mysql-test/r/ctype_utf32.result
View file @
e4f8cea3
...
...
@@ -2206,3 +2206,21 @@ DEALLOCATE PREPARE stmt;
#
# End of 10.0 tests
#
#
# Start of 10.1 tests
#
#
# MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
#
CREATE TABLE t1 (a VARCHAR(10) CHARACTER SET utf32, KEY(a));
INSERT INTO t1 VALUES (0x10000),(0x10001),(0x10002);
SELECT COUNT(DISTINCT a) FROM t1;
COUNT(DISTINCT a)
1
DROP TABLE t1;
SELECT _utf32 0x10001=_utf32 0x10002;
_utf32 0x10001=_utf32 0x10002
1
#
# End of 10.1 tests
#
mysql-test/t/ctype_utf32.test
View file @
e4f8cea3
...
...
@@ -956,3 +956,20 @@ DEALLOCATE PREPARE stmt;
--
echo
# End of 10.0 tests
--
echo
#
--
echo
#
--
echo
# Start of 10.1 tests
--
echo
#
--
echo
#
--
echo
# MDEV-8419 utf32: compare broken bytes as "greater than any non-broken character"
--
echo
#
# Make sure that all non-BMP characters are compared as equal
CREATE
TABLE
t1
(
a
VARCHAR
(
10
)
CHARACTER
SET
utf32
,
KEY
(
a
));
INSERT
INTO
t1
VALUES
(
0x10000
),(
0x10001
),(
0x10002
);
SELECT
COUNT
(
DISTINCT
a
)
FROM
t1
;
DROP
TABLE
t1
;
SELECT
_utf32
0x10001
=
_utf32
0x10002
;
--
echo
#
--
echo
# End of 10.1 tests
--
echo
#
strings/ctype-ucs2.c
View file @
e4f8cea3
...
...
@@ -1892,6 +1892,34 @@ struct charset_info_st my_charset_utf16le_bin=
*/
#define IS_UTF32_MBHEAD4(b0,b1) (!(b0) && ((uchar) (b1) <= 0x10))
#define IS_MB4_CHAR(b0,b1,b2,b3) (IS_UTF32_MBHEAD4(b0,b1))
#define MY_UTF32_WC4(b0,b1,b2,b3) ((b0 << 24) + (b1 << 16) + (b2 << 8) + (b3))
static
inline
int
my_weight_utf32_general_ci
(
uchar
b0
,
uchar
b1
,
uchar
b2
,
uchar
b3
)
{
my_wc_t
wc
=
MY_UTF32_WC4
(
b0
,
b1
,
b2
,
b3
);
if
(
wc
<=
0xFFFF
)
{
MY_UNICASE_CHARACTER
*
page
=
my_unicase_default
.
page
[
wc
>>
8
];
return
(
int
)
(
page
?
page
[
wc
&
0xFF
].
sort
:
wc
);
}
return
MY_CS_REPLACEMENT_CHARACTER
;
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
#include "strcoll.ic"
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) MY_UTF32_WC4(b0, b1, b2, b3))
#include "strcoll.ic"
#undef IS_MB2_CHAR
#undef IS_MB4_CHAR
static
int
my_utf32_uni
(
CHARSET_INFO
*
cs
__attribute__
((
unused
)),
...
...
@@ -1899,7 +1927,7 @@ my_utf32_uni(CHARSET_INFO *cs __attribute__((unused)),
{
if
(
s
+
4
>
e
)
return
MY_CS_TOOSMALL4
;
*
pwc
=
(
s
[
0
]
<<
24
)
+
(
s
[
1
]
<<
16
)
+
(
s
[
2
]
<<
8
)
+
(
s
[
3
]);
*
pwc
=
MY_UTF32_WC4
(
s
[
0
],
s
[
1
],
s
[
2
],
s
[
3
]);
return
*
pwc
>
0x10FFFF
?
MY_CS_ILSEQ
:
4
;
}
...
...
@@ -2029,144 +2057,6 @@ my_casedn_utf32(CHARSET_INFO *cs, char *src, size_t srclen,
}
static
int
my_strnncoll_utf32
(
CHARSET_INFO
*
cs
,
const
uchar
*
s
,
size_t
slen
,
const
uchar
*
t
,
size_t
tlen
,
my_bool
t_is_prefix
)
{
my_wc_t
UNINIT_VAR
(
s_wc
),
UNINIT_VAR
(
t_wc
);
const
uchar
*
se
=
s
+
slen
;
const
uchar
*
te
=
t
+
tlen
;
MY_UNICASE_INFO
*
uni_plane
=
cs
->
caseinfo
;
while
(
s
<
se
&&
t
<
te
)
{
int
s_res
=
my_utf32_uni
(
cs
,
&
s_wc
,
s
,
se
);
int
t_res
=
my_utf32_uni
(
cs
,
&
t_wc
,
t
,
te
);
if
(
s_res
<=
0
||
t_res
<=
0
)
{
/* Incorrect string, compare by char value */
return
my_bincmp
(
s
,
se
,
t
,
te
);
}
my_tosort_utf32
(
uni_plane
,
&
s_wc
);
my_tosort_utf32
(
uni_plane
,
&
t_wc
);
if
(
s_wc
!=
t_wc
)
{
return
s_wc
>
t_wc
?
1
:
-
1
;
}
s
+=
s_res
;
t
+=
t_res
;
}
return
(
int
)
(
t_is_prefix
?
(
t
-
te
)
:
((
se
-
s
)
-
(
te
-
t
)));
}
/**
Compare strings, discarding end space
If one string is shorter as the other, then we space extend the other
so that the strings have equal length.
This will ensure that the following things hold:
"a" == "a "
"a\0" < "a"
"a\0" < "a "
@param cs Character set pinter.
@param a First string to compare.
@param a_length Length of 'a'.
@param b Second string to compare.
@param b_length Length of 'b'.
IMPLEMENTATION
@return Comparison result.
@retval Negative number, if a less than b.
@retval 0, if a is equal to b
@retval Positive number, if a > b
*/
static
int
my_strnncollsp_utf32
(
CHARSET_INFO
*
cs
,
const
uchar
*
s
,
size_t
slen
,
const
uchar
*
t
,
size_t
tlen
,
my_bool
diff_if_only_endspace_difference
)
{
int
res
;
my_wc_t
UNINIT_VAR
(
s_wc
),
UNINIT_VAR
(
t_wc
);
const
uchar
*
se
=
s
+
slen
,
*
te
=
t
+
tlen
;
MY_UNICASE_INFO
*
uni_plane
=
cs
->
caseinfo
;
DBUG_ASSERT
((
slen
%
4
)
==
0
);
DBUG_ASSERT
((
tlen
%
4
)
==
0
);
#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
diff_if_only_endspace_difference
=
FALSE
;
#endif
while
(
s
<
se
&&
t
<
te
)
{
int
s_res
=
my_utf32_uni
(
cs
,
&
s_wc
,
s
,
se
);
int
t_res
=
my_utf32_uni
(
cs
,
&
t_wc
,
t
,
te
);
if
(
s_res
<=
0
||
t_res
<=
0
)
{
/* Incorrect string, compare bytewise */
return
my_bincmp
(
s
,
se
,
t
,
te
);
}
my_tosort_utf32
(
uni_plane
,
&
s_wc
);
my_tosort_utf32
(
uni_plane
,
&
t_wc
);
if
(
s_wc
!=
t_wc
)
{
return
s_wc
>
t_wc
?
1
:
-
1
;
}
s
+=
s_res
;
t
+=
t_res
;
}
slen
=
(
size_t
)
(
se
-
s
);
tlen
=
(
size_t
)
(
te
-
t
);
res
=
0
;
if
(
slen
!=
tlen
)
{
int
s_res
,
swap
=
1
;
if
(
diff_if_only_endspace_difference
)
res
=
1
;
/* Assume 's' is bigger */
if
(
slen
<
tlen
)
{
slen
=
tlen
;
s
=
t
;
se
=
te
;
swap
=
-
1
;
res
=
-
res
;
}
for
(
;
s
<
se
;
s
+=
s_res
)
{
if
((
s_res
=
my_utf32_uni
(
cs
,
&
s_wc
,
s
,
se
))
<
0
)
{
DBUG_ASSERT
(
0
);
return
0
;
}
if
(
s_wc
!=
' '
)
return
(
s_wc
<
' '
)
?
-
swap
:
swap
;
}
}
return
res
;
}
static
uint
my_ismbchar_utf32
(
CHARSET_INFO
*
cs
__attribute__
((
unused
)),
const
char
*
b
,
...
...
@@ -2578,97 +2468,6 @@ my_wildcmp_utf32_bin(CHARSET_INFO *cs,
}
static
int
my_strnncoll_utf32_bin
(
CHARSET_INFO
*
cs
,
const
uchar
*
s
,
size_t
slen
,
const
uchar
*
t
,
size_t
tlen
,
my_bool
t_is_prefix
)
{
my_wc_t
UNINIT_VAR
(
s_wc
),
UNINIT_VAR
(
t_wc
);
const
uchar
*
se
=
s
+
slen
;
const
uchar
*
te
=
t
+
tlen
;
while
(
s
<
se
&&
t
<
te
)
{
int
s_res
=
my_utf32_uni
(
cs
,
&
s_wc
,
s
,
se
);
int
t_res
=
my_utf32_uni
(
cs
,
&
t_wc
,
t
,
te
);
if
(
s_res
<=
0
||
t_res
<=
0
)
{
/* Incorrect string, compare by char value */
return
my_bincmp
(
s
,
se
,
t
,
te
);
}
if
(
s_wc
!=
t_wc
)
{
return
s_wc
>
t_wc
?
1
:
-
1
;
}
s
+=
s_res
;
t
+=
t_res
;
}
return
(
int
)
(
t_is_prefix
?
(
t
-
te
)
:
((
se
-
s
)
-
(
te
-
t
)));
}
static
inline
my_wc_t
my_utf32_get
(
const
uchar
*
s
)
{
return
((
my_wc_t
)
s
[
0
]
<<
24
)
+
((
my_wc_t
)
s
[
1
]
<<
16
)
+
((
my_wc_t
)
s
[
2
]
<<
8
)
+
s
[
3
];
}
static
int
my_strnncollsp_utf32_bin
(
CHARSET_INFO
*
cs
__attribute__
((
unused
)),
const
uchar
*
s
,
size_t
slen
,
const
uchar
*
t
,
size_t
tlen
,
my_bool
diff_if_only_endspace_difference
__attribute__
((
unused
)))
{
const
uchar
*
se
,
*
te
;
size_t
minlen
;
DBUG_ASSERT
((
slen
%
4
)
==
0
);
DBUG_ASSERT
((
tlen
%
4
)
==
0
);
se
=
s
+
slen
;
te
=
t
+
tlen
;
for
(
minlen
=
MY_MIN
(
slen
,
tlen
);
minlen
;
minlen
-=
4
)
{
my_wc_t
s_wc
=
my_utf32_get
(
s
);
my_wc_t
t_wc
=
my_utf32_get
(
t
);
if
(
s_wc
!=
t_wc
)
return
s_wc
>
t_wc
?
1
:
-
1
;
s
+=
4
;
t
+=
4
;
}
if
(
slen
!=
tlen
)
{
int
swap
=
1
;
if
(
slen
<
tlen
)
{
s
=
t
;
se
=
te
;
swap
=
-
1
;
}
for
(
;
s
<
se
;
s
+=
4
)
{
my_wc_t
s_wc
=
my_utf32_get
(
s
);
if
(
s_wc
!=
' '
)
return
(
s_wc
<
' '
)
?
-
swap
:
swap
;
}
}
return
0
;
}
static
size_t
my_scan_utf32
(
CHARSET_INFO
*
cs
,
const
char
*
str
,
const
char
*
end
,
int
sequence_type
)
...
...
@@ -2696,8 +2495,8 @@ my_scan_utf32(CHARSET_INFO *cs,
static
MY_COLLATION_HANDLER
my_collation_utf32_general_ci_handler
=
{
NULL
,
/* init */
my_strnncoll_utf32
,
my_strnncollsp_utf32
,
my_strnncoll_utf32
_general_ci
,
my_strnncollsp_utf32
_general_ci
,
my_strnxfrm_unicode
,
my_strnxfrmlen_unicode
,
my_like_range_generic
,
...
...
unittest/strings/strings-t.c
View file @
e4f8cea3
...
...
@@ -537,6 +537,55 @@ static STRNNCOLL_PARAM strcoll_utf16le_general_ci[]=
{
CSTR
(
"
\x00\xD8\x00\xDC
"
),
CSTR
(
"
\xFF\xDB\xFF\xDF
"
),
0
},
/* Non-BMP MB4 vs non-BMP MB4 */
{
CSTR
(
"
\x00\x00
"
),
CSTR
(
"
\x00\xD8\x01\xDC
"
),
-
1
},
/* U+0000 vs non-BMP MB4 */
{
CSTR
(
"
\x00\x00
"
),
CSTR
(
"
\xFF\xDB\xFF\xDF
"
),
-
1
},
/* U+0000 vs non-BMP MB4 */
{
NULL
,
0
,
NULL
,
0
,
0
}
};
static
STRNNCOLL_PARAM
strcoll_utf32_common
[]
=
{
/* Minimum character: U+0000 == _utf32 0x00000000 */
{
CSTR
(
"
\x00\x00\x00\x00
"
),
CSTR
(
"
\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x00\x00\x00
"
),
CSTR
(
"
\xFF
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x00\x00\x00
"
),
CSTR
(
"
\x00\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x00\x00\x00
"
),
CSTR
(
"
\x00\x00\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x00\x00\x00
"
),
CSTR
(
"
\x00\x20\x00\x00
"
),
-
1
},
/* MB4 vs broken MB4 */
{
CSTR
(
"
\x00\x00\x00\x00
"
),
CSTR
(
"
\xFF\xFF\xFF\xFF
"
),
-
1
},
/* MB4 vs broken MB4 */
/* Minimum non-BMP character: U+10000 == _utf32 0x00010000 */
{
CSTR
(
"
\x00\x01\x00\x00
"
),
CSTR
(
"
\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x01\x00\x00
"
),
CSTR
(
"
\xFF
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x01\x00\x00
"
),
CSTR
(
"
\x00\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x01\x00\x00
"
),
CSTR
(
"
\x00\x00\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x01\x00\x00
"
),
CSTR
(
"
\x00\x20\x00\x00
"
),
-
1
},
/* MB4 vs broken MB4 */
{
CSTR
(
"
\x00\x01\x00\x00
"
),
CSTR
(
"
\xFF\xFF\xFF\xFF
"
),
-
1
},
/* MB4 vs broken MB4 */
/* Maximum character: U+10FFFF == _utf32 0x0010FFFF */
{
CSTR
(
"
\x00\x10\xFF\xFF
"
),
CSTR
(
"
\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x10\xFF\xFF
"
),
CSTR
(
"
\xFF
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x10\xFF\xFF
"
),
CSTR
(
"
\x00\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x10\xFF\xFF
"
),
CSTR
(
"
\x00\x00\x00
"
),
-
1
},
/* MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x10\xFF\xFF
"
),
CSTR
(
"
\x20\x00\x00\x00
"
),
-
1
},
/* MB4 vs broken MB3 */
{
CSTR
(
"
\x00\x10\xFF\xFF
"
),
CSTR
(
"
\xFF\xFF\xFF\xFF
"
),
-
1
},
/* MB4 vs broken MB4 */
/* Broken MB4 vs incomplete/broken MB3 */
{
CSTR
(
"
\x00\x20\x00\x00
"
),
CSTR
(
"
\x00
"
),
1
},
/* Broken MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x20\x00\x00
"
),
CSTR
(
"
\x00\x00
"
),
1
},
/* Broken MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x20\x00\x00
"
),
CSTR
(
"
\x00\x00\x00
"
),
1
},
/* Broken MB4 vs incomplete MB4 */
{
CSTR
(
"
\x00\x20\x00\x00
"
),
CSTR
(
"
\x00\x20\x00\x01
"
),
-
1
},
/* Broken MB4 vs broken MB4 */
{
NULL
,
0
,
NULL
,
0
,
0
}
};
static
STRNNCOLL_PARAM
strcoll_utf32_general_ci
[]
=
{
/* Two non-BMP characters are compared as equal */
{
CSTR
(
"
\x00\x01\x00\x00
"
),
CSTR
(
"
\x00\x01\x00\x01
"
),
0
},
/* non-BMP MB4 vs non-BMP MB4 */
{
CSTR
(
"
\x00\x00\x00\x00
"
),
CSTR
(
"
\x00\x01\x00\x00
"
),
-
1
},
/* U+0000 vs non-BMP MB4 */
{
CSTR
(
"
\x00\x00\x00\x00
"
),
CSTR
(
"
\x00\x01\x00\x01
"
),
-
1
},
/* U+0000 vs non-BMP MB4 */
{
NULL
,
0
,
NULL
,
0
,
0
}
};
...
...
@@ -688,6 +737,11 @@ test_strcollsp()
failed
+=
strcollsp
(
&
my_charset_utf16le_bin
,
strcoll_utf16le_space
);
failed
+=
strcollsp
(
&
my_charset_utf16le_bin
,
strcoll_utf16le_common
);
#endif
#ifdef HAVE_CHARSET_utf32
failed
+=
strcollsp
(
&
my_charset_utf32_general_ci
,
strcoll_utf32_common
);
failed
+=
strcollsp
(
&
my_charset_utf32_general_ci
,
strcoll_utf32_general_ci
);
failed
+=
strcollsp
(
&
my_charset_utf32_bin
,
strcoll_utf32_common
);
#endif
#ifdef HAVE_CHARSET_utf8
failed
+=
strcollsp
(
&
my_charset_utf8_general_ci
,
strcoll_utf8mb3_common
);
failed
+=
strcollsp
(
&
my_charset_utf8_general_mysql500_ci
,
strcoll_utf8mb3_common
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment