Commit aed03906 authored by tomas@poseidon.ndb.mysql.com's avatar tomas@poseidon.ndb.mysql.com

Merge tulin@bk-internal.mysql.com:/home/bk/mysql-4.1

into poseidon.ndb.mysql.com:/home/tomas/mysql-4.1-ndb
parents 60cf2213 67c9fc98
...@@ -365,6 +365,11 @@ uint my_instr_mb(struct charset_info_st *, ...@@ -365,6 +365,11 @@ uint my_instr_mb(struct charset_info_st *,
const char *s, uint s_length, const char *s, uint s_length,
my_match_t *match, uint nmatch); my_match_t *match, uint nmatch);
int my_wildcmp_unicode(CHARSET_INFO *cs,
const char *str, const char *str_end,
const char *wildstr, const char *wildend,
int escape, int w_one, int w_many,
MY_UNICASE_INFO **weights);
extern my_bool my_parse_charset_xml(const char *bug, uint len, extern my_bool my_parse_charset_xml(const char *bug, uint len,
int (*add)(CHARSET_INFO *cs)); int (*add)(CHARSET_INFO *cs));
......
...@@ -63,6 +63,15 @@ select 'A' like 'a' collate utf8_bin; ...@@ -63,6 +63,15 @@ select 'A' like 'a' collate utf8_bin;
select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%'); select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');
_utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%') _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%')
1 1
select convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8);
convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8)
1
select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8)
1
select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8)
1
SELECT 'a' = 'a '; SELECT 'a' = 'a ';
'a' = 'a ' 'a' = 'a '
1 1
......
...@@ -33,6 +33,14 @@ select 'A' like 'a'; ...@@ -33,6 +33,14 @@ select 'A' like 'a';
select 'A' like 'a' collate utf8_bin; select 'A' like 'a' collate utf8_bin;
select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%'); select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');
# Bug #6040: can't retrieve records with umlaut
# characters in case insensitive manner.
# Case insensitive search LIKE comparison
# was broken for multibyte characters:
select convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8);
select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
# #
# Check the following: # Check the following:
# "a" == "a " # "a" == "a "
......
CHARSET_INFO
============
A structure containing data for charset+collation pair implementation.
Virtual functions which use this data are collected
into separate structures MY_CHARSET_HANDLER and
MY_COLLATION_HANDLER.
typedef struct charset_info_st
{
uint number;
uint primary_number;
uint binary_number;
uint state;
const char *csname;
const char *name;
const char *comment;
uchar *ctype;
uchar *to_lower;
uchar *to_upper;
uchar *sort_order;
uint16 *tab_to_uni;
MY_UNI_IDX *tab_from_uni;
uchar state_map[256];
uchar ident_map[256];
uint strxfrm_multiply;
uint mbminlen;
uint mbmaxlen;
char max_sort_char; /* For LIKE optimization */
MY_CHARSET_HANDLER *cset;
MY_COLLATION_HANDLER *coll;
} CHARSET_INFO;
CHARSET_INFO fields description:
===============================
Numbers (identifiers)
---------------------
number - an ID uniquely identifying this charset+collation pair.
primary_number - ID of a charset+collation pair, which consists
of the same character set and the default collation of this
character set. Not really used now. Intended to optimize some
parts of the code where we need to find the default collation
using its non-default counterpart for the given character set.
binary_numner - ID of a charset+collation pair, which consists
of the same character set and the binary collation of this
character set. Not really used now. Intended to optimize
"SELECT BINARY x" in the future.
Names
-----
csname - name of the character set for this charset+collation pair.
name - name of the collation for this charset+collation pair.
comment - a text comment, dysplayed in "Description" column of
SHOW CHARACTER SET output.
Conversion tables
-----------------
ctype - pointer to array[257] of "type of characters"
bit mask for each chatacter, e.g. if a
character is a digit or a letter or a separator, etc.
to_lower - pointer to arrat[256] used in LCASE()
to_upper - pointer to array[256] used in UCASE()
sort_order - pointer to array[256] used for strings comparison
Unicode conversion data
-----------------------
For 8bit character sets:
tab_to_uni : array[256] of charset->Unicode translation
tab_from_uni: a structure for Unicode->charset translation
Non-8 bit charsets have their own structures per charset
hidden in correspondent ctype-xxx.c file and don't use
tab_to_uni and tab_from_uni tables.
Parser maps
-----------
state_map[]
ident_map[]
These maps are to quickly identify if a character is
an identificator part, a digit, a special character,
or a part of other SQL language lexical item.
Probably can be combined with ctype array in the future.
But for some reasons these two arrays are used in the parser,
while a separate ctype[] array is used in the other part of the
code, like fulltext, etc.
Misc fields
-----------
strxfrm_multiply - how many times a sort key (i.e. a string
which can be passed into memcmp() for comparison)
can be longer than the original string.
Usually it is 1. For some complex
collations it can be bigger. For example
in latin1_german2_ci, a sort key is up to
twice longer than the original string.
e.g. Letter 'A' with two dots above is
substituted with 'AE'.
mbminlen - mininum multibyte sequence length.
Now always 1 accept ucs2. For ucs2
it is 2.
mbmaxlen - maximum multibyte sequence length.
1 for 8bit charsets. Can be also 2 or 3.
MY_CHARSET_HANDLER
==================
MY_CHARSET_HANDLER is a collection of character-set
related routines. Defined in m_ctype.h. Have the
following set of functions:
Multibyte routines
------------------
ismbchar() - detects if the given string is a multibyte sequence
mbcharlen() - retuturns length of multibyte sequence starting with
the given character
numchars() - returns number of characters in the given string, e.g.
in SQL function CHAR_LENGTH().
charpos() - calculates the offset of the given position in the string.
Used in SQL functions LEFT(), RIGHT(), SUBSTRING(),
INSERT()
well_formed_length()
- finds the length of correctly formed multybyte beginning.
Used in INSERTs to cut a beginning of the given string
which is
a) "well formed" according to the given character set.
b) can fit into the given data type
Terminates the string in the good position, taking in account
multibyte character boundaries.
lengthsp() - returns the length of the given string without traling spaces.
Unicode conversion routines
---------------------------
mb_wc - converts the left multibyte sequence into it Unicode code.
mc_mb - converts the given Unicode code into multibyte sequence.
Case and sort convertion
------------------------
caseup_str - converts the given 0-terminated string into the upper case
casedn_str - converts the given 0-terminated string into the lower case
caseup - converts the given string into the lower case using length
casedn - converts the given string into the lower case using length
Number-to-string conversion routines
------------------------------------
snprintf()
long10_to_str()
longlong10_to_str()
The names are pretty self-descripting.
String padding routines
-----------------------
fill() - writes the given Unicode value into the given string
with the given length. Used to pad the string, usually
with space character, according to the given charset.
String-to-numner conversion routines
------------------------------------
strntol()
strntoul()
strntoll()
strntoull()
strntod()
These functions are almost for the same thing with their
STDLIB counterparts, but also:
- accept length instead of 0-terminator
- and are character set dependant
Simple scanner routines
-----------------------
scan() - to skip leading spaces in the given string.
Used when a string value is inserted into a numeric field.
MY_COLLATION_HANDLER
====================
strnncoll() - compares two strings according to the given collation
strnncollsp() - like the above but ignores trailing spaces
strnxfrm() - makes a sort key suitable for memcmp() corresponding
to the given string
like_range() - creates a LIKE range, for optimizer
wildcmp() - wildcard comparison, for LIKE
strcasecmp() - 0-terminated string comparison
instr() - finds the first substring appearence in the string
hash_sort() - calculates hash value taking in account
the collation rules, e.g. case-insensitivity,
accent sensitivity, etc.
\ No newline at end of file
...@@ -1231,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)), ...@@ -1231,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)),
} }
/*
** Compare string against string with wildcard
** 0 if matched
** -1 if not matched with wildcard
** 1 if matched with wildcard
*/
static
int my_wildcmp_ucs2(CHARSET_INFO *cs,
const char *str,const char *str_end,
const char *wildstr,const char *wildend,
int escape, int w_one, int w_many,
MY_UNICASE_INFO **weights)
{
int result= -1; /* Not found, using wildcards */
my_wc_t s_wc, w_wc;
int scan, plane;
while (wildstr != wildend)
{
while (1)
{
scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
(const uchar*)wildend);
if (scan <= 0)
return 1;
if (w_wc == (my_wc_t)escape)
{
wildstr+= scan;
scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
(const uchar*)wildend);
if (scan <= 0)
return 1;
}
if (w_wc == (my_wc_t)w_many)
{
result= 1; /* Found an anchor char */
break;
}
wildstr+= scan;
scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, (const uchar*)str_end);
if (scan <=0)
return 1;
str+= scan;
if (w_wc == (my_wc_t)w_one)
{
result= 1; /* Found an anchor char */
}
else
{
if (weights)
{
plane=(s_wc>>8) & 0xFF;
s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
plane=(w_wc>>8) & 0xFF;
w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
}
if (s_wc != w_wc)
return 1; /* No match */
}
if (wildstr == wildend)
return (str != str_end); /* Match if both are at end */
}
if (w_wc == (my_wc_t)w_many)
{ /* Found w_many */
/* Remove any '%' and '_' from the wild search string */
for ( ; wildstr != wildend ; )
{
scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
(const uchar*)wildend);
if (scan <= 0)
return 1;
if (w_wc == (my_wc_t)w_many)
{
wildstr+= scan;
continue;
}
if (w_wc == (my_wc_t)w_one)
{
wildstr+= scan;
scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end);
if (scan <=0)
return 1;
str+= scan;
continue;
}
break; /* Not a wild character */
}
if (wildstr == wildend)
return 0; /* Ok if w_many is last */
if (str == str_end)
return -1;
scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
(const uchar*)wildend);
if (scan <= 0)
return 1;
if (w_wc == (my_wc_t)escape)
{
wildstr+= scan;
scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
(const uchar*)wildend);
if (scan <= 0)
return 1;
}
while (1)
{
/* Skip until the first character from wildstr is found */
while (str != str_end)
{
scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str,
(const uchar*)str_end);
if (scan <= 0)
return 1;
if (weights)
{
plane=(s_wc>>8) & 0xFF;
s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
plane=(w_wc>>8) & 0xFF;
w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
}
if (s_wc == w_wc)
break;
str+= scan;
}
if (str == str_end)
return -1;
result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape,
w_one,w_many,weights);
if (result <= 0)
return result;
str+= scan;
}
}
}
return (str != str_end ? 1 : 0);
}
static static
int my_wildcmp_ucs2_ci(CHARSET_INFO *cs, int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
const char *str,const char *str_end, const char *str,const char *str_end,
const char *wildstr,const char *wildend, const char *wildstr,const char *wildend,
int escape, int w_one, int w_many) int escape, int w_one, int w_many)
{ {
return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend, return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
escape,w_one,w_many,uni_plane); escape,w_one,w_many,uni_plane);
} }
...@@ -1406,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs, ...@@ -1406,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
const char *wildstr,const char *wildend, const char *wildstr,const char *wildend,
int escape, int w_one, int w_many) int escape, int w_one, int w_many)
{ {
return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend, return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
escape,w_one,w_many,NULL); escape,w_one,w_many,NULL);
} }
......
...@@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={ ...@@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={
}; };
/*
** Compare string against string with wildcard
** This function is used in UTF8 and UCS2
**
** 0 if matched
** -1 if not matched with wildcard
** 1 if matched with wildcard
*/
int my_wildcmp_unicode(CHARSET_INFO *cs,
const char *str,const char *str_end,
const char *wildstr,const char *wildend,
int escape, int w_one, int w_many,
MY_UNICASE_INFO **weights)
{
int result= -1; /* Not found, using wildcards */
my_wc_t s_wc, w_wc;
int scan, plane;
int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc,
const unsigned char *s,const unsigned char *e);
mb_wc= cs->cset->mb_wc;
while (wildstr != wildend)
{
while (1)
{
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
return 1;
if (w_wc == (my_wc_t)escape)
{
wildstr+= scan;
if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
return 1;
}
if (w_wc == (my_wc_t)w_many)
{
result= 1; /* Found an anchor char */
break;
}
wildstr+= scan;
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <=0)
return 1;
str+= scan;
if (w_wc == (my_wc_t)w_one)
{
result= 1; /* Found an anchor char */
}
else
{
if (weights)
{
plane=(s_wc>>8) & 0xFF;
s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
plane=(w_wc>>8) & 0xFF;
w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
}
if (s_wc != w_wc)
return 1; /* No match */
}
if (wildstr == wildend)
return (str != str_end); /* Match if both are at end */
}
if (w_wc == (my_wc_t)w_many)
{ /* Found w_many */
/* Remove any '%' and '_' from the wild search string */
for ( ; wildstr != wildend ; )
{
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
return 1;
if (w_wc == (my_wc_t)w_many)
{
wildstr+= scan;
continue;
}
if (w_wc == (my_wc_t)w_one)
{
wildstr+= scan;
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <=0)
return 1;
str+= scan;
continue;
}
break; /* Not a wild character */
}
if (wildstr == wildend)
return 0; /* Ok if w_many is last */
if (str == str_end)
return -1;
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <=0)
return 1;
if (w_wc == (my_wc_t)escape)
{
wildstr+= scan;
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <=0)
return 1;
}
while (1)
{
/* Skip until the first character from wildstr is found */
while (str != str_end)
{
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <=0)
return 1;
if (weights)
{
plane=(s_wc>>8) & 0xFF;
s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
plane=(w_wc>>8) & 0xFF;
w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
}
if (s_wc == w_wc)
break;
str+= scan;
}
if (str == str_end)
return -1;
result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
escape, w_one, w_many,
weights);
if (result <= 0)
return result;
str+= scan;
}
}
}
return (str != str_end ? 1 : 0);
}
#endif #endif
...@@ -1992,6 +2147,17 @@ static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t) ...@@ -1992,6 +2147,17 @@ static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
return my_strncasecmp_utf8(cs, s, t, len); return my_strncasecmp_utf8(cs, s, t, len);
} }
static
int my_wildcmp_utf8(CHARSET_INFO *cs,
const char *str,const char *str_end,
const char *wildstr,const char *wildend,
int escape, int w_one, int w_many)
{
return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
escape,w_one,w_many,uni_plane);
}
static int my_strnxfrm_utf8(CHARSET_INFO *cs, static int my_strnxfrm_utf8(CHARSET_INFO *cs,
uchar *dst, uint dstlen, uchar *dst, uint dstlen,
const uchar *src, uint srclen) const uchar *src, uint srclen)
...@@ -2060,7 +2226,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = ...@@ -2060,7 +2226,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
my_strnncollsp_utf8, my_strnncollsp_utf8,
my_strnxfrm_utf8, my_strnxfrm_utf8,
my_like_range_mb, my_like_range_mb,
my_wildcmp_mb, my_wildcmp_utf8,
my_strcasecmp_utf8, my_strcasecmp_utf8,
my_instr_mb, my_instr_mb,
my_hash_sort_utf8 my_hash_sort_utf8
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment