Commit d00be2f3 authored by Michael Widenius's avatar Michael Widenius

Added more general support for sorting 2 characters as one (contractions)

Added support for Croatian sorting orders utf8_croatian_ci and ucs2_croatian_ci.
Patch done by Alexander Barkov. See http://www.collation-charts.org/articles/croatian.htm

mysql-test/r/ctype_uca.result:
  Added testing of Croatian sort order
mysql-test/t/ctype_uca.test:
  Added testing of Croatian sort order
parent ad3237ed
......@@ -49,6 +49,24 @@ typedef struct unicase_info_st
extern MY_UNICASE_INFO *my_unicase_default[256];
extern MY_UNICASE_INFO *my_unicase_turkish[256];
#define MY_UCA_MAX_CONTRACTION 4
#define MY_UCA_MAX_WEIGHT_SIZE 8
typedef struct my_contraction_t
{
my_wc_t ch[MY_UCA_MAX_CONTRACTION]; /* Character sequence */
uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */
} MY_CONTRACTION;
typedef struct my_contraction_list_t
{
size_t nitems; /* Number of items in the list */
MY_CONTRACTION *item; /* List of contractions */
char *flags; /* Character flags, e.g. "is contraction head") */
} MY_CONTRACTIONS;
typedef struct uni_ctype_st
{
uchar pctype;
......@@ -262,7 +280,7 @@ typedef struct charset_info_st
uchar *to_lower;
uchar *to_upper;
uchar *sort_order;
uint16 *contractions;
MY_CONTRACTIONS *contractions;
uint16 **sort_order_big;
uint16 *tab_to_uni;
MY_UNI_IDX *tab_from_uni;
......@@ -475,6 +493,13 @@ my_bool my_charset_is_ascii_based(CHARSET_INFO *cs);
my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs);
uint my_charset_repertoire(CHARSET_INFO *cs);
my_bool my_uca_have_contractions(CHARSET_INFO *cs);
my_bool my_uca_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc);
my_bool my_uca_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc);
uint16 *my_uca_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2);
#define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */
......
......@@ -159,6 +159,7 @@ insert into t1 values (_ucs2 0x01fc),(_ucs2 0x01fd),(_ucs2 0x01fe),(_ucs2 0x01ff
insert into t1 values ('AA'),('Aa'),('aa'),('aA');
insert into t1 values ('CH'),('Ch'),('ch'),('cH');
insert into t1 values ('DZ'),('Dz'),('dz'),('dZ');
insert into t1 values ('DŽ'),('Dž'),('dž'),('dŽ');
insert into t1 values ('IJ'),('Ij'),('ij'),('iJ');
insert into t1 values ('LJ'),('Lj'),('lj'),('lJ');
insert into t1 values ('LL'),('Ll'),('ll'),('lL');
......@@ -181,7 +182,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -286,7 +287,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Ð,ð
Đ,đ
Ɖ
......@@ -400,6 +401,7 @@ CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DŽ,Dž,dŽ,dž
Đ,đ
Ɖ
Ɗ
......@@ -513,7 +515,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -622,6 +624,7 @@ CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DŽ,Dž,dŽ,dž
Đ,đ
Ɖ
Ɗ
......@@ -729,7 +732,7 @@ CH,Ch,cH,ch
Ć,ć
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -840,6 +843,7 @@ CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz
DŽ,Dž,dŽ,dž
DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
......@@ -951,7 +955,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -1056,7 +1060,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -1164,7 +1168,7 @@ CH,Ch,cH,ch
Ç,ç
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -1275,6 +1279,7 @@ cH
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DŽ,Dž,dŽ,dž
Đ,đ
Ɖ
Ɗ
......@@ -1382,7 +1387,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -1491,6 +1496,7 @@ cH
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DŽ,Dž,dŽ,dž
Đ,đ
Ɖ
Ɗ
......@@ -1599,6 +1605,7 @@ cH
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DŽ,Dž,dŽ,dž
Đ,đ
Ɖ
Ɗ
......@@ -1707,7 +1714,7 @@ cH
CH,Ch,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -1813,7 +1820,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -1921,7 +1928,7 @@ CH,Ch,cH,ch
Ĉ,ĉ
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -2030,7 +2037,7 @@ C,c,Ç,ç,Ć,ć,Ĉ,ĉ,Ċ,ċ,Č,č
CH,Ch,cH,ch
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DŽ,Dž,dž,DZ,Dz,dz
DZ,Dz,DŽ,Dž,dZ,dz,dŽ,dž,DŽ,Dž,dž,DZ,Dz,dz
Đ,đ
Ɖ
Ɗ
......@@ -2121,6 +2128,118 @@ Z,z,Ź,ź,Ż,ż,Ž,ž
ǁ
ǂ
ǃ
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_croatian_ci;
group_concat(c1 order by c1)
÷
×
A,a,À,Á,Â,Ã,Ä,Å,à,á,â,ã,ä,å,Ā,ā,Ă,ă,Ą,ą,Ǎ,ǎ,Ǟ,ǟ,Ǡ,ǡ,Ǻ,ǻ
AA,Aa,aA,aa
Æ,æ,Ǣ,ǣ,Ǽ,ǽ
B,b
ƀ
Ɓ
Ƃ,ƃ
C,c,Ç,ç,Ĉ,ĉ,Ċ,ċ
CH,Ch,cH,ch
Č,č
Ć,ć
Ƈ,ƈ
D,d,Ď,ď
DZ,Dz,dZ,dz,DZ,Dz,dz
DŽ,Dž,dž,DŽ,Dž,dž
Đ,đ
Ɖ
Ɗ
Ƌ,ƌ
Ð,ð
E,e,È,É,Ê,Ë,è,é,ê,ë,Ē,ē,Ĕ,ĕ,Ė,ė,Ę,ę,Ě,ě
Ǝ,ǝ
Ə
Ɛ
F,f
Ƒ,ƒ
G,g,Ĝ,ĝ,Ğ,ğ,Ġ,ġ,Ģ,ģ,Ǧ,ǧ,Ǵ,ǵ
Ǥ,ǥ
Ɠ
Ɣ
Ƣ,ƣ
H,h,Ĥ,ĥ
ƕ,Ƕ
Ħ,ħ
I,i,Ì,Í,Î,Ï,ì,í,î,ï,Ĩ,ĩ,Ī,ī,Ĭ,ĭ,Į,į,İ,Ǐ,ǐ
IJ,Ij,iJ,ij,IJ,ij
ı
Ɨ
Ɩ
J,j,Ĵ,ĵ,ǰ
K,k,Ķ,ķ,Ǩ,ǩ
Ƙ,ƙ
L,l,Ĺ,ĺ,Ļ,ļ,Ľ,ľ
Ŀ,ŀ
lJ
LL,Ll,lL,ll
LJ,Lj,lj,LJ,Lj,lj
Ł,ł
ƚ
ƛ
M,m
N,n,Ñ,ñ,Ń,ń,Ņ,ņ,Ň,ň,Ǹ,ǹ
nJ
NJ,Nj,nj,NJ,Nj,nj
Ɲ
ƞ
Ŋ,ŋ
O,o,Ò,Ó,Ô,Õ,Ö,ò,ó,ô,õ,ö,Ō,ō,Ŏ,ŏ,Ő,ő,Ơ,ơ,Ǒ,ǒ,Ǫ,ǫ,Ǭ,ǭ
OE,Oe,oE,oe,Œ,œ
Ø,ø,Ǿ,ǿ
Ɔ
Ɵ
P,p
Ƥ,ƥ
Q,q
ĸ
R,r,Ŕ,ŕ,Ŗ,ŗ,Ř,ř
RR,Rr,rR,rr
Ʀ
S,s,Ś,ś,Ŝ,ŝ,Ş,ş,ſ
SS,Ss,sS,ss,ß
Š,š
Ʃ
ƪ
T,t,Ţ,ţ,Ť,ť
ƾ
Ŧ,ŧ
ƫ
Ƭ,ƭ
Ʈ
U,u,Ù,Ú,Û,Ü,ù,ú,û,ü,Ũ,ũ,Ū,ū,Ŭ,ŭ,Ů,ů,Ű,ű,Ų,ų,Ư,ư,Ǔ,ǔ,Ǖ,ǖ,Ǘ,ǘ,Ǚ,ǚ,Ǜ,ǜ
Ɯ
Ʊ
V,v
Ʋ
W,w,Ŵ,ŵ
X,x
Y,y,Ý,ý,ÿ,Ŷ,ŷ,Ÿ
Ƴ,ƴ
Z,z,Ź,ź,Ż,ż
ƍ
Ž,ž
Ƶ,ƶ
Ʒ,Ǯ,ǯ
Ƹ,ƹ
ƺ
Þ,þ
ƿ,Ƿ
ƻ
Ƨ,ƨ
Ƽ,ƽ
Ƅ,ƅ
ʼn
ǀ
ǁ
ǂ
ǃ
drop table t1;
SET NAMES utf8;
CREATE TABLE t1 (c varchar(255) NOT NULL COLLATE utf8_general_ci, INDEX (c));
......
......@@ -186,6 +186,7 @@ insert into t1 values (_ucs2 0x01fc),(_ucs2 0x01fd),(_ucs2 0x01fe),(_ucs2 0x01ff
insert into t1 values ('AA'),('Aa'),('aa'),('aA');
insert into t1 values ('CH'),('Ch'),('ch'),('cH');
insert into t1 values ('DZ'),('Dz'),('dz'),('dZ');
insert into t1 values ('DŽ'),('Dž'),('dž'),('dŽ');
insert into t1 values ('IJ'),('Ij'),('ij'),('iJ');
insert into t1 values ('LJ'),('Lj'),('lj'),('lJ');
insert into t1 values ('LL'),('Ll'),('ll'),('lL');
......@@ -213,6 +214,7 @@ select group_concat(c1 order by c1) from t1 group by c1 collate utf8_spanish2_ci
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_roman_ci;
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_esperanto_ci;
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_hungarian_ci;
select group_concat(c1 order by c1) from t1 group by c1 collate utf8_croatian_ci;
drop table t1;
......
......@@ -42,6 +42,7 @@ extern CHARSET_INFO my_charset_ucs2_roman_uca_ci;
extern CHARSET_INFO my_charset_ucs2_persian_uca_ci;
extern CHARSET_INFO my_charset_ucs2_esperanto_uca_ci;
extern CHARSET_INFO my_charset_ucs2_hungarian_uca_ci;
extern CHARSET_INFO my_charset_ucs2_croatian_uca_ci;
#endif
#ifdef HAVE_CHARSET_utf8
......@@ -63,6 +64,7 @@ extern CHARSET_INFO my_charset_utf8_roman_uca_ci;
extern CHARSET_INFO my_charset_utf8_persian_uca_ci;
extern CHARSET_INFO my_charset_utf8_esperanto_uca_ci;
extern CHARSET_INFO my_charset_utf8_hungarian_uca_ci;
extern CHARSET_INFO my_charset_utf8_croatian_uca_ci;
#ifdef HAVE_UTF8_GENERAL_CS
extern CHARSET_INFO my_charset_utf8_general_cs;
#endif
......@@ -152,6 +154,7 @@ my_bool init_compiled_charsets(myf flags __attribute__((unused)))
add_compiled_collation(&my_charset_ucs2_persian_uca_ci);
add_compiled_collation(&my_charset_ucs2_esperanto_uca_ci);
add_compiled_collation(&my_charset_ucs2_hungarian_uca_ci);
add_compiled_collation(&my_charset_ucs2_croatian_uca_ci);
#endif
#endif
......@@ -186,6 +189,7 @@ my_bool init_compiled_charsets(myf flags __attribute__((unused)))
add_compiled_collation(&my_charset_utf8_persian_uca_ci);
add_compiled_collation(&my_charset_utf8_esperanto_uca_ci);
add_compiled_collation(&my_charset_utf8_hungarian_uca_ci);
add_compiled_collation(&my_charset_utf8_croatian_uca_ci);
#endif
#endif
......
......@@ -567,8 +567,7 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
char *min_end= min_str + res_length;
char *max_end= max_str + res_length;
size_t maxcharlen= res_length / cs->mbmaxlen;
const char *contraction_flags= cs->contractions ?
((const char*) cs->contractions) + 0x40*0x40 : NULL;
my_bool have_contractions= my_uca_have_contractions(cs);
for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--)
{
......@@ -636,8 +635,8 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
'ab\min\min\min\min' and 'ab\max\max\max\max'.
*/
if (contraction_flags && ptr + 1 < end &&
contraction_flags[(uchar) *ptr])
if (have_contractions && ptr + 1 < end &&
my_uca_can_be_contraction_head(cs, (uchar) *ptr))
{
/* Ptr[0] is a contraction head. */
......@@ -659,8 +658,8 @@ my_bool my_like_range_mb(CHARSET_INFO *cs,
is not a contraction, then we put only ptr[0],
and continue with ptr[1] on the next loop.
*/
if (contraction_flags[(uchar) ptr[1]] &&
cs->contractions[(*ptr-0x40)*0x40 + ptr[1] - 0x40])
if (my_uca_can_be_contraction_tail(cs, (uchar) ptr[1]) &&
my_uca_contraction2_weight(cs, (uchar) ptr[0], (uchar) ptr[1]))
{
/* Contraction found */
if (maxcharlen == 1 || min_str + 1 >= min_end)
......
This diff is collapsed.
......@@ -1526,8 +1526,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
char *min_org=min_str;
char *min_end=min_str+res_length;
size_t charlen= res_length / cs->mbmaxlen;
const char *contraction_flags= cs->contractions ?
((const char*) cs->contractions) + 0x40*0x40 : NULL;
my_bool have_contractions= my_uca_have_contractions(cs);
for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0
; ptr+=2, charlen--)
......@@ -1567,8 +1566,9 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
return 0;
}
if (contraction_flags && ptr + 3 < end &&
ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]])
if (have_contractions && ptr + 3 < end &&
ptr[0] == '\0' &&
my_uca_can_be_contraction_head(cs, (uchar) ptr[1]))
{
/* Contraction head found */
if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many))
......@@ -1581,8 +1581,9 @@ my_bool my_like_range_ucs2(CHARSET_INFO *cs,
Check if the second letter can be contraction part,
and if two letters really produce a contraction.
*/
if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] &&
cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40])
if (ptr[2] == '\0' &&
my_uca_can_be_contraction_tail(cs, (uchar) ptr[3]) &&
my_uca_contraction2_weight(cs,(uchar) ptr[1], (uchar) ptr[3]))
{
/* Contraction found */
if (charlen == 1 || min_str + 2 >= min_end)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment