Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
mariadb
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
mariadb
Commits
23aee562
Commit
23aee562
authored
Jun 03, 2004
by
bar@bar.intranet.mysql.r18.ru
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Unicode collations: WL#916
XML and "collation customization" language parsers.
parent
5ddf741a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
547 additions
and
8 deletions
+547
-8
mysys/charset.c
mysys/charset.c
+496
-6
strings/ctype.c
strings/ctype.c
+51
-2
No files found.
mysys/charset.c
View file @
23aee562
...
...
@@ -21,6 +21,344 @@
#include <my_dir.h>
#include <my_xml.h>
/*
Collation language is implemented according to
subset of ICU Collation Customization (tailorings):
http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
Collation language elements:
Delimiters:
space - skipped
<char> := A-Z | a-z | \uXXXX
Shift command:
<shift> := & - reset at this letter.
Diff command:
<d1> := < - Identifies a primary difference.
<d2> := << - Identifies a secondary difference.
<d3> := <<< - Idenfifies a tertiary difference.
Collation rules:
<ruleset> := <rule> { <ruleset> }
<rule> := <d1> <string>
| <d2> <string>
| <d3> <string>
| <shift> <char>
<string> := <char> [ <string> ]
An example, Polish collation:
&A < \u0105 <<< \u0104
&C < \u0107 <<< \u0106
&E < \u0119 <<< \u0118
&L < \u0142 <<< \u0141
&N < \u0144 <<< \u0143
&O < \u00F3 <<< \u00D3
&S < \u015B <<< \u015A
&Z < \u017A <<< \u017B
*/
typedef
enum
my_coll_lexem_num_en
{
MY_COLL_LEXEM_EOF
=
0
,
MY_COLL_LEXEM_DIFF
=
1
,
MY_COLL_LEXEM_SHIFT
=
4
,
MY_COLL_LEXEM_CHAR
=
5
,
MY_COLL_LEXEM_ERROR
=
6
}
my_coll_lexem_num
;
typedef
struct
my_coll_lexem_st
{
const
char
*
beg
;
const
char
*
end
;
const
char
*
prev
;
int
diff
;
int
code
;
}
MY_COLL_LEXEM
;
/*
Initialize collation rule lexical anilizer
SYNOPSIS
my_coll_lexem_init
lexem Lex analizer to init
str Const string to parse
strend End of the string
USAGE
RETURN VALUES
N/A
*/
static
void
my_coll_lexem_init
(
MY_COLL_LEXEM
*
lexem
,
const
char
*
str
,
const
char
*
strend
)
{
lexem
->
beg
=
str
;
lexem
->
prev
=
str
;
lexem
->
end
=
strend
;
lexem
->
diff
=
0
;
lexem
->
code
=
0
;
}
/*
Print collation customization expression parse error, with context.
SYNOPSIS
my_coll_lexem_print_error
lexem Lex analizer to take context from
errstr sting to write error to
errsize errstr size
txt error message
USAGE
RETURN VALUES
N/A
*/
static
void
my_coll_lexem_print_error
(
MY_COLL_LEXEM
*
lexem
,
char
*
errstr
,
size_t
errsize
,
const
char
*
txt
)
{
char
tail
[
30
];
size_t
len
=
lexem
->
end
-
lexem
->
prev
;
strmake
(
tail
,
lexem
->
prev
,
min
(
len
,
sizeof
(
tail
)
-
1
));
errstr
[
errsize
-
1
]
=
'\0'
;
my_snprintf
(
errstr
,
errsize
-
1
,
"%s at '%s'"
,
txt
,
tail
);
}
/*
Convert a hex digit into its numeric value
SYNOPSIS
ch2x
ch hex digit to convert
USAGE
RETURN VALUES
an integer value in the range 0..15
-1 on error
*/
static
int
ch2x
(
int
ch
)
{
if
(
ch
>=
'0'
&&
ch
<=
'9'
)
return
ch
-
'0'
;
if
(
ch
>=
'a'
&&
ch
<=
'f'
)
return
10
+
ch
-
'a'
;
if
(
ch
>=
'A'
&&
ch
<=
'Z'
)
return
10
+
ch
-
'A'
;
return
-
1
;
}
/*
Collation language lexical parser:
Scans the next lexem.
SYNOPSIS
my_coll_lexem_next
lexem Lex analizer, previously initialized by
my_coll_lexem_init.
USAGE
Call this function in a loop
RETURN VALUES
Lexem number: eof, diff, shift, char or error.
*/
static
my_coll_lexem_num
my_coll_lexem_next
(
MY_COLL_LEXEM
*
lexem
)
{
for
(
;
lexem
->
beg
<
lexem
->
end
;
lexem
->
beg
++
)
{
lexem
->
prev
=
lexem
->
beg
;
if
(
lexem
->
beg
[
0
]
==
' '
||
lexem
->
beg
[
0
]
==
'\t'
||
lexem
->
beg
[
0
]
==
'\r'
||
lexem
->
beg
[
0
]
==
'\n'
)
continue
;
if
(
lexem
->
beg
[
0
]
==
'&'
)
{
lexem
->
beg
++
;
return
MY_COLL_LEXEM_SHIFT
;
}
if
(
lexem
->
beg
[
0
]
==
'<'
)
{
for
(
lexem
->
beg
++
,
lexem
->
diff
=
1
;
(
lexem
->
beg
<
lexem
->
end
)
&&
(
lexem
->
beg
[
0
]
==
'<'
)
&&
(
lexem
->
diff
<
3
);
lexem
->
beg
++
,
lexem
->
diff
++
);
return
MY_COLL_LEXEM_DIFF
;
}
if
((
lexem
->
beg
[
0
]
>=
'a'
&&
lexem
->
beg
[
0
]
<=
'z'
)
||
(
lexem
->
beg
[
0
]
>=
'A'
&&
lexem
->
beg
[
0
]
<=
'Z'
))
{
lexem
->
code
=
lexem
->
beg
[
0
];
lexem
->
beg
++
;
return
MY_COLL_LEXEM_CHAR
;
}
if
((
lexem
->
beg
[
0
]
==
'\\'
)
&&
(
lexem
->
beg
+
2
<
lexem
->
end
)
&&
(
lexem
->
beg
[
1
]
==
'u'
))
{
int
ch
;
lexem
->
code
=
0
;
for
(
lexem
->
beg
+=
2
;
(
lexem
->
beg
<
lexem
->
end
)
&&
((
ch
=
ch2x
(
lexem
->
beg
[
0
]))
>=
0
)
;
lexem
->
beg
++
)
{
lexem
->
code
=
(
lexem
->
code
<<
4
)
+
ch
;
}
return
MY_COLL_LEXEM_CHAR
;
}
return
MY_COLL_LEXEM_ERROR
;
}
return
MY_COLL_LEXEM_EOF
;
}
/*
Collation rule item
*/
typedef
struct
my_coll_rule_item_st
{
uint
base
;
/* Base character */
uint
curr
;
/* Current character */
int
diff
[
3
];
/* Primary, Secondary and Tertiary difference */
}
MY_COLL_RULE
;
/*
Collation language syntax parser.
Uses lexical parser.
SYNOPSIS
my_coll_rule_parse
rule Collation rule list to load to.
str A string containin collation language expression.
strend End of the string.
USAGE
RETURN VALUES
0 - OK
1 - ERROR, e.g. too many items.
*/
static
int
my_coll_rule_parse
(
MY_COLL_RULE
*
rule
,
size_t
mitems
,
const
char
*
str
,
const
char
*
strend
,
char
*
errstr
,
size_t
errsize
)
{
MY_COLL_LEXEM
lexem
;
my_coll_lexem_num
lexnum
;
my_coll_lexem_num
prevlexnum
=
MY_COLL_LEXEM_ERROR
;
MY_COLL_RULE
item
;
int
state
=
0
;
size_t
nitems
=
0
;
/* Init all variables */
errstr
[
0
]
=
'\0'
;
bzero
(
&
item
,
sizeof
(
item
));
my_coll_lexem_init
(
&
lexem
,
str
,
strend
);
while
((
lexnum
=
my_coll_lexem_next
(
&
lexem
)))
{
if
(
lexnum
==
MY_COLL_LEXEM_ERROR
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"Unknown character"
);
return
-
1
;
}
switch
(
state
)
{
case
0
:
if
(
lexnum
!=
MY_COLL_LEXEM_SHIFT
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"& expected"
);
return
-
1
;
}
prevlexnum
=
lexnum
;
state
=
2
;
continue
;
case
1
:
if
(
lexnum
!=
MY_COLL_LEXEM_SHIFT
&&
lexnum
!=
MY_COLL_LEXEM_DIFF
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"& or < expected"
);
return
-
1
;
}
prevlexnum
=
lexnum
;
state
=
2
;
continue
;
case
2
:
if
(
lexnum
!=
MY_COLL_LEXEM_CHAR
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"character expected"
);
return
-
1
;
}
if
(
prevlexnum
==
MY_COLL_LEXEM_SHIFT
)
{
item
.
base
=
lexem
.
code
;
item
.
diff
[
0
]
=
0
;
item
.
diff
[
1
]
=
0
;
item
.
diff
[
2
]
=
0
;
}
else
if
(
prevlexnum
==
MY_COLL_LEXEM_DIFF
)
{
item
.
curr
=
lexem
.
code
;
if
(
lexem
.
diff
==
3
)
{
item
.
diff
[
2
]
++
;
}
else
if
(
lexem
.
diff
==
2
)
{
item
.
diff
[
1
]
++
;
item
.
diff
[
2
]
=
0
;
}
else
if
(
lexem
.
diff
==
1
)
{
item
.
diff
[
0
]
++
;
item
.
diff
[
1
]
=
0
;
item
.
diff
[
2
]
=
0
;
}
if
(
nitems
>=
mitems
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"Too many rules"
);
return
-
1
;
}
rule
[
nitems
++
]
=
item
;
}
else
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"Should never happen"
);
return
-
1
;
}
state
=
1
;
continue
;
}
}
return
(
size_t
)
nitems
;
}
typedef
struct
{
int
nchars
;
...
...
@@ -284,6 +622,144 @@ err:
}
#ifdef HAVE_CHARSET_ucs2
#define MY_MAX_COLL_RULE 64
/*
This function copies an UCS2 collation from
the default Unicode Collation Algorithm (UCA)
weights applying tailorings, i.e. a set of
alternative weights for some characters.
The default UCA weights are stored in my_charset_ucs2_general_uca.
They consist of 256 pages, 256 character each.
If a page is not overwritten by tailoring rules,
it is copies as is from UCA as is.
If a page contains some overwritten characters, it is
allocated. Untouched characters are copied from the
default weights.
*/
static
int
ucs2_copy_data
(
CHARSET_INFO
*
to
,
CHARSET_INFO
*
from
)
{
MY_COLL_RULE
rule
[
MY_MAX_COLL_RULE
];
char
errstr
[
128
];
uchar
*
newlengths
;
uint16
**
newweights
;
const
uchar
*
deflengths
=
my_charset_ucs2_general_uca
.
sort_order
;
uint16
**
defweights
=
my_charset_ucs2_general_uca
.
sort_order_big
;
int
rc
,
i
;
to
->
number
=
from
->
number
?
from
->
number
:
to
->
number
;
if
(
from
->
csname
)
if
(
!
(
to
->
csname
=
my_once_strdup
(
from
->
csname
,
MYF
(
MY_WME
))))
goto
err
;
if
(
from
->
name
)
if
(
!
(
to
->
name
=
my_once_strdup
(
from
->
name
,
MYF
(
MY_WME
))))
goto
err
;
if
(
from
->
comment
)
if
(
!
(
to
->
comment
=
my_once_strdup
(
from
->
comment
,
MYF
(
MY_WME
))))
goto
err
;
to
->
strxfrm_multiply
=
my_charset_ucs2_general_uca
.
strxfrm_multiply
;
to
->
min_sort_char
=
my_charset_ucs2_general_uca
.
min_sort_char
;
to
->
max_sort_char
=
my_charset_ucs2_general_uca
.
max_sort_char
;
to
->
mbminlen
=
2
;
to
->
mbmaxlen
=
2
;
/* Parse ICU Collation Customization expression */
if
((
rc
=
my_coll_rule_parse
(
rule
,
MY_MAX_COLL_RULE
,
from
->
sort_order
,
from
->
sort_order
+
strlen
(
from
->
sort_order
),
errstr
,
sizeof
(
errstr
)))
<=
0
)
{
/*
TODO: add error message reporting.
printf("Error: %d '%s'\n", rc, errstr);
*/
return
1
;
}
if
(
!
(
newweights
=
(
uint16
**
)
my_once_alloc
(
256
*
sizeof
(
uint16
*
),
MYF
(
MY_WME
))))
goto
err
;
bzero
(
newweights
,
256
*
sizeof
(
uint16
*
));
if
(
!
(
newlengths
=
(
uchar
*
)
my_once_memdup
(
deflengths
,
256
,
MYF
(
MY_WME
))))
goto
err
;
/*
Calculate maximum lenghts for the pages
which will be overwritten.
*/
for
(
i
=
0
;
i
<
rc
;
i
++
)
{
uint
pageb
=
(
rule
[
i
].
base
>>
8
)
&
0xFF
;
uint
pagec
=
(
rule
[
i
].
curr
>>
8
)
&
0xFF
;
if
(
newlengths
[
pagec
]
<
deflengths
[
pageb
])
newlengths
[
pagec
]
=
deflengths
[
pageb
];
}
for
(
i
=
0
;
i
<
rc
;
i
++
)
{
uint
pageb
=
(
rule
[
i
].
base
>>
8
)
&
0xFF
;
uint
pagec
=
(
rule
[
i
].
curr
>>
8
)
&
0xFF
;
uint
chb
,
chc
;
if
(
!
newweights
[
pagec
])
{
/* Alloc new page and copy the default UCA weights */
uint
size
=
256
*
newlengths
[
pagec
]
*
sizeof
(
uint16
);
if
(
!
(
newweights
[
pagec
]
=
(
uint16
*
)
my_once_alloc
(
size
,
MYF
(
MY_WME
))))
goto
err
;
bzero
((
void
*
)
newweights
[
pagec
],
size
);
for
(
chc
=
0
;
chc
<
256
;
chc
++
)
{
memcpy
(
newweights
[
pagec
]
+
chc
*
newlengths
[
pagec
],
defweights
[
pagec
]
+
chc
*
deflengths
[
pagec
],
deflengths
[
pagec
]
*
sizeof
(
uint16
));
}
}
/*
Aply the alternative rule:
shift to the base character and primary difference.
*/
chc
=
rule
[
i
].
curr
&
0xFF
;
chb
=
rule
[
i
].
base
&
0xFF
;
memcpy
(
newweights
[
pagec
]
+
chc
*
newlengths
[
pagec
],
defweights
[
pageb
]
+
chb
*
deflengths
[
pageb
],
deflengths
[
pageb
]
*
sizeof
(
uint16
));
/* Apply primary difference */
newweights
[
pagec
][
chc
*
newlengths
[
pagec
]]
+=
rule
[
i
].
diff
[
0
];
}
/* Copy non-overwritten pages from the default UCA weights */
for
(
i
=
0
;
i
<
256
;
i
++
)
if
(
!
newweights
[
i
])
newweights
[
i
]
=
defweights
[
i
];
to
->
sort_order
=
newlengths
;
to
->
sort_order_big
=
newweights
;
return
0
;
err:
return
1
;
}
#endif
static
my_bool
simple_cs_is_full
(
CHARSET_INFO
*
cs
)
{
return
((
cs
->
csname
&&
cs
->
tab_to_uni
&&
cs
->
ctype
&&
cs
->
to_upper
&&
...
...
@@ -315,14 +791,28 @@ static int add_collation(CHARSET_INFO *cs)
if
(
!
(
all_charsets
[
cs
->
number
]
->
state
&
MY_CS_COMPILED
))
{
simple_cs_init_functions
(
all_charsets
[
cs
->
number
]);
if
(
simple_cs_copy_data
(
all_charsets
[
cs
->
number
],
cs
))
return
MY_XML_ERROR
;
if
(
simple_cs_is_full
(
all_charsets
[
cs
->
number
]))
if
(
!
strcmp
(
cs
->
csname
,
"ucs2"
)
)
{
all_charsets
[
cs
->
number
]
->
state
|=
MY_CS_LOADED
;
#ifdef HAVE_CHARSET_ucs2
CHARSET_INFO
*
new
=
all_charsets
[
cs
->
number
];
new
->
cset
=
my_charset_ucs2_general_uca
.
cset
;
new
->
coll
=
my_charset_ucs2_general_uca
.
coll
;
if
(
ucs2_copy_data
(
new
,
cs
))
return
MY_XML_ERROR
;
new
->
state
|=
MY_CS_AVAILABLE
|
MY_CS_LOADED
;
#endif
}
else
{
simple_cs_init_functions
(
all_charsets
[
cs
->
number
]);
if
(
simple_cs_copy_data
(
all_charsets
[
cs
->
number
],
cs
))
return
MY_XML_ERROR
;
if
(
simple_cs_is_full
(
all_charsets
[
cs
->
number
]))
{
all_charsets
[
cs
->
number
]
->
state
|=
MY_CS_LOADED
;
}
all_charsets
[
cs
->
number
]
->
state
|=
MY_CS_AVAILABLE
;
}
all_charsets
[
cs
->
number
]
->
state
|=
MY_CS_AVAILABLE
;
}
else
{
...
...
strings/ctype.c
View file @
23aee562
...
...
@@ -22,6 +22,23 @@
#endif
/*
This files implements routines which parse XML based
character set and collation description files.
Unicode collations are encoded according to
Unicode Technical Standard #35
Locale Data Markup Language (LDML)
http://www.unicode.org/reports/tr35/
and converted into ICU string according to
Collation Customization
http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
*/
static
char
*
mstr
(
char
*
str
,
const
char
*
src
,
uint
l1
,
uint
l2
)
{
...
...
@@ -54,6 +71,11 @@ struct my_cs_file_section_st
#define _CS_PRIMARY_ID 15
#define _CS_BINARY_ID 16
#define _CS_CSDESCRIPT 17
#define _CS_RESET 18
#define _CS_DIFF1 19
#define _CS_DIFF2 20
#define _CS_DIFF3 21
static
struct
my_cs_file_section_st
sec
[]
=
{
...
...
@@ -83,6 +105,10 @@ static struct my_cs_file_section_st sec[] =
{
_CS_ORDER
,
"charsets.charset.collation.order"
},
{
_CS_FLAG
,
"charsets.charset.collation.flag"
},
{
_CS_COLLMAP
,
"charsets.charset.collation.map"
},
{
_CS_RESET
,
"charsets.charset.collation.rules.reset"
},
{
_CS_DIFF1
,
"charsets.charset.collation.rules.p"
},
{
_CS_DIFF2
,
"charsets.charset.collation.rules.s"
},
{
_CS_DIFF3
,
"charsets.charset.collation.rules.t"
},
{
0
,
NULL
}
};
...
...
@@ -109,6 +135,7 @@ typedef struct my_cs_file_info
uchar
sort_order
[
MY_CS_SORT_ORDER_TABLE_SIZE
];
uint16
tab_to_uni
[
MY_CS_TO_UNI_TABLE_SIZE
];
char
comment
[
MY_CS_CSDESCR_SIZE
];
size_t
sort_order_length
;
CHARSET_INFO
cs
;
int
(
*
add_collation
)(
CHARSET_INFO
*
cs
);
}
MY_CHARSET_LOADER
;
...
...
@@ -156,9 +183,11 @@ static int cs_enter(MY_XML_PARSER *st,const char *attr, uint len)
struct
my_cs_file_section_st
*
s
=
cs_file_sec
(
attr
,
len
);
if
(
s
&&
(
s
->
state
==
_CS_CHARSET
))
{
bzero
(
&
i
->
cs
,
sizeof
(
i
->
cs
));
}
if
(
s
&&
(
s
->
state
==
_CS_COLLATION
))
i
->
sort_order_length
=
0
;
return
MY_XML_OK
;
}
...
...
@@ -242,6 +271,26 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, uint len)
fill_uchar
(
i
->
ctype
,
MY_CS_CTYPE_TABLE_SIZE
,
attr
,
len
);
i
->
cs
.
ctype
=
i
->
ctype
;
break
;
case
_CS_RESET
:
case
_CS_DIFF1
:
case
_CS_DIFF2
:
case
_CS_DIFF3
:
{
/*
Convert collation description from
Locale Data Markup Language (LDML)
into ICU Collation Customization expression.
*/
char
arg
[
16
];
const
char
*
cmd
[]
=
{
"&"
,
"<"
,
"<<"
,
"<<<"
};
i
->
cs
.
sort_order
=
i
->
sort_order
;
mstr
(
arg
,
attr
,
len
,
sizeof
(
arg
)
-
1
);
if
(
i
->
sort_order_length
+
20
<
sizeof
(
i
->
sort_order
))
{
char
*
dst
=
i
->
sort_order_length
+
i
->
sort_order
;
i
->
sort_order_length
+=
sprintf
(
dst
,
" %s %s"
,
cmd
[
state
-
_CS_RESET
],
arg
);
}
}
}
return
MY_XML_OK
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment