Unicode collations: WL#916

XML and "collation customization" language parsers.
parent 5ddf741a
...@@ -21,6 +21,344 @@ ...@@ -21,6 +21,344 @@
#include <my_dir.h> #include <my_dir.h>
#include <my_xml.h> #include <my_xml.h>
/*
Collation language is implemented according to
subset of ICU Collation Customization (tailorings):
http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
Collation language elements:
Delimiters:
space - skipped
<char> := A-Z | a-z | \uXXXX
Shift command:
<shift> := & - reset at this letter.
Diff command:
<d1> := < - Identifies a primary difference.
<d2> := << - Identifies a secondary difference.
<d3> := <<< - Idenfifies a tertiary difference.
Collation rules:
<ruleset> := <rule> { <ruleset> }
<rule> := <d1> <string>
| <d2> <string>
| <d3> <string>
| <shift> <char>
<string> := <char> [ <string> ]
An example, Polish collation:
&A < \u0105 <<< \u0104
&C < \u0107 <<< \u0106
&E < \u0119 <<< \u0118
&L < \u0142 <<< \u0141
&N < \u0144 <<< \u0143
&O < \u00F3 <<< \u00D3
&S < \u015B <<< \u015A
&Z < \u017A <<< \u017B
*/
typedef enum my_coll_lexem_num_en
{
MY_COLL_LEXEM_EOF = 0,
MY_COLL_LEXEM_DIFF = 1,
MY_COLL_LEXEM_SHIFT = 4,
MY_COLL_LEXEM_CHAR = 5,
MY_COLL_LEXEM_ERROR = 6
} my_coll_lexem_num;
typedef struct my_coll_lexem_st
{
const char *beg;
const char *end;
const char *prev;
int diff;
int code;
} MY_COLL_LEXEM;
/*
Initialize collation rule lexical anilizer
SYNOPSIS
my_coll_lexem_init
lexem Lex analizer to init
str Const string to parse
strend End of the string
USAGE
RETURN VALUES
N/A
*/
static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
const char *str, const char *strend)
{
lexem->beg= str;
lexem->prev= str;
lexem->end= strend;
lexem->diff= 0;
lexem->code= 0;
}
/*
Print collation customization expression parse error, with context.
SYNOPSIS
my_coll_lexem_print_error
lexem Lex analizer to take context from
errstr sting to write error to
errsize errstr size
txt error message
USAGE
RETURN VALUES
N/A
*/
static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
char *errstr, size_t errsize,
const char *txt)
{
char tail[30];
size_t len= lexem->end - lexem->prev;
strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
errstr[errsize-1]= '\0';
my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
}
/*
Convert a hex digit into its numeric value
SYNOPSIS
ch2x
ch hex digit to convert
USAGE
RETURN VALUES
an integer value in the range 0..15
-1 on error
*/
static int ch2x(int ch)
{
if (ch >= '0' && ch <= '9')
return ch - '0';
if (ch >= 'a' && ch <= 'f')
return 10 + ch - 'a';
if (ch >= 'A' && ch <= 'Z')
return 10 + ch - 'A';
return -1;
}
/*
Collation language lexical parser:
Scans the next lexem.
SYNOPSIS
my_coll_lexem_next
lexem Lex analizer, previously initialized by
my_coll_lexem_init.
USAGE
Call this function in a loop
RETURN VALUES
Lexem number: eof, diff, shift, char or error.
*/
static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
{
for ( ;lexem->beg < lexem->end ; lexem->beg++)
{
lexem->prev= lexem->beg;
if (lexem->beg[0] == ' ' || lexem->beg[0] == '\t' ||
lexem->beg[0] == '\r' || lexem->beg[0] == '\n')
continue;
if (lexem->beg[0] == '&')
{
lexem->beg++;
return MY_COLL_LEXEM_SHIFT;
}
if (lexem->beg[0] == '<')
{
for (lexem->beg++, lexem->diff=1;
(lexem->beg < lexem->end) &&
(lexem->beg[0] == '<') && (lexem->diff<3);
lexem->beg++, lexem->diff++);
return MY_COLL_LEXEM_DIFF;
}
if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') ||
(lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z'))
{
lexem->code= lexem->beg[0];
lexem->beg++;
return MY_COLL_LEXEM_CHAR;
}
if ((lexem->beg[0] == '\\') &&
(lexem->beg+2 < lexem->end) &&
(lexem->beg[1] == 'u'))
{
int ch;
lexem->code= 0;
for (lexem->beg+=2;
(lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ;
lexem->beg++)
{
lexem->code= (lexem->code << 4) + ch;
}
return MY_COLL_LEXEM_CHAR;
}
return MY_COLL_LEXEM_ERROR;
}
return MY_COLL_LEXEM_EOF;
}
/*
Collation rule item
*/
typedef struct my_coll_rule_item_st
{
uint base; /* Base character */
uint curr; /* Current character */
int diff[3]; /* Primary, Secondary and Tertiary difference */
} MY_COLL_RULE;
/*
Collation language syntax parser.
Uses lexical parser.
SYNOPSIS
my_coll_rule_parse
rule Collation rule list to load to.
str A string containin collation language expression.
strend End of the string.
USAGE
RETURN VALUES
0 - OK
1 - ERROR, e.g. too many items.
*/
static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
const char *str, const char *strend,
char *errstr, size_t errsize)
{
MY_COLL_LEXEM lexem;
my_coll_lexem_num lexnum;
my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
MY_COLL_RULE item;
int state= 0;
size_t nitems= 0;
/* Init all variables */
errstr[0]= '\0';
bzero(&item, sizeof(item));
my_coll_lexem_init(&lexem, str, strend);
while ((lexnum= my_coll_lexem_next(&lexem)))
{
if (lexnum == MY_COLL_LEXEM_ERROR)
{
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
return -1;
}
switch (state) {
case 0:
if (lexnum != MY_COLL_LEXEM_SHIFT)
{
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
return -1;
}
prevlexnum= lexnum;
state= 2;
continue;
case 1:
if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
{
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
return -1;
}
prevlexnum= lexnum;
state= 2;
continue;
case 2:
if (lexnum != MY_COLL_LEXEM_CHAR)
{
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
return -1;
}
if (prevlexnum == MY_COLL_LEXEM_SHIFT)
{
item.base= lexem.code;
item.diff[0]= 0;
item.diff[1]= 0;
item.diff[2]= 0;
}
else if (prevlexnum == MY_COLL_LEXEM_DIFF)
{
item.curr= lexem.code;
if (lexem.diff == 3)
{
item.diff[2]++;
}
else if (lexem.diff == 2)
{
item.diff[1]++;
item.diff[2]= 0;
}
else if (lexem.diff == 1)
{
item.diff[0]++;
item.diff[1]= 0;
item.diff[2]= 0;
}
if (nitems >= mitems)
{
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
return -1;
}
rule[nitems++]= item;
}
else
{
my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
return -1;
}
state= 1;
continue;
}
}
return (size_t) nitems;
}
typedef struct typedef struct
{ {
int nchars; int nchars;
...@@ -284,6 +622,144 @@ err: ...@@ -284,6 +622,144 @@ err:
} }
#ifdef HAVE_CHARSET_ucs2
#define MY_MAX_COLL_RULE 64
/*
This function copies an UCS2 collation from
the default Unicode Collation Algorithm (UCA)
weights applying tailorings, i.e. a set of
alternative weights for some characters.
The default UCA weights are stored in my_charset_ucs2_general_uca.
They consist of 256 pages, 256 character each.
If a page is not overwritten by tailoring rules,
it is copies as is from UCA as is.
If a page contains some overwritten characters, it is
allocated. Untouched characters are copied from the
default weights.
*/
static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
{
MY_COLL_RULE rule[MY_MAX_COLL_RULE];
char errstr[128];
uchar *newlengths;
uint16 **newweights;
const uchar *deflengths= my_charset_ucs2_general_uca.sort_order;
uint16 **defweights= my_charset_ucs2_general_uca.sort_order_big;
int rc, i;
to->number= from->number ? from->number : to->number;
if (from->csname)
if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
goto err;
if (from->name)
if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
goto err;
if (from->comment)
if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
goto err;
to->strxfrm_multiply= my_charset_ucs2_general_uca.strxfrm_multiply;
to->min_sort_char= my_charset_ucs2_general_uca.min_sort_char;
to->max_sort_char= my_charset_ucs2_general_uca.max_sort_char;
to->mbminlen= 2;
to->mbmaxlen= 2;
/* Parse ICU Collation Customization expression */
if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
from->sort_order,
from->sort_order + strlen(from->sort_order),
errstr, sizeof(errstr))) <= 0)
{
/*
TODO: add error message reporting.
printf("Error: %d '%s'\n", rc, errstr);
*/
return 1;
}
if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME))))
goto err;
bzero(newweights, 256*sizeof(uint16*));
if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME))))
goto err;
/*
Calculate maximum lenghts for the pages
which will be overwritten.
*/
for (i=0; i < rc; i++)
{
uint pageb= (rule[i].base >> 8) & 0xFF;
uint pagec= (rule[i].curr >> 8) & 0xFF;
if (newlengths[pagec] < deflengths[pageb])
newlengths[pagec]= deflengths[pageb];
}
for (i=0; i < rc; i++)
{
uint pageb= (rule[i].base >> 8) & 0xFF;
uint pagec= (rule[i].curr >> 8) & 0xFF;
uint chb, chc;
if (!newweights[pagec])
{
/* Alloc new page and copy the default UCA weights */
uint size= 256*newlengths[pagec]*sizeof(uint16);
if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME))))
goto err;
bzero((void*) newweights[pagec], size);
for (chc=0 ; chc < 256; chc++)
{
memcpy(newweights[pagec] + chc*newlengths[pagec],
defweights[pagec] + chc*deflengths[pagec],
deflengths[pagec]*sizeof(uint16));
}
}
/*
Aply the alternative rule:
shift to the base character and primary difference.
*/
chc= rule[i].curr & 0xFF;
chb= rule[i].base & 0xFF;
memcpy(newweights[pagec] + chc*newlengths[pagec],
defweights[pageb] + chb*deflengths[pageb],
deflengths[pageb]*sizeof(uint16));
/* Apply primary difference */
newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
}
/* Copy non-overwritten pages from the default UCA weights */
for (i= 0; i < 256 ; i++)
if (!newweights[i])
newweights[i]= defweights[i];
to->sort_order= newlengths;
to->sort_order_big= newweights;
return 0;
err:
return 1;
}
#endif
static my_bool simple_cs_is_full(CHARSET_INFO *cs) static my_bool simple_cs_is_full(CHARSET_INFO *cs)
{ {
return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper && return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper &&
...@@ -314,6 +790,19 @@ static int add_collation(CHARSET_INFO *cs) ...@@ -314,6 +790,19 @@ static int add_collation(CHARSET_INFO *cs)
all_charsets[cs->number]->state|= cs->state; all_charsets[cs->number]->state|= cs->state;
if (!(all_charsets[cs->number]->state & MY_CS_COMPILED)) if (!(all_charsets[cs->number]->state & MY_CS_COMPILED))
{
if (!strcmp(cs->csname,"ucs2") )
{
#ifdef HAVE_CHARSET_ucs2
CHARSET_INFO *new= all_charsets[cs->number];
new->cset= my_charset_ucs2_general_uca.cset;
new->coll= my_charset_ucs2_general_uca.coll;
if (ucs2_copy_data(new, cs))
return MY_XML_ERROR;
new->state |= MY_CS_AVAILABLE | MY_CS_LOADED;
#endif
}
else
{ {
simple_cs_init_functions(all_charsets[cs->number]); simple_cs_init_functions(all_charsets[cs->number]);
if (simple_cs_copy_data(all_charsets[cs->number],cs)) if (simple_cs_copy_data(all_charsets[cs->number],cs))
...@@ -324,6 +813,7 @@ static int add_collation(CHARSET_INFO *cs) ...@@ -324,6 +813,7 @@ static int add_collation(CHARSET_INFO *cs)
} }
all_charsets[cs->number]->state|= MY_CS_AVAILABLE; all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
} }
}
else else
{ {
/* /*
......
...@@ -22,6 +22,23 @@ ...@@ -22,6 +22,23 @@
#endif #endif
/*
This files implements routines which parse XML based
character set and collation description files.
Unicode collations are encoded according to
Unicode Technical Standard #35
Locale Data Markup Language (LDML)
http://www.unicode.org/reports/tr35/
and converted into ICU string according to
Collation Customization
http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
*/
static char *mstr(char *str,const char *src,uint l1,uint l2) static char *mstr(char *str,const char *src,uint l1,uint l2)
{ {
...@@ -54,6 +71,11 @@ struct my_cs_file_section_st ...@@ -54,6 +71,11 @@ struct my_cs_file_section_st
#define _CS_PRIMARY_ID 15 #define _CS_PRIMARY_ID 15
#define _CS_BINARY_ID 16 #define _CS_BINARY_ID 16
#define _CS_CSDESCRIPT 17 #define _CS_CSDESCRIPT 17
#define _CS_RESET 18
#define _CS_DIFF1 19
#define _CS_DIFF2 20
#define _CS_DIFF3 21
static struct my_cs_file_section_st sec[] = static struct my_cs_file_section_st sec[] =
{ {
...@@ -83,6 +105,10 @@ static struct my_cs_file_section_st sec[] = ...@@ -83,6 +105,10 @@ static struct my_cs_file_section_st sec[] =
{_CS_ORDER, "charsets.charset.collation.order"}, {_CS_ORDER, "charsets.charset.collation.order"},
{_CS_FLAG, "charsets.charset.collation.flag"}, {_CS_FLAG, "charsets.charset.collation.flag"},
{_CS_COLLMAP, "charsets.charset.collation.map"}, {_CS_COLLMAP, "charsets.charset.collation.map"},
{_CS_RESET, "charsets.charset.collation.rules.reset"},
{_CS_DIFF1, "charsets.charset.collation.rules.p"},
{_CS_DIFF2, "charsets.charset.collation.rules.s"},
{_CS_DIFF3, "charsets.charset.collation.rules.t"},
{0, NULL} {0, NULL}
}; };
...@@ -109,6 +135,7 @@ typedef struct my_cs_file_info ...@@ -109,6 +135,7 @@ typedef struct my_cs_file_info
uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE]; uchar sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE]; uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
char comment[MY_CS_CSDESCR_SIZE]; char comment[MY_CS_CSDESCR_SIZE];
size_t sort_order_length;
CHARSET_INFO cs; CHARSET_INFO cs;
int (*add_collation)(CHARSET_INFO *cs); int (*add_collation)(CHARSET_INFO *cs);
} MY_CHARSET_LOADER; } MY_CHARSET_LOADER;
...@@ -156,9 +183,11 @@ static int cs_enter(MY_XML_PARSER *st,const char *attr, uint len) ...@@ -156,9 +183,11 @@ static int cs_enter(MY_XML_PARSER *st,const char *attr, uint len)
struct my_cs_file_section_st *s= cs_file_sec(attr,len); struct my_cs_file_section_st *s= cs_file_sec(attr,len);
if ( s && (s->state == _CS_CHARSET)) if ( s && (s->state == _CS_CHARSET))
{
bzero(&i->cs,sizeof(i->cs)); bzero(&i->cs,sizeof(i->cs));
}
if (s && (s->state == _CS_COLLATION))
i->sort_order_length= 0;
return MY_XML_OK; return MY_XML_OK;
} }
...@@ -242,6 +271,26 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, uint len) ...@@ -242,6 +271,26 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, uint len)
fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len); fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
i->cs.ctype=i->ctype; i->cs.ctype=i->ctype;
break; break;
case _CS_RESET:
case _CS_DIFF1:
case _CS_DIFF2:
case _CS_DIFF3:
{
/*
Convert collation description from
Locale Data Markup Language (LDML)
into ICU Collation Customization expression.
*/
char arg[16];
const char *cmd[]= {"&","<","<<","<<<"};
i->cs.sort_order= i->sort_order;
mstr(arg,attr,len,sizeof(arg)-1);
if (i->sort_order_length + 20 < sizeof(i->sort_order))
{
char *dst= i->sort_order_length + i->sort_order;
i->sort_order_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg);
}
}
} }
return MY_XML_OK; return MY_XML_OK;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment