Commit bb84f61a authored by Alexander Barkov's avatar Alexander Barkov Committed by Oleksandr Byelkin

MDEV-27009 Add UCA-14.0.0 collations - adding uca-dump into build targets

- Adding uca-dump into build targets
- Adding ctype-uca.h and moving implicit weight related routines there
- Reusing implicit weight routines in ctype-uca.c and uca-dump.c
- Adding handling of command line arguments to uca-dump
- Fixing some compile-time warnings in uca-dump.c
parent 45e0373a
......@@ -24,6 +24,7 @@ usr/lib/x86_64-linux-gnu/libidbboot.a # ColumnStore header file
usr/lib/x86_64-linux-gnu/libprocessor.a # ColumnStore header file
usr/lib/x86_64-linux-gnu/libwe_xml.a # ColumnStore header file
usr/bin/test-connect-t
usr/bin/uca-dump
usr/bin/wsrep_sst_backup
usr/lib/mysql/plugin/type_test.so
usr/lib/sysusers.d/mariadb.conf # Not used (yet) in Debian systemd
......
......@@ -37,3 +37,5 @@ MAYBE_DISABLE_IPO(strings)
ADD_EXECUTABLE(conf_to_src EXCLUDE_FROM_ALL conf_to_src.c)
SET_TARGET_PROPERTIES(conf_to_src PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
TARGET_LINK_LIBRARIES(conf_to_src mysys strings)
ADD_EXECUTABLE(uca-dump uca-dump.c)
......@@ -34,6 +34,7 @@
#include "strings_def.h"
#include <m_ctype.h>
#include "ctype-uca.h"
typedef struct
{
......@@ -31689,62 +31690,13 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,
/****************************************************************/
/**
Implicit weights for a code CP are constructed as follows:
[.AAAA.0020.0002][.BBBB.0000.0000]
where:
AAAA= BASE + (CP >> 15);
BBBB= (CP & 0x7FFF) | 0x8000;
There are two weights in the primary level (AAAA followed by BBBB).
There is one weight on other levels:
- 0020 on the secondary level
- 0002 on the tertiary level
*/
/**
Return BASE for an implicit weight on the primary level
According to UCA, BASE is calculated as follows:
- FB40 for Unified_Ideograph=True AND
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FB80 for Unified_Ideograph=True AND NOT
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FBC0 for any other code point
TODO: it seems we're not handling BASE correctly:
- check what are those blocks
- there are more Unified Ideograph blocks in the latest Unicode versions
*/
static inline uint16
my_uca_implicit_weight_base(my_wc_t code)
{
if (code >= 0x3400 && code <= 0x4DB5)
return 0xFB80;
if (code >= 0x4E00 && code <= 0x9FA5)
return 0xFB40;
return 0xFBC0;
}
static inline void
my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level)
{
switch (level) {
case 1: to[0]= 0x0020; to[1]= 0; break; /* Secondary level */
case 2: to[0]= 0x0002; to[1]= 0; break; /* Tertiary level */
case 3: to[0]= 0x0001; to[1]= 0; break; /* Quaternary level */
default:
DBUG_ASSERT(0);
case 0:
break;
}
/* Primary level */
to[0]= (uint16)(code >> 15) + my_uca_implicit_weight_base(code);
to[1]= (code & 0x7FFF) | 0x8000;
MY_UCA_IMPLICIT_WEIGHT weight;
weight= my_uca_520_implicit_weight_on_level(code, level);
to[0]= weight.weight[0];
to[1]= weight.weight[1];
to[2]= 0;
}
......@@ -31766,10 +31718,11 @@ static inline int
my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)
{
my_wc_t wc= (scanner->page << 8) + scanner->code;
scanner->implicit[0]= (wc & 0x7FFF) | 0x8000; /* The second weight */
scanner->implicit[1]= 0; /* 0 terminator */
MY_UCA_IMPLICIT_WEIGHT weight= my_uca_520_implicit_weight_primary(wc);
scanner->implicit[0]= weight.weight[1]; /* The second weight */
scanner->implicit[1]= 0; /* 0 terminator */
scanner->wbeg= scanner->implicit;
return my_uca_implicit_weight_base(wc) + (wc >> 15);
return weight.weight[0]; /* The first weight */
}
#ifndef CTYPE_UCA_H
#define CTYPE_UCA_H
/* Copyright (c) 2021, MariaDB
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; version 2
of the License.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
MA 02110-1335 USA */
/*
Implicit weight handling is done according to
the section "Computing Implicit Weights" in
https://unicode.org/reports/tr10/#Values_For_Base_Table
(as of Unicode 14.0.0)
Implicit weights for a code CP are constructed as follows:
[.AAAA.0020.0002][.BBBB.0000.0000]
- There are two primary weights, depending on the character type and block.
- There is one weight on the secondary and tertiary levels.
AAAA and BBBB are computed using different formulas for:
- Siniform ideographic scripts
- Han
- Unassigned characters
*/
typedef struct my_uca_implict_weight_t
{
uint16 weight[2];
} MY_UCA_IMPLICIT_WEIGHT;
/*
By default, implicit weights for a code CP are constructed as follows:
[.AAAA.0020.0002][.BBBB.0000.0000]
where AAAA and BBBB are :
AAAA= BASE + (CP >> 15);
BBBB= (CP & 0x7FFF) | 0x8000;
This formula covers the following implicit weight subtypes:
- Core Han Unified Ideographs
- All other Han Unified Ideographs
- Unassigned characters
Every mentioned subtype passes a different BASE.
This formula does not cover Siniform ideographic scripts.
They are handled by separate functions.
*/
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_primary_default(uint16 base, my_wc_t code)
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= (uint16) ((code >> 15) + base);
res.weight[1]= (uint16) ((code & 0x7FFF)|0x8000);
return res;
}
/**
Calculate Unicode-5.2.0 implicit weight on the primary level.
According to UCA, BASE is calculated as follows:
- FB40 for Unified_Ideograph=True AND
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FB80 for Unified_Ideograph=True AND NOT
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FBC0 for any other code point
But for Unicode-5.2.0 and Unicode-4.0.0 we used
a simplified formula as implemented before.
*/
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_520_implicit_weight_primary(my_wc_t code)
{
uint16 base;
/*
3400;<CJK Ideograph Extension A, First>
4DB5;<CJK Ideograph Extension A, Last>
4E00;<CJK Ideograph, First>
9FA5;<CJK Ideograph, Last>
*/
if (code >= 0x3400 && code <= 0x4DB5)
base= 0xFB80;
else if (code >= 0x4E00 && code <= 0x9FA5)
base= 0xFB40;
else
base= 0xFBC0;
return my_uca_implicit_weight_primary_default(base, code);
}
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_secondary()
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0x0020;
res.weight[1]= 0;
return res;
}
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_tertiary()
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0x0002;
res.weight[1]= 0;
return res;
}
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_quaternary()
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0x0001;
res.weight[1]= 0;
return res;
}
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_520_implicit_weight_on_level(my_wc_t code, uint level)
{
switch (level) {
case 0:
return my_uca_520_implicit_weight_primary(code);
case 1:
return my_uca_implicit_weight_secondary();
case 2:
return my_uca_implicit_weight_tertiary();
default:
break;
}
return my_uca_implicit_weight_quaternary();
}
#endif /* CTYPE_UCA_H */
......@@ -20,13 +20,14 @@
#include <stdlib.h>
#include <string.h>
typedef unsigned char uchar;
typedef unsigned short uint16;
#include "my_global.h"
#include "m_ctype.h"
#include "ctype-uca.h"
struct uca_item_st
{
uchar num;
uint16 weight[4][9];
uint16 weight[4][MY_UCA_MAX_WEIGHT_SIZE];
};
#if 0
......@@ -43,35 +44,139 @@ struct uca_item_st
#define MAX_ALLOWED_CODE 0x10FFFF
/* Name that goes into all array names */
static const char *global_name_prefix= "uca520";
typedef struct opt_st
{
const char *name_prefix; /* Name that goes into all array names */
const char *filename; /* The filename or "-" for stdin */
uint levels; /* The number of levels to dump */
} OPT;
static OPT defaults=
{
"uca",
"-",
3
};
/* Name prefix that goes into page weight array names after global_name_prefix */
static char *pname_prefix[]= {"_p", "_p", "_p"};
static const char *pname_prefix[]= {"_p", "_p", "_p"};
/* Name suffix that goes into page weight array names after page number */
static char *pname_suffix[]= {"", "_w2", "_w3"};
static const char *pname_suffix[]= {"", "_w2", "_w3"};
void usage(const char *prog)
{
printf("Usage:\n");
printf("%s [options] filename\n", prog);
}
static inline int lstrncmp(const char *str, const LEX_CSTRING lstr)
{
return strncmp(lstr.str, str, lstr.length);
}
int process_option(OPT *options, const char *opt)
{
static const LEX_CSTRING opt_name_prefix= {STRING_WITH_LEN("--name-prefix=")};
static const LEX_CSTRING opt_levels= {STRING_WITH_LEN("--levels=")};
if (!lstrncmp(opt, opt_name_prefix))
{
options->name_prefix= opt + opt_name_prefix.length;
return 0;
}
if (!lstrncmp(opt, opt_levels))
{
options->levels= (uint) strtoul(opt + opt_levels.length, NULL, 10);
if (options->levels < 1 || options->levels > 3)
{
printf("Bad --levels value\n");
return 1;
}
return 0;
}
printf("Unknown option: %s\n", opt);
return 1;
}
int process_options(OPT *options, int ac, char **av)
{
int i;
for (i= 1; i < ac; i++)
{
if (!strncmp(av[i], "--", 2))
{
if (process_option(options, av[i]))
return 1;
}
else
{
if (i + 1 != ac)
{
usage(av[0]);
return 1;
}
options->filename= av[i];
return 0;
}
}
usage(av[0]);
return 1;
}
FILE *open_file(const char *name)
{
if (!strcmp(name, "-"))
return stdin;
return fopen(name, "r");
}
void close_file(FILE *file)
{
if (file != stdin)
fclose(file);
}
int main(int ac, char **av)
{
char str[256];
char str[1024];
char *weights[64];
static struct uca_item_st uca[MAX_ALLOWED_CODE+1];
size_t code, w;
my_wc_t code;
uint w;
int pageloaded[MY_UCA_NPAGES];
FILE *file;
OPT options= defaults;
if (process_options(&options, ac, av))
return 1;
if (!(file= open_file(options.filename)))
{
printf("Could not open %s for reading\n", options.filename);
return 1;
}
bzero(uca, sizeof(uca));
bzero(pageloaded, sizeof(pageloaded));
while (fgets(str,sizeof(str),stdin))
while (fgets(str, sizeof(str), file))
{
char *comment;
char *weight;
char *s;
size_t codenum;
code= strtol(str,NULL,16);
code= (my_wc_t) strtol(str,NULL,16);
if (str[0]=='#' || (code > MAX_ALLOWED_CODE))
continue;
......@@ -116,7 +221,9 @@ int main(int ac, char **av)
uca[code].num++;
}
for (w=0; w < uca[code].num; w++)
set_if_smaller(uca[code].num, MY_UCA_MAX_WEIGHT_SIZE-1);
for (w=0; w < uca[code].num ; w++)
{
size_t partnum;
......@@ -125,9 +232,8 @@ int main(int ac, char **av)
while (*s)
{
char *endptr;
size_t part;
part= strtol(s+1,&endptr,16);
uca[code].weight[partnum][w]= part;
uint part= (uint) strtoul(s + 1, &endptr, 16);
uca[code].weight[partnum][w]= (uint16) part;
s= endptr;
partnum++;
}
......@@ -135,45 +241,24 @@ int main(int ac, char **av)
/* Mark that a character from this page was loaded */
pageloaded[code >> MY_UCA_PSHIFT]++;
}
close_file(file);
/* Now set implicit weights */
for (code=0; code <= MAX_ALLOWED_CODE; code++)
{
size_t base, aaaa, bbbb;
uint level;
if (uca[code].num)
continue;
/*
3400;<CJK Ideograph Extension A, First>
4DB5;<CJK Ideograph Extension A, Last>
4E00;<CJK Ideograph, First>
9FA5;<CJK Ideograph, Last>
*/
if (code >= 0x3400 && code <= 0x4DB5)
base= 0xFB80;
else if (code >= 0x4E00 && code <= 0x9FA5)
base= 0xFB40;
else
base= 0xFBC0;
aaaa= base + (code >> 15);
bbbb= (code & 0x7FFF) | 0x8000;
uca[code].weight[0][0]= aaaa;
uca[code].weight[0][1]= bbbb;
uca[code].weight[1][0]= 0x0020;
uca[code].weight[1][1]= 0x0000;
uca[code].weight[2][0]= 0x0002;
uca[code].weight[2][1]= 0x0000;
uca[code].weight[3][0]= 0x0001;
uca[code].weight[3][2]= 0x0000;
for (level= 0; level < 4; level++)
{
MY_UCA_IMPLICIT_WEIGHT weight;
weight= my_uca_520_implicit_weight_on_level(code, level);
uca[code].weight[level][0]= weight.weight[0];
uca[code].weight[level][1]= weight.weight[1];
}
uca[code].num= 2;
}
......@@ -184,7 +269,7 @@ int main(int ac, char **av)
printf("#define MY_UCA_CMASK %d\n",MY_UCA_CMASK);
printf("#define MY_UCA_PSHIFT %d\n",MY_UCA_PSHIFT);
for (w=0; w<3; w++)
for (w=0; w < options.levels; w++)
{
size_t page;
int pagemaxlen[MY_UCA_NPAGES];
......@@ -259,7 +344,7 @@ int main(int ac, char **av)
default: mchars= uca[code].num;
}
pagemaxlen[page]= maxnum;
pagemaxlen[page]= (int) maxnum;
/*
......@@ -268,12 +353,12 @@ int main(int ac, char **av)
printf("static const uint16 %s%s%03X%s[]= { /* %04X (%d weights per char) */\n",
global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
options.name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
(int) page*MY_UCA_NCHARS, (int) maxnum);
for (offs=0; offs < MY_UCA_NCHARS; offs++)
{
uint16 weight[8];
uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];
size_t num, i;
code= page*MY_UCA_NCHARS+offs;
......@@ -324,7 +409,7 @@ int main(int ac, char **av)
}
printf("const uchar %s_length%s[%d]={\n",
global_name_prefix, pname_suffix[w], MY_UCA_NPAGES);
options.name_prefix, pname_suffix[w], MY_UCA_NPAGES);
for (page=0; page < MY_UCA_NPAGES; page++)
{
printf("%d%s%s",pagemaxlen[page],page<MY_UCA_NPAGES-1?",":"",(page+1) % 16 ? "":"\n");
......@@ -333,7 +418,7 @@ int main(int ac, char **av)
printf("static const uint16 *%s_weight%s[%d]={\n",
global_name_prefix, pname_suffix[w], MY_UCA_NPAGES);
options.name_prefix, pname_suffix[w], MY_UCA_NPAGES);
for (page=0; page < MY_UCA_NPAGES; page++)
{
const char *comma= page < MY_UCA_NPAGES-1 ? "," : "";
......@@ -342,7 +427,7 @@ int main(int ac, char **av)
printf("NULL %s%s%s", w ? " ": "", comma , nline);
else
printf("%s%s%03X%s%s%s",
global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
options.name_prefix, pname_prefix[w], (int) page, pname_suffix[w],
comma, nline);
}
printf("};\n");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment