Commit 0736c03d authored by Alexander Barkov's avatar Alexander Barkov Committed by Oleksandr Byelkin

MDEV-27009 Add UCA-14.0.0 collations - Adding implicit weight handling for Unicode-14.0.0

1. Adding separate functions for different Unicode versions
  - my_uca_520_implicit_weight_primary()
   It calculates implicit weights according to the old algorithm
   that we used to dump Unicode-5.2.0 weights.

  - my_uca_1400_implicit_weight_primary()
    It calculates implicit weights according to
    https://unicode.org/reports/tr10/#Values_For_Base_Table
    as of November 2021, Unicode version 14.0.0.

2. Adding the "@version" line recognition when dumping allkeys.txt.
   Implicit weights are dumped according to @version.

3. Dumping the scanned version as a "#define"

4. Removing dumping MY_UCA_NPAGES, MY_UCA_NCHARS, MY_UCA_CMASK, MY_UCA_PSHIFT,
   as they are defined in ctype-uca.c. Removing dumping of "main()", it's not
   needed. The intent is to generate an *.h file which can be put directly
   to the MariaDB source tree.

5. Adding a structure MY_DUCET. It now contains weights for single
   characters and version related members. Later we'll add contractions
   and logical positions in here.
parent bb84f61a
......@@ -31694,7 +31694,7 @@ static inline void
my_uca_implicit_weight_put(uint16 *to, my_wc_t code, uint level)
{
MY_UCA_IMPLICIT_WEIGHT weight;
weight= my_uca_520_implicit_weight_on_level(code, level);
weight= my_uca_implicit_weight_on_level(520, code, level);
to[0]= weight.weight[0];
to[1]= weight.weight[1];
to[2]= 0;
......@@ -17,6 +17,7 @@
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
MA 02110-1335 USA */
#define MY_UCA_VERSION_ID(x,y,z) ((uint) ((x) * 100 + (y) * 10 + (z)))
/*
Implicit weight handling is done according to
......@@ -105,6 +106,18 @@ my_uca_520_implicit_weight_primary(my_wc_t code)
}
#include "ctype-uca1400.h"
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_primary(uint version, my_wc_t code)
{
return version >= 1400 ?
my_uca_1400_implicit_weight_primary(code) :
my_uca_520_implicit_weight_primary(code);
}
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_implicit_weight_secondary()
{
......@@ -136,11 +149,11 @@ my_uca_implicit_weight_quaternary()
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_520_implicit_weight_on_level(my_wc_t code, uint level)
my_uca_implicit_weight_on_level(uint version, my_wc_t code, uint level)
{
switch (level) {
case 0:
return my_uca_520_implicit_weight_primary(code);
return my_uca_implicit_weight_primary(version, code);
case 1:
return my_uca_implicit_weight_secondary();
case 2:
......
#ifndef CTYPE_UCA_1400_H
#define CTYPE_UCA_1400_H
/* Copyright (c) 2021, MariaDB
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; version 2
of the License.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
MA 02110-1335 USA */
/*
17000..187FF; Tangut [6144]
18800..18AFF; Tangut Components [768]
18D00..18D7F; Tangut Supplement [128]
*/
static inline my_bool
my_uca_1400_is_assigned_tangut(my_wc_t code)
{
return (code >= 0x17000 && code <= 0x187FF) ||
(code >= 0x18800 && code <= 0x18AFF) ||
(code >= 0x18D00 && code <= 0x18D7F);
}
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_1400_implicit_weight_primary_tangut(my_wc_t code)
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0xFB00;
res.weight[1]= (uint16) (code - 0x17000) | 0x8000;
return res;
}
/*
1B170..1B2FF; Nushu [400]
*/
static inline my_bool
my_uca_1400_is_assigned_nushu(my_wc_t code)
{
return code >= 0x1B170 && code <= 0x1B2FF;
}
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_1400_implicit_weight_primary_nushu(my_wc_t code)
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0xFB01;
res.weight[1]= (uint16) (code - 0x1B170) | 0x8000;
return res;
}
/*
18B00..18CFF; Khitan Small Script [512]
*/
static inline my_bool
my_uca_1400_is_assigned_khitan_small_script(my_wc_t code)
{
return code >= 0x18B00 && code <= 0x18CFF;
}
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_1400_implicit_weight_primary_khitan(my_wc_t code)
{
MY_UCA_IMPLICIT_WEIGHT res;
res.weight[0]= 0xFB02;
res.weight[1]= (uint16) (code - 0x18B00) | 0x8000;
return res;
}
/*
Unified_Ideograph=True AND
((Block=CJK_Unified_Ideograph) OR (Block=CJK_Compatibility_Ideographs))
https://www.unicode.org/Public/14.0.0/ucd/Blocks.txt
4E00..9FFF; CJK Unified Ideographs
F900..FAFF; CJK Compatibility Ideographs
https://www.unicode.org/Public/14.0.0/ucd/PropList.txt
4E00..9FFF ; Unified_Ideograph # Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF
FA0E..FA0F ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA0E..CJK COMPATIBILITY IDEOGRAPH-FA0F
FA11 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA11
FA13..FA14 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA13..CJK COMPATIBILITY IDEOGRAPH-FA14
FA1F ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA1F
FA21 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA21
FA23..FA24 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA23..CJK COMPATIBILITY IDEOGRAPH-FA24
FA27..FA29 ; Unified_Ideograph # Lo [3] CJK COMPATIBILITY IDEOGRAPH-FA27..CJK COMPATIBILITY IDEOGRAPH-FA29
*/
static inline my_bool
my_uca_1400_is_core_han_unified_ideograph(my_wc_t code)
{
return (code >= 0x4E00 && code <= 0x9FFF) ||
(code >= 0xFA0E && code <= 0xFA0F) ||
(code == 0xFA11) ||
(code >= 0xFA13 && code <= 0xFA14) ||
(code == 0xFA1F) ||
(code == 0xFA21) ||
(code >= 0xFA23 && code <= 0xFA24) ||
(code >= 0xFA27 && code <= 0xFA29);
}
/*
(Unified_Ideograph=True AND NOT
((Block=CJK_Unified_Ideograph) OR (Block=CJK_Compatibility_Ideographs))
https://www.unicode.org/Public/14.0.0/ucd/Blocks.txt
3400..4DBF; CJK Unified Ideographs Extension A
20000..2A6DF; CJK Unified Ideographs Extension B
2A700..2B73F; CJK Unified Ideographs Extension C
2B740..2B81F; CJK Unified Ideographs Extension D
2B820..2CEAF; CJK Unified Ideographs Extension E
2CEB0..2EBEF; CJK Unified Ideographs Extension F
30000..3134F; CJK Unified Ideographs Extension G
https://www.unicode.org/Public/14.0.0/ucd/PropList.txt
3400..4DBF ; Unified_Ideograph # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
20000..2A6DF ; Unified_Ideograph # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
2A700..2B738 ; Unified_Ideograph # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
2B740..2B81D ; Unified_Ideograph # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Unified_Ideograph # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2CEB0..2EBE0 ; Unified_Ideograph # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
30000..3134A ; Unified_Ideograph # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
*/
static inline my_bool
my_uca_1400_is_other_han_unified_ideograph(my_wc_t code)
{
return (code >= 0x3400 && code <= 0x4DBF) ||
(code >= 0x20000 && code <= 0x2A6DF) ||
(code >= 0x2A700 && code <= 0x2B738) ||
(code >= 0x2B740 && code <= 0x2B81D) ||
(code >= 0x2B820 && code <= 0x2CEA1) ||
(code >= 0x2CEB0 && code <= 0x2EBE0) ||
(code >= 0x30000 && code <= 0x3134A);
}
/*
See section "Computing Implicit Weights" in
https://unicode.org/reports/tr10/#Values_For_Base_Table
*/
static inline MY_UCA_IMPLICIT_WEIGHT
my_uca_1400_implicit_weight_primary(my_wc_t code)
{
if (my_uca_1400_is_core_han_unified_ideograph(code))
return my_uca_implicit_weight_primary_default(0xFB40, code);
if (my_uca_1400_is_other_han_unified_ideograph(code))
return my_uca_implicit_weight_primary_default(0xFB80, code);
if (my_uca_1400_is_assigned_tangut(code))
return my_uca_1400_implicit_weight_primary_tangut(code);
if (my_uca_1400_is_assigned_nushu(code))
return my_uca_1400_implicit_weight_primary_nushu(code);
if (my_uca_1400_is_assigned_khitan_small_script(code))
return my_uca_1400_implicit_weight_primary_khitan(code);
/* Unassigned - Any other code point */
return my_uca_implicit_weight_primary_default(0xFBC0, code);
}
#endif /* CTYPE_UCA_1400_H */
......@@ -24,12 +24,6 @@
#include "m_ctype.h"
#include "ctype-uca.h"
struct uca_item_st
{
uchar num;
uint16 weight[4][MY_UCA_MAX_WEIGHT_SIZE];
};
#if 0
#define MY_UCA_NPAGES 1024
#define MY_UCA_NCHARS 64
......@@ -61,6 +55,27 @@ static OPT defaults=
};
typedef struct my_ducet_weight_st
{
uint16 weight[4][MY_UCA_MAX_WEIGHT_SIZE];
size_t weight_length;
} MY_DUCET_WEIGHT;
typedef struct my_ducet_single_char_t
{
MY_DUCET_WEIGHT weight;
} MY_DUCET_SINGLE_CHAR;
typedef struct my_allkeys_st
{
MY_DUCET_SINGLE_CHAR single_chars[MAX_ALLOWED_CODE+1];
uint version;
char version_str[32];
} MY_DUCET;
/* Name prefix that goes into page weight array names after global_name_prefix */
static const char *pname_prefix[]= {"_p", "_p", "_p"};
......@@ -146,11 +161,80 @@ void close_file(FILE *file)
}
char *strrtrim(char *str)
{
char *end= str + strlen(str);
for ( ; str < end; end--)
{
if (end[-1] != '\r' && end[-1] != '\n' &&
end[-1] != ' ' && end[-1] != '\t')
break;
end[-1]= '\0';
}
return str;
}
/*
Parse a line starting with '@'.
As of 14.0.0, allkeys.txt has @version and @implicitweights lines.
Only @version is parsed here.
It could also be possible to parse @implicitweights to automatically
generate routines responsible for implicit weight handling for Siniform
ideographic scripts (Tangut, Nushu, Khitan). But as there are only a few
of them at the moment, it was easier to write these routines in ctype-uca.h
manually. So @implicitweights lines are ignored here.
*/
my_bool parse_at_line(MY_DUCET *ducet, const char *str)
{
static const LEX_CSTRING version= {STRING_WITH_LEN("@version ")};
if (!strncmp(str, version.str, version.length))
{
/*
Examples:
@version 4.0.0
@version 5.2.0
@version 14.0.0
*/
const char *src= str + version.length;
long n[3]= {0};
uint pos;
int length;
length= snprintf(ducet->version_str, sizeof(ducet->version_str)-1,
"%s", src);
ducet->version_str[length]= '\0';
for (pos= 0 ; pos < 3; pos++)
{
char *endptr;
n[pos]= strtol(src, &endptr, 10);
if (*endptr != '.' && *endptr != '\r' && *endptr != '\n' && *endptr != 0)
return TRUE;
src= endptr + 1;
}
ducet->version= MY_UCA_VERSION_ID(n[0], n[1], n[2]);
}
return FALSE;
}
static void
print_version(const MY_DUCET *ducet, const OPT *opt)
{
printf("\n");
printf("#define %s_version %d /* %s */\n",
opt->name_prefix, ducet->version, ducet->version_str);
printf("\n");
}
int main(int ac, char **av)
{
char str[1024];
char *weights[64];
static struct uca_item_st uca[MAX_ALLOWED_CODE+1];
static MY_DUCET ducet;
my_wc_t code;
uint w;
int pageloaded[MY_UCA_NPAGES];
......@@ -166,7 +250,7 @@ int main(int ac, char **av)
return 1;
}
bzero(uca, sizeof(uca));
bzero(&ducet, sizeof(ducet));
bzero(pageloaded, sizeof(pageloaded));
while (fgets(str, sizeof(str), file))
......@@ -176,6 +260,12 @@ int main(int ac, char **av)
char *s;
size_t codenum;
if (str[0] == '@')
{
parse_at_line(&ducet, strrtrim(str));
continue;
}
code= (my_wc_t) strtol(str,NULL,16);
if (str[0]=='#' || (code > MAX_ALLOWED_CODE))
......@@ -212,18 +302,18 @@ int main(int ac, char **av)
continue;
}
uca[code].num= 0;
ducet.single_chars[code].weight.weight_length= 0;
s= strtok(weight, " []");
while (s)
{
weights[uca[code].num]= s;
weights[ducet.single_chars[code].weight.weight_length]= s;
s= strtok(NULL, " []");
uca[code].num++;
ducet.single_chars[code].weight.weight_length++;
}
set_if_smaller(uca[code].num, MY_UCA_MAX_WEIGHT_SIZE-1);
set_if_smaller(ducet.single_chars[code].weight.weight_length, MY_UCA_MAX_WEIGHT_SIZE-1);
for (w=0; w < uca[code].num ; w++)
for (w=0; w < ducet.single_chars[code].weight.weight_length ; w++)
{
size_t partnum;
......@@ -233,7 +323,7 @@ int main(int ac, char **av)
{
char *endptr;
uint part= (uint) strtoul(s + 1, &endptr, 16);
uca[code].weight[partnum][w]= (uint16) part;
ducet.single_chars[code].weight.weight[partnum][w]= (uint16) part;
s= endptr;
partnum++;
}
......@@ -249,25 +339,22 @@ int main(int ac, char **av)
{
uint level;
if (uca[code].num)
if (ducet.single_chars[code].weight.weight_length)
continue;
for (level= 0; level < 4; level++)
{
MY_UCA_IMPLICIT_WEIGHT weight;
weight= my_uca_520_implicit_weight_on_level(code, level);
uca[code].weight[level][0]= weight.weight[0];
uca[code].weight[level][1]= weight.weight[1];
weight= my_uca_implicit_weight_on_level(ducet.version, code, level);
ducet.single_chars[code].weight.weight[level][0]= weight.weight[0];
ducet.single_chars[code].weight.weight[level][1]= weight.weight[1];
}
uca[code].num= 2;
ducet.single_chars[code].weight.weight_length= 2;
}
printf("#include \"my_uca.h\"\n");
printf("#define MY_UCA_NPAGES %d\n",MY_UCA_NPAGES);
printf("#define MY_UCA_NCHARS %d\n",MY_UCA_NCHARS);
printf("#define MY_UCA_CMASK %d\n",MY_UCA_CMASK);
printf("#define MY_UCA_PSHIFT %d\n",MY_UCA_PSHIFT);
printf("/*\n");
printf(" Generated from allkeys.txt version '%s'\n", ducet.version_str);
printf("*/\n");
for (w=0; w < options.levels; w++)
{
......@@ -304,8 +391,8 @@ int main(int ac, char **av)
code= page*MY_UCA_NCHARS+offs;
/* Calculate only non-zero weights */
for (num=0, i=0; i < uca[code].num; i++)
if (uca[code].weight[w][i])
for (num=0, i=0; i < ducet.single_chars[code].weight.weight_length; i++)
if (ducet.single_chars[code].weight.weight[w][i])
num++;
maxnum= maxnum < num ? num : maxnum;
......@@ -314,13 +401,13 @@ int main(int ac, char **av)
if (w == 1 && num == 1)
{
/* 0020 0000 ... */
if (uca[code].weight[w][0] == 0x0020)
if (ducet.single_chars[code].weight.weight[w][0] == 0x0020)
ndefs++;
}
else if (w == 2 && num == 1)
{
/* 0002 0000 ... */
if (uca[code].weight[w][0] == 0x0002)
if (ducet.single_chars[code].weight.weight[w][0] == 0x0002)
ndefs++;
}
}
......@@ -341,7 +428,7 @@ int main(int ac, char **av)
case 2: mchars= 8; break;
case 3: mchars= 9; break;
case 4: mchars= 8; break;
default: mchars= uca[code].num;
default: mchars= ducet.single_chars[code].weight.weight_length;
}
pagemaxlen[page]= (int) maxnum;
......@@ -366,11 +453,11 @@ int main(int ac, char **av)
bzero(weight,sizeof(weight));
/* Copy non-zero weights */
for (num=0, i=0; i < uca[code].num; i++)
for (num=0, i=0; i < ducet.single_chars[code].weight.weight_length; i++)
{
if (uca[code].weight[w][i])
if (ducet.single_chars[code].weight.weight[w][i])
{
weight[num]= uca[code].weight[w][i];
weight[num]= ducet.single_chars[code].weight.weight[w][i];
num++;
}
}
......@@ -433,7 +520,7 @@ int main(int ac, char **av)
printf("};\n");
}
print_version(&ducet, &options);
printf("int main(void){ return 0;};\n");
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment