Commit 63120090 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-10262 ucs2_thai_520_w2: wrong implicit weights on the secondary level

parent 61492ea5
......@@ -131,6 +131,7 @@ typedef struct my_uca_level_info_st
uchar *lengths;
uint16 **weights;
MY_CONTRACTIONS contractions;
uint levelno;
} MY_UCA_WEIGHT_LEVEL;
......
......@@ -22,6 +22,13 @@ SELECT HEX(a), HEX(WEIGHT_STRING(a AS CHAR(4) LEVEL 2)) FROM t1;
SELECT HEX(a), HEX(WEIGHT_STRING(a AS CHAR(4) LEVEL 3)) FROM t1;
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
......
......@@ -14033,6 +14033,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 E39080 FB80B4000020
F001 EF8081 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
......@@ -14714,6 +14727,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 3400 FB80B4000020
F001 F001 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
......
......@@ -6663,6 +6663,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 3400 FB80B4000020
F001 F001 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
......
......@@ -6683,6 +6683,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 00003400 FB80B4000020
F001 0000F001 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
......
......@@ -5373,6 +5373,19 @@ Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 E39080 FB80B4000020
F001 EF8081 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2))
......
......@@ -6539,7 +6539,8 @@ MY_UCA_INFO my_uca_v400=
0, /* nitems */
NULL, /* item */
NULL /* flags */
}
},
0 /* levelno */
},
},
......@@ -30084,7 +30085,8 @@ MY_UCA_INFO my_uca_v520_th=
THAI_CONTRACTIONS, /* nitems */
thai_contractions, /* item */
NULL /* flags */
}
},
0 /* levelno */
},
{
0x10FFFF, /* maxchar */
......@@ -30094,7 +30096,8 @@ MY_UCA_INFO my_uca_v520_th=
THAI_CONTRACTIONS_W2, /* nitems */
thai_contractions_w2, /* item */
NULL /* flags */
}
},
1 /* levelno */
},
},
......@@ -30127,8 +30130,9 @@ MY_UCA_INFO my_uca_v520=
{ /* Contractions: */
0, /* nitems */
NULL, /* item */
NULL /* flags */
}
NULL /* flags */
},
0 /* levelno */
},
},
......@@ -31529,37 +31533,88 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
/****************************************************************/
/**
Implicit weights for a code CP are constructed as follows:
[.AAAA.0020.0002][.BBBB.0000.0000]
where:
AAAA= BASE + (CP >> 15);
BBBB= (CP & 0x7FFF) | 0x8000;
There are two weights in the primary level (AAAA followed by BBBB).
There is one weight on other levels:
- 0020 on the secondary level
- 0002 on the tertiary level
*/
/**
Return BASE for an implicit weight on the primary level
According to UCA, BASE is calculated as follows:
- FB40 for Unified_Ideograph=True AND
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FB80 for Unified_Ideograph=True AND NOT
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FBC0 for any other code point
TODO: it seems we're not handling BASE correctly:
- check what are those blocks
- there are more Unified Ideograph blocks in the latest Unicode versions
*/
static inline uint16
my_uca_implicit_weight_base(my_wc_t code)
{
if (code >= 0x3400 && code <= 0x4DB5)
return 0xFB80;
if (code >= 0x4E00 && code <= 0x9FA5)
return 0xFB40;
return 0xFBC0;
}
/**
Return implicit UCA weight
Return an implicit UCA weight for the primary level.
Used for characters that do not have assigned UCA weights.
@param scanner UCA weight scanner
@return The leading implicit weight.
The second weight is stored in scanner->implicit[0]
and is later returned on the next my_uca_scanner_next_any() call.
*/
static inline int
my_uca_scanner_next_implicit(my_uca_scanner *scanner)
my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)
{
scanner->code= (scanner->page << 8) + scanner->code;
scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000;
scanner->implicit[1]= 0;
my_wc_t wc= (scanner->page << 8) + scanner->code;
scanner->implicit[0]= (wc & 0x7FFF) | 0x8000; /* The second weight */
scanner->implicit[1]= 0; /* 0 terminator */
scanner->wbeg= scanner->implicit;
scanner->page= scanner->page >> 7;
if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5)
scanner->page+= 0xFB80;
else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5)
scanner->page+= 0xFB40;
else
scanner->page+= 0xFBC0;
return scanner->page;
return my_uca_implicit_weight_base(wc) + (wc >> 15);
}
/**
Return an implicit weight for the current level
(according to scanner->level->levelno).
*/
static inline int
my_uca_scanner_next_implicit(my_uca_scanner *scanner)
{
switch (scanner->level->levelno) {
case 0: return my_uca_scanner_next_implicit_primary(scanner);/* Primary level*/
case 1: scanner->wbeg= nochar; return 0x0020; /* Secondary level */
case 2: scanner->wbeg= nochar; return 0x0002; /* Tertiary level */
default: scanner->wbeg= nochar; break;
}
DBUG_ASSERT(0);
return 0;
}
/*
The same two functions for any character set
*/
......@@ -33829,6 +33884,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
size_t i, npages= (src->maxchar + 1) / 256;
dst->maxchar= src->maxchar;
dst->levelno= src->levelno;
if (check_rules(loader, rules, dst, src))
return TRUE;
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment