Commit d8f172c1 authored by Alexander Barkov's avatar Alexander Barkov Committed by Oleksandr Byelkin

MDEV-27266 Improve UCA collation performance for utf8mb3 and utf8mb4

Adding two levels of optimization:

1. For every bytes pair [00..FF][00..FF] which:
  a. consists of two ASCII characters or makes a well-formed two-byte character
  b. whose total weight string fits into 4 weights
     (concatenated weight string in case of two ASCII characters,
     or a single weight string in case of a two-byte character)
  c. whose weight is context independent (i.e. does not depend on contractions
     or previous context pairs)
  store weights in a separate array of MY_UCA_2BYTES_ITEM,
  so during scanner_next() we can scan two bytes at a time.
  Byte pairs that do not match the conditions a-c are marked in this array
  as not applicable for optimization and scanned as before.

2. For every byte pair which is applicable for optimization in #1,
   and which produces only one or two weights, store
   weights in one more array of MY_UCA_WEIGHT2. So in the beginning
   of strnncoll*() we can skip equal prefixes using an even more efficient
   loop. This loop consumes two bytes at a time. The loop scans while the
   two bytes on both sides produce weight strings of equal length
   (i.e. one weight on both sides, or two weight on both sides).
   This allows to compare efficiently:
   - Context independent sequences consisting of two ASCII characters
   - Context independent 2-byte characters
   - Contractions consisting of two ASCII characters, e.g. Czech "ch".
   - Some tricky cases: "ss" vs "SHARP S"
     ("ss" produces two weights, 0xC39F also produces two weights)
parent a0858b2c
......@@ -141,6 +141,58 @@ const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
my_wc_t wc1, my_wc_t wc2);
typedef struct my_uca_weight2_t
{
uint16 weight[2];
} MY_UCA_WEIGHT2;
/*
In DUCET as of Unicode-14.0.0:
- All characters in the range U+0000..U+007F (i.e. using one byte in utf8)
have not more than two weights on all weight levels.
- All characters in the range U+0080..U+07FF (i.e. using two bytes in utf8)
have not more than four weights on all weight levels.
Therefore the limit of 4 weights should cover all byte pairs
(i.e. two ASCII characters or one 2-byte character)
that are a subject for the "process 2 bytes at a time" optimization.
If some collation reorders any character from the mentioned ranges
in the way that it produces more weights, such character will not
be optimized, but will be correctly processed the slower mb_wc-based
method (1 character at a time).
*/
#define MY_UCA_2BYTES_MAX_WEIGHT_SIZE (4+1) /* Including 0 terminator */
typedef struct my_uca_2bytes_item_t
{
uint16 weight[MY_UCA_2BYTES_MAX_WEIGHT_SIZE];
} MY_UCA_2BYTES_ITEM;
typedef struct my_uca_level_booster_t
{
/*
A helper array to process 2 bytes at a time during string comparison.
It maps all 2-bytes sequences that make:
- two ASCII characters or
- one 2-byte character
to their weights. The weight length is limited to
MY_UCA_2BYTES_MAX_WEIGHT_SIZE-1 weights.
This array is used in the main loop optimization.
*/
MY_UCA_2BYTES_ITEM weight_strings_2bytes[0x10000];
/*
A helper array to process 2bytes at a time during string comparison,
with an even more efficient way than the above one.
The weight size is limited to 2 weights, so it's used for the cases
when 2 input bytes produce 1 or 2 weights.
This limit makes the code using this array even simpler and faster.
This array is used for prefix optimization.
*/
MY_UCA_WEIGHT2 weight_strings_2bytes_to_1_or_2_weights[0x10000];
} MY_UCA_LEVEL_BOOSTER;
typedef struct my_uca_contraction_hash_t
{
size_t nitems_alloced;
......@@ -157,6 +209,7 @@ typedef struct my_uca_level_info_st
MY_CONTRACTIONS contractions;
uint levelno;
MY_UCA_CONTRACTION_HASH contraction_hash;
MY_UCA_LEVEL_BOOSTER *booster;
} MY_UCA_WEIGHT_LEVEL;
......
......@@ -78,6 +78,45 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
my_wc_t currwc= 0;
const uint16 *cweight;
#if MY_UCA_ASCII_OPTIMIZE && !defined(SCANNER_NEXT_NCHARS)
if (scanner->sbeg + 1 < scanner->send)
{
const MY_UCA_2BYTES_ITEM *ww;
ww= my_uca_level_booster_2bytes_item_addr_const(scanner->level->booster,
scanner->sbeg[0],
scanner->sbeg[1]);
if (my_uca_2bytes_item_is_applicable(ww))
{
/*
Byte pairs that make 2-byte head characters in previous
context pairs are marked as not applicable for optimization
during the collation initialization. So when we come here
sbeg[0] and sbeg[1] are:
- either two ASCII characters
- or one 2-byte character which IS NOT a previous context head
Just remember sbeg[1] as the previous character for simplicity.
This may erroneously interpret bytes 0x80..0x9F as previous context
head characters U+0080..U+009F. However, CLDR does not have any real
collations that use these characters as previous context heads.
*/
scanner->page= 0;
scanner->code= (int) scanner->sbeg[1];
scanner->sbeg+= 2;
if ((weight= my_uca_scanner_set_weight(scanner, ww->weight)))
{
/*
TODO: add support for scanner_next_with_nchars and do this:
SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
*/
return weight;
}
continue; /* Ignorable character */
}
/* 2 byte optimization is not applicable, go the slow path */
}
#endif
/* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
/* Get next ASCII character */
......
This diff is collapsed.
......@@ -95,6 +95,15 @@ MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs,
my_uca_scanner tscanner;
int s_res;
int t_res;
#if MY_UCA_ASCII_OPTIMIZE
{
size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
s, slen, t, tlen);
s+= prefix, slen-= prefix;
t+= prefix, tlen-= prefix;
}
#endif
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
......@@ -204,6 +213,15 @@ MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs,
my_uca_scanner sscanner, tscanner;
int s_res, t_res;
#if MY_UCA_ASCII_OPTIMIZE
{
size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
s, slen, t, tlen);
s+= prefix, slen-= prefix;
t+= prefix, tlen-= prefix;
}
#endif
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
......@@ -432,6 +450,18 @@ MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(CHARSET_INFO *cs,
size_t s_nchars_left= nchars;
size_t t_nchars_left= nchars;
/*
TODO: strnncollsp_nchars_onelevel
#if MY_UCA_ASCII_OPTIMIZE
{
size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
s, slen, t, tlen);
s+= prefix, slen-= prefix;
t+= prefix, tlen-= prefix;
}
#endif
*/
my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
......
......@@ -1341,7 +1341,7 @@ strnncollsp_char_one(CHARSET_INFO *cs, const STRNNCOLLSP_CHAR_PARAM *p)
str2hex(ahex, sizeof(ahex), p->a.str, p->a.length);
str2hex(bhex, sizeof(bhex), p->b.str, p->b.length);
diag("%-25s %-12s %-12s %3d %7d %7d%s",
cs->cs_name.str, ahex, bhex, (int) p->nchars, p->res, res,
cs->coll_name.str, ahex, bhex, (int) p->nchars, p->res, res,
eqres(res, p->res) ? "" : " FAILED");
if (!eqres(res, p->res))
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment