MDEV-27266 Improve UCA collation performance for utf8mb3 and utf8mb4

Adding two levels of optimization: 1. For every bytes pair [00..FF][00..FF] which: a. consists of two ASCII characters or makes a well-formed two-byte character b. whose total weight string fits into 4 weights (concatenated weight string in case of two ASCII characters, or a single weight string in case of a two-byte character) c. whose weight is context independent (i.e. does not depend on contractions or previous context pairs) store weights in a separate array of MY_UCA_2BYTES_ITEM, so during scanner_next() we can scan two bytes at a time. Byte pairs that do not match the conditions a-c are marked in this array as not applicable for optimization and scanned as before. 2. For every byte pair which is applicable for optimization in #1, and which produces only one or two weights, store weights in one more array of MY_UCA_WEIGHT2. So in the beginning of strnncoll*() we can skip equal prefixes using an even more efficient loop. This loop consumes two bytes at a time. The loop scans while the two bytes on both sides produce weight strings of equal length (i.e. one weight on both sides, or two weight on both sides). This allows to compare efficiently: - Context independent sequences consisting of two ASCII characters - Context independent 2-byte characters - Contractions consisting of two ASCII characters, e.g. Czech "ch". - Some tricky cases: "ss" vs "SHARP S" ("ss" produces two weights, 0xC39F also produces two weights)

MDEV-27266 Improve UCA collation performance for utf8mb3 and utf8mb4
Adding two levels of optimization: 1. For every bytes pair [00..FF][00..FF] which: a. consists of two ASCII characters or makes a well-formed two-byte character b. whose total weight string fits into 4 weights (concatenated weight string in case of two ASCII characters, or a single weight string in case of a two-byte character) c. whose weight is context independent (i.e. does not depend on contractions or previous context pairs) store weights in a separate array of MY_UCA_2BYTES_ITEM, so during scanner_next() we can scan two bytes at a time. Byte pairs that do not match the conditions a-c are marked in this array as not applicable for optimization and scanned as before. 2. For every byte pair which is applicable for optimization in #1, and which produces only one or two weights, store weights in one more array of MY_UCA_WEIGHT2. So in the beginning of strnncoll*() we can skip equal prefixes using an even more efficient loop. This loop consumes two bytes at a time. The loop scans while the two bytes on both sides produce weight strings of equal length (i.e. one weight on both sides, or two weight on both sides). This allows to compare efficiently: - Context independent sequences consisting of two ASCII characters - Context independent 2-byte characters - Contractions consisting of two ASCII characters, e.g. Czech "ch". - Some tricky cases: "ss" vs "SHARP S" ("ss" produces two weights, 0xC39F also produces two weights)
d8f172c1 · Alexander Barkov · Oleksandr Byelkin · a0858b2c · d8f172c1 · d8f172c1
Commit d8f172c1 authored Feb 25, 2022 by Alexander Barkov Committed by Oleksandr Byelkin Aug 10, 2022
5 changed files
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -141,6 +141,58 @@ const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
                                         my_wc_t wc1, my_wc_t wc2);
+typedef struct my_uca_weight2_t
+{
+  uint16 weight[2];
+} MY_UCA_WEIGHT2;
+/*
+  In DUCET as of Unicode-14.0.0:
+  - All characters in the range U+0000..U+007F (i.e. using one byte in utf8)
+    have not more than two weights on all weight levels.
+  - All characters in the range U+0080..U+07FF (i.e. using two bytes in utf8)
+    have not more than four weights on all weight levels.
+  Therefore the limit of 4 weights should cover all byte pairs
+  (i.e. two ASCII characters or one 2-byte character)
+  that are a subject for the "process 2 bytes at a time" optimization.
+  If some collation reorders any character from the mentioned ranges
+  in the way that it produces more weights, such character will not
+  be optimized, but will be correctly processed the slower mb_wc-based
+  method (1 character at a time).
+*/
+#define MY_UCA_2BYTES_MAX_WEIGHT_SIZE (4+1) /* Including 0 terminator */
+typedef struct my_uca_2bytes_item_t
+{
+  uint16 weight[MY_UCA_2BYTES_MAX_WEIGHT_SIZE];
+} MY_UCA_2BYTES_ITEM;
+typedef struct my_uca_level_booster_t
+{
+  /*
+    A helper array to process 2 bytes at a time during string comparison.
+    It maps all 2-bytes sequences that make:
+    - two ASCII characters or
+    - one 2-byte character
+    to their weights. The weight length is limited to
+    MY_UCA_2BYTES_MAX_WEIGHT_SIZE-1 weights.
+    This array is used in the main loop optimization.
+  */
+  MY_UCA_2BYTES_ITEM weight_strings_2bytes[0x10000];
+  /*
+    A helper array to process 2bytes at a time during string comparison,
+    with an even more efficient way than the above one.
+    The weight size is limited to 2 weights, so it's used for the cases
+    when 2 input bytes produce 1 or 2 weights.
+    This limit makes the code using this array even simpler and faster.
+    This array is used for prefix optimization.
+  */
+  MY_UCA_WEIGHT2 weight_strings_2bytes_to_1_or_2_weights[0x10000];
+} MY_UCA_LEVEL_BOOSTER;
 typedef struct my_uca_contraction_hash_t
 {
  size_t nitems_alloced;
@@ -157,6 +209,7 @@ typedef struct my_uca_level_info_st
  MY_CONTRACTIONS contractions;
  uint    levelno;
  MY_UCA_CONTRACTION_HASH contraction_hash;
+  MY_UCA_LEVEL_BOOSTER *booster;
 } MY_UCA_WEIGHT_LEVEL;

--- a/strings/ctype-uca-scanner_next.inl
+++ b/strings/ctype-uca-scanner_next.inl
@@ -78,6 +78,45 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
    my_wc_t currwc= 0;
    const uint16 *cweight;
+#if MY_UCA_ASCII_OPTIMIZE && !defined(SCANNER_NEXT_NCHARS)
+    if (scanner->sbeg + 1 < scanner->send)
+    {
+      const MY_UCA_2BYTES_ITEM *ww;
+      ww= my_uca_level_booster_2bytes_item_addr_const(scanner->level->booster,
+                                                      scanner->sbeg[0],
+                                                      scanner->sbeg[1]);
+      if (my_uca_2bytes_item_is_applicable(ww))
+      {
+        /*
+          Byte pairs that make 2-byte head characters in previous
+          context pairs are marked as not applicable for optimization
+          during the collation initialization. So when we come here
+          sbeg[0] and sbeg[1] are:
+          - either two ASCII characters
+          - or one 2-byte character which IS NOT a previous context head
+          Just remember sbeg[1] as the previous character for simplicity.
+          This may erroneously interpret bytes 0x80..0x9F as previous context
+          head characters U+0080..U+009F. However, CLDR does not have any real
+          collations that use these characters as previous context heads.
+        */
+        scanner->page= 0;
+        scanner->code= (int) scanner->sbeg[1];
+        scanner->sbeg+= 2;
+        if ((weight= my_uca_scanner_set_weight(scanner, ww->weight)))
+        {
+          /*
+            TODO: add support for scanner_next_with_nchars and do this:
+            SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+          */
+          return weight;
+        }
+        continue; /* Ignorable character */
+      }
+      /* 2 byte optimization is not applicable, go the slow path */
+    }
+#endif
    /* Get next character */
 #if MY_UCA_ASCII_OPTIMIZE
    /* Get next ASCII character */

--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
--- a/strings/ctype-uca.inl
+++ b/strings/ctype-uca.inl
@@ -95,6 +95,15 @@ MY_FUNCTION_NAME(strnncoll_onelevel)(CHARSET_INFO *cs,
  my_uca_scanner tscanner;
  int s_res;
  int t_res;
+#if MY_UCA_ASCII_OPTIMIZE
+{
+  size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
+                                                          s, slen, t, tlen);
+  s+= prefix, slen-= prefix;
+  t+= prefix, tlen-= prefix;
+}
+#endif
  my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
  my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
@@ -204,6 +213,15 @@ MY_FUNCTION_NAME(strnncollsp_onelevel)(CHARSET_INFO *cs,
  my_uca_scanner sscanner, tscanner;
  int s_res, t_res;
+#if MY_UCA_ASCII_OPTIMIZE
+{
+  size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
+                                                          s, slen, t, tlen);
+  s+= prefix, slen-= prefix;
+  t+= prefix, tlen-= prefix;
+}
+#endif
  my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
  my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);
@@ -432,6 +450,18 @@ MY_FUNCTION_NAME(strnncollsp_nchars_onelevel)(CHARSET_INFO *cs,
  size_t s_nchars_left= nchars;
  size_t t_nchars_left= nchars;
+/*
+TODO: strnncollsp_nchars_onelevel
+#if MY_UCA_ASCII_OPTIMIZE
+{
+  size_t prefix= my_uca_level_booster_equal_prefix_length(level->booster,
+                                                          s, slen, t, tlen);
+  s+= prefix, slen-= prefix;
+  t+= prefix, tlen-= prefix;
+}
+#endif
+*/
  my_uca_scanner_init_any(&sscanner, cs, level, s, slen);
  my_uca_scanner_init_any(&tscanner, cs, level, t, tlen);

--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -1341,7 +1341,7 @@ strnncollsp_char_one(CHARSET_INFO *cs, const STRNNCOLLSP_CHAR_PARAM *p)
  str2hex(ahex, sizeof(ahex), p->a.str, p->a.length);
  str2hex(bhex, sizeof(bhex), p->b.str, p->b.length);
  diag("%-25s %-12s %-12s %3d %7d %7d%s",
-       cs->cs_name.str, ahex, bhex, (int) p->nchars, p->res, res,
+       cs->coll_name.str, ahex, bhex, (int) p->nchars, p->res, res,
       eqres(res, p->res) ? "" : " FAILED");
  if (!eqres(res, p->res))
  {