MDEV-27042 UCA: Resetting contractions to ignorable does not work well

The weight scanner routine scanner_next() did not properly handle the cases when a contraction produces no weights (is ignorable). Adding a helper routine my_uca_scanner_set_weight() and using it in all cases: - A single ASCII character - A contraction starting with an ASCII character - A multi-byte character - A contraction starting with a multi-byte character Also adding two other helper routines: - my_uca_scanner_next_expansion_weight() - my_uca_scanner_set_weight_outside_maxchar() to avoid using scanner->wbeg directly inside scanner_next(). This reduces the probability of similar future bugs.

MDEV-27042 UCA: Resetting contractions to ignorable does not work well
The weight scanner routine scanner_next() did not properly handle the cases when a contraction produces no weights (is ignorable). Adding a helper routine my_uca_scanner_set_weight() and using it in all cases: - A single ASCII character - A contraction starting with an ASCII character - A multi-byte character - A contraction starting with a multi-byte character Also adding two other helper routines: - my_uca_scanner_next_expansion_weight() - my_uca_scanner_set_weight_outside_maxchar() to avoid using scanner->wbeg directly inside scanner_next(). This reduces the probability of similar future bugs.
f9ad8072 · Alexander Barkov · 0a3d1d10 · f9ad8072 · f9ad8072 · f9ad8072
Commit f9ad8072 authored Nov 14, 2021 by Alexander Barkov
5 changed files
--- a/mysql-test/main/ctype_ldml.result
+++ b/mysql-test/main/ctype_ldml.result
--- a/mysql-test/main/ctype_ldml.test
+++ b/mysql-test/main/ctype_ldml.test
@@ -33,6 +33,7 @@ SELECT * FROM t1 ORDER BY phone;
 SELECT * FROM t1 WHERE phone='+7(912)800-80-01';
 SELECT * FROM t1 WHERE phone='79128008001';
 SELECT * FROM t1 WHERE phone='7 9 1 2 8 0 0 8 0 0 1';
+SELECT * FROM t1 WHERE phone='tel.79128008001';
 DROP TABLE t1;
 show collation like 'utf8mb3_test_ci';
@@ -615,3 +616,23 @@ SELECT 'a' COLLATE utf8_czech_test_bad_w2;
 SHOW COLLATION LIKE 'latin1_test_replace';
 --error ER_UNKNOWN_COLLATION
 SELECT 'foo' = 'foo ' COLLATE latin1_test_replace;
+--echo #
+--echo # MDEV-27042 UCA: Resetting contractions to ignorable does not work well
+--echo #
+CREATE TABLE t1 (
+  phone VARCHAR(64) CHARACTER SET utf8 COLLATE utf8_phone_ci
+);
+INSERT INTO t1 VALUES ('123');
+INSERT INTO t1 VALUES ('tel.123');
+INSERT INTO t1 VALUES ('tél.123');
+INSERT INTO t1 VALUES ('tèl.123');
+INSERT INTO t1 VALUES ('ťel.123');
+INSERT INTO t1 VALUES ('ťèl.123');
+INSERT INTO t1 VALUES ('tex.123');
+SELECT * FROM t1 WHERE phone='123' ORDER BY BINARY phone;
+SELECT * FROM t1 WHERE phone<>'123' ORDER BY BINARY phone;
+SELECT phone, HEX(WEIGHT_STRING(phone)) FROM t1 ORDER BY phone, BINARY phone;
+DROP TABLE t1;
--- a/mysql-test/std_data/ldml/Index.xml
+++ b/mysql-test/std_data/ldml/Index.xml
@@ -9,6 +9,9 @@
          <i>\u0029</i> <!-- right parenthesis -->
          <i>\u002B</i> <!-- plus -->
          <i>\u002D</i> <!-- hyphen -->
+          <i>tel.</i>
+          <i>tél.</i>
+          <i>ťel.</i>
      </rules>
    </collation>
    <collation name="utf8mb3_test_ci" id="353">

--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -31175,6 +31175,33 @@ static const uint16 nochar[]= {0,0};
 #define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
 #define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
+static inline uint16
+my_uca_scanner_next_expansion_weight(my_uca_scanner *scanner)
+{
+  if (scanner->wbeg[0])
+    return *scanner->wbeg++;
+  return 0;
+}
+static inline uint16
+my_uca_scanner_set_weight(my_uca_scanner *scanner, const uint16 *weight)
+{
+  scanner->wbeg= weight + 1;
+  return *weight;
+}
+static inline uint16
+my_uca_scanner_set_weight_outside_maxchar(my_uca_scanner *scanner)
+{
+  /* Return 0xFFFD as weight for all characters outside BMP */
+  scanner->wbeg= nochar;
+  return 0xFFFD;
+}
 /********** Helper functions to handle contraction ************/
--- a/strings/ctype-uca.ic
+++ b/strings/ctype-uca.ic
@@ -40,20 +40,16 @@
 static inline int
 MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
 {
-  /*
+  uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
-    Check if the weights for the previous character have been
+  if (weight)
-    already fully scanned. If yes, then get the next character and
+    return weight; /* Next expansion weight found */
-    initialize wbeg and wlength to its weight string.
-  */
-  if (scanner->wbeg[0])      /* More weights left from the previous step: */
-    return *scanner->wbeg++; /* return the next weight from expansion     */
-  do
+  for ( ; ; )
  {
    const uint16 *wpage;
    int mblen;
    my_wc_t currwc;
+    const uint16 *cweight;
    /* Get next character */
 #if MY_UCA_ASCII_OPTIMIZE
@@ -64,23 +60,21 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
      scanner->sbeg+= 1;
 #if MY_UCA_COMPILE_CONTRACTIONS
-      if (my_uca_needs_context_handling(scanner->level, currwc))
+      if (my_uca_needs_context_handling(scanner->level, currwc) &&
+          (cweight= my_uca_context_weight_find(scanner, currwc)))
      {
-        const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
+        if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
-        if (cweight)
+          return weight;
-        {
+        continue; /* Ignorable contraction */
-          scanner->wbeg= cweight + 1;
-          return *cweight;
-        }
      }
 #endif
      scanner->page= 0;
      scanner->code= (int) currwc;
-      scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
+      cweight= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
-      if (scanner->wbeg[0])
+      if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
-        return *scanner->wbeg++;
+        return weight;
-      continue;
+      continue; /* Ignorable character */
    }
    else
 #endif
@@ -109,21 +103,15 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
    scanner->sbeg+= mblen;
    if (currwc > scanner->level->maxchar)
-    {
+      return my_uca_scanner_set_weight_outside_maxchar(scanner);
-      /* Return 0xFFFD as weight for all characters outside BMP */
-      scanner->wbeg= nochar;
-      return 0xFFFD;
-    }
 #if MY_UCA_COMPILE_CONTRACTIONS
-    if (my_uca_needs_context_handling(scanner->level, currwc))
+    if (my_uca_needs_context_handling(scanner->level, currwc) &&
+        (cweight= my_uca_context_weight_find(scanner, currwc)))
    {
-      const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
+      if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
-      if (cweight)
+        return weight;
-      {
+      continue; /* Ignorable contraction */
-        scanner->wbeg= cweight + 1;
-        return *cweight;
-      }
    }
 #endif
@@ -136,11 +124,13 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
      return my_uca_scanner_next_implicit(scanner);
    /* Calculate pointer to w[0]'s weight, using page and offset */
-    scanner->wbeg= wpage +
+    cweight= wpage + scanner->code * scanner->level->lengths[scanner->page];
-                   scanner->code * scanner->level->lengths[scanner->page];
+    if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
-  } while (!scanner->wbeg[0]); /* Skip ignorable characters */
+      return weight;
+    continue; /* Ignorable character */
+  }
-  return *scanner->wbeg++;
+  return 0;
 }