MDEV-8415 utf8: compare broken bytes as "greater than any non-broken character"

9ad8ff66 · Alexander Barkov · 95d07ee4 · 9ad8ff66 · 9ad8ff66 · 9ad8ff66
Commit 9ad8ff66 authored Jul 03, 2015 by Alexander Barkov
5 changed files
--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -5594,11 +5594,11 @@ a?z
 ab
 az
 aЀ
+aր
 SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch');
 PREPARE stmt FROM @query;
 EXECUTE stmt;
 ch
-aր
 z
 # 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence
 SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch');
@@ -5802,11 +5802,11 @@ a?z
 ab
 az
 aЀ
+aր
 SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch');
 PREPARE stmt FROM @query;
 EXECUTE stmt;
 ch
-aր
 z
 # 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence
 SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch');
@@ -6002,11 +6002,11 @@ a?z
 ab
 az
 aЀ
+aր
 SET @query=CONCAT('SELECT ch FROM t1 WHERE ch>''a', 0xD1,''' ORDER BY ch');
 PREPARE stmt FROM @query;
 EXECUTE stmt;
 ch
-aր
 z
 # 0xEA9A96 would be a good 3-byte character, 0xEA9A is an incomplete sequence
 SET @query=CONCAT('SELECT ch FROM t1 WHERE ch=''a', 0xEA9A,''' ORDER BY ch');

--- a/storage/myisam/myisamdef.h
+++ b/storage/myisam/myisamdef.h
@@ -394,7 +394,7 @@ struct st_myisam_info
 #define PACK_TYPE_SELECTED      1       /* Bits in field->pack_type */
 #define PACK_TYPE_SPACE_FIELDS  2
 #define PACK_TYPE_ZERO_FILL     4
-#define MI_FOUND_WRONG_KEY 32738        /* Impossible value from ha_key_cmp */
+#define MI_FOUND_WRONG_KEY 0x7FFFFFFF   /* Impossible value from ha_key_cmp */

 #define MI_MAX_KEY_BLOCK_SIZE   (MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH)
 #define MI_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size))

--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -26,8 +26,30 @@
 #define EILSEQ ENOENT
 #endif

-#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)
-
+/* Detect special bytes and sequences */
+#define IS_CONTINUATION_BYTE(c)   (((uchar) (c) ^ 0x80) < 0x40)
+
+#define IS_UTF8MB2(b0,b1)         (((uchar) (b0) < 0xE0) && \
+                                  IS_CONTINUATION_BYTE((uchar) b1))
+#define IS_UTF8MB3(b0,b1,b2)      (((uchar) (b0) < 0xF0) && \
+                                   IS_CONTINUATION_BYTE(b1) && \
+                                   IS_CONTINUATION_BYTE(b2) && \
+                                   ((uchar) b0 >= 0xe1 || (uchar) b1 >= 0xa0))
+
+/* Convert individual bytes to Unicode code points */
+#define UTF8MB2_CODE(b0,b1)       (((my_wc_t) ((uchar) b0 & 0x1f) << 6)  |\
+                                   ((my_wc_t) ((uchar) b1 ^ 0x80)))
+#define UTF8MB3_CODE(b0,b1,b2)    (((my_wc_t) ((uchar) b0 & 0x0f) << 12) |\
+                                   ((my_wc_t) ((uchar) b1 ^ 0x80) << 6)  |\
+                                   ((my_wc_t) ((uchar) b2 ^ 0x80)))
+
+/* Definitions for strcoll.ic */
+#define IS_MB1_CHAR(x)              ((uchar) (x) < 0x80)
+#define IS_MB1_MBHEAD_UNUSED_GAP(x) ((uchar) (x) < 0xC2)
+#define IS_MB2_CHAR(x,y)            IS_UTF8MB2(x,y)
+#define IS_MB3_CHAR(x,y,z)          IS_UTF8MB3(x,y,z)
+
+/* Collation names */
 #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
 #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
 #define MY_UTF8MB3_BIN        MY_UTF8MB3 "_bin"
@@ -4848,7 +4870,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
    if (!(IS_CONTINUATION_BYTE(s[1])))
      return MY_CS_ILSEQ;

-    *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+    *pwc= UTF8MB2_CODE(c, s[1]);
    return 2;
  }
  else if (c < 0xf0)
@@ -4860,10 +4882,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
          (c >= 0xe1 || s[1] >= 0xa0)))
      return MY_CS_ILSEQ;

-    *pwc = ((my_wc_t) (c & 0x0f) << 12)   |
-           ((my_wc_t) (s[1] ^ 0x80) << 6) |
-            (my_wc_t) (s[2] ^ 0x80);
-
+    *pwc= UTF8MB3_CODE(c, s[1], s[2]);
    return 3;
  }
 #ifdef UNICODE_32BIT
@@ -4954,7 +4973,7 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
    if (!((s[1] ^ 0x80) < 0x40))
      return MY_CS_ILSEQ;

-    *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+    *pwc= UTF8MB2_CODE(c, s[1]);
    return 2;
  }

@@ -4965,10 +4984,7 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
          (c >= 0xe1 || s[1] >= 0xa0)))
      return MY_CS_ILSEQ;

-    *pwc= ((my_wc_t) (c & 0x0f) << 12)   |
-          ((my_wc_t) (s[1] ^ 0x80) << 6) |
-           (my_wc_t) (s[2] ^ 0x80);
-
+    *pwc= UTF8MB3_CODE(c, s[1], s[2]);
    return 3;
  }
  return MY_CS_ILSEQ;
@@ -5193,148 +5209,6 @@ static size_t my_casedn_str_utf8(CHARSET_INFO *cs, char *src)
 }


-static int my_strnncoll_utf8(CHARSET_INFO *cs,
-                             const uchar *s, size_t slen,
-                             const uchar *t, size_t tlen,
-                             my_bool t_is_prefix)
-{
-  int s_res,t_res;
-  my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
-  const uchar *se=s+slen;
-  const uchar *te=t+tlen;
-  MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
-  while ( s < se && t < te )
-  {
-    s_res=my_utf8_uni(cs,&s_wc, s, se);
-    t_res=my_utf8_uni(cs,&t_wc, t, te);
-
-    if ( s_res <= 0 || t_res <= 0 )
-    {
-      /* Incorrect string, compare byte by byte value */
-      return bincmp(s, se, t, te);
-    }
-
-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
-
-    if ( s_wc != t_wc )
-    {
-      return  s_wc > t_wc ? 1 : -1;
-    }
-
-    s+=s_res;
-    t+=t_res;
-  }
-  return (int) (t_is_prefix ? t-te : ((se-s) - (te-t)));
-}
-
-
-
-/*
-  Compare strings, discarding end space
-
-  SYNOPSIS
-    my_strnncollsp_utf8()
-    cs                  character set handler
-    a                   First string to compare
-    a_length            Length of 'a'
-    b                   Second string to compare
-    b_length            Length of 'b'
-    diff_if_only_endspace_difference
-		        Set to 1 if the strings should be regarded as different
-                        if they only difference in end space
-
-  IMPLEMENTATION
-    If one string is shorter as the other, then we space extend the other
-    so that the strings have equal length.
-
-    This will ensure that the following things hold:
-
-    "a"  == "a "
-    "a\0" < "a"
-    "a\0" < "a "
-
-  RETURN
-    < 0  a <  b
-    = 0  a == b
-    > 0  a > b
-*/
-
-static int my_strnncollsp_utf8(CHARSET_INFO *cs,
-                               const uchar *s, size_t slen,
-                               const uchar *t, size_t tlen,
-                               my_bool diff_if_only_endspace_difference)
-{
-  int s_res, t_res, res;
-  my_wc_t UNINIT_VAR(s_wc), UNINIT_VAR(t_wc);
-  const uchar *se= s+slen, *te= t+tlen;
-  MY_UNICASE_INFO *uni_plane= cs->caseinfo;
-
-#ifndef VARCHAR_WITH_DIFF_ENDSPACE_ARE_DIFFERENT_FOR_UNIQUE
-  diff_if_only_endspace_difference= 0;
-#endif
-
-  while ( s < se && t < te )
-  {
-    s_res=my_utf8_uni(cs,&s_wc, s, se);
-    t_res=my_utf8_uni(cs,&t_wc, t, te);
-
-    if ( s_res <= 0 || t_res <= 0 )
-    {
-      /* Incorrect string, compare byte by byte value */
-      return bincmp(s, se, t, te);
-    }
-
-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
-    
-    if ( s_wc != t_wc )
-    {
-      return  s_wc > t_wc ? 1 : -1;
-    }
-
-    s+=s_res;
-    t+=t_res;
-  }
-
-  slen= (size_t) (se-s);
-  tlen= (size_t) (te-t);
-  res= 0;
-
-  if (slen != tlen)
-  {
-    int swap= 1;
-    if (diff_if_only_endspace_difference)
-      res= 1;                                   /* Assume 'a' is bigger */
-    if (slen < tlen)
-    {
-      slen= tlen;
-      s= t;
-      se= te;
-      swap= -1;
-      res= -res;
-    }
-    /*
-      This following loop uses the fact that in UTF-8
-      all multibyte characters are greater than space,
-      and all multibyte head characters are greater than
-      space. It means if we meet a character greater
-      than space, it always means that the longer string
-      is greater. So we can reuse the same loop from the
-      8bit version, without having to process full multibute
-      sequences.
-    */
-    for ( ; s < se; s++)
-    {
-      if (*s != ' ')
-	return (*s < ' ') ? -swap : swap;
-    }
-  }
-  return res;
-}
-
-
 /*
  Compare 0-terminated UTF8 strings.

@@ -5535,6 +5409,75 @@ my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
 /* my_well_formed_char_length_utf8 */


+static inline int my_weight_mb1_utf8_general_ci(uchar b)
+{
+  return (int) plane00[b & 0xFF].sort;
+}
+
+
+static inline int my_weight_mb2_utf8_general_ci(uchar b0, uchar b1)
+{
+  my_wc_t wc= UTF8MB2_CODE(b0, b1);
+  MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8];
+  return (int) (page ? page[wc & 0xFF].sort : wc);
+}
+
+
+static inline int my_weight_mb3_utf8_general_ci(uchar b0, uchar b1, uchar b2)
+{
+  my_wc_t wc= UTF8MB3_CODE(b0, b1, b2);
+  MY_UNICASE_CHARACTER *page= my_unicase_pages_default[wc >> 8];
+  return (int) (page ? page[wc & 0xFF].sort : wc);
+}
+
+
+#define MY_FUNCTION_NAME(x)    my_ ## x ## _utf8_general_ci
+#define WEIGHT_ILSEQ(x)        (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x)          my_weight_mb1_utf8_general_ci(x)
+#define WEIGHT_MB2(x,y)        my_weight_mb2_utf8_general_ci(x,y)
+#define WEIGHT_MB3(x,y,z)      my_weight_mb3_utf8_general_ci(x,y,z)
+#include "strcoll.ic"
+
+
+static inline int my_weight_mb1_utf8_general_mysql500_ci(uchar b)
+{
+  return (int) plane00_mysql500[b & 0xFF].sort;
+}
+
+
+static inline int my_weight_mb2_utf8_general_mysql500_ci(uchar b0, uchar b1)
+{
+  my_wc_t wc= UTF8MB2_CODE(b0, b1);
+  MY_UNICASE_CHARACTER *page= my_unicase_pages_mysql500[wc >> 8];
+  return (int) (page ? page[wc & 0xFF].sort : wc);
+}
+
+
+static inline int
+my_weight_mb3_utf8_general_mysql500_ci(uchar b0, uchar b1, uchar b2)
+{
+  my_wc_t wc= UTF8MB3_CODE(b0, b1, b2);
+  MY_UNICASE_CHARACTER *page= my_unicase_pages_mysql500[wc >> 8];
+  return (int) (page ? page[wc & 0xFF].sort : wc);
+}
+
+
+#define MY_FUNCTION_NAME(x)    my_ ## x ## _utf8_general_mysql500_ci
+#define WEIGHT_ILSEQ(x)        (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x)          my_weight_mb1_utf8_general_mysql500_ci(x)
+#define WEIGHT_MB2(x,y)        my_weight_mb2_utf8_general_mysql500_ci(x,y)
+#define WEIGHT_MB3(x,y,z)      my_weight_mb3_utf8_general_mysql500_ci(x,y,z)
+#include "strcoll.ic"
+
+
+#define MY_FUNCTION_NAME(x)    my_ ## x ## _utf8_bin
+#define WEIGHT_ILSEQ(x)        (0xFF0000 + (uchar) (x))
+#define WEIGHT_MB1(x)          ((int) (uchar) (x))
+#define WEIGHT_MB2(x,y)        ((int) UTF8MB2_CODE(x,y))
+#define WEIGHT_MB3(x,y,z)      ((int) UTF8MB3_CODE(x,y,z))
+#include "strcoll.ic"
+
+
 static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
 {
  int  res= my_charlen_utf8(cs, (const uchar*) b, (const uchar*) e);
@@ -5567,8 +5510,24 @@ static uint my_mbcharlen_utf8(CHARSET_INFO *cs  __attribute__((unused)),
 static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler =
 {
    NULL,               /* init */
-    my_strnncoll_utf8,
-    my_strnncollsp_utf8,
+    my_strnncoll_utf8_general_ci,
+    my_strnncollsp_utf8_general_ci,
+    my_strnxfrm_unicode,
+    my_strnxfrmlen_unicode,
+    my_like_range_mb,
+    my_wildcmp_utf8,
+    my_strcasecmp_utf8,
+    my_instr_mb,
+    my_hash_sort_utf8,
+    my_propagate_complex
+};
+
+
+static MY_COLLATION_HANDLER my_collation_utf8_general_mysql500_ci_handler =
+{
+    NULL,               /* init */
+    my_strnncoll_utf8_general_mysql500_ci,
+    my_strnncollsp_utf8_general_mysql500_ci,
    my_strnxfrm_unicode,
    my_strnxfrmlen_unicode,
    my_like_range_mb,
@@ -5583,8 +5542,8 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler =
 static MY_COLLATION_HANDLER my_collation_utf8_bin_handler =
 {
    NULL,		/* init */
-    my_strnncoll_mb_bin,
-    my_strnncollsp_mb_bin,
+    my_strnncoll_utf8_bin,
+    my_strnncollsp_utf8_bin,
    my_strnxfrm_unicode,
    my_strnxfrmlen_unicode,
    my_like_range_mb,
@@ -5693,7 +5652,7 @@ struct charset_info_st my_charset_utf8_general_mysql500_ci=
  0,                          /* escape_with_backslash_is_dangerous */
  1,                                            /* levels_for_order   */
  &my_charset_utf8_handler,
-  &my_collation_utf8_general_ci_handler
+  &my_collation_utf8_general_mysql500_ci_handler
 };


@@ -7166,8 +7125,8 @@ my_ismbchar_filename(CHARSET_INFO *cs, const char *str, const char *end)
 static MY_COLLATION_HANDLER my_collation_filename_handler =
 {
    NULL,               /* init */
-    my_strnncoll_utf8,
-    my_strnncollsp_utf8,
+    my_strnncoll_simple,
+    my_strnncollsp_simple,
    my_strnxfrm_unicode,
    my_strnxfrmlen_unicode,
    my_like_range_mb,
@@ -7410,7 +7369,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
    if (!(IS_CONTINUATION_BYTE(s[1])))
      return MY_CS_ILSEQ;

-    *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+    *pwc= UTF8MB2_CODE(c, s[1]);
    return 2;
  }
  else if (c < 0xf0)
@@ -7422,9 +7381,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
          (c >= 0xe1 || s[1] >= 0xa0)))
      return MY_CS_ILSEQ;

-    *pwc= ((my_wc_t) (c & 0x0f) << 12)   |
-          ((my_wc_t) (s[1] ^ 0x80) << 6) |
-           (my_wc_t) (s[2] ^ 0x80);
+    *pwc= UTF8MB3_CODE(c, s[1], s[2]);
    return 3;
  }
  else if (c < 0xf5)
@@ -7492,7 +7449,7 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
    if (!IS_CONTINUATION_BYTE(s[1]))
      return MY_CS_ILSEQ;

-    *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+    *pwc= UTF8MB2_CODE(c, s[1]);
    return 2;
  }

@@ -7502,10 +7459,7 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
          IS_CONTINUATION_BYTE(s[2]) &&
          (c >= 0xe1 || s[1] >= 0xa0)))
      return MY_CS_ILSEQ;
-    *pwc= ((my_wc_t) (c & 0x0f) << 12)   |
-          ((my_wc_t) (s[1] ^ 0x80) << 6) |
-           (my_wc_t) (s[2] ^ 0x80);
-
+    *pwc= UTF8MB3_CODE(c, s[1], s[2]);
    return 3;
  }
  else if (c < 0xf5)

--- a/strings/strcoll.ic
+++ b/strings/strcoll.ic
@@ -65,7 +65,9 @@

  The including source file must define the following macros:
  IS_MB1_CHAR(x)
+  IS_MB1_MB2HEAD_GAP(x)  - optional, for better performance
  IS_MB2_CHAR(x,y)
+  IS_MB3_CHAR(x,y,z)     - for character sets with mbmaxlen>2
  WEIGHT_PAD_SPACE
  WEIGHT_MB1(x)
  WEIGHT_MB2(x,y)
@@ -86,6 +88,16 @@ MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
    return 1;
  }

+#ifdef IS_MB1_MBHEAD_UNUSED_GAP
+  /*
+    Quickly filter out unused bytes that are neither MB1 nor MBHEAD.
+    E.g. [0x80..0xC1] in utf8. This allows using simplified conditions
+    in IS_MB2_CHAR(), IS_MB3_CHAR(), etc.
+  */
+  if (IS_MB1_MBHEAD_UNUSED_GAP(*str))
+    goto bad;
+#endif
+
  if (str + 2 > end)                     /* The string ended unexpectedly */
    goto bad;                            /* Treat as a bad byte */


--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -347,6 +347,28 @@ STRNNCOLL_PARAM strcoll_ujis[]=
 };


+STRNNCOLL_PARAM strcoll_utf8mb3_common[]=
+{
+  {CSTR("\xC0"),     CSTR("\xC1"),        -1},    /* Unused byte vs unused byte */
+  {CSTR("\xC0"),     CSTR("\xFF"),        -1},    /* Unused byte vs unused byte */
+  {CSTR("\xC2\xA1"), CSTR("\xC0"),        -1},    /* MB2 vs unused byte */
+  {CSTR("\xC2\xA1"), CSTR("\xC2"),        -1},    /* MB2 vs incomplete MB2 */
+  {CSTR("\xC2\xA1"), CSTR("\xC2\xA2"),    -1},    /* MB2 vs MB2 */
+  {CSTR("\xC2\xA1"), CSTR("\xE0\xA0\x7F"),-1},    /* MB2 vs broken MB3 */
+  {CSTR("\xC2\xA1"), CSTR("\xE0\xA0\x80"),-1},    /* MB2 vs MB3 */
+  {CSTR("\xC2\xA1"), CSTR("\xE0\xA0\xBF"),-1},    /* MB2 vs MB3 */
+  {CSTR("\xC2\xA1"), CSTR("\xE0\xA0\xC0"),-1},    /* MB2 vs broken MB3 */
+  {CSTR("\xC2\xA1"), CSTR("\xE0\xA0"),     -1},   /* MB2 vs incomplete MB3 */
+  {CSTR("\xE0\xA0\x7E"), CSTR("\xE0\xA0\x7F"),-1},/* Broken MB3 vs broken MB3 */
+  {CSTR("\xE0\xA0\x80"), CSTR("\xE0\xA0"),    -1},/* MB3 vs incomplete MB3 */
+  {CSTR("\xE0\xA0\x80"), CSTR("\xE0\xA0\x7F"),-1},/* MB3 vs broken MB3 */
+  {CSTR("\xE0\xA0\x80"), CSTR("\xE0\xA0\xBF"),-1},/* MB3 vs MB3 */
+  {CSTR("\xE0\xA0\x80"), CSTR("\xE0\xA0\xC0"),-1},/* MB3 vs broken MB3 */
+  {CSTR("\xE0\xA0\xC0"), CSTR("\xE0\xA0\xC1"),-1},/* Broken MB3 vs broken MB3 */
+  {NULL, 0, NULL, 0, 0}
+};
+
+
 static void
 str2hex(char *dst, size_t dstlen, const char *src, size_t srclen)
 {
@@ -470,6 +492,11 @@ test_strcollsp()
  failed+= strcollsp(&my_charset_ujis_bin,         strcoll_mb2_A1A1_mb2_F9FE);
  failed+= strcollsp(&my_charset_ujis_japanese_ci, strcoll_ujis);
  failed+= strcollsp(&my_charset_ujis_bin,         strcoll_ujis);
+#endif
+#ifdef HAVE_CHARSET_utf8
+  failed+= strcollsp(&my_charset_utf8_general_ci,          strcoll_utf8mb3_common);
+  failed+= strcollsp(&my_charset_utf8_general_mysql500_ci, strcoll_utf8mb3_common);
+  failed+= strcollsp(&my_charset_utf8_bin,                 strcoll_utf8mb3_common);
 #endif
  return failed;
 }