Bug #14057034 : WASTED CPU CYCLES IN MY_UTF8_UNI WHERE

RESULTING MY_WC_T RESULT IS NOT USED Issue : handler functions my_ismbchar_utf8, my_well_formed_len_mb for charset utf8 is calling unicode converion function to validate and to find the character length. Because of this, instructions which will convert the utf8 to unicode are executed for no use. A similar issue exist with charset utf8mb4 Solution : reorganized the code such that character validation part of unicode conversion handler is extracted(duplicated) in to separate function. Hence my_ismbchar_utf8, my_well_formed_len_mb will call the new function which only validates and return the length of mb(utf8). A similar fix for charset utf8mb4.

Bug #14057034 : WASTED CPU CYCLES IN MY_UTF8_UNI WHERE
RESULTING MY_WC_T RESULT IS NOT USED Issue : handler functions my_ismbchar_utf8, my_well_formed_len_mb for charset utf8 is calling unicode converion function to validate and to find the character length. Because of this, instructions which will convert the utf8 to unicode are executed for no use. A similar issue exist with charset utf8mb4 Solution : reorganized the code such that character validation part of unicode conversion handler is extracted(duplicated) in to separate function. Hence my_ismbchar_utf8, my_well_formed_len_mb will call the new function which only validates and return the length of mb(utf8). A similar fix for charset utf8mb4.
8934a80b · mithun · aec08569 · 8934a80b
Commit 8934a80b authored Nov 12, 2013 by mithun
Hide whitespace changes
Inline Side-by-side

Showing with 229 additions and 35 deletions

strings/ctype-utf8.c strings/ctype-utf8.c +229 -35

No files found.
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -27,6 +27,7 @@
 #define EILSEQ ENOENT
 #endif

+#define IS_CONTINUATION_BYTE(c) (((c) ^ 0x80) < 0x40)

 #define MY_UTF8MB3_GENERAL_CI MY_UTF8MB3 "_general_ci"
 #define MY_UTF8MB3_GENERAL_CS MY_UTF8MB3 "_general_cs"
@@ -57,6 +58,46 @@
 #define HAVE_UNIDATA
 #endif

+
+#if defined(HAVE_CHARSET_utf8) || defined(HAVE_CHARSET_utf8mb4)
+
+static inline
+int my_valid_mbcharlen_utf8mb3(const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  DBUG_ASSERT(s < e);
+  c= s[0];
+  if (c < 0x80)
+    return 1;
+
+  if (c < 0xc2)
+    return MY_CS_ILSEQ;
+
+  if (c < 0xe0)
+  {
+    if (s+2 > e) /* We need 2 characters */
+      return MY_CS_TOOSMALL2;
+
+    if (!(IS_CONTINUATION_BYTE(s[1])))
+      return MY_CS_ILSEQ;
+
+    return 2;
+  }
+
+  DBUG_ASSERT(c < 0xf0);
+  if (s+3 > e) /* We need 3 characters */
+    return MY_CS_TOOSMALL3;
+
+  if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
+          (c >= 0xe1 || s[1] >= 0xa0)))
+    return MY_CS_ILSEQ;
+
+  return 3;
+}
+
+#endif  /*HAVE_CHARSET_utf8 || HAVE_CHARSET_utf8mb4*/
+
 #ifdef HAVE_UNIDATA

 #include "my_uctype.h"
@@ -2287,7 +2328,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
    if (s+2 > e) /* We need 2 characters */
      return MY_CS_TOOSMALL2;

-    if (!((s[1] ^ 0x80) < 0x40))
+    if (!(IS_CONTINUATION_BYTE(s[1])))
      return MY_CS_ILSEQ;

    *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -2298,7 +2339,7 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
    if (s+3 > e) /* We need 3 characters */
      return MY_CS_TOOSMALL3;

-    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
          (c >= 0xe1 || s[1] >= 0xa0)))
      return MY_CS_ILSEQ;

@@ -2314,9 +2355,9 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
    if (s+4 > e) /* We need 4 characters */
      return MY_CS_TOOSMALL4;

-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
-          (s[3] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
          (c >= 0xf1 || s[1] >= 0x90)))
      return MY_CS_ILSEQ;

@@ -2332,10 +2373,10 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
    if (s+5 >e) /* We need 5 characters */
      return MY_CS_TOOSMALL5;

-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
-          (s[3] ^ 0x80) < 0x40 &&
-          (s[4] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          IS_CONTINUATION_BYTE(s[4]) &&
          (c >= 0xf9 || s[1] >= 0x88)))
      return MY_CS_ILSEQ;

@@ -2351,11 +2392,11 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
    if ( s+6 >e ) /* We need 6 characters */
      return MY_CS_TOOSMALL6;

-    if (!((s[1] ^ 0x80) < 0x40   &&
-          (s[2] ^ 0x80) < 0x40   &&
-          (s[3] ^ 0x80) < 0x40   &&
-          (s[4] ^ 0x80) < 0x40   &&
-          (s[5] ^ 0x80) < 0x40   &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          IS_CONTINUATION_BYTE(s[4]) &&
+          IS_CONTINUATION_BYTE(s[5]) &&
          (c >= 0xfd || s[1] >= 0x84)))
      return MY_CS_ILSEQ;

@@ -2399,11 +2440,11 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
    *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
    return 2;
  }
-  
+
  if (c < 0xf0)
  {
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
          (c >= 0xe1 || s[1] >= 0xa0)))
      return MY_CS_ILSEQ;

@@ -2892,10 +2933,90 @@ size_t my_strnxfrmlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
 }


+static
+int my_valid_mbcharlen_utf8(CHARSET_INFO *cs __attribute__((unused)),
+                            const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  if (s >= e)
+    return MY_CS_TOOSMALL;
+
+  c= s[0];
+  if (c < 0xf0)
+    return my_valid_mbcharlen_utf8mb3(s, e);
+
+#ifdef UNICODE_32BIT
+  if (c < 0xf8 && sizeof(my_wc_t)*8 >= 32)
+  {
+    if (s+4 > e) /* We need 4 characters */
+      return MY_CS_TOOSMALL4;
+
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          (c >= 0xf1 || s[1] >= 0x90)))
+      return MY_CS_ILSEQ;
+
+    return 4;
+  }
+  if (c < 0xfc && sizeof(my_wc_t)*8 >= 32)
+  {
+    if (s+5 >e) /* We need 5 characters */
+      return MY_CS_TOOSMALL5;
+
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          IS_CONTINUATION_BYTE(s[4]) &&
+          (c >= 0xf9 || s[1] >= 0x88)))
+      return MY_CS_ILSEQ;
+
+    return 5;
+  }
+  if (c < 0xfe && sizeof(my_wc_t)*8 >= 32)
+  {
+    if ( s+6 >e ) /* We need 6 characters */
+      return MY_CS_TOOSMALL6;
+
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          IS_CONTINUATION_BYTE(s[4]) &&
+          IS_CONTINUATION_BYTE(s[5]) &&
+          (c >= 0xfd || s[1] >= 0x84)))
+      return MY_CS_ILSEQ;
+
+    return 6;
+  }
+#endif
+  return MY_CS_ILSEQ;
+}
+
+static size_t
+my_well_formed_len_utf8(CHARSET_INFO *cs, const char *b, const char *e,
+                        size_t pos, int *error)
+{
+  const char *b_start= b;
+  *error= 0;
+  while (pos)
+  {
+    int mb_len;
+
+    if ((mb_len= my_valid_mbcharlen_utf8(cs, (uchar*) b, (uchar*) e)) <= 0)
+    {
+      *error= b < e ? 1 : 0;
+      break;
+    }
+    b+= mb_len;
+    pos--;
+  }
+  return (size_t) (b - b_start);
+}
+
 static uint my_ismbchar_utf8(CHARSET_INFO *cs,const char *b, const char *e)
 {
-  my_wc_t wc;
-  int  res= my_utf8_uni(cs,&wc, (const uchar*)b, (const uchar*)e);
+  int  res= my_valid_mbcharlen_utf8(cs, (const uchar*)b, (const uchar*)e);
  return (res>1) ? res : 0;
 }

@@ -2944,7 +3065,7 @@ MY_CHARSET_HANDLER my_charset_utf8_handler=
    my_mbcharlen_utf8,
    my_numchars_mb,
    my_charpos_mb,
-    my_well_formed_len_mb,
+    my_well_formed_len_utf8,
    my_lengthsp_8bit,
    my_numcells_mb,
    my_utf8_uni,
@@ -4714,7 +4835,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
    if (s + 2 > e) /* We need 2 characters */
      return MY_CS_TOOSMALL2;

-    if (!((s[1] ^ 0x80) < 0x40))
+    if (!(IS_CONTINUATION_BYTE(s[1])))
      return MY_CS_ILSEQ;

    *pwc= ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
@@ -4725,7 +4846,7 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
    if (s + 3 > e) /* We need 3 characters */
      return MY_CS_TOOSMALL3;

-    if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) && IS_CONTINUATION_BYTE(s[2]) &&
          (c >= 0xe1 || s[1] >= 0xa0)))
      return MY_CS_ILSEQ;

@@ -4758,9 +4879,9 @@ my_mb_wc_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
      [F4][80..8F][80..BF][80..BF]
    */

-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
-          (s[3] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
          (c >= 0xf1 || s[1] >= 0x90) &&
          (c <= 0xf3 || s[1] <= 0x8F)))
      return MY_CS_ILSEQ;
@@ -4796,17 +4917,17 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),

  if (c < 0xe0)
  {
-    if (!((s[1] ^ 0x80) < 0x40))
+    if (!IS_CONTINUATION_BYTE(s[1]))
      return MY_CS_ILSEQ;

    *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
    return 2;
  }
-  
+
  if (c < 0xf0)
  {
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
          (c >= 0xe1 || s[1] >= 0xa0)))
      return MY_CS_ILSEQ;
    *pwc= ((my_wc_t) (c & 0x0f) << 12)   |
@@ -4817,9 +4938,9 @@ my_mb_wc_utf8mb4_no_range(CHARSET_INFO *cs __attribute__((unused)),
  }
  else if (c < 0xf5)
  {
-    if (!((s[1] ^ 0x80) < 0x40 &&
-          (s[2] ^ 0x80) < 0x40 &&
-          (s[3] ^ 0x80) < 0x40 &&
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
          (c >= 0xf1 || s[1] >= 0x90) &&
          (c <= 0xf3 || s[1] <= 0x8F)))
      return MY_CS_ILSEQ;
@@ -5308,11 +5429,84 @@ my_strnxfrmlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), size_t len)
 }


+static int
+my_valid_mbcharlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
+                           const uchar *s, const uchar *e)
+{
+  uchar c;
+
+  if (s >= e)
+    return MY_CS_TOOSMALL;
+
+  c= s[0];
+  if (c < 0xf0)
+    return my_valid_mbcharlen_utf8mb3(s, e);
+
+  if (c < 0xf5)
+  {
+    if (s + 4 > e) /* We need 4 characters */
+      return MY_CS_TOOSMALL4;
+
+    /*
+      UTF-8 quick four-byte mask:
+      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      Encoding allows to encode U+00010000..U+001FFFFF
+
+      The maximum character defined in the Unicode standard is U+0010FFFF.
+      Higher characters U+00110000..U+001FFFFF are not used.
+
+      11110000.10010000.10xxxxxx.10xxxxxx == F0.90.80.80 == U+00010000 (min)
+      11110100.10001111.10111111.10111111 == F4.8F.BF.BF == U+0010FFFF (max)
+
+      Valid codes:
+      [F0][90..BF][80..BF][80..BF]
+      [F1][80..BF][80..BF][80..BF]
+      [F2][80..BF][80..BF][80..BF]
+      [F3][80..BF][80..BF][80..BF]
+      [F4][80..8F][80..BF][80..BF]
+    */
+
+    if (!(IS_CONTINUATION_BYTE(s[1]) &&
+          IS_CONTINUATION_BYTE(s[2]) &&
+          IS_CONTINUATION_BYTE(s[3]) &&
+          (c >= 0xf1 || s[1] >= 0x90) &&
+          (c <= 0xf3 || s[1] <= 0x8F)))
+      return MY_CS_ILSEQ;
+
+    return 4;
+  }
+
+  return MY_CS_ILSEQ;
+}
+
+
+static
+size_t my_well_formed_len_utf8mb4(CHARSET_INFO *cs,
+                                  const char *b, const char *e,
+                                  size_t pos, int *error)
+{
+  const char *b_start= b;
+  *error= 0;
+  while (pos)
+  {
+    int mb_len;
+
+    if ((mb_len= my_valid_mbcharlen_utf8mb4(cs, (uchar*) b, (uchar*) e)) <= 0)
+    {
+      *error= b < e ? 1 : 0;
+      break;
+    }
+    b+= mb_len;
+    pos--;
+  }
+  return (size_t) (b - b_start);
+}
+
+
 static uint
 my_ismbchar_utf8mb4(CHARSET_INFO *cs, const char *b, const char *e)
 {
-  my_wc_t wc;
-  int res= my_mb_wc_utf8mb4(cs,&wc, (const uchar*)b, (const uchar*)e);
+  int res= my_valid_mbcharlen_utf8mb4(cs, (const uchar*)b, (const uchar*)e);
  return (res > 1) ? res : 0;
 }

@@ -5373,7 +5567,7 @@ MY_CHARSET_HANDLER my_charset_utf8mb4_handler=
  my_mbcharlen_utf8mb4,
  my_numchars_mb,
  my_charpos_mb,
-  my_well_formed_len_mb,
+  my_well_formed_len_utf8mb4,
  my_lengthsp_8bit,
  my_numcells_mb,
  my_mb_wc_utf8mb4,