MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES`

contains broken procedure body when used shielding quotes inside.

MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES`
contains broken procedure body when used shielding quotes inside.
ff2d92b1 · Alexander Barkov · 28a36f61 · ff2d92b1 · ff2d92b1 · ff2d92b1
Commit ff2d92b1 authored Feb 24, 2016 by Alexander Barkov
8 changed files
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -533,6 +533,7 @@ struct my_charset_handler_st

 extern MY_CHARSET_HANDLER my_charset_8bit_handler;
 extern MY_CHARSET_HANDLER my_charset_ucs2_handler;
+extern MY_CHARSET_HANDLER my_charset_utf8_handler;


 /*
@@ -889,6 +890,18 @@ uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
                  const char *from, uint32 from_length,
                  CHARSET_INFO *from_cs, uint *errors);

+/**
+  An extended version of my_convert(), to pass non-default mb_wc() and wc_mb().
+  For example, String::copy_printable() which is used in
+  Protocol::store_warning() uses this to escape control
+  and non-convertable characters.
+*/
+uint32 my_convert_using_func(char *to, uint32 to_length, CHARSET_INFO *to_cs,
+                             my_charset_conv_wc_mb mb_wc,
+                             const char *from, uint32 from_length,
+                             CHARSET_INFO *from_cs,
+                             my_charset_conv_mb_wc wc_mb,
+                             uint *errors);
 /*
  Convert a string between two character sets.
  Bad byte sequences as well as characters that cannot be

--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -10259,5 +10259,146 @@ Warnings:
 Note	1003	select `test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`c` = 'A')
 DROP TABLE t1;
 #
+# MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+#
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '\'', "\"";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'test\0';
+SELECT N'''', N"""", N'\'', N"\"";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+ROUTINE_DEFINITION
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '''', """";
+SELECT '<tab>\t<tab>\t<tab>';
+SELECT '<nl>\n<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT 'test\0';
+SELECT N'''', N"""", N'''', N"""";
+SELECT N'<tab>\t<tab>\t<tab>';
+SELECT N'<nl>\n<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+body_utf8
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '''', """";
+SELECT '<tab>\t<tab>\t<tab>';
+SELECT '<nl>\n<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT 'test\0';
+SELECT N'''', N"""", N'''', N"""";
+SELECT N'<tab>\t<tab>\t<tab>';
+SELECT N'<nl>\n<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+DROP PROCEDURE p1;
+SET @@SQL_MODE='NO_BACKSLASH_ESCAPES';
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'test\0';
+SELECT N'''', N"""";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+ROUTINE_DEFINITION
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT 'test\0';
+SELECT N'''', N"""";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+body_utf8
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT 'test';
+SELECT 'test\0';
+SELECT N'''', N"""";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END
+DROP PROCEDURE p1;
+SET @@SQL_MODE=default;
+#
 # End of 10.1 tests
 #
--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -3382,5 +3382,19 @@ SET NAMES utf8mb4;
 SELECT * FROM `test😁😁test`;
 ERROR HY000: Invalid utf8mb4 character string: 'test\xF0\x9F\x98\x81\xF0\x9F\x98\x81test'
 #
+# MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+#
+SET NAMES utf8mb4;
+CREATE FUNCTION f1() RETURNS TEXT CHARACTER SET utf8mb4
+RETURN CONCAT('😎','x😎','😎y','x😎y');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+ROUTINE_DEFINITION
+RETURN CONCAT('?','x?','?y','x?y')
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+body_utf8
+RETURN CONCAT('?','x?','?y','x?y')
+DROP FUNCTION f1;
+#
 # End of 10.1 tests
 #
--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -1871,6 +1871,82 @@ SELECT * FROM t1 WHERE c>=_utf8'a' COLLATE utf8_general_ci AND c='A';
 DROP TABLE t1;


+--echo #
+--echo # MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+--echo #
+DELIMITER $$;
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """", '\'', "\"";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'test\0';
+SELECT N'''', N"""", N'\'', N"\"";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+DELIMITER ;$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+DROP PROCEDURE p1;
+
+SET @@SQL_MODE='NO_BACKSLASH_ESCAPES';
+DELIMITER $$;
+CREATE PROCEDURE p1()
+BEGIN
+SELECT CONCAT('ABC = ''',1,''''), CONCAT('ABC = ',2);
+SELECT '''', """";
+SELECT '<tab>	<tab>\t<tab>';
+SELECT '<nl>
+<nl>\n<nl>';
+SELECT 'test';
+SELECT 'tëst';
+SELECT 'test\0';
+SELECT 'tëst\0';
+SELECT _binary'test';
+SELECT _binary'test\0';
+SELECT N'''', N"""";
+SELECT N'<tab>	<tab>\t<tab>';
+SELECT N'<nl>
+<nl>\n<nl>';
+SELECT N'test';
+SELECT N'tëst';
+SELECT N'test\0';
+SELECT N'tëst\0';
+END$$
+DELIMITER ;$$
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='p1';
+SELECT body_utf8 FROM mysql.proc WHERE name='p1';
+DROP PROCEDURE p1;
+SET @@SQL_MODE=default;
+
+
+# TODO: Uncomment the below test whe we fix:
+# MDEV-9623INFORMATION_SCHEMA.ROUTINES.ROUTINE_DEFINITION does not handle binary literals well
+#
+#SET NAMES binary;
+#CREATE FUNCTION f1() RETURNS TEXT RETURN CONCAT('i','й');
+#SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+#WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+#SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+#DROP FUNCTION f1;
+#SET NAMES utf8;
+
+
 --echo #
 --echo # End of 10.1 tests
 --echo #
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -1904,6 +1904,18 @@ SET NAMES utf8mb4;
 --error ER_INVALID_CHARACTER_STRING
 SELECT * FROM `test😁😁test`;

+--echo #
+--echo # MDEV-7231 Field ROUTINE_DEFINITION in INFORMATION_SCHEMA.`ROUTINES` contains broken procedure body when used shielding quotes inside.
+--echo #
+# Non-BMP characters should be replaced to '?' in ROUTINE_DEFINITION/body_utf8
+SET NAMES utf8mb4;
+CREATE FUNCTION f1() RETURNS TEXT CHARACTER SET utf8mb4
+RETURN CONCAT('😎','x😎','😎y','x😎y');
+SELECT ROUTINE_DEFINITION FROM INFORMATION_SCHEMA.ROUTINES
+WHERE ROUTINE_SCHEMA='test' AND SPECIFIC_NAME ='f1';
+SELECT body_utf8 FROM mysql.proc WHERE name='f1';
+DROP FUNCTION f1;
+
 --echo #
 --echo # End of 10.1 tests
 --echo #
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -324,9 +324,7 @@ void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr)
  DBUG_ASSERT(begin_ptr);
  DBUG_ASSERT(m_cpp_buf <= begin_ptr && begin_ptr <= m_cpp_buf + m_buf_length);

-  uint body_utf8_length=
-    (m_buf_length / thd->variables.character_set_client->mbminlen) *
-    my_charset_utf8_bin.mbmaxlen;
+  uint body_utf8_length= get_body_utf8_maximum_length(thd);

  m_body_utf8= (char *) thd->alloc(body_utf8_length + 1);
  m_body_utf8_ptr= m_body_utf8;
@@ -335,6 +333,22 @@ void Lex_input_stream::body_utf8_start(THD *thd, const char *begin_ptr)
  m_cpp_utf8_processed_ptr= begin_ptr;
 }

+
+uint Lex_input_stream::get_body_utf8_maximum_length(THD *thd)
+{
+  /*
+    String literals can grow during escaping:
+    1a. Character string '<TAB>' can grow to '\t', 3 bytes to 4 bytes growth.
+    1b. Character string '1000 times <TAB>' grows from
+        1002 to 2002 bytes (including quotes), which gives a little bit
+        less than 2 times growth.
+    "2" should be a reasonable multiplier that safely covers escaping needs.
+  */
+  return (m_buf_length / thd->variables.character_set_client->mbminlen) *
+          my_charset_utf8_bin.mbmaxlen * 2/*for escaping*/;
+}
+
+
 /**
  @brief The operation appends unprocessed part of pre-processed buffer till
  the given pointer (ptr) and sets m_cpp_utf8_processed_ptr to end_ptr.
@@ -402,15 +416,15 @@ void Lex_input_stream::body_utf8_append(const char *ptr)
                  operation.
 */

-void Lex_input_stream::body_utf8_append_literal(THD *thd,
+void Lex_input_stream::body_utf8_append_ident(THD *thd,
                                              const LEX_STRING *txt,
-                                                CHARSET_INFO *txt_cs,
                                              const char *end_ptr)
 {
  if (!m_cpp_utf8_processed_ptr)
    return;

  LEX_STRING utf_txt;
+  CHARSET_INFO *txt_cs= thd->charset();

  if (!my_charset_same(txt_cs, &my_charset_utf8_general_ci))
  {
@@ -434,6 +448,189 @@ void Lex_input_stream::body_utf8_append_literal(THD *thd,
  m_cpp_utf8_processed_ptr= end_ptr;
 }

+
+
+
+extern "C" {
+
+/**
+  Escape a character. Consequently puts "escape" and "wc" characters into
+  the destination utf8 string.
+  @param cs     - the character set (utf8)
+  @param escape - the escape character (backslash, single quote, double quote)
+  @param wc     - the character to be escaped
+  @param str    - the destination string
+  @param end    - the end of the destination string
+  @returns      - a code according to the wc_mb() convension.
+*/
+int my_wc_mb_utf8_with_escape(CHARSET_INFO *cs, my_wc_t escape, my_wc_t wc,
+                              uchar *str, uchar *end)
+{
+  DBUG_ASSERT(escape > 0);
+  if (str + 1 >= end)
+    return MY_CS_TOOSMALL2;  // Not enough space, need at least two bytes.
+  *str= escape;
+  int cnvres= my_charset_utf8_handler.wc_mb(cs, wc, str + 1, end);
+  if (cnvres > 0)
+    return cnvres + 1;       // The character was normally put
+  if (cnvres == MY_CS_ILUNI)
+    return MY_CS_ILUNI;      // Could not encode "wc" (e.g. non-BMP character)
+  DBUG_ASSERT(cnvres <= MY_CS_TOOSMALL);
+  return cnvres - 1;         // Not enough space
+}
+
+
+/**
+  Optionally escape a character.
+  If "escape" is non-zero, then both "escape" and "wc" are put to
+  the destination string. Otherwise, only "wc" is put.
+  @param cs     - the character set (utf8)
+  @param wc     - the character to be optionally escaped
+  @param escape - the escape character, or 0
+  @param ewc    - the escaped replacement of "wc" (e.g. 't' for '\t')
+  @param str    - the destination string
+  @param end    - the end of the destination string
+  @returns      - a code according to the wc_mb() conversion.
+*/
+int my_wc_mb_utf8_opt_escape(CHARSET_INFO *cs,
+                             my_wc_t wc, my_wc_t escape, my_wc_t ewc,
+                             uchar *str, uchar *end)
+{
+  return escape ? my_wc_mb_utf8_with_escape(cs, escape, ewc, str, end) :
+                  my_charset_utf8_handler.wc_mb(cs, wc, str, end);
+}
+
+/**
+  Encode a character with optional backlash escaping and quote escaping.
+  Quote marks are escaped using another quote mark.
+  Additionally, if "escape" is non-zero, then special characters are
+  also escaped using "escape".
+  Otherwise (if "escape" is zero, e.g. in case of MODE_NO_BACKSLASH_ESCAPES),
+  then special characters are not escaped and handled as normal characters.
+
+  @param cs        - the character set (utf8)
+  @param wc        - the character to be encoded
+  @param str       - the destination string
+  @param end       - the end of the destination string
+  @param sep       - the string delimiter (e.g. ' or ")
+  @param escape    - the escape character (backslash, or 0)
+  @returns         - a code according to the wc_mb() convension.
+*/
+int my_wc_mb_utf8_escape(CHARSET_INFO *cs, my_wc_t wc, uchar *str, uchar *end,
+                         my_wc_t sep, my_wc_t escape)
+{
+  DBUG_ASSERT(escape == 0 || escape == '\\');
+  DBUG_ASSERT(sep == '"' || sep == '\'');
+  switch (wc) {
+  case 0:      return my_wc_mb_utf8_opt_escape(cs, wc, escape, '0', str, end);
+  case '\t':   return my_wc_mb_utf8_opt_escape(cs, wc, escape, 't', str, end);
+  case '\r':   return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'r', str, end);
+  case '\n':   return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'n', str, end);
+  case '\032': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'Z', str, end);
+  case '\'':
+  case '\"':
+    if (wc == sep)
+      return my_wc_mb_utf8_with_escape(cs, wc, wc, str, end);
+  }
+  return my_charset_utf8_handler.wc_mb(cs, wc, str, end); // No escaping needed
+}
+
+
+/** wc_mb() compatible routines for all sql_mode and delimiter combinations */
+int my_wc_mb_utf8_escape_single_quote_and_backslash(CHARSET_INFO *cs,
+                                                    my_wc_t wc,
+                                                    uchar *str, uchar *end)
+{
+  return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', '\\');
+}
+
+
+int my_wc_mb_utf8_escape_double_quote_and_backslash(CHARSET_INFO *cs,
+                                                    my_wc_t wc,
+                                                    uchar *str, uchar *end)
+{
+  return my_wc_mb_utf8_escape(cs, wc, str, end, '"', '\\');
+}
+
+
+int my_wc_mb_utf8_escape_single_quote(CHARSET_INFO *cs, my_wc_t wc,
+                                      uchar *str, uchar *end)
+{
+  return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', 0);
+}
+
+
+int my_wc_mb_utf8_escape_double_quote(CHARSET_INFO *cs, my_wc_t wc,
+                                      uchar *str, uchar *end)
+{
+  return my_wc_mb_utf8_escape(cs, wc, str, end, '"', 0);
+}
+
+}; // End of extern "C"
+
+
+/**
+  Get an escaping function, depending on the current sql_mode and the
+  string separator.
+*/
+my_charset_conv_wc_mb
+Lex_input_stream::get_escape_func(THD *thd, my_wc_t sep) const
+{
+  return thd->backslash_escapes() ?
+         (sep == '"' ? my_wc_mb_utf8_escape_double_quote_and_backslash:
+                       my_wc_mb_utf8_escape_single_quote_and_backslash) :
+         (sep == '"' ? my_wc_mb_utf8_escape_double_quote:
+                       my_wc_mb_utf8_escape_single_quote);
+}
+
+
+/**
+  Append a text literal to the end of m_body_utf8.
+  The string is escaped according to the current sql_mode and the
+  string delimiter (e.g. ' or ").
+
+  @param thd       - current THD
+  @param txt       - the string to be appended to m_body_utf8.
+                     Note, the string must be already unescaped.
+  @param cs        - the character set of the string
+  @param end_ptr   - m_cpp_utf8_processed_ptr will be set to this value
+                     (see body_utf8_append_ident for details)
+  @param sep       - the string delimiter (single or double quote)
+*/
+void Lex_input_stream::body_utf8_append_escape(THD *thd,
+                                               const LEX_STRING *txt,
+                                               CHARSET_INFO *cs,
+                                               const char *end_ptr,
+                                               my_wc_t sep)
+{
+  DBUG_ASSERT(sep == '\'' || sep == '"');
+  if (!m_cpp_utf8_processed_ptr)
+    return;
+  uint errors;
+  /**
+    We previously alloced m_body_utf8 to be able to store the query with all
+    strings properly escaped. See get_body_utf8_maximum_length().
+    So here we have guaranteedly enough space to append any string literal
+    with escaping. Passing txt->length*2 as "available space" is always safe.
+    For better safety purposes we could calculate get_body_utf8_maximum_length()
+    every time we append a string, but this would affect performance negatively,
+    so let's check that we don't get beyond the allocated buffer in
+    debug build only.
+  */
+  DBUG_ASSERT(m_body_utf8 + get_body_utf8_maximum_length(thd) >=
+              m_body_utf8_ptr + txt->length * 2);
+  uint32 cnv_length= my_convert_using_func(m_body_utf8_ptr, txt->length * 2,
+                                           &my_charset_utf8_general_ci,
+                                           get_escape_func(thd, sep),
+                                           txt->str, txt->length,
+                                           cs, cs->cset->mb_wc,
+                                           &errors);
+  m_body_utf8_ptr+= cnv_length;
+  *m_body_utf8_ptr= 0;
+  m_cpp_utf8_processed_ptr= end_ptr;
+}
+
+
 void Lex_input_stream::add_digest_token(uint token, LEX_YYSTYPE yylval)
 {
  if (m_digest != NULL)
@@ -797,14 +994,14 @@ Lex_input_stream::unescape(CHARSET_INFO *cs, char *to,
  Fix sometimes to do only one scan of the string
 */

-bool Lex_input_stream::get_text(LEX_STRING *dst, int pre_skip, int post_skip)
+bool Lex_input_stream::get_text(LEX_STRING *dst, uint sep,
+                                int pre_skip, int post_skip)
 {
-  reg1 uchar c,sep;
+  reg1 uchar c;
  uint found_escape=0;
  CHARSET_INFO *cs= m_thd->charset();

  tok_bitmap= 0;
-  sep= yyGetLast();                        // String should end with this
  while (! eof())
  {
    c= yyGet();
@@ -1169,6 +1366,8 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
      return((int) c);

    case MY_LEX_IDENT_OR_NCHAR:
+    {
+      uint sep;
      if (lip->yyPeek() != '\'')
      {
 	state= MY_LEX_IDENT;
@@ -1176,14 +1375,20 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
      }
      /* Found N'string' */
      lip->yySkip();                         // Skip '
-      if (lip->get_text(&yylval->lex_str, 2, 1))
+      if (lip->get_text(&yylval->lex_str, (sep= lip->yyGetLast()), 2, 1))
      {
 	state= MY_LEX_CHAR;             // Read char by char
 	break;
      }
+
+      lip->body_utf8_append(lip->m_cpp_text_start);
+      lip->body_utf8_append_escape(thd, &yylval->lex_str,
+                                   national_charset_info,
+                                   lip->m_cpp_text_end, sep);
+
      lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
      return(NCHAR_STRING);
-
+    }
    case MY_LEX_IDENT_OR_HEX:
      if (lip->yyPeek() == '\'')
      {					// Found x'hex-number'
@@ -1286,8 +1491,7 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)

      lip->body_utf8_append(lip->m_cpp_text_start);

-      lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
-                                    lip->m_cpp_text_end);
+      lip->body_utf8_append_ident(thd, &yylval->lex_str, lip->m_cpp_text_end);

      return(result_state);			// IDENT or IDENT_QUOTED

@@ -1391,8 +1595,7 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)

      lip->body_utf8_append(lip->m_cpp_text_start);

-      lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
-                                    lip->m_cpp_text_end);
+      lip->body_utf8_append_ident(thd, &yylval->lex_str, lip->m_cpp_text_end);

      return(result_state);

@@ -1435,8 +1638,7 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)

      lip->body_utf8_append(lip->m_cpp_text_start);

-      lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
-                                    lip->m_cpp_text_end);
+      lip->body_utf8_append_ident(thd, &yylval->lex_str, lip->m_cpp_text_end);

      return(IDENT_QUOTED);
    }
@@ -1541,23 +1743,23 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)
      }
      /* " used for strings */
    case MY_LEX_STRING:			// Incomplete text string
-      if (lip->get_text(&yylval->lex_str, 1, 1))
+    {
+      uint sep;
+      if (lip->get_text(&yylval->lex_str, (sep= lip->yyGetLast()), 1, 1))
      {
 	state= MY_LEX_CHAR;		// Read char by char
 	break;
      }
-
+      CHARSET_INFO *strcs= lip->m_underscore_cs ? lip->m_underscore_cs : cs;
      lip->body_utf8_append(lip->m_cpp_text_start);

-      lip->body_utf8_append_literal(thd, &yylval->lex_str,
-        lip->m_underscore_cs ? lip->m_underscore_cs : cs,
-        lip->m_cpp_text_end);
-
+      lip->body_utf8_append_escape(thd, &yylval->lex_str, strcs,
+                                   lip->m_cpp_text_end, sep);
      lip->m_underscore_cs= NULL;

      lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
      return(TEXT_STRING);
-
+    }
    case MY_LEX_COMMENT:			//  Comment
      lex->select_lex.options|= OPTION_FOUND_COMMENT;
      while ((c = lip->yyGet()) != '\n' && c) ;
@@ -1806,8 +2008,7 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd)

      lip->body_utf8_append(lip->m_cpp_text_start);

-      lip->body_utf8_append_literal(thd, &yylval->lex_str, cs,
-                                    lip->m_cpp_text_end);
+      lip->body_utf8_append_ident(thd, &yylval->lex_str, lip->m_cpp_text_end);

      return(result_state);
    }

--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -1807,6 +1807,7 @@ class Lex_input_stream
 {
  size_t unescape(CHARSET_INFO *cs, char *to,
                  const char *str, const char *end, int sep);
+  my_charset_conv_wc_mb get_escape_func(THD *thd, my_wc_t sep) const;
 public:
  Lex_input_stream()
  {
@@ -2077,14 +2078,23 @@ class Lex_input_stream
    return (uint) (m_body_utf8_ptr - m_body_utf8);
  }

+  /**
+    Get the maximum length of the utf8-body buffer.
+    The utf8 body can grow because of the character set conversion and escaping.
+  */
+  uint get_body_utf8_maximum_length(THD *thd);
+
  void body_utf8_start(THD *thd, const char *begin_ptr);
  void body_utf8_append(const char *ptr);
  void body_utf8_append(const char *ptr, const char *end_ptr);
-  void body_utf8_append_literal(THD *thd,
+  void body_utf8_append_ident(THD *thd,
                              const LEX_STRING *txt,
-                                CHARSET_INFO *txt_cs,
                              const char *end_ptr);
-
+  void body_utf8_append_escape(THD *thd,
+                               const LEX_STRING *txt,
+                               CHARSET_INFO *txt_cs,
+                               const char *end_ptr,
+                               my_wc_t sep);
  /** Current thread. */
  THD *m_thd;

@@ -2105,7 +2115,7 @@ class Lex_input_stream
  /** LALR(2) resolution, value of the look ahead token.*/
  LEX_YYSTYPE lookahead_yylval;

-  bool get_text(LEX_STRING *to, int pre_skip, int post_skip);
+  bool get_text(LEX_STRING *to, uint sep, int pre_skip, int post_skip);

  void add_digest_token(uint token, LEX_YYSTYPE yylval);


--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -1030,19 +1030,18 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs)
  @return Number of bytes copied to 'to' string
 */

-static uint32
-my_convert_internal(char *to, uint32 to_length,
-                    CHARSET_INFO *to_cs,
+uint32
+my_convert_using_func(char *to, uint32 to_length,
+                      CHARSET_INFO *to_cs, my_charset_conv_wc_mb wc_mb,
                      const char *from, uint32 from_length,
-                    CHARSET_INFO *from_cs, uint *errors)
+                      CHARSET_INFO *from_cs, my_charset_conv_mb_wc mb_wc,
+                      uint *errors)
 {
  int         cnvres;
  my_wc_t     wc;
  const uchar *from_end= (const uchar*) from + from_length;
  char *to_start= to;
  uchar *to_end= (uchar*) to + to_length;
-  my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
-  my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
  uint error_count= 0;

  while (1)
@@ -1119,8 +1118,11 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
    immediately switch to slow mb_wc->wc_mb method.
  */
  if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
-    return my_convert_internal(to, to_length, to_cs,
-                               from, from_length, from_cs, errors);
+    return my_convert_using_func(to, to_length,
+                                 to_cs, to_cs->cset->wc_mb,
+                                 from, from_length,
+                                 from_cs, from_cs->cset->mb_wc,
+                                 errors);

  length= length2= MY_MIN(to_length, from_length);

@@ -1152,8 +1154,10 @@ my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
      uint32 copied_length= length2 - length;
      to_length-= copied_length;
      from_length-= copied_length;
-      return copied_length + my_convert_internal(to, to_length, to_cs,
+      return copied_length + my_convert_using_func(to, to_length, to_cs,
+                                                   to_cs->cset->wc_mb,
                                                   from, from_length, from_cs,
+                                                   from_cs->cset->mb_wc,
                                                   errors);
    }
  }