MDEV-33772 Bad SEPARATOR value in GROUP_CONCAT on character set conversion

Item_func_group_concat::print() did not take into account that Item_func_group_concat::separator can be of a different character set than the "String *str" (when the printing is being done to). Therefore, printing did not work correctly for: - non-ASCII separators when GROUP_CONCAT is done on 8bit data or multi-byte data with mbminlen==1. - all separators (even including simple ones like comma) when GROUP_CONCAT is done on ucs2/utf16/utf32 data (mbminlen>1). Because of this problem, VIEW definitions did not print correctly to their FRM files. This later led to a wrong SELECT and SHOW CREATE output. Fix: - Adding new String methods: bool append_for_single_quote_using_mb_wc(const char *str, size_t length, CHARSET_INFO *cs); bool append_for_single_quote_opt_convert(const char *str, size_t length, CHARSET_INFO *cs) which perform both escaping and character set conversion at the same time. - Adding a new String method escaped_wc_for_single_quote(), to reuse the code between the old and the new methods. - Fixing Item_func_group_concat::print() to use the new method append_for_single_quote_opt_convert().

MDEV-33772 Bad SEPARATOR value in GROUP_CONCAT on character set conversion
Item_func_group_concat::print() did not take into account that Item_func_group_concat::separator can be of a different character set than the "String *str" (when the printing is being done to). Therefore, printing did not work correctly for: - non-ASCII separators when GROUP_CONCAT is done on 8bit data or multi-byte data with mbminlen==1. - all separators (even including simple ones like comma) when GROUP_CONCAT is done on ucs2/utf16/utf32 data (mbminlen>1). Because of this problem, VIEW definitions did not print correctly to their FRM files. This later led to a wrong SELECT and SHOW CREATE output. Fix: - Adding new String methods: bool append_for_single_quote_using_mb_wc(const char *str, size_t length, CHARSET_INFO *cs); bool append_for_single_quote_opt_convert(const char *str, size_t length, CHARSET_INFO *cs) which perform both escaping and character set conversion at the same time. - Adding a new String method escaped_wc_for_single_quote(), to reuse the code between the old and the new methods. - Fixing Item_func_group_concat::print() to use the new method append_for_single_quote_opt_convert().
0fc123c5 · Alexander Barkov · 58df2097 · 0fc123c5 · 0fc123c5 · 0fc123c5
Commit 0fc123c5 authored Mar 27, 2024 by Alexander Barkov
7 changed files
--- a/mysql-test/main/ctype_ucs.result
+++ b/mysql-test/main/ctype_ucs.result
@@ -6520,5 +6520,25 @@ SELECT 1 COLLATE latin1_swedish_ci;
 ERROR 42000: COLLATION 'latin1_swedish_ci' is not valid for CHARACTER SET 'ucs2'
 SET NAMES utf8;
 #
+# MDEV-33772 Bad SEPARATOR value in GROUP_CONCAT on character set conversion
+#
+SET NAMES utf8mb3, @@collation_connection=ucs2_general_ci;
+CREATE TABLE t1 (c VARCHAR(10)) CHARACTER SET ucs2;
+INSERT INTO t1 VALUES ('a'),('A');
+CREATE OR REPLACE VIEW v1 AS
+SELECT COUNT(*) AS cnt, GROUP_CONCAT(c) AS c1 FROM t1 GROUP BY c;
+SELECT * FROM v1;
+cnt	c1
+2	a,A
+SELECT HEX(c1) FROM v1;
+HEX(c1)
+0061002C0041
+SHOW CREATE VIEW v1;
+View	Create View	character_set_client	collation_connection
+v1	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select count(0) AS `cnt`,group_concat(`t1`.`c` separator ',') AS `c1` from `t1` group by `t1`.`c`	utf8	ucs2_general_ci
+DROP VIEW v1;
+DROP TABLE t1;
+SET NAMES utf8mb3;
+#
 # End of 10.5 tests
 #
--- a/mysql-test/main/ctype_ucs.test
+++ b/mysql-test/main/ctype_ucs.test
@@ -1189,6 +1189,23 @@ SELECT HEX(1 COLLATE ucs2_bin);
 SELECT 1 COLLATE latin1_swedish_ci;
 SET NAMES utf8;
+--echo #
+--echo # MDEV-33772 Bad SEPARATOR value in GROUP_CONCAT on character set conversion
+--echo #
+SET NAMES utf8mb3, @@collation_connection=ucs2_general_ci;
+CREATE TABLE t1 (c VARCHAR(10)) CHARACTER SET ucs2;
+INSERT INTO t1 VALUES ('a'),('A');
+CREATE OR REPLACE VIEW v1 AS
+  SELECT COUNT(*) AS cnt, GROUP_CONCAT(c) AS c1 FROM t1 GROUP BY c;
+SELECT * FROM v1;
+SELECT HEX(c1) FROM v1;
+SHOW CREATE VIEW v1;
+DROP VIEW v1;
+DROP TABLE t1;
+SET NAMES utf8mb3;
 --echo #
 --echo # End of 10.5 tests
 --echo #
--- a/mysql-test/main/func_gconcat.result
+++ b/mysql-test/main/func_gconcat.result
@@ -1517,4 +1517,24 @@ deallocate prepare stmt;
 set join_cache_level=default;
 set group_concat_max_len=default;
 drop table t1,t2;
+#
+# MDEV-33772 Bad SEPARATOR value in GROUP_CONCAT on character set conversion
+#
+SET NAMES utf8, @@collation_connection=latin1_swedish_ci;
+CREATE TABLE t1 (c VARCHAR(10)) CHARACTER SET latin1;
+INSERT INTO t1 VALUES ('a'),('A');
+CREATE OR REPLACE VIEW v1 AS
+SELECT GROUP_CONCAT(c SEPARATOR 'ß') AS c1 FROM t1 GROUP BY c;
+SELECT * FROM v1;
+c1
+aßA
+SELECT HEX(c1) FROM v1;
+HEX(c1)
+61DF41
+SHOW CREATE VIEW v1;
+View	Create View	character_set_client	collation_connection
+v1	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select group_concat(`t1`.`c` separator 'ß') AS `c1` from `t1` group by `t1`.`c`	utf8	latin1_swedish_ci
+DROP VIEW v1;
+DROP TABLE t1;
+SET NAMES latin1;
 # End of 10.5 tests
--- a/mysql-test/main/func_gconcat.test
+++ b/mysql-test/main/func_gconcat.test
@@ -1105,4 +1105,20 @@ set group_concat_max_len=default;
 drop table t1,t2;
+--echo #
+--echo # MDEV-33772 Bad SEPARATOR value in GROUP_CONCAT on character set conversion
+--echo #
+SET NAMES utf8, @@collation_connection=latin1_swedish_ci;
+CREATE TABLE t1 (c VARCHAR(10)) CHARACTER SET latin1;
+INSERT INTO t1 VALUES ('a'),('A');
+CREATE OR REPLACE VIEW v1 AS
+  SELECT GROUP_CONCAT(c SEPARATOR 'ß') AS c1 FROM t1 GROUP BY c;
+SELECT * FROM v1;
+SELECT HEX(c1) FROM v1;
+SHOW CREATE VIEW v1;
+DROP VIEW v1;
+DROP TABLE t1;
+SET NAMES latin1;
 --echo # End of 10.5 tests
--- a/sql/item_sum.cc
+++ b/sql/item_sum.cc
@@ -4587,7 +4587,7 @@ void Item_func_group_concat::print(String *str, enum_query_type query_type)
  if (sum_func() == GROUP_CONCAT_FUNC)
  {
    str->append(STRING_WITH_LEN(" separator \'"));
-    str->append_for_single_quote(separator->ptr(), separator->length());
+    str->append_for_single_quote_opt_convert(*separator);
    str->append(STRING_WITH_LEN("\'"));
  }

--- a/sql/sql_string.cc
+++ b/sql/sql_string.cc
@@ -1126,26 +1126,45 @@ bool String::append_for_single_quote(const char *st, size_t len)
  int chlen;
  for (; st < end; st++)
  {
-    switch (*st)
+    char ch2= (char) (uchar) escaped_wc_for_single_quote((uchar) *st);
+    if (ch2)
    {
-    case '\\':   APPEND(STRING_WITH_LEN("\\\\")); break;
+      if (append('\\') || append(ch2))
-    case '\0':   APPEND(STRING_WITH_LEN("\\0")); break;
+        return true;
-    case '\'':   APPEND(STRING_WITH_LEN("\\'")); break;
+      continue;
-    case '\n':   APPEND(STRING_WITH_LEN("\\n")); break;
-    case '\r':   APPEND(STRING_WITH_LEN("\\r")); break;
-    case '\032': APPEND(STRING_WITH_LEN("\\Z")); break;
-    default:     if ((chlen=charset()->charlen(st, end)) > 0)
-                 {
-                   APPEND(st, chlen);
-                   st+= chlen-1;
-                 }
-                 else
-                   APPEND(*st);
    }
+    if ((chlen= charset()->charlen(st, end)) > 0)
+    {
+     APPEND(st, chlen);
+      st+= chlen-1;
+    }
+    else
+      APPEND(*st);
  }
  return 0;
 }
+bool String::append_for_single_quote_using_mb_wc(const char *src,
+                                                 size_t length,
+                                                 CHARSET_INFO *cs)
+{
+  DBUG_ASSERT(&my_charset_bin != charset());
+  DBUG_ASSERT(&my_charset_bin != cs);
+  const uchar *str= (const uchar *) src;
+  const uchar *end= (const uchar *) src + length;
+  int chlen;
+  my_wc_t wc;
+  for ( ; (chlen= cs->cset->mb_wc(cs, &wc, str, end)) > 0; str+= chlen)
+  {
+    my_wc_t wc2= escaped_wc_for_single_quote(wc);
+    if (wc2 ? (append_wc('\\') || append_wc(wc2)) : append_wc(wc))
+      return true;
+  }
+  return false;
+}
 void String::print(String *str) const
 {
  str->append_for_single_quote(Ptr, str_length);

--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -1134,6 +1134,42 @@ class String: public Charset, public Binary_string
      print_with_conversion(to, cs);
  }
+  static my_wc_t escaped_wc_for_single_quote(my_wc_t ch)
+  {
+    switch (ch)
+    {
+    case '\\':   return '\\';
+    case '\0':   return '0';
+    case '\'':   return '\'';
+    case '\n':   return 'n';
+    case '\r':   return 'r';
+    case '\032': return 'Z';
+    }
+    return 0;
+  }
+  // Append for single quote using mb_wc/wc_mb Unicode conversion
+  bool append_for_single_quote_using_mb_wc(const char *str, size_t length,
+                                           CHARSET_INFO *cs);
+  // Append for single quote with optional mb_wc/wc_mb conversion
+  bool append_for_single_quote_opt_convert(const char *str,
+                                           size_t length,
+                                           CHARSET_INFO *cs)
+  {
+    return charset() == &my_charset_bin || cs == &my_charset_bin  ||
+           my_charset_same(charset(), cs) ?
+           append_for_single_quote(str, length) :
+           append_for_single_quote_using_mb_wc(str, length, cs);
+  }
+  bool append_for_single_quote_opt_convert(const String &str)
+  {
+    return append_for_single_quote_opt_convert(str.ptr(),
+                                               str.length(),
+                                               str.charset());
+  }
  bool append_for_single_quote(const char *st, size_t len);
  bool append_for_single_quote(const String *s)
  {