MDEV-26743 InnoDB: CHAR+nopad does not work well

The patch for "MDEV-25440: Indexed CHAR ... broken with NO_PAD collations" fixed these scenarios from MDEV-26743: - Basic latin letter vs equal accented letter - Two letters vs equal (but space padded) expansion However, this scenario was still broken: - Basic latin letter (but followed by an ignorable character) vs equal accented letter Fix: When processing for a NOPAD collation a string with trailing ignorable characters, like: '<non-ignorable><ignorable><ignorable>' the string gets virtually converted to: '<non-ignorable><ignorable><ignorable><space><space><space>...' After the fix the code works differently in these two cases: 1. <space> fits into the "nchars" limit 2. <space> does not fit into the "nchars" limit Details: 1. If "nchars" is large enough (4+ in this example), return weights as follows: '[weight-for-non-ignorable, 1 char] [weight-for-space-character, 3 chars]' i.e. the weight for the virtual trailing space character now indicates that it corresponds to total 3 characters: - two ignorable characters - one virtual trailing space character 2. If "nchars" is small (3), then the virtual trailing space character does not fit into the "nchar" limit, so return 0x00 as weight, e.g.: '[weight-for-non-ignorable, 1 char] [0x00, 2 chars]' Adding corresponding MTR tests and unit tests.

MDEV-26743 InnoDB: CHAR+nopad does not work well
The patch for "MDEV-25440: Indexed CHAR ... broken with NO_PAD collations" fixed these scenarios from MDEV-26743: - Basic latin letter vs equal accented letter - Two letters vs equal (but space padded) expansion However, this scenario was still broken: - Basic latin letter (but followed by an ignorable character) vs equal accented letter Fix: When processing for a NOPAD collation a string with trailing ignorable characters, like: '<non-ignorable><ignorable><ignorable>' the string gets virtually converted to: '<non-ignorable><ignorable><ignorable><space><space><space>...' After the fix the code works differently in these two cases: 1. <space> fits into the "nchars" limit 2. <space> does not fit into the "nchars" limit Details: 1. If "nchars" is large enough (4+ in this example), return weights as follows: '[weight-for-non-ignorable, 1 char] [weight-for-space-character, 3 chars]' i.e. the weight for the virtual trailing space character now indicates that it corresponds to total 3 characters: - two ignorable characters - one virtual trailing space character 2. If "nchars" is small (3), then the virtual trailing space character does not fit into the "nchar" limit, so return 0x00 as weight, e.g.: '[weight-for-non-ignorable, 1 char] [0x00, 2 chars]' Adding corresponding MTR tests and unit tests.
1710b645 · Alexander Barkov · d6872f9c · 1710b645 · 1710b645 · 1710b645
Commit 1710b645 authored Oct 01, 2021 by Alexander Barkov
4 changed files
--- a/mysql-test/suite/innodb/r/no_pad.result
+++ b/mysql-test/suite/innodb/r/no_pad.result
@@ -5,3 +5,49 @@ ALTER TABLE t1 ROW_FORMAT=DYNAMIC;
 INSERT INTO t1 VALUES ('',2);
 ALTER TABLE t1 ROW_FORMAT=REDUNDANT;
 DROP TABLE t1;
+#
+# MDEV-26743 InnoDB: CHAR+nopad does not work well
+#
+#
+# Basic Latin letter vs equal accented letter
+#
+SET NAMES utf8mb3;
+CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
+INSERT INTO t1 VALUES ('a'),('ä');
+ERROR 23000: Duplicate entry 'ä' for key 'PRIMARY'
+DROP TABLE t1;
+#
+# Two letters vs equal (but space padded) expansion
+#
+CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
+INSERT INTO t1 VALUES ('ss'),('ß');
+SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
+SELECT HEX(a) FROM t1;
+HEX(a)
+7373
+C39F20
+SET sql_mode=DEFAULT;
+DROP TABLE t1;
+#
+# Basic Latin letter (but followed by an ignorable character) vs equal accented letter
+#
+SET NAMES utf8mb3;
+CREATE TABLE t1 (a CHAR(3), PRIMARY KEY(a)) CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
+INSERT INTO t1 VALUES (CONCAT('a',_utf8mb3 0x01)),('ä');
+SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
+SELECT HEX(a) FROM t1 ORDER BY HEX(a);
+HEX(a)
+610120
+C3A42020
+SET sql_mode=DEFAULT;
+DROP TABLE t1;
+SET NAMES utf8mb3;
+CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
+INSERT INTO t1 VALUES (CONCAT('a',_utf8mb3 0x01)),('ä');
+SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
+SELECT HEX(a) FROM t1 ORDER BY HEX(a);
+HEX(a)
+6101
+C3A420
+SET sql_mode=DEFAULT;
+DROP TABLE t1;
--- a/mysql-test/suite/innodb/t/no_pad.test
+++ b/mysql-test/suite/innodb/t/no_pad.test
@@ -8,3 +8,49 @@ ALTER TABLE t1 ROW_FORMAT=DYNAMIC;
 INSERT INTO t1 VALUES ('',2);
 ALTER TABLE t1 ROW_FORMAT=REDUNDANT;
 DROP TABLE t1;
+--echo #
+--echo # MDEV-26743 InnoDB: CHAR+nopad does not work well
+--echo #
+--echo #
+--echo # Basic Latin letter vs equal accented letter
+--echo #
+SET NAMES utf8mb3;
+CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
+--error ER_DUP_ENTRY
+INSERT INTO t1 VALUES ('a'),('ä');
+DROP TABLE t1;
+--echo #
+--echo # Two letters vs equal (but space padded) expansion
+--echo #
+CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
+INSERT INTO t1 VALUES ('ss'),('ß');
+SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
+SELECT HEX(a) FROM t1;
+SET sql_mode=DEFAULT;
+DROP TABLE t1;
+--echo #
+--echo # Basic Latin letter (but followed by an ignorable character) vs equal accented letter
+--echo #
+SET NAMES utf8mb3;
+CREATE TABLE t1 (a CHAR(3), PRIMARY KEY(a)) CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
+INSERT INTO t1 VALUES (CONCAT('a',_utf8mb3 0x01)),('ä');
+SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
+SELECT HEX(a) FROM t1 ORDER BY HEX(a);
+SET sql_mode=DEFAULT;
+DROP TABLE t1;
+SET NAMES utf8mb3;
+CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
+INSERT INTO t1 VALUES (CONCAT('a',_utf8mb3 0x01)),('ä');
+SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
+SELECT HEX(a) FROM t1 ORDER BY HEX(a);
+SET sql_mode=DEFAULT;
+DROP TABLE t1;
--- a/strings/ctype-uca.inl
+++ b/strings/ctype-uca.inl
@@ -335,8 +335,20 @@ MY_FUNCTION_NAME(scanner_next_pad_trim)(my_uca_scanner *scanner,
        flags & MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES ?
        my_space_weight(scanner->level) : 0;
-      res.nchars= 1;
      (*generated)++;
+      res.nchars++; /* Count all ignorable characters and the padded space */
+      if (res.nchars > nchars)
+      {
+        /*
+          We scanned a number of ignorable characters at the end of the
+          string and reached the "nchars" limit, so the virtual padded space
+          does not fit. This is possible with CONCAT('a', x'00') with
+          nchars=2 on the second iteration when we scan the x'00'.
+        */
+        if (scanner->cs->state & MY_CS_NOPAD)
+          res.weight= 0;
+        res.nchars= (uint) nchars;
+      }
    }
    else if (res.nchars > nchars)
    {

--- a/unittest/strings/strings-t.c
+++ b/unittest/strings/strings-t.c
@@ -911,6 +911,19 @@ static STRNNCOLLSP_CHAR_PARAM strnncollsp_char_utf8mb3_unicode_ci[]=
  {{CSTR("ss")},             {CSTR(UTF8_sz)},                   4,  TCHAR,  0},
  {{CSTR("ss")},             {CSTR(UTF8_sz)},                 100,  TCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},                 0,  TCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},                 1,  TCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},                 2,  TCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},                 3,  TCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},               100,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 0,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 1,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 2,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 3,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 4,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},               100,  TCHAR,  0},
  {{NULL, 0},                {NULL, 0},                         0,    0,  0}
 };
@@ -938,6 +951,19 @@ static STRNNCOLLSP_CHAR_PARAM strnncollsp_char_utf8mb3_unicode_nopad_ci[]=
  {{CSTR("ss")},             {CSTR(UTF8_sz)},                   4,  TVCHAR,  0},
  {{CSTR("ss")},             {CSTR(UTF8_sz)},                 100,  TVCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},                 0,  TCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},                 1,  TCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},                 2,  TCHAR,  -1},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},                 3,  TCHAR,  0},
+  {{CSTR("a" "\x01")},       {CSTR(UTF8_auml)},               100,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 0,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 1,  TCHAR,  0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 2,  TCHAR,  -1},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 3,  TCHAR,  -1},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},                 4,  TCHAR,   0},
+  {{CSTR("a" "\x01\x01")},   {CSTR(UTF8_auml)},               100,  TCHAR,   0},
  {{NULL, 0},                {NULL, 0},                         0,    0,  0}
 };