Commit 1710b645 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-26743 InnoDB: CHAR+nopad does not work well

The patch for "MDEV-25440: Indexed CHAR ... broken with NO_PAD collations"
fixed these scenarios from MDEV-26743:
- Basic latin letter vs equal accented letter
- Two letters vs equal (but space padded) expansion

However, this scenario was still broken:
- Basic latin letter (but followed by an ignorable character)
  vs equal accented letter

Fix:
When processing for a NOPAD collation a string with trailing ignorable
characters, like:
  '<non-ignorable><ignorable><ignorable>'

the string gets virtually converted to:
  '<non-ignorable><ignorable><ignorable><space><space><space>...'

After the fix the code works differently in these two cases:
1. <space> fits into the "nchars" limit
2. <space> does not fit into the "nchars" limit

Details:

1. If "nchars" is large enough (4+ in this example),
   return weights as follows:

  '[weight-for-non-ignorable, 1 char] [weight-for-space-character, 3 chars]'

  i.e. the weight for the virtual trailing space character now indicates
  that it corresponds to total 3 characters:
  - two ignorable characters
  - one virtual trailing space character

2. If "nchars" is small (3), then the virtual trailing space character
   does not fit into the "nchar" limit, so return 0x00 as weight, e.g.:

  '[weight-for-non-ignorable, 1 char] [0x00, 2 chars]'

Adding corresponding MTR tests and unit tests.
parent d6872f9c
...@@ -5,3 +5,49 @@ ALTER TABLE t1 ROW_FORMAT=DYNAMIC; ...@@ -5,3 +5,49 @@ ALTER TABLE t1 ROW_FORMAT=DYNAMIC;
INSERT INTO t1 VALUES ('',2); INSERT INTO t1 VALUES ('',2);
ALTER TABLE t1 ROW_FORMAT=REDUNDANT; ALTER TABLE t1 ROW_FORMAT=REDUNDANT;
DROP TABLE t1; DROP TABLE t1;
#
# MDEV-26743 InnoDB: CHAR+nopad does not work well
#
#
# Basic Latin letter vs equal accented letter
#
SET NAMES utf8mb3;
CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
INSERT INTO t1 VALUES ('a'),('ä');
ERROR 23000: Duplicate entry 'ä' for key 'PRIMARY'
DROP TABLE t1;
#
# Two letters vs equal (but space padded) expansion
#
CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
INSERT INTO t1 VALUES ('ss'),('ß');
SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
SELECT HEX(a) FROM t1;
HEX(a)
7373
C39F20
SET sql_mode=DEFAULT;
DROP TABLE t1;
#
# Basic Latin letter (but followed by an ignorable character) vs equal accented letter
#
SET NAMES utf8mb3;
CREATE TABLE t1 (a CHAR(3), PRIMARY KEY(a)) CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
INSERT INTO t1 VALUES (CONCAT('a',_utf8mb3 0x01)),('ä');
SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
SELECT HEX(a) FROM t1 ORDER BY HEX(a);
HEX(a)
610120
C3A42020
SET sql_mode=DEFAULT;
DROP TABLE t1;
SET NAMES utf8mb3;
CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
INSERT INTO t1 VALUES (CONCAT('a',_utf8mb3 0x01)),('ä');
SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
SELECT HEX(a) FROM t1 ORDER BY HEX(a);
HEX(a)
6101
C3A420
SET sql_mode=DEFAULT;
DROP TABLE t1;
...@@ -8,3 +8,49 @@ ALTER TABLE t1 ROW_FORMAT=DYNAMIC; ...@@ -8,3 +8,49 @@ ALTER TABLE t1 ROW_FORMAT=DYNAMIC;
INSERT INTO t1 VALUES ('',2); INSERT INTO t1 VALUES ('',2);
ALTER TABLE t1 ROW_FORMAT=REDUNDANT; ALTER TABLE t1 ROW_FORMAT=REDUNDANT;
DROP TABLE t1; DROP TABLE t1;
--echo #
--echo # MDEV-26743 InnoDB: CHAR+nopad does not work well
--echo #
--echo #
--echo # Basic Latin letter vs equal accented letter
--echo #
SET NAMES utf8mb3;
CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
--error ER_DUP_ENTRY
INSERT INTO t1 VALUES ('a'),('ä');
DROP TABLE t1;
--echo #
--echo # Two letters vs equal (but space padded) expansion
--echo #
CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
INSERT INTO t1 VALUES ('ss'),('ß');
SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
SELECT HEX(a) FROM t1;
SET sql_mode=DEFAULT;
DROP TABLE t1;
--echo #
--echo # Basic Latin letter (but followed by an ignorable character) vs equal accented letter
--echo #
SET NAMES utf8mb3;
CREATE TABLE t1 (a CHAR(3), PRIMARY KEY(a)) CHARACTER SET utf8mb3 COLLATE utf8mb3_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
INSERT INTO t1 VALUES (CONCAT('a',_utf8mb3 0x01)),('ä');
SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
SELECT HEX(a) FROM t1 ORDER BY HEX(a);
SET sql_mode=DEFAULT;
DROP TABLE t1;
SET NAMES utf8mb3;
CREATE TABLE t1 (a CHAR(2), PRIMARY KEY(a)) COLLATE utf8_unicode_nopad_ci ENGINE=InnoDB ROW_FORMAT=COMPACT;
INSERT INTO t1 VALUES (CONCAT('a',_utf8mb3 0x01)),('ä');
SET sql_mode=PAD_CHAR_TO_FULL_LENGTH;
SELECT HEX(a) FROM t1 ORDER BY HEX(a);
SET sql_mode=DEFAULT;
DROP TABLE t1;
...@@ -335,8 +335,20 @@ MY_FUNCTION_NAME(scanner_next_pad_trim)(my_uca_scanner *scanner, ...@@ -335,8 +335,20 @@ MY_FUNCTION_NAME(scanner_next_pad_trim)(my_uca_scanner *scanner,
flags & MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES ? flags & MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES ?
my_space_weight(scanner->level) : 0; my_space_weight(scanner->level) : 0;
res.nchars= 1;
(*generated)++; (*generated)++;
res.nchars++; /* Count all ignorable characters and the padded space */
if (res.nchars > nchars)
{
/*
We scanned a number of ignorable characters at the end of the
string and reached the "nchars" limit, so the virtual padded space
does not fit. This is possible with CONCAT('a', x'00') with
nchars=2 on the second iteration when we scan the x'00'.
*/
if (scanner->cs->state & MY_CS_NOPAD)
res.weight= 0;
res.nchars= (uint) nchars;
}
} }
else if (res.nchars > nchars) else if (res.nchars > nchars)
{ {
......
...@@ -911,6 +911,19 @@ static STRNNCOLLSP_CHAR_PARAM strnncollsp_char_utf8mb3_unicode_ci[]= ...@@ -911,6 +911,19 @@ static STRNNCOLLSP_CHAR_PARAM strnncollsp_char_utf8mb3_unicode_ci[]=
{{CSTR("ss")}, {CSTR(UTF8_sz)}, 4, TCHAR, 0}, {{CSTR("ss")}, {CSTR(UTF8_sz)}, 4, TCHAR, 0},
{{CSTR("ss")}, {CSTR(UTF8_sz)}, 100, TCHAR, 0}, {{CSTR("ss")}, {CSTR(UTF8_sz)}, 100, TCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 0, TCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 1, TCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 2, TCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 3, TCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 100, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 0, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 1, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 2, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 3, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 4, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 100, TCHAR, 0},
{{NULL, 0}, {NULL, 0}, 0, 0, 0} {{NULL, 0}, {NULL, 0}, 0, 0, 0}
}; };
...@@ -938,6 +951,19 @@ static STRNNCOLLSP_CHAR_PARAM strnncollsp_char_utf8mb3_unicode_nopad_ci[]= ...@@ -938,6 +951,19 @@ static STRNNCOLLSP_CHAR_PARAM strnncollsp_char_utf8mb3_unicode_nopad_ci[]=
{{CSTR("ss")}, {CSTR(UTF8_sz)}, 4, TVCHAR, 0}, {{CSTR("ss")}, {CSTR(UTF8_sz)}, 4, TVCHAR, 0},
{{CSTR("ss")}, {CSTR(UTF8_sz)}, 100, TVCHAR, 0}, {{CSTR("ss")}, {CSTR(UTF8_sz)}, 100, TVCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 0, TCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 1, TCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 2, TCHAR, -1},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 3, TCHAR, 0},
{{CSTR("a" "\x01")}, {CSTR(UTF8_auml)}, 100, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 0, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 1, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 2, TCHAR, -1},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 3, TCHAR, -1},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 4, TCHAR, 0},
{{CSTR("a" "\x01\x01")}, {CSTR(UTF8_auml)}, 100, TCHAR, 0},
{{NULL, 0}, {NULL, 0}, 0, 0, 0} {{NULL, 0}, {NULL, 0}, 0, 0, 0}
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment