MDEV-30695 Refactor case folding data types in Asian collations

This is a non-functional change and should not change the server behavior. Casefolding information is now stored in items of a new data type MY_CASEFOLD_CHARACTER: typedef struct casefold_info_char_t { uint32 toupper; uint32 tolower; } MY_CASEFOLD_CHARACTER; Before this change, casefolding tables for Asian collations were stored in: typedef struct unicase_info_char_st { uint32 toupper; uint32 tolower; uint32 sort; } MY_UNICASE_CHARACTER; The "sort" member was not used in the code handling Asian collations, it only wasted space. (it's only used by Unicode _general_ci and _general_mysql500_ci collations). Unicode collations (at least UCA and _bin) should also be refactored later, but under terms of a separate task.

MDEV-30695 Refactor case folding data types in Asian collations
This is a non-functional change and should not change the server behavior. Casefolding information is now stored in items of a new data type MY_CASEFOLD_CHARACTER: typedef struct casefold_info_char_t { uint32 toupper; uint32 tolower; } MY_CASEFOLD_CHARACTER; Before this change, casefolding tables for Asian collations were stored in: typedef struct unicase_info_char_st { uint32 toupper; uint32 tolower; uint32 sort; } MY_UNICASE_CHARACTER; The "sort" member was not used in the code handling Asian collations, it only wasted space. (it's only used by Unicode _general_ci and _general_mysql500_ci collations). Unicode collations (at least UCA and _bin) should also be refactored later, but under terms of a separate task.
33f8f92b · Alexander Barkov · 7e341cc7 · 33f8f92b · 33f8f92b · 33f8f92b
Commit 33f8f92b authored Feb 21, 2023 by Alexander Barkov
20 changed files
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -80,10 +80,26 @@ typedef const struct my_charset_handler_st MY_CHARSET_HANDLER;
 typedef const struct my_collation_handler_st MY_COLLATION_HANDLER;

 typedef const struct unicase_info_st MY_UNICASE_INFO;
+typedef const struct casefold_info_st MY_CASEFOLD_INFO;
 typedef const struct uni_ctype_st MY_UNI_CTYPE;
 typedef const struct my_uni_idx_st MY_UNI_IDX;
 typedef uint16 decimal_digits_t;

+
+typedef struct casefold_info_char_t
+{
+  uint32 toupper;
+  uint32 tolower;
+} MY_CASEFOLD_CHARACTER;
+
+
+struct casefold_info_st
+{
+  my_wc_t maxchar;
+  MY_CASEFOLD_CHARACTER **page;
+};
+
+
 typedef struct unicase_info_char_st
 {
  uint32 toupper;
@@ -755,6 +771,7 @@ struct charset_info_st
  MY_UCA_INFO *uca;
  const uint16 *tab_to_uni;
  MY_UNI_IDX  *tab_from_uni;
+  MY_CASEFOLD_INFO *casefold;
  MY_UNICASE_INFO *caseinfo;
  const uchar  *state_map;
  const uchar  *ident_map;

--- a/strings/conf_to_src.c
+++ b/strings/conf_to_src.c
@@ -408,6 +408,7 @@ void dispcset(FILE *f,CHARSET_INFO *cs)
  }

  fprintf(f,"  NULL,                       /* from_uni      */\n");
+  fprintf(f,"  NULL,                       /* casefold      */\n");
  fprintf(f,"  &my_unicase_default,        /* caseinfo      */\n");
  fprintf(f,"  NULL,                       /* state map     */\n");
  fprintf(f,"  NULL,                       /* ident map     */\n");

--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -622,6 +622,7 @@ struct charset_info_st my_charset_bin =
    NULL,			/* uca           */
    NULL,			/* tab_to_uni    */
    NULL,			/* tab_from_uni  */
+    NULL,                       /* casefold     */
    &my_unicase_default,        /* caseinfo     */
    NULL,			/* state_map    */
    NULL,			/* ident_map    */

--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
--- a/strings/ctype-czech.c
+++ b/strings/ctype-czech.c
@@ -617,6 +617,7 @@ struct charset_info_st my_charset_latin2_czech_cs =
    NULL,		/* uca          */
    tab_8859_2_uni,	/* tab_to_uni   */
    idx_uni_8859_2,	/* tab_from_uni */
+    NULL,               /* casefold     */
    &my_unicase_default,/* caseinfo     */
    NULL,		/* state_map    */
    NULL,		/* ident_map    */

--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
--- a/strings/ctype-extra.c
+++ b/strings/ctype-extra.c
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -448,6 +448,7 @@ struct charset_info_st my_charset_latin1=
    NULL,		/* uca          */
    cs_to_uni,		/* tab_to_uni   */
    NULL,		/* tab_from_uni */
+    NULL,               /* casefold     */
    &my_unicase_default,/* caseinfo     */
    NULL,		/* state_map    */
    NULL,		/* ident_map    */
@@ -479,6 +480,7 @@ struct charset_info_st my_charset_latin1_nopad=
    NULL,                         /* uca              */
    cs_to_uni,                    /* tab_to_uni       */
    NULL,                         /* tab_from_uni     */
+    NULL,                         /* casefold     */
    &my_unicase_default,          /* caseinfo         */
    NULL,                         /* state_map        */
    NULL,                         /* ident_map        */
@@ -760,6 +762,7 @@ struct charset_info_st my_charset_latin1_german2_ci=
  NULL,					/* uca          */
  cs_to_uni,				/* tab_to_uni   */
  NULL,					/* tab_from_uni */
+  NULL,                                 /* casefold     */
  &my_unicase_default,                  /* caseinfo     */
  NULL,					/* state_map    */
  NULL,					/* ident_map    */
@@ -791,6 +794,7 @@ struct charset_info_st my_charset_latin1_bin=
  NULL,					/* uca          */
  cs_to_uni,				/* tab_to_uni   */
  NULL,					/* tab_from_uni */
+  NULL,                                 /* casefold     */
  &my_unicase_default,                  /* caseinfo     */
  NULL,					/* state_map    */
  NULL,					/* ident_map    */
@@ -822,6 +826,7 @@ struct charset_info_st my_charset_latin1_nopad_bin=
  NULL,                                /* uca              */
  cs_to_uni,                           /* tab_to_uni       */
  NULL,                                /* tab_from_uni     */
+  NULL,                                /* casefold         */
  &my_unicase_default,                 /* caseinfo         */
  NULL,                                /* state_map        */
  NULL,                                /* ident_map        */

--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -63,11 +63,11 @@ size_t my_casedn_str_mb(CHARSET_INFO * cs, char *str)
 }


-static inline MY_UNICASE_CHARACTER*
+static inline MY_CASEFOLD_CHARACTER*
 get_case_info_for_ch(CHARSET_INFO *cs, uint page, uint offs)
 {
-  MY_UNICASE_CHARACTER *p;
-  return cs->caseinfo && (p= cs->caseinfo->page[page]) ? &p[offs] : NULL;
+  MY_CASEFOLD_CHARACTER *p;
+  return cs->casefold && (p= cs->casefold->page[page]) ? &p[offs] : NULL;
 }


@@ -97,7 +97,7 @@ my_casefold_mb(CHARSET_INFO *cs,
    size_t mblen= my_ismbchar(cs, src, srcend);
    if (mblen)
    {
-      MY_UNICASE_CHARACTER *ch;
+      MY_CASEFOLD_CHARACTER *ch;
      if ((ch= get_case_info_for_ch(cs, (uchar) src[0], (uchar) src[1])))
      {
        int code= is_upper ? ch->toupper : ch->tolower;

--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
--- a/strings/ctype-tis620.c
+++ b/strings/ctype-tis620.c
@@ -955,6 +955,7 @@ struct charset_info_st my_charset_tis620_thai_ci=
    NULL,		/* uca          */
    NULL,		/* tab_to_uni   */
    NULL,		/* tab_from_uni */
+    NULL,               /* casefold     */
    &my_unicase_default,/* caseinfo     */
    NULL,		/* state_map    */
    NULL,		/* ident_map    */
@@ -985,6 +986,7 @@ struct charset_info_st my_charset_tis620_bin=
    NULL,		/* uca          */
    NULL,		/* tab_to_uni   */
    NULL,		/* tab_from_uni */
+    NULL,               /* casefold     */
    &my_unicase_default,/* caseinfo     */
    NULL,		/* state_map    */
    NULL,		/* ident_map    */
@@ -1016,6 +1018,7 @@ struct charset_info_st my_charset_tis620_thai_nopad_ci=
    NULL,                  /* uca              */
    NULL,                  /* tab_to_uni       */
    NULL,                  /* tab_from_uni     */
+    NULL,                  /* casefold         */
    &my_unicase_default,   /* caseinfo         */
    NULL,                  /* state_map        */
    NULL,                  /* ident_map        */
@@ -1047,6 +1050,7 @@ struct charset_info_st my_charset_tis620_nopad_bin=
    NULL,                  /* uca              */
    NULL,                  /* tab_to_uni       */
    NULL,                  /* tab_from_uni     */
+    NULL,                  /* casefold         */
    &my_unicase_default,   /* caseinfo         */
    NULL,                  /* state_map        */
    NULL,                  /* ident_map        */

--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
--- a/strings/ctype-win1250ch.c
+++ b/strings/ctype-win1250ch.c
@@ -710,6 +710,7 @@ struct charset_info_st my_charset_cp1250_czech_cs =
  NULL,				/* uca          */
  tab_cp1250_uni,		/* tab_to_uni   */
  idx_uni_cp1250,		/* tab_from_uni */
+  NULL,                         /* casefold     */
  &my_unicase_default,          /* caseinfo     */
  NULL,				/* state_map    */
  NULL,				/* ident_map    */