Commit 0a3d1d10 authored by Alexander Barkov's avatar Alexander Barkov

Refactoring for MDEV-27042 and MDEV-27009

This patch prepares the code for upcoming changes:

MDEV-27009 Add UCA-14.0.0 collations
MDEV-27042 UCA: Resetting contractions to ignorable does not work well

1. Adding "const" qualifiers to return type and parameters in functions:
- my_uca_contraction2_weight()
- my_wmemcmp()
- my_uca_contraction_weight()
- my_uca_scanner_contraction_find()
- my_uca_previous_context_find()
- my_uca_context_weight_find()

2. Adding a helper function my_uca_true_contraction_eq()

3. Changing the way how scanner->wbeg is set during context weight handling.
   It was previously set inside functions:
   - my_uca_scanner_contraction_find()
   - my_uca_previous_context_find()
   Now it's set inside scanner_next(), which makes the code more symmetric
   for context-free and context-dependent sequences.
   This makes then upcoming fix for MDEV-27042 simpler.
parent 86891b85
...@@ -135,7 +135,7 @@ typedef struct my_contraction_list_t ...@@ -135,7 +135,7 @@ typedef struct my_contraction_list_t
my_bool my_uca_can_be_contraction_head(const MY_CONTRACTIONS *c, my_wc_t wc); my_bool my_uca_can_be_contraction_head(const MY_CONTRACTIONS *c, my_wc_t wc);
my_bool my_uca_can_be_contraction_tail(const MY_CONTRACTIONS *c, my_wc_t wc); my_bool my_uca_can_be_contraction_tail(const MY_CONTRACTIONS *c, my_wc_t wc);
uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c, const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
my_wc_t wc1, my_wc_t wc2); my_wc_t wc1, my_wc_t wc2);
......
...@@ -31358,7 +31358,7 @@ my_uca_can_be_contraction_part(const MY_CONTRACTIONS *c, my_wc_t wc, int flag) ...@@ -31358,7 +31358,7 @@ my_uca_can_be_contraction_part(const MY_CONTRACTIONS *c, my_wc_t wc, int flag)
@retval ptr - contraction weight array @retval ptr - contraction weight array
*/ */
uint16 * const uint16 *
my_uca_contraction2_weight(const MY_CONTRACTIONS *list, my_wc_t wc1, my_wc_t wc2) my_uca_contraction2_weight(const MY_CONTRACTIONS *list, my_wc_t wc1, my_wc_t wc2)
{ {
MY_CONTRACTION *c, *last; MY_CONTRACTION *c, *last;
...@@ -31443,13 +31443,29 @@ my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc) ...@@ -31443,13 +31443,29 @@ my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc)
@retval non-zero - strings are different @retval non-zero - strings are different
*/ */
static int static inline int
my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len) my_wmemcmp(const my_wc_t *a, const my_wc_t *b, size_t len)
{ {
return memcmp(a, b, len * sizeof(my_wc_t)); return memcmp(a, b, len * sizeof(my_wc_t));
} }
/*
Test if the MY_CONTRACTION instance is equal to the wide
string with the given length.
Note, only true contractions are checked,
while previous context pairs always return FALSE.
*/
static inline my_bool
my_uca_true_contraction_eq(const MY_CONTRACTION *c,
const my_wc_t *wc, size_t len)
{
return (len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) &&
!c->with_context &&
!my_wmemcmp(c->ch, wc, len);
}
/** /**
Check if a string is a contraction, Check if a string is a contraction,
and return its weight array on success. and return its weight array on success.
...@@ -31463,7 +31479,7 @@ my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len) ...@@ -31463,7 +31479,7 @@ my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len)
@retval ptr - contraction weight array @retval ptr - contraction weight array
*/ */
static inline uint16 * static inline const uint16 *
my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len) my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
{ {
MY_CONTRACTION *c, *last; MY_CONTRACTION *c, *last;
...@@ -31471,9 +31487,7 @@ my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len) ...@@ -31471,9 +31487,7 @@ my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
for (c= list->item, last= c + list->nitems; c < last; c++) for (c= list->item, last= c + list->nitems; c < last; c++)
{ {
if ((len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) && if (my_uca_true_contraction_eq(c, wc, len))
!c->with_context &&
!my_wmemcmp(c->ch, wc, len))
return c->weight; return c->weight;
} }
return NULL; return NULL;
...@@ -31495,12 +31509,15 @@ my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len) ...@@ -31495,12 +31509,15 @@ my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
@retval ptr - contraction weight array @retval ptr - contraction weight array
*/ */
static uint16 * static const uint16 *
my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc) my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc)
{ {
size_t clen= 1; size_t clen= 1;
int flag; int flag;
const uchar *s, *beg[MY_UCA_MAX_CONTRACTION]; const uchar *s, *beg[MY_UCA_MAX_CONTRACTION];
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
wc[0]= currwc;
memset((void*) beg, 0, sizeof(beg)); memset((void*) beg, 0, sizeof(beg));
/* Scan all contraction candidates */ /* Scan all contraction candidates */
...@@ -31520,13 +31537,12 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc) ...@@ -31520,13 +31537,12 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc)
/* Find among candidates the longest real contraction */ /* Find among candidates the longest real contraction */
for ( ; clen > 1; clen--) for ( ; clen > 1; clen--)
{ {
uint16 *cweight; const uint16 *cweight;
if (my_uca_can_be_contraction_tail(&scanner->level->contractions, if (my_uca_can_be_contraction_tail(&scanner->level->contractions,
wc[clen - 1]) && wc[clen - 1]) &&
(cweight= my_uca_contraction_weight(&scanner->level->contractions, (cweight= my_uca_contraction_weight(&scanner->level->contractions,
wc, clen))) wc, clen)))
{ {
scanner->wbeg= cweight + 1;
scanner->sbeg= beg[clen - 1]; scanner->sbeg= beg[clen - 1];
return cweight; return cweight;
} }
...@@ -31549,20 +31565,16 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc) ...@@ -31549,20 +31565,16 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc)
@retval ptr - contraction weight array @retval ptr - contraction weight array
*/ */
static uint16 * static const uint16 *
my_uca_previous_context_find(my_uca_scanner *scanner, my_uca_previous_context_find(const MY_CONTRACTIONS *list,
my_wc_t wc0, my_wc_t wc1) my_wc_t wc0, my_wc_t wc1)
{ {
const MY_CONTRACTIONS *list= &scanner->level->contractions;
MY_CONTRACTION *c, *last; MY_CONTRACTION *c, *last;
for (c= list->item, last= c + list->nitems; c < last; c++) for (c= list->item, last= c + list->nitems; c < last; c++)
{ {
if (c->with_context && wc0 == c->ch[0] && wc1 == c->ch[1]) if (c->with_context && wc0 == c->ch[0] && wc1 == c->ch[1])
{
scanner->wbeg= c->weight + 1;
return c->weight; return c->weight;
} }
}
return NULL; return NULL;
} }
...@@ -31584,10 +31596,11 @@ my_uca_previous_context_find(my_uca_scanner *scanner, ...@@ -31584,10 +31596,11 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
@retval NULL if could not find any contextual weights for wc[0] @retval NULL if could not find any contextual weights for wc[0]
@retval non null pointer to a zero-terminated weight string otherwise @retval non null pointer to a zero-terminated weight string otherwise
*/ */
static inline uint16 * static inline const uint16 *
my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc) my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc)
{ {
uint16 *cweight; const uint16 *cweight;
my_wc_t prevwc;
DBUG_ASSERT(scanner->level->contractions.nitems); DBUG_ASSERT(scanner->level->contractions.nitems);
/* /*
If we have scanned a character which can have previous context, If we have scanned a character which can have previous context,
...@@ -31599,21 +31612,22 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc) ...@@ -31599,21 +31612,22 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc)
context at the moment. CLDR does not have longer sequences. context at the moment. CLDR does not have longer sequences.
*/ */
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions, if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
wc[0]) && currwc) &&
scanner->wbeg != nochar && /* if not the very first character */ scanner->wbeg != nochar && /* if not the very first character */
my_uca_can_be_previous_context_head(&scanner->level->contractions, my_uca_can_be_previous_context_head(&scanner->level->contractions,
(wc[1]= ((scanner->page << 8) + (prevwc= ((scanner->page << 8) +
scanner->code))) && scanner->code))) &&
(cweight= my_uca_previous_context_find(scanner, wc[1], wc[0]))) (cweight= my_uca_previous_context_find(&scanner->level->contractions,
prevwc, currwc)))
{ {
scanner->page= scanner->code= 0; /* Clear for the next character */ scanner->page= scanner->code= 0; /* Clear for the next character */
return cweight; return cweight;
} }
else if (my_uca_can_be_contraction_head(&scanner->level->contractions, else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
wc[0])) currwc))
{ {
/* Check if w[0] starts a contraction */ /* Check if w[0] starts a contraction */
if ((cweight= my_uca_scanner_contraction_find(scanner, wc))) if ((cweight= my_uca_scanner_contraction_find(scanner, currwc)))
return cweight; return cweight;
} }
return NULL; return NULL;
...@@ -52,28 +52,31 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) ...@@ -52,28 +52,31 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
do do
{ {
const uint16 *wpage; const uint16 *wpage;
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
int mblen; int mblen;
my_wc_t currwc;
/* Get next character */ /* Get next character */
#if MY_UCA_ASCII_OPTIMIZE #if MY_UCA_ASCII_OPTIMIZE
/* Get next ASCII character */ /* Get next ASCII character */
if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80) if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
{ {
wc[0]= scanner->sbeg[0]; currwc= scanner->sbeg[0];
scanner->sbeg+= 1; scanner->sbeg+= 1;
#if MY_UCA_COMPILE_CONTRACTIONS #if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, wc[0])) if (my_uca_needs_context_handling(scanner->level, currwc))
{ {
uint16 *cweight= my_uca_context_weight_find(scanner, wc); const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
if (cweight) if (cweight)
{
scanner->wbeg= cweight + 1;
return *cweight; return *cweight;
} }
}
#endif #endif
scanner->page= 0; scanner->page= 0;
scanner->code= (int) wc[0]; scanner->code= (int) currwc;
scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0]; scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
if (scanner->wbeg[0]) if (scanner->wbeg[0])
return *scanner->wbeg++; return *scanner->wbeg++;
...@@ -82,7 +85,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) ...@@ -82,7 +85,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
else else
#endif #endif
/* Get next MB character */ /* Get next MB character */
if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg, if (((mblen= MY_MB_WC(scanner, &currwc, scanner->sbeg,
scanner->send)) <= 0)) scanner->send)) <= 0))
{ {
if (scanner->sbeg >= scanner->send) if (scanner->sbeg >= scanner->send)
...@@ -105,7 +108,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) ...@@ -105,7 +108,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
} }
scanner->sbeg+= mblen; scanner->sbeg+= mblen;
if (wc[0] > scanner->level->maxchar) if (currwc > scanner->level->maxchar)
{ {
/* Return 0xFFFD as weight for all characters outside BMP */ /* Return 0xFFFD as weight for all characters outside BMP */
scanner->wbeg= nochar; scanner->wbeg= nochar;
...@@ -113,17 +116,20 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) ...@@ -113,17 +116,20 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
} }
#if MY_UCA_COMPILE_CONTRACTIONS #if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, wc[0])) if (my_uca_needs_context_handling(scanner->level, currwc))
{ {
uint16 *cweight= my_uca_context_weight_find(scanner, wc); const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
if (cweight) if (cweight)
{
scanner->wbeg= cweight + 1;
return *cweight; return *cweight;
} }
}
#endif #endif
/* Process single character */ /* Process single character */
scanner->page= wc[0] >> 8; scanner->page= currwc >> 8;
scanner->code= wc[0] & 0xFF; scanner->code= currwc & 0xFF;
/* If weight page for w[0] does not exist, then calculate algoritmically */ /* If weight page for w[0] does not exist, then calculate algoritmically */
if (!(wpage= scanner->level->weights[scanner->page])) if (!(wpage= scanner->level->weights[scanner->page]))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment