Commit 0a3d1d10 authored by Alexander Barkov's avatar Alexander Barkov

Refactoring for MDEV-27042 and MDEV-27009

This patch prepares the code for upcoming changes:

MDEV-27009 Add UCA-14.0.0 collations
MDEV-27042 UCA: Resetting contractions to ignorable does not work well

1. Adding "const" qualifiers to return type and parameters in functions:
- my_uca_contraction2_weight()
- my_wmemcmp()
- my_uca_contraction_weight()
- my_uca_scanner_contraction_find()
- my_uca_previous_context_find()
- my_uca_context_weight_find()

2. Adding a helper function my_uca_true_contraction_eq()

3. Changing the way how scanner->wbeg is set during context weight handling.
   It was previously set inside functions:
   - my_uca_scanner_contraction_find()
   - my_uca_previous_context_find()
   Now it's set inside scanner_next(), which makes the code more symmetric
   for context-free and context-dependent sequences.
   This makes then upcoming fix for MDEV-27042 simpler.
parent 86891b85
......@@ -135,8 +135,8 @@ typedef struct my_contraction_list_t
my_bool my_uca_can_be_contraction_head(const MY_CONTRACTIONS *c, my_wc_t wc);
my_bool my_uca_can_be_contraction_tail(const MY_CONTRACTIONS *c, my_wc_t wc);
uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
my_wc_t wc1, my_wc_t wc2);
const uint16 *my_uca_contraction2_weight(const MY_CONTRACTIONS *c,
my_wc_t wc1, my_wc_t wc2);
/* Collation weights on a single level (e.g. primary, secondary, tertiarty) */
......
......@@ -31358,7 +31358,7 @@ my_uca_can_be_contraction_part(const MY_CONTRACTIONS *c, my_wc_t wc, int flag)
@retval ptr - contraction weight array
*/
uint16 *
const uint16 *
my_uca_contraction2_weight(const MY_CONTRACTIONS *list, my_wc_t wc1, my_wc_t wc2)
{
MY_CONTRACTION *c, *last;
......@@ -31443,13 +31443,29 @@ my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc)
@retval non-zero - strings are different
*/
static int
my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len)
static inline int
my_wmemcmp(const my_wc_t *a, const my_wc_t *b, size_t len)
{
return memcmp(a, b, len * sizeof(my_wc_t));
}
/*
Test if the MY_CONTRACTION instance is equal to the wide
string with the given length.
Note, only true contractions are checked,
while previous context pairs always return FALSE.
*/
static inline my_bool
my_uca_true_contraction_eq(const MY_CONTRACTION *c,
const my_wc_t *wc, size_t len)
{
return (len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) &&
!c->with_context &&
!my_wmemcmp(c->ch, wc, len);
}
/**
Check if a string is a contraction,
and return its weight array on success.
......@@ -31463,7 +31479,7 @@ my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len)
@retval ptr - contraction weight array
*/
static inline uint16 *
static inline const uint16 *
my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
{
MY_CONTRACTION *c, *last;
......@@ -31471,9 +31487,7 @@ my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
for (c= list->item, last= c + list->nitems; c < last; c++)
{
if ((len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) &&
!c->with_context &&
!my_wmemcmp(c->ch, wc, len))
if (my_uca_true_contraction_eq(c, wc, len))
return c->weight;
}
return NULL;
......@@ -31495,12 +31509,15 @@ my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
@retval ptr - contraction weight array
*/
static uint16 *
my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc)
static const uint16 *
my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc)
{
size_t clen= 1;
int flag;
const uchar *s, *beg[MY_UCA_MAX_CONTRACTION];
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
wc[0]= currwc;
memset((void*) beg, 0, sizeof(beg));
/* Scan all contraction candidates */
......@@ -31520,13 +31537,12 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc)
/* Find among candidates the longest real contraction */
for ( ; clen > 1; clen--)
{
uint16 *cweight;
const uint16 *cweight;
if (my_uca_can_be_contraction_tail(&scanner->level->contractions,
wc[clen - 1]) &&
(cweight= my_uca_contraction_weight(&scanner->level->contractions,
wc, clen)))
{
scanner->wbeg= cweight + 1;
scanner->sbeg= beg[clen - 1];
return cweight;
}
......@@ -31549,19 +31565,15 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc)
@retval ptr - contraction weight array
*/
static uint16 *
my_uca_previous_context_find(my_uca_scanner *scanner,
static const uint16 *
my_uca_previous_context_find(const MY_CONTRACTIONS *list,
my_wc_t wc0, my_wc_t wc1)
{
const MY_CONTRACTIONS *list= &scanner->level->contractions;
MY_CONTRACTION *c, *last;
for (c= list->item, last= c + list->nitems; c < last; c++)
{
if (c->with_context && wc0 == c->ch[0] && wc1 == c->ch[1])
{
scanner->wbeg= c->weight + 1;
return c->weight;
}
}
return NULL;
}
......@@ -31584,10 +31596,11 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
@retval NULL if could not find any contextual weights for wc[0]
@retval non null pointer to a zero-terminated weight string otherwise
*/
static inline uint16 *
my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc)
static inline const uint16 *
my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc)
{
uint16 *cweight;
const uint16 *cweight;
my_wc_t prevwc;
DBUG_ASSERT(scanner->level->contractions.nitems);
/*
If we have scanned a character which can have previous context,
......@@ -31599,21 +31612,22 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc)
context at the moment. CLDR does not have longer sequences.
*/
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
wc[0]) &&
currwc) &&
scanner->wbeg != nochar && /* if not the very first character */
my_uca_can_be_previous_context_head(&scanner->level->contractions,
(wc[1]= ((scanner->page << 8) +
(prevwc= ((scanner->page << 8) +
scanner->code))) &&
(cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
(cweight= my_uca_previous_context_find(&scanner->level->contractions,
prevwc, currwc)))
{
scanner->page= scanner->code= 0; /* Clear for the next character */
return cweight;
}
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
wc[0]))
currwc))
{
/* Check if w[0] starts a contraction */
if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
if ((cweight= my_uca_scanner_contraction_find(scanner, currwc)))
return cweight;
}
return NULL;
......@@ -52,28 +52,31 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
do
{
const uint16 *wpage;
my_wc_t wc[MY_UCA_MAX_CONTRACTION];
int mblen;
my_wc_t currwc;
/* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
/* Get next ASCII character */
if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
{
wc[0]= scanner->sbeg[0];
currwc= scanner->sbeg[0];
scanner->sbeg+= 1;
#if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, wc[0]))
if (my_uca_needs_context_handling(scanner->level, currwc))
{
uint16 *cweight= my_uca_context_weight_find(scanner, wc);
const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
if (cweight)
{
scanner->wbeg= cweight + 1;
return *cweight;
}
}
#endif
scanner->page= 0;
scanner->code= (int) wc[0];
scanner->code= (int) currwc;
scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
if (scanner->wbeg[0])
return *scanner->wbeg++;
......@@ -82,8 +85,8 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
else
#endif
/* Get next MB character */
if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
scanner->send)) <= 0))
if (((mblen= MY_MB_WC(scanner, &currwc, scanner->sbeg,
scanner->send)) <= 0))
{
if (scanner->sbeg >= scanner->send)
return -1; /* No more bytes, end of line reached */
......@@ -105,7 +108,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
}
scanner->sbeg+= mblen;
if (wc[0] > scanner->level->maxchar)
if (currwc > scanner->level->maxchar)
{
/* Return 0xFFFD as weight for all characters outside BMP */
scanner->wbeg= nochar;
......@@ -113,17 +116,20 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
}
#if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, wc[0]))
if (my_uca_needs_context_handling(scanner->level, currwc))
{
uint16 *cweight= my_uca_context_weight_find(scanner, wc);
const uint16 *cweight= my_uca_context_weight_find(scanner, currwc);
if (cweight)
{
scanner->wbeg= cweight + 1;
return *cweight;
}
}
#endif
/* Process single character */
scanner->page= wc[0] >> 8;
scanner->code= wc[0] & 0xFF;
scanner->page= currwc >> 8;
scanner->code= currwc & 0xFF;
/* If weight page for w[0] does not exist, then calculate algoritmically */
if (!(wpage= scanner->level->weights[scanner->page]))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment