Commit 15ae97b1 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-32578 row_merge_fts_doc_tokenize() handles parser plugin inconsistently

When mysql/mysql-server@0c954c2289a75d90d1088356b1092437ebf45a1d
added a plugin interface for FULLTEXT INDEX tokenization to MySQL 5.7,
fts_tokenize_ctx::processed_len got a second meaning, which is only
partly implemented in row_merge_fts_doc_tokenize().

This inconsistency could cause a crash when using FULLTEXT...WITH PARSER.
A test case that would crash MySQL 8.0 when using an n-gram parser and
single-character words would fail to crash in MySQL 5.7, because the
buf_full condition in row_merge_fts_doc_tokenize() was not met.

This change is inspired by
mysql/mysql-server@38e9a0779aeea2d197c727e306a910c56b26a47c
that appeared in MySQL 5.7.44.
parent 728bca44
...@@ -108,7 +108,10 @@ typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t; ...@@ -108,7 +108,10 @@ typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t;
/** Structure stores information from string tokenization operation */ /** Structure stores information from string tokenization operation */
struct fts_tokenize_ctx { struct fts_tokenize_ctx {
ulint processed_len; /*!< processed string length */ /** the processed string length in bytes
(when using the built-in tokenizer),
or the number of row_merge_fts_doc_tokenize_by_parser() calls */
ulint processed_len;
ulint init_pos; /*!< doc start position */ ulint init_pos; /*!< doc start position */
ulint buf_used; /*!< the sort buffer (ID) when ulint buf_used; /*!< the sort buffer (ID) when
tokenization stops, which tokenization stops, which
...@@ -119,6 +122,7 @@ struct fts_tokenize_ctx { ...@@ -119,6 +122,7 @@ struct fts_tokenize_ctx {
ib_rbt_t* cached_stopword;/*!< in: stopword list */ ib_rbt_t* cached_stopword;/*!< in: stopword list */
dfield_t sort_field[FTS_NUM_FIELDS_SORT]; dfield_t sort_field[FTS_NUM_FIELDS_SORT];
/*!< in: sort field */ /*!< in: sort field */
/** parsed tokens (when using an external parser) */
fts_token_list_t fts_token_list; fts_token_list_t fts_token_list;
fts_tokenize_ctx() : fts_tokenize_ctx() :
......
...@@ -506,7 +506,10 @@ row_merge_fts_doc_tokenize( ...@@ -506,7 +506,10 @@ row_merge_fts_doc_tokenize(
/* Tokenize the data and add each word string, its corresponding /* Tokenize the data and add each word string, its corresponding
doc id and position to sort buffer */ doc id and position to sort buffer */
while (t_ctx->processed_len < doc->text.f_len) { while (parser
? (!t_ctx->processed_len
|| UT_LIST_GET_LEN(t_ctx->fts_token_list))
: t_ctx->processed_len < doc->text.f_len) {
ulint idx = 0; ulint idx = 0;
ulint cur_len; ulint cur_len;
doc_id_t write_doc_id; doc_id_t write_doc_id;
...@@ -847,7 +850,8 @@ DECLARE_THREAD(fts_parallel_tokenization)( ...@@ -847,7 +850,8 @@ DECLARE_THREAD(fts_parallel_tokenization)(
/* Not yet finish processing the "doc" on hand, /* Not yet finish processing the "doc" on hand,
continue processing it */ continue processing it */
ut_ad(doc.text.f_str); ut_ad(doc.text.f_str);
ut_ad(t_ctx.processed_len < doc.text.f_len); ut_ad(buf[0]->index->parser
|| t_ctx.processed_len < doc.text.f_len);
} }
processed = row_merge_fts_doc_tokenize( processed = row_merge_fts_doc_tokenize(
...@@ -857,7 +861,8 @@ DECLARE_THREAD(fts_parallel_tokenization)( ...@@ -857,7 +861,8 @@ DECLARE_THREAD(fts_parallel_tokenization)(
/* Current sort buffer full, need to recycle */ /* Current sort buffer full, need to recycle */
if (!processed) { if (!processed) {
ut_ad(t_ctx.processed_len < doc.text.f_len); ut_ad(buf[0]->index->parser
|| t_ctx.processed_len < doc.text.f_len);
ut_ad(t_ctx.rows_added[t_ctx.buf_used]); ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
break; break;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment