Commit 1503a472 authored by unknown's avatar unknown

Optimize LIKE with turbo-boyer-more algoritm


Docs/manual.texi:
  Added info about LIKE optimization
mysql-test/r/func_like.result:
  Test of new LIKE optimization
mysql-test/t/func_like.test:
  Test of new LIKE optimization
parent 03aebd40
......@@ -26728,6 +26728,12 @@ In the first statement, the @code{LIKE} value begins with a wildcard
character. In the second statement, the @code{LIKE} value is not a
constant.
MySQL 4.0 does another optimization on @code{LIKE}. If you are using
@code{... LIKE "%string%"} and @code{string} is longer than 3 characters
then MySQL will use the turbo-boyer-more algorithm to once initialize
the pattern for the string and then use this pattern to quickly search
after the given string.
@findex IS NULL, and indexes
@cindex indexes, and @code{IS NULL}
Searching using @code{column_name IS NULL} will use indexes if column_name
......@@ -49310,6 +49316,8 @@ Our TODO section contains what we plan to have in 4.0. @xref{TODO MySQL 4.0}.
@itemize @bullet
@item
Use turbo-boyer-more to speed up @code{LIKE "%keyword%"} searches.
@item
Fixed bug in @code{DROP DATABASE} with symlink.
@item
Fixed crash in @code{REPAIR ... USE_FRM}.
......@@ -15,4 +15,15 @@ test
select * from t1 where a like "te_t";
a
test
select * from t1 where a like "%a%";
a
a
abc
abcd
select * from t1 where a like "%abcd%";
a
abcd
select * from t1 where a like "%abc\d%";
a
abcd
drop table t1;
......@@ -9,4 +9,12 @@ select * from t1 where a like "abc%";
select * from t1 where a like "ABC%";
select * from t1 where a like "test%";
select * from t1 where a like "te_t";
#
# The following will test the boyer-more code
#
select * from t1 where a like "%a%";
select * from t1 where a like "%abcd%";
select * from t1 where a like "%abc\d%";
drop table t1;
......@@ -1228,23 +1228,23 @@ void Item_func_like::fix_length_and_dec()
// cmp_type=STRING_RESULT; // For quick select
}
longlong Item_func_like::val_int()
{
String *res,*res2;
res=args[0]->val_str(&tmp_value1);
String* res = args[0]->val_str(&tmp_value1);
if (args[0]->null_value)
{
null_value=1;
return 0;
}
res2=args[1]->val_str(&tmp_value2);
String* res2 = args[1]->val_str(&tmp_value2);
if (args[1]->null_value)
{
null_value=1;
return 0;
}
null_value=0;
if (canDoTurboBM)
return turboBM_matches(res->ptr(), res->length()) ? 1 : 0;
if (binary)
return wild_compare(*res,*res2,escape) ? 0 : 1;
else
......@@ -1268,6 +1268,51 @@ Item_func::optimize_type Item_func_like::select_optimize() const
return OPTIMIZE_NONE;
}
bool Item_func_like::fix_fields(THD *thd,struct st_table_list *tlist)
{
if (Item_bool_func2::fix_fields(thd, tlist))
return 1;
/*
TODO--we could do it for non-const, but we'd have to
recompute the tables for each row--probably not worth it.
*/
if (args[1]->const_item() && !(specialflag & SPECIAL_NO_NEW_FUNC))
{
String* res2 = args[1]->val_str(&tmp_value2);
const size_t len = res2->length();
const char* first = res2->ptr();
const char* last = first + len - 1;
/*
len must be > 2 ('%pattern%')
heuristic: only do TurboBM for pattern_len > 2
*/
if (len > MIN_TURBOBM_PATTERN_LEN + 2 &&
*first == wild_many &&
*last == wild_many)
{
const char* tmp = first + 1;
for ( ; *tmp != wild_many && *tmp != wild_one && *tmp != escape; tmp++) ;
canDoTurboBM = tmp == last;
}
if (canDoTurboBM)
{
pattern = first + 1;
pattern_len = len - 2;
DBUG_PRINT("TurboBM", ("Initializing pattern: '%s'...", first));
int* suff = (int*)thd->alloc(sizeof(int[pattern_len + 1]));
bmGs = (int*)thd->alloc(sizeof(int[pattern_len + 1]));
bmBc = (int*)thd->alloc(sizeof(int[alphabet_size]));
turboBM_compute_good_suffix_shifts(suff);
turboBM_compute_bad_character_shifts();
DBUG_PRINT("turboBM",("done"));
}
}
return 0;
}
#ifdef USE_REGEX
bool
......@@ -1307,7 +1352,6 @@ Item_func_regex::fix_fields(THD *thd,TABLE_LIST *tables)
return 0;
}
longlong Item_func_regex::val_int()
{
char buff[MAX_FIELD_WIDTH];
......@@ -1364,3 +1408,215 @@ Item_func_regex::~Item_func_regex()
}
#endif /* USE_REGEX */
#ifdef LIKE_CMP_TOUPPER
#define likeconv(A) (uchar) toupper(A)
#else
#define likeconv(A) (uchar) my_sort_order[(uchar) (A)]
#endif
/**********************************************************************
turboBM_compute_suffixes()
Precomputation dependent only on pattern_len.
**********************************************************************/
void Item_func_like::turboBM_compute_suffixes(int* suff)
{
const int plm1 = pattern_len - 1;
int f = 0;
int g = plm1;
int* const splm1 = suff + plm1;
*splm1 = pattern_len;
if (binary)
{
int i;
for (i = pattern_len - 2; i >= 0; i--)
{
int tmp = *(splm1 + i - f);
if (g < i && tmp < i - g)
suff[i] = tmp;
else
{
if (i < g)
g = i; // g = min(i, g)
f = i;
while (g >= 0 && pattern[g] == pattern[g + plm1 - f])
g--;
suff[i] = f - g;
}
}
}
else
{
int i;
for (i = pattern_len - 2; 0 <= i; --i)
{
int tmp = *(splm1 + i - f);
if (g < i && tmp < i - g)
suff[i] = tmp;
else
{
if (i < g)
g = i; // g = min(i, g)
f = i;
while (g >= 0 && likeconv(pattern[g]) == likeconv(pattern[g + plm1 - f]))
g--;
suff[i] = f - g;
}
}
}
}
/**********************************************************************
turboBM_compute_good_suffix_shifts()
Precomputation dependent only on pattern_len.
**********************************************************************/
void Item_func_like::turboBM_compute_good_suffix_shifts(int* suff)
{
turboBM_compute_suffixes(suff);
int* end = bmGs + pattern_len;
int* k;
for (k = bmGs; k < end; k++)
*k = pattern_len;
int tmp;
int i;
int j = 0;
const int plm1 = pattern_len - 1;
for (i = plm1; i > -1; i--)
{
if (suff[i] == i + 1)
{
for (tmp = plm1 - i; j < tmp; j++)
{
int* tmp2 = bmGs + j;
if (*tmp2 == pattern_len)
*tmp2 = tmp;
}
}
}
int* tmp2;
for (tmp = plm1 - i; j < tmp; j++)
{
tmp2 = bmGs + j;
if (*tmp2 == pattern_len)
*tmp2 = tmp;
}
tmp2 = bmGs + plm1;
for (i = 0; i <= pattern_len - 2; i++)
*(tmp2 - suff[i]) = plm1 - i;
}
/**********************************************************************
turboBM_compute_bad_character_shifts()
Precomputation dependent on pattern_len.
**********************************************************************/
void Item_func_like::turboBM_compute_bad_character_shifts()
{
int* i;
int* end = bmBc + alphabet_size;
for (i = bmBc; i < end; i++)
*i = pattern_len;
int j;
const int plm1 = pattern_len - 1;
if (binary)
for (j = 0; j < plm1; j++)
bmBc[pattern[j]] = plm1 - j;
else
for (j = 0; j < plm1; j++)
bmBc[likeconv(pattern[j])] = plm1 - j;
}
/**********************************************************************
turboBM_matches()
Search for pattern in text, returns true/false for match/no match
**********************************************************************/
bool Item_func_like::turboBM_matches(const char* text, int text_len) const
{
register int bcShift;
register int turboShift;
int shift = pattern_len;
int j = 0;
int u = 0;
const int plm1 = pattern_len - 1;
const int tlmpl = text_len - pattern_len;
/* Searching */
if (binary)
{
while (j <= tlmpl)
{
register int i = plm1;
while (i >= 0 && pattern[i] == text[i + j])
{
i--;
if (i == plm1 - shift)
i -= u;
}
if (i < 0)
return true;
register const int v = plm1 - i;
turboShift = u - v;
bcShift = bmBc[text[i + j]] - plm1 + i;
shift = max(turboShift, bcShift);
shift = max(shift, bmGs[i]);
if (shift == bmGs[i])
u = min(pattern_len - shift, v);
else
{
if (turboShift < bcShift)
shift = max(shift, u + 1);
u = 0;
}
j += shift;
}
return false;
}
else
{
while (j <= tlmpl)
{
register int i = plm1;
while (i >= 0 && likeconv(pattern[i]) == likeconv(text[i + j]))
{
i--;
if (i == plm1 - shift)
i -= u;
}
if (i < 0)
return true;
register const int v = plm1 - i;
turboShift = u - v;
bcShift = bmBc[likeconv(text[i + j])] - plm1 + i;
shift = max(turboShift, bcShift);
shift = max(shift, bmGs[i]);
if (shift == bmGs[i])
u = min(pattern_len - shift, v);
else
{
if (turboShift < bcShift)
shift = max(shift, u + 1);
u = 0;
}
j += shift;
}
return false;
}
}
......@@ -478,15 +478,40 @@ class Item_func_isnotnull :public Item_bool_func
class Item_func_like :public Item_bool_func2
{
char escape;
public:
Item_func_like(Item *a,Item *b, char* escape_arg) :Item_bool_func2(a,b),escape(*escape_arg)
// Turbo Boyer-Moore data
bool canDoTurboBM; // pattern is '%abcd%' case
const char* pattern;
int pattern_len;
// TurboBM buffers, *this is owner
int* bmGs; // good suffix shift table, size is pattern_len + 1
int* bmBc; // bad character shift table, size is alphabet_size
void turboBM_compute_suffixes(int* suff);
void turboBM_compute_good_suffix_shifts(int* suff);
void turboBM_compute_bad_character_shifts();
bool turboBM_matches(const char* text, int text_len) const;
enum { alphabet_size = 256 };
public:
Item_func_like::Item_func_like(Item *a,Item *b, char* escape_arg) :
Item_bool_func2(a,b),
escape(*escape_arg),
canDoTurboBM(false),
pattern(0),
pattern_len(0),
bmGs(0),
bmBc(0)
{}
longlong val_int();
enum Functype functype() const { return LIKE_FUNC; }
optimize_type select_optimize() const;
cond_result eq_cmp_result() const { return COND_TRUE; }
const char *func_name() const { return "like"; }
void fix_length_and_dec();
bool fix_fields(THD *thd,struct st_table_list *tlist);
};
#ifdef USE_REGEX
......
......@@ -122,6 +122,13 @@ bfill((A)->null_flags,(A)->null_bytes,255);\
#define TE_INFO_LENGTH 3
#define MTYP_NOEMPTY_BIT 128
/*
* Minimum length pattern before Turbo Boyer-Moore is used
* for SELECT "text" LIKE "%pattern%", excluding the two
* wildcards in class Item_func_like.
*/
#define MIN_TURBOBM_PATTERN_LEN 3
/* Include prototypes for unireg */
#include "mysqld_error.h"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment