Commit 1c9d0d06 authored by unknown's avatar unknown

boolean fulltext search without an index


myisam/ft_dump.c:
  some help added
parent 05f04742
......@@ -29167,8 +29167,6 @@ mysql> select STRCMP('text', 'text');
relevance - similarity measure between the text in columns
@code{(col1,col2,...)} and the query @code{expr}. Relevance is a
positive floating-point number. Zero relevance means no similarity.
For @code{MATCH ... AGAINST()} to work, a @strong{FULLTEXT} index
must be created first. @xref{CREATE TABLE, , @code{CREATE TABLE}}.
@code{MATCH ... AGAINST()} is available in MySQL version
3.23.23 or later. @code{IN BOOLEAN MODE} extension was added in version
4.0.1. For details and usage examples @pxref{Fulltext Search}.
......@@ -33828,9 +33826,10 @@ mysql> SELECT * FROM articles WHERE MATCH (title,body) AGAINST (
This query retrieved all the rows that contain the word @code{MySQL}
(note: 50% threshold is gone), but does @strong{not} contain the word
@code{YourSQL}. Note that it does not auto-magically sort rows in
@code{YourSQL}. Note, that it does not auto-magically sort rows in
derceasing relevance order (the last row has the highest relevance,
as it contains @code{MySQL} twice).
as it contains @code{MySQL} twice). Boolean fulltext search can also
work even without @code{FULLTEXT} index, but it would be @strong{slow}.
Boolean fulltext search supports the following operators:
......@@ -33890,10 +33889,12 @@ order), but rank ``gates to hell'' higher than ``bill gates''.
@itemize @bullet
@item
All parameters to the @code{MATCH} function must be columns from the
same table that is part of the same fulltext index.
same table that is part of the same fulltext index, unless this
@code{MATCH} is @code{IN BOOLEAN MODE}.
@item
Column list between @code{MATCH} and @code{AGAINST} must match exactly
a column list in the @code{FULLTEXT} index definition.
a column list in the @code{FULLTEXT} index definition, unless this
@code{MATCH} is @code{IN BOOLEAN MODE}.
@item
The argument to @code{AGAINST} must be a constant string.
@end itemize
......@@ -45853,6 +45854,9 @@ Our TODO section contains what we plan to have in 4.0. @xref{TODO MySQL 4.0}.
@itemize @bullet
@item
@code{MATCH ... AGAINST(... IN BOOLEAN MODE)} can now work
without @code{FULLTEXT} index.
@item
Added @file{myisam/ft_dump} utility for low-level inspection
of @code{FULLTEXT} indexes.
@item
......@@ -32,7 +32,7 @@ extern "C" {
typedef struct st_ft_info FT_INFO;
struct _ft_vft {
int (*read_next)(FT_INFO *, char *);
float (*find_relevance)(FT_INFO *, my_off_t, byte *);
float (*find_relevance)(FT_INFO *, byte *, uint);
void (*close_search)(FT_INFO *);
float (*get_relevance)(FT_INFO *);
void (*reinit_search)(FT_INFO *);
......
......@@ -226,6 +226,7 @@ enum ha_base_keytype {
/* Other constants */
#define HA_NAMELEN 64 /* Max length of saved filename */
#define NO_SUCH_KEY ((uint)~0) /* used as a key no. */
/* Intern constants in databases */
......
......@@ -152,13 +152,16 @@ void _ftb_init_index_search(FT_INFO *ftb)
int i, r;
FTB_WORD *ftbw;
MI_INFO *info=ftb->info;
MI_KEYDEF *keyinfo=info->s->keyinfo+ftb->keynr;
my_off_t keyroot=info->s->state.key_root[ftb->keynr];
MI_KEYDEF *keyinfo;
my_off_t keyroot;
if (ftb->state != READY)
if (ftb->state != READY || ftb->keynr == NO_SUCH_KEY)
return;
ftb->state=INDEX_SEARCH;
keyinfo=info->s->keyinfo+ftb->keynr;
keyroot=info->s->state.key_root[ftb->keynr];
for (i=ftb->queue.elements; i; i--)
{
ftbw=(FTB_WORD *)(ftb->queue.root[i]);
......@@ -352,14 +355,17 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
return my_errno=HA_ERR_END_OF_FILE;
}
float ft_boolean_find_relevance(FT_INFO *ftb, my_off_t docid, byte *record)
float ft_boolean_find_relevance(FT_INFO *ftb, byte *record, uint length)
{
TREE ptree;
FT_WORD word;
FTB_WORD *ftbw;
FTB_EXPR *ftbe;
uint i;
my_off_t docid=ftb->info->lastpos;
if (docid == HA_POS_ERROR)
return -2.0;
if (ftb->state == READY || ftb->state == INDEX_DONE)
{
for (i=1; i<=ftb->queue.elements; i++)
......@@ -382,11 +388,13 @@ float ft_boolean_find_relevance(FT_INFO *ftb, my_off_t docid, byte *record)
ftb->state=SCAN;
}
else if (ftb->state != SCAN)
return -2.0;
return -3.0;
bzero(&ptree, sizeof(ptree));
if (_mi_ft_parse(& ptree, ftb->info, ftb->keynr, record))
return -3.0;
if ((ftb->keynr==NO_SUCH_KEY)
? ft_parse(& ptree, record, length)
: _mi_ft_parse(& ptree, ftb->info, ftb->keynr, record))
return -4.0;
for (i=1; i<=ftb->queue.elements; i++)
{
......
......@@ -159,7 +159,7 @@ int main(int argc,char *argv[])
return 0;
}
const char *options="dscve:h";
const char *options="dscvh";
static void get_options(int argc, char *argv[])
{
......@@ -184,7 +184,15 @@ static void get_options(int argc, char *argv[])
static void usage(char *argv[])
{
printf("Use: %s [-%s] <table_name> <key_no>\n", *argv, options);
printf("
Use: %s [-%s] <table_name> <index_no>
-d dump index (incl. data offsets and word weights)
-s report global stats
-c calculate per-word stats (counts and global weights)
-v be verbose
-h this text\n
", *argv, options);
exit(1);
}
......
......@@ -169,7 +169,7 @@ static int FT_DOC_cmp(FT_DOC *a, FT_DOC *b)
FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, byte *query,
uint query_len, my_bool presort)
{
TREE *wtree, allocated_wtree;
TREE allocated_wtree, *wtree=&allocated_wtree;
ALL_IN_ONE aio;
FT_DOC *dptr;
FT_INFO *dlist=NULL;
......@@ -193,7 +193,7 @@ FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, byte *query,
init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
NULL, NULL);
if(!(wtree=ft_parse(&allocated_wtree,query,query_len)))
if(ft_parse(&allocated_wtree,query,query_len))
goto err;
if(tree_walk(wtree, (tree_walk_action)&walk_and_match, &aio,
......@@ -247,11 +247,15 @@ int ft_nlq_read_next(FT_INFO *handler, char *record)
return my_errno;
}
float ft_nlq_find_relevance(FT_INFO *handler, my_off_t docid,
byte *record __attribute__((unused)))
float ft_nlq_find_relevance(FT_INFO *handler,
byte *record __attribute__((unused)), uint length __attribute__((unused)))
{
int a,b,c;
FT_DOC *docs=handler->doc;
my_off_t docid=handler->info->lastpos;
if (docid == HA_POS_ERROR)
return -5.0;
/* Assuming docs[] is sorted by dpos... */
......
......@@ -206,7 +206,7 @@ byte ft_simple_get_word(byte **start, byte *end, FT_WORD *word)
return 0;
}
TREE * ft_parse(TREE *wtree, byte *doc, int doclen)
int ft_parse(TREE *wtree, byte *doc, int doclen)
{
byte *end=doc+doclen;
FT_WORD w;
......@@ -221,10 +221,10 @@ TREE * ft_parse(TREE *wtree, byte *doc, int doclen)
if (!tree_insert(wtree, &w, 0))
goto err;
}
return wtree;
return 0;
err:
delete_tree(wtree);
return NULL;
return 1;
}
......@@ -28,7 +28,7 @@
/**************************************************************/
/* parses a document i.e. calls _mi_ft_parse for every keyseg */
/* parses a document i.e. calls ft_parse for every keyseg */
uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record)
{
byte *pos;
......@@ -57,11 +57,11 @@ uint _mi_ft_parse(TREE *parsed, MI_INFO *info, uint keynr, const byte *record)
}
else
len=keyseg->length;
if (!(ft_parse(parsed, pos, len)))
if (ft_parse(parsed, pos, len))
return 1;
}
/* Handle the case where all columns are NULL */
if (!is_tree_inited(parsed) && !(ft_parse(parsed, (byte*) "", 0)))
if (!is_tree_inited(parsed) && ft_parse(parsed, (byte*) "", 0))
return 1;
else
return 0;
......
......@@ -120,14 +120,14 @@ uint _ft_make_key(MI_INFO *, uint , byte *, FT_WORD *, my_off_t);
byte ft_get_word(byte **, byte *, FT_WORD *, FTB_PARAM *);
byte ft_simple_get_word(byte **, byte *, FT_WORD *);
TREE * ft_parse(TREE *, byte *, int);
int ft_parse(TREE *, byte *, int);
FT_WORD * ft_linearize(/*MI_INFO *, uint, byte *, */TREE *);
FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, byte *, const byte *);
const struct _ft_vft _ft_vft_nlq;
FT_INFO *ft_init_nlq_search(MI_INFO *, uint, byte *, uint, my_bool);
int ft_nlq_read_next(FT_INFO *, char *);
float ft_nlq_find_relevance(FT_INFO *, my_off_t, byte *);
float ft_nlq_find_relevance(FT_INFO *, byte *, uint);
void ft_nlq_close_search(FT_INFO *);
float ft_nlq_get_relevance(FT_INFO *);
my_off_t ft_nlq_get_docid(FT_INFO *);
......@@ -136,7 +136,7 @@ void ft_nlq_reinit_search(FT_INFO *);
const struct _ft_vft _ft_vft_boolean;
FT_INFO *ft_init_boolean_search(MI_INFO *, uint, byte *, uint, my_bool);
int ft_boolean_read_next(FT_INFO *, char *);
float ft_boolean_find_relevance(FT_INFO *, my_off_t, byte *);
float ft_boolean_find_relevance(FT_INFO *, byte *, uint);
void ft_boolean_close_search(FT_INFO *);
float ft_boolean_get_relevance(FT_INFO *);
my_off_t ft_boolean_get_docid(FT_INFO *);
......
......@@ -10,9 +10,15 @@ INSERT INTO t1 VALUES('MySQL has now support', 'for full-text search'),
('Only MyISAM tables','support collections'),
('Function MATCH ... AGAINST()','is used to do a search'),
('Full-text search in MySQL', 'implements vector space model');
# nl search
select * from t1 where MATCH(a,b) AGAINST ("collections");
select * from t1 where MATCH(a,b) AGAINST ("indexes");
select * from t1 where MATCH(a,b) AGAINST ("indexes collections");
# boolean search
select * from t1 where MATCH(a,b) AGAINST("support -collections" IN BOOLEAN MODE);
select * from t1 where MATCH(a,b) AGAINST("support collections" IN BOOLEAN MODE);
select * from t1 where MATCH(a,b) AGAINST("support +collections" IN BOOLEAN MODE);
......@@ -22,6 +28,13 @@ select * from t1 where MATCH(a,b) AGAINST("+search" IN BOOLEAN MODE);
select * from t1 where MATCH(a,b) AGAINST("+search +(support vector)" IN BOOLEAN MODE);
select * from t1 where MATCH(a,b) AGAINST("+search -(support vector)" IN BOOLEAN MODE);
select *, MATCH(a,b) AGAINST("support collections" IN BOOLEAN MODE) as x from t1;
# boolean w/o index:
select * from t1 where MATCH a AGAINST ("search" IN BOOLEAN MODE);
#update/delete with fulltext index
delete from t1 where a like "MySQL%";
update t1 set a='some test foobar' where MATCH a,b AGAINST ('model');
delete from t1 where MATCH(a,b) AGAINST ("indexes");
......
......@@ -2004,6 +2004,9 @@ void Item_func_match::init_search(bool no_order)
return;
}
if (key == NO_SUCH_KEY)
concat=new Item_func_concat_ws (new Item_string(" ",1), fields);
String *ft_tmp=0;
char tmp1[FT_QUERY_MAXLEN];
String tmp2(tmp1,sizeof(tmp1));
......@@ -2015,7 +2018,8 @@ void Item_func_match::init_search(bool no_order)
tmp2.set("",0);
}
ft_handler_init(ft_tmp->ptr(), ft_tmp->length(), join_key && !no_order);
ft_handler=table->file->ft_init_ext(mode, key,
ft_tmp->ptr(), ft_tmp->length(), join_key && !no_order);
if (join_key)
{
......@@ -2032,12 +2036,11 @@ bool Item_func_match::fix_fields(THD *thd,struct st_table_list *tlist)
maybe_null=1;
join_key=0;
/* Serg:
I'd rather say now that const_item is assumed in quite a bit of
places, so it would be difficult to remove; If it would ever to be
removed, this should include modifications to find_best and auto_close
as complement to auto_init code above.
*/
/* const_item is assumed in quite a bit of places, so it would be difficult
to remove; If it would ever to be removed, this should include
modifications to find_best and auto_close as complement to auto_init code
above.
*/
if (Item_func::fix_fields(thd,tlist) || !const_item())
{
my_error(ER_WRONG_ARGUMENTS,MYF(0),"AGAINST");
......@@ -2051,21 +2054,20 @@ bool Item_func_match::fix_fields(THD *thd,struct st_table_list *tlist)
if (item->type() == Item::REF_ITEM)
li.replace(item= *((Item_ref *)item)->ref);
if (item->type() != Item::FIELD_ITEM || !item->used_tables())
{
my_error(ER_WRONG_ARGUMENTS,MYF(0),"MATCH");
return 1;
}
key=NO_SUCH_KEY;
used_tables_cache|=item->used_tables();
}
/* check that all columns come from the same table */
if (count_bits(used_tables_cache) != 1)
key=NO_SUCH_KEY;
const_item_cache=0;
table=((Item_field *)fields.head())->field->table;
record=table->record[0];
if (key == NO_SUCH_KEY && mode != FT_BOOL)
{
my_error(ER_WRONG_ARGUMENTS,MYF(0),"MATCH");
return 1;
}
const_item_cache=0;
table=((Item_field *)fields.head())->field->table;
record=table->record[0];
return 0;
}
......@@ -2074,6 +2076,10 @@ bool Item_func_match::fix_index()
List_iterator_fast<Item> li(fields);
Item_field *item;
uint ft_to_key[MAX_KEY], ft_cnt[MAX_KEY], fts=0, key;
uint max_cnt=0, mkeys=0;
if (this->key == NO_SUCH_KEY)
return 0;
for (key=0 ; key<table->keys ; key++)
{
......@@ -2087,11 +2093,7 @@ bool Item_func_match::fix_index()
}
if (!fts)
{
my_printf_error(ER_FT_MATCHING_KEY_NOT_FOUND,
ER(ER_FT_MATCHING_KEY_NOT_FOUND),MYF(0));
return 1;
}
goto err;
while ((item=(Item_field*)(li++)))
{
......@@ -2108,7 +2110,6 @@ bool Item_func_match::fix_index()
}
}
uint max_cnt=0, mkeys=0;
for (key=0 ; key<fts ; key++)
{
if (ft_cnt[key] > max_cnt)
......@@ -2139,6 +2140,12 @@ bool Item_func_match::fix_index()
return 0;
}
err:
if (mode == FT_BOOL)
{
this->key=NO_SUCH_KEY;
return 0;
}
my_printf_error(ER_FT_MATCHING_KEY_NOT_FOUND,
ER(ER_FT_MATCHING_KEY_NOT_FOUND),MYF(0));
return 1;
......@@ -2174,61 +2181,18 @@ double Item_func_match::val()
join_key=0;
}
my_off_t docid=table->file->row_position();
if ((null_value=(docid==HA_OFFSET_ERROR)))
return 0.0;
else
return ft_handler->please->find_relevance(ft_handler, docid, record);
}
#if 0
double Item_func_match_nl::val()
{
if (ft_handler==NULL)
init_search(1);
if ((null_value= (ft_handler==NULL)))
return 0.0;
if (join_key)
if (key == NO_SUCH_KEY)
{
if (table->file->ft_handler)
return ft_handler->please->get_relevance(ft_handler);
join_key=0;
String *a=concat->val_str(&value);
if (null_value=(a==0))
return 0;
return ft_handler->please->find_relevance(ft_handler,
(byte *)a->ptr(), a->length());
}
my_off_t docid=table->file->row_position();
if ((null_value=(docid==HA_OFFSET_ERROR)))
return 0.0;
else
return ft_handler->please->find_relevance(ft_handler, docid, record);
return ft_handler->please->find_relevance(ft_handler, record, 0);
}
double Item_func_match_bool::val()
{
if (ft_handler==NULL)
init_search(1);
if ((null_value= (ft_handler==NULL)))
return 0.0;
if (join_key)
{
if (table->file->ft_handler)
return ft_handler->please->get_relevance(ft_handler);
join_key=0;
}
return ft_handler->please->find_relevance(ft_handler, docid, record);
//null_value=1;
//return -1.0;
}
#endif
/***************************************************************************
System variables
This has to be recoded after we get more than 3 system variables
......
......@@ -862,15 +862,18 @@ class Item_func_match :public Item_real_func
{
public:
List<Item> fields;
Item *concat;
String value;
TABLE *table;
uint key;
uint key, mode;
bool join_key;
Item_func_match *master;
FT_INFO * ft_handler;
byte *record;
Item_func_match(List<Item> &a, Item *b): Item_real_func(b),
fields(a), table(0), join_key(0), master(0), ft_handler(0) {}
fields(a), table(0), join_key(0), master(0), ft_handler(0),
key(0), concat(0) {}
~Item_func_match()
{
if (!master && ft_handler)
......@@ -880,8 +883,8 @@ class Item_func_match :public Item_real_func
if(join_key)
table->file->ft_handler=0;
}
if (concat) delete concat;
}
virtual int ft_handler_init(const byte *key, uint keylen, bool presort) =0;
enum Functype functype() const { return FT_FUNC; }
void update_used_tables() {}
bool fix_fields(THD *thd,struct st_table_list *tlist);
......@@ -896,26 +899,16 @@ class Item_func_match :public Item_real_func
class Item_func_match_nl :public Item_func_match
{
public:
Item_func_match_nl(List<Item> &a, Item *b): Item_func_match(a,b) {}
Item_func_match_nl(List<Item> &a, Item *b):
Item_func_match(a,b) { mode=FT_NL; }
const char *func_name() const { return "match_nl"; }
// double val();
int ft_handler_init(const byte *query, uint querylen, bool presort)
{
ft_handler=table->file->ft_init_ext(FT_NL,key, query, querylen, presort);
return 0;
}
};
class Item_func_match_bool :public Item_func_match
{
public:
Item_func_match_bool(List<Item> &a, Item *b): Item_func_match(a,b) {}
Item_func_match_bool(List<Item> &a, Item *b):
Item_func_match(a,b) { mode=FT_BOOL; }
const char *func_name() const { return "match_bool"; }
// double val();
int ft_handler_init(const byte *query, uint querylen, bool presort)
{
ft_handler=table->file->ft_init_ext(FT_BOOL,key, query, querylen, presort);
return 0;
}
};
......@@ -1457,7 +1457,7 @@ add_ft_keys(DYNAMIC_ARRAY *keyuse_array,
{
Item *item;
/*
I, (Sergei) too lazy to implement proper recursive descent here,
I'm (Sergei) too lazy to implement proper recursive descent here,
and anyway, nobody will use such a stupid queries
that will require it :-)
May be later...
......@@ -1474,7 +1474,7 @@ add_ft_keys(DYNAMIC_ARRAY *keyuse_array,
}
}
if (!cond_func)
if (!cond_func || cond_func->key == NO_SUCH_KEY)
return;
KEYUSE keyuse;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment