Commit 19b0c089 authored by Hugo Wen's avatar Hugo Wen Committed by Sergei Golubchik

MDEV-33408 Initial support for vector DELETE and UPDATE

When the source row is deleted, mark the corresponding node in HNSW
index by setting `tref` to null. An index is added for the `tref` in
secondary table for faster searching of the to-be-marked nodes.

The nodes marked as deleted will still be used for search, but will not
be included in the final query results.

As skipping deleted nodes and not adding deleted nodes for new-inserted
nodes' neighbor list could impact the performance, we now only skip
these nodes in search results.

- for some reason the bitmap is not set for hlindex during the delete so
  I had to temporarily comment out one line

All new code of the whole pull request, including one or several files
that are either new files or modified ones, are contributed under the
BSD-new license. I am contributing on behalf of my employer Amazon Web
Services, Inc.
parent b4b9d7e2
......@@ -197,6 +197,51 @@ id1 id2 vec_distance(t1.v, t2.v)
9 8 1.2575258643523053
7 8 1.288239696195716
8 7 1.288239696195716
delete from t1 where v = x'7b713f3e5258323f80d1113d673b2b3f66e3583f';
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 3;
id d
10 0.256948729687565
3 0.344061212052452
7 0.5394116168863548
insert t1 (v) values (x'7b713f3e5258323f80d1113d673b2b3f66e3583f');
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 3;
id d
11 0.22278176178224385
10 0.256948729687565
3 0.344061212052452
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
id d
11 0.22278176178224385
10 0.256948729687565
3 0.344061212052452
7 0.5394116168863548
5 0.5884475540369749
update t1 set v=x'76EDFC3E4B57243F10F8423FB158713F020BAA3E' where v=x'6CA1D43E9DF91B3FE580DA3E1C247D3F147CF33E';
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
id d
11 0.22278176178224385
3 0.344061212052452
7 0.5394116168863548
10 0.5577650851591898
5 0.5884475540369749
delete from t1;
insert t1 (v) values (x'e360d63ebe554f3fcdbc523f4522193f5236083d'),
(x'f511303f72224a3fdd05fe3eb22a133ffae86a3f'),
(x'f09baa3ea172763f123def3e0c7fe53e288bf33e'),
(x'b97a523f2a193e3eb4f62e3f2d23583e9dd60d3f'),
(x'f7c5df3e984b2b3e65e59d3d7376db3eac63773e'),
(x'de01453ffa486d3f10aa4d3fdd66813c71cb163f'),
(x'76edfc3e4b57243f10f8423fb158713f020bda3e'),
(x'56926c3fdf098d3e2c8c5e3d1ad4953daa9d0b3e'),
(x'7b713f3e5258323f80d1113d673b2b3f66e3583f'),
(x'6ca1d43e9df91b3fe580da3e1c247d3f147cf33e');
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
id d
20 0.22278176178224385
21 0.256948729687565
14 0.344061212052452
18 0.5394116168863548
16 0.5884475540369749
insert t1 (v) values ('');
ERROR 22007: Incorrect vector value: '...' for column `test`.`t1`.`v` at row 1
insert t1 (v) values (x'1234');
......
......@@ -36,6 +36,33 @@ select id>0,vec_distance(v, NULL) d from t1 order by d limit 3;
select id>0,vec_distance(v, x'123456') d from t1 order by d limit 3;
select t1.id as id1, t2.id as id2, vec_distance(t1.v, t2.v) from t1, t1 as t2 order by 3,1,2;
# test delete
delete from t1 where v = x'7b713f3e5258323f80d1113d673b2b3f66e3583f';
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 3;
# test insert deleted vec
insert t1 (v) values (x'7b713f3e5258323f80d1113d673b2b3f66e3583f');
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 3;
# test update
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
update t1 set v=x'76EDFC3E4B57243F10F8423FB158713F020BAA3E' where v=x'6CA1D43E9DF91B3FE580DA3E1C247D3F147CF33E';
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
# test delete all and reinsert
delete from t1;
insert t1 (v) values (x'e360d63ebe554f3fcdbc523f4522193f5236083d'),
(x'f511303f72224a3fdd05fe3eb22a133ffae86a3f'),
(x'f09baa3ea172763f123def3e0c7fe53e288bf33e'),
(x'b97a523f2a193e3eb4f62e3f2d23583e9dd60d3f'),
(x'f7c5df3e984b2b3e65e59d3d7376db3eac63773e'),
(x'de01453ffa486d3f10aa4d3fdd66813c71cb163f'),
(x'76edfc3e4b57243f10f8423fb158713f020bda3e'),
(x'56926c3fdf098d3e2c8c5e3d1ad4953daa9d0b3e'),
(x'7b713f3e5258323f80d1113d673b2b3f66e3583f'),
(x'6ca1d43e9df91b3fe580da3e1c247d3f147cf33e');
select id,vec_distance(v, x'b047263c9f87233fcfd27e3eae493e3f0329f43e') d from t1 order by d limit 5;
--error ER_TRUNCATED_WRONG_VALUE_FOR_FIELD
insert t1 (v) values ('');
--error ER_TRUNCATED_WRONG_VALUE_FOR_FIELD
......
......@@ -5475,6 +5475,11 @@ handler::ha_delete_all_rows()
m_lock_type == F_WRLCK);
mark_trx_read_write();
int err= 0;
if ((err= table->open_hlindexes_for_write()) ||
(err= table->hlindexes_on_delete_all()))
return err;
return delete_all_rows();
}
......@@ -8138,7 +8143,7 @@ int handler::ha_write_row(const uchar *buf)
{ error= write_row(buf); })
MYSQL_INSERT_ROW_DONE(error);
if (!error && !((error= table->update_hlindexes())))
if (!error && !((error= table->hlindexes_on_insert())))
{
rows_stats.inserted++;
Log_func *log_func= Write_rows_log_event::binlog_row_logging_function;
......@@ -8171,6 +8176,9 @@ int handler::ha_update_row(const uchar *old_data, const uchar *new_data)
DBUG_ASSERT(new_data == table->record[0]);
DBUG_ASSERT(old_data == table->record[1]);
if (table->open_hlindexes_for_write())
return 1;
uint saved_status= table->status;
error= ha_check_overlaps(old_data, new_data);
......@@ -8189,7 +8197,7 @@ int handler::ha_update_row(const uchar *old_data, const uchar *new_data)
{ error= update_row(old_data, new_data);})
MYSQL_UPDATE_ROW_DONE(error);
if (likely(!error))
if (likely(!error) && !(error= table->hlindexes_on_update()))
{
rows_stats.updated++;
Log_func *log_func= Update_rows_log_event::binlog_row_logging_function;
......@@ -8262,10 +8270,13 @@ int handler::ha_delete_row(const uchar *buf)
mark_trx_read_write();
increment_statistics(&SSV::ha_delete_count);
if (table->open_hlindexes_for_write())
return 1;
TABLE_IO_WAIT(tracker, PSI_TABLE_DELETE_ROW, active_index, error,
{ error= delete_row(buf);})
MYSQL_DELETE_ROW_DONE(error);
if (likely(!error))
if (likely(!error) && !(error= table->hlindexes_on_delete()))
{
rows_stats.deleted++;
Log_func *log_func= Delete_rows_log_event::binlog_row_logging_function;
......
......@@ -9898,7 +9898,8 @@ int TABLE::open_hlindexes_for_write()
{
KEY *key= s->key_info + i;
for (uint j=0; j < key->usable_key_parts; j++)
if (bitmap_is_set(write_set, key->key_part[j].fieldnr - 1))
// TODO WHY?
// if (bitmap_is_set(write_set, key->key_part[j].fieldnr - 1))
{
if (hlindex_open(i))
return 1;
......@@ -9918,7 +9919,7 @@ int TABLE::reset_hlindexes()
return 0;
}
int TABLE::update_hlindexes()
int TABLE::hlindexes_on_insert()
{
DBUG_ASSERT(s->total_keys - s->keys == (hlindex != NULL));
if (hlindex && hlindex->in_use)
......@@ -9927,6 +9928,43 @@ int TABLE::update_hlindexes()
return 0;
}
int TABLE::hlindexes_on_update()
{
DBUG_ASSERT(s->total_keys - s->keys == (hlindex != NULL));
if (!hlindex || !hlindex->in_use)
return 0;
int err;
// mark deleted node invalid and insert node for new row
if ((err= mhnsw_invalidate(this, this->record[1], key_info + s->keys)) ||
(err= mhnsw_insert(this, key_info + s->keys)))
return err;
return 0;
}
int TABLE::hlindexes_on_delete()
{
DBUG_ASSERT(s->total_keys - s->keys == (hlindex != NULL));
if (!hlindex || !hlindex->in_use)
return 0;
if (int err= mhnsw_invalidate(this, this->record[0], key_info + s->keys))
return err;
return 0;
}
int TABLE::hlindexes_on_delete_all()
{
DBUG_ASSERT(s->total_keys - s->keys == (hlindex != NULL));
if (!hlindex || !hlindex->in_use)
return 0;
this->hlindex->file->ha_delete_all_rows();
return 0;
}
int TABLE::hlindex_first(uint nr, Item *item, ulonglong limit)
{
DBUG_ASSERT(s->total_keys - s->keys == 1);
......
......@@ -1795,7 +1795,10 @@ struct TABLE
int hlindex_next();
int open_hlindexes_for_write();
int update_hlindexes();
int hlindexes_on_insert();
int hlindexes_on_update();
int hlindexes_on_delete();
int hlindexes_on_delete_all();
int reset_hlindexes();
void prepare_triggers_for_insert_stmt_or_event();
......
......@@ -16,6 +16,7 @@
*/
#include <my_global.h>
#include "key.h" // key_copy()
#include "vector_mhnsw.h"
#include "item_vectorfunc.h"
#include <scope.h>
......@@ -141,7 +142,7 @@ class FVectorNode: public FVector
public:
Neighborhood *neighbors= nullptr;
uint8_t max_layer;
bool stored;
bool stored:1, deleted:1;
FVectorNode(MHNSW_Context *ctx_, const void *gref_);
FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer,
......@@ -532,15 +533,16 @@ float *FVectorNode::make_vec(const void *v)
}
FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *gref_)
: FVector(), ctx(ctx_), stored(true)
: FVector(), ctx(ctx_), stored(true), deleted(false)
{
memcpy(gref(), gref_, gref_len());
}
FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer,
const void *vec_)
: FVector(), ctx(ctx_), stored(false)
: FVector(), ctx(ctx_), stored(false), deleted(false)
{
DBUG_ASSERT(tref_);
memset(gref(), 0xff, gref_len()); // important: larger than any real gref
memcpy(tref(), tref_, tref_len());
vec= make_vec(vec_);
......@@ -589,9 +591,13 @@ int FVectorNode::load_from_record(TABLE *graph)
return 0;
String buf, *v= graph->field[FIELD_TREF]->val_str(&buf);
if (unlikely(!v || v->length() != tref_len()))
deleted= graph->field[FIELD_TREF]->is_null();
if (!deleted)
{
if (unlikely(v->length() != tref_len()))
return my_errno= HA_ERR_CRASHED;
memcpy(tref(), v->ptr(), v->length());
}
v= graph->field[FIELD_VEC]->val_str(&buf);
if (unlikely(!v))
......@@ -761,8 +767,13 @@ int FVectorNode::save(TABLE *graph)
DBUG_ASSERT(neighbors);
graph->field[FIELD_LAYER]->store(max_layer, false);
if (deleted)
graph->field[FIELD_TREF]->set_null();
else
{
graph->field[FIELD_TREF]->set_notnull();
graph->field[FIELD_TREF]->store_binary(tref(), tref_len());
}
graph->field[FIELD_VEC]->store_binary((uchar*)vec, ctx->byte_len);
size_t total_size= 0;
......@@ -825,7 +836,7 @@ static int update_second_degree_neighbors(MHNSW_Context *ctx, TABLE *graph,
static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector &target,
Neighborhood *start_nodes, uint ef, size_t layer,
Neighborhood *result)
Neighborhood *result, bool skip_deleted)
{
DBUG_ASSERT(start_nodes->num > 0);
result->num= 0;
......@@ -847,13 +858,15 @@ static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector &target,
{
Visited *v= visited.create(start_nodes->links[i]);
candidates.push(v);
if (skip_deleted && v->node->deleted)
continue;
if (best.elements() < ef)
best.push(v);
else if (v->distance_to_target < best.top()->distance_to_target)
best.replace_top(v);
}
float furthest_best= best.top()->distance_to_target;
float furthest_best= FLT_MAX;
while (candidates.elements())
{
const Visited &cur= *candidates.pop();
......@@ -880,13 +893,17 @@ static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector &target,
if (best.elements() < ef)
{
candidates.push(v);
if (skip_deleted && v->node->deleted)
continue;
best.push(v);
furthest_best= best.top()->distance_to_target;
}
else if (v->distance_to_target < furthest_best)
{
best.replace_top(v);
candidates.push(v);
if (skip_deleted && v->node->deleted)
continue;
best.replace_top(v);
furthest_best= best.top()->distance_to_target;
}
}
......@@ -982,7 +999,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
for (cur_layer= max_layer; cur_layer > target_layer; cur_layer--)
{
if (int err= search_layer(ctx, graph, *target, &start_nodes, 1, cur_layer,
&candidates))
&candidates, false))
return err;
std::swap(start_nodes, candidates);
}
......@@ -991,7 +1008,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
{
uint max_neighbors= ctx->max_neighbors(cur_layer);
if (int err= search_layer(ctx, graph, *target, &start_nodes,
ef_construction, cur_layer, &candidates))
ef_construction, cur_layer, &candidates, false))
return err;
if (int err= select_neighbors(ctx, graph, cur_layer, *target, candidates,
......@@ -1061,13 +1078,13 @@ int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
for (size_t cur_layer= max_layer; cur_layer > 0; cur_layer--)
{
if (int err= search_layer(ctx, graph, target, &start_nodes, 1, cur_layer,
&candidates))
&candidates, false))
return err;
std::swap(start_nodes, candidates);
}
if (int err= search_layer(ctx, graph, target, &start_nodes, ef, 0,
&candidates))
&candidates, true))
return err;
if (limit > candidates.num)
......@@ -1110,6 +1127,60 @@ void mhnsw_free(TABLE_SHARE *share)
graph_share->hlindex_data= 0;
}
int mhnsw_invalidate(TABLE *table, uchar *rec, KEY *keyinfo)
{
TABLE *graph= table->hlindex;
Field *vec_field= keyinfo->key_part->field;
String buf, *res= vec_field->val_str(&buf);
handler *h= table->file;
int err= 0;
/* metadata are checked on open */
DBUG_ASSERT(graph);
DBUG_ASSERT(keyinfo->algorithm == HA_KEY_ALG_VECTOR);
DBUG_ASSERT(keyinfo->usable_key_parts == 1);
DBUG_ASSERT(vec_field->binary());
DBUG_ASSERT(vec_field->cmp_type() == STRING_RESULT);
DBUG_ASSERT(res); // ER_INDEX_CANNOT_HAVE_NULL
DBUG_ASSERT(h->ref_length <= graph->field[1]->field_length);
DBUG_ASSERT(h->ref_length <= graph->field[2]->field_length);
if (res->length() == 0 || res->length() % 4)
return 1;
// use index on tref
if ((err= graph->file->ha_index_init(1, 0)))
return err;
// target record:
h->position(rec);
graph->field[FIELD_TREF]->set_notnull();
graph->field[FIELD_TREF]->store_binary(
reinterpret_cast<const char *>(h->ref), h->ref_length);
uchar *key= (uchar*)alloca(graph->key_info[1].key_length);
key_copy(key, graph->record[0], graph->key_info + 1,
graph->key_info[1].key_length);
err= graph->file->ha_index_read_map(graph->record[1], key,
HA_WHOLE_KEY,
HA_READ_KEY_EXACT);
// Deleted tref not found in index, should not happen
if (err == HA_ERR_KEY_NOT_FOUND)
{
DBUG_ASSERT(0);
return err;
}
restore_record(graph, record[1]);
graph->field[FIELD_TREF]->set_null();
graph->file->ha_update_row(graph->record[1], graph->record[0]);
return 0;
}
const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length)
{
const char templ[]="CREATE TABLE i ( "
......@@ -1117,7 +1188,8 @@ const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length)
" tref varbinary(%u), "
" vec blob not null, "
" neighbors blob not null, "
" key (layer)) ";
" key (layer), "
" key (ref)) ";
size_t len= sizeof(templ) + 32;
char *s= thd->alloc(len);
len= my_snprintf(s, len, templ, ref_length);
......
......@@ -24,6 +24,7 @@
const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length);
int mhnsw_insert(TABLE *table, KEY *keyinfo);
int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit);
int mhnsw_invalidate(TABLE *table, uchar *rec, KEY *keyinfo);
int mhnsw_next(TABLE *table);
void mhnsw_free(TABLE_SHARE *share);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment