Commit 1249210b authored by Sergei Golubchik's avatar Sergei Golubchik

mhnsw: configurable parameters

1. introduce alpha. the value of 1.1 is optimal, so hard-code it.

2. define ef and efConstruction in terms of limit and M, that is,
   ef = ef_limit_multiplier * limit
   efConstruction = ef_construction_multiplier * M

3. ef_construction_multiplier=4 is almost always optimal, so
   hard-code it too

4. rename hnsw_max_connection_per_layer to mhnsw_max_edges_per_node
   (max_connection is rather ambiguous in MariaDB) and add a help text

5. rename hnsw_ef_search to mhnsw_limit_multiplier and add a help text
parent a91bc7b7
......@@ -400,11 +400,6 @@ The following specify which files/extra groups are read (specified before remain
height-balanced, DOUBLE_PREC_HB - double precision
height-balanced, JSON_HB - height-balanced, stored as
JSON
--hnsw-ef-constructor=#
hnsw_ef_constructor
--hnsw-ef-search=# hnsw_ef_search
--hnsw-max-connection-per-layer=#
hnsw_max_connection_per_layer
--host-cache-size=# How many host names should be cached to avoid resolving
(Automatically configured unless set explicitly)
--idle-readonly-transaction-timeout=#
......@@ -693,6 +688,17 @@ The following specify which files/extra groups are read (specified before remain
Unused. Deprecated, will be removed in a future release.
--metadata-locks-hash-instances=#
Unused. Deprecated, will be removed in a future release.
--mhnsw-limit-multiplier=#
Defines the number of result candidates to look for in
the vector index for ORDER BY ... LIMIT N queries.
Specified in term of LIMIT (1 means look for exactly N
candidates, 2 means look for 2*N, etc). Larger values
means the search will be slower, but the result will be
closer to perfect
--mhnsw-max-edges-per-node=#
Larger values means slower INSERT, larger index size and
higher memory consumption, but better search results. Not
used for SELECT
--min-examined-row-limit=#
Alias for log_slow_min_examined_row_limit. Don't write
queries to slow log that examine fewer rows than that
......@@ -1697,9 +1703,6 @@ gtid-strict-mode FALSE
help TRUE
histogram-size 254
histogram-type JSON_HB
hnsw-ef-constructor 10
hnsw-ef-search 10
hnsw-max-connection-per-layer 50
host-cache-size 279
idle-readonly-transaction-timeout 0
idle-transaction-timeout 0
......@@ -1788,6 +1791,8 @@ max-write-lock-count 18446744073709551615
memlock FALSE
metadata-locks-cache-size 1024
metadata-locks-hash-instances 8
mhnsw-limit-multiplier 2
mhnsw-max-edges-per-node 15
min-examined-row-limit 0
mrr-buffer-size 262144
myisam-block-size 1024
......
......@@ -1422,36 +1422,6 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST SINGLE_PREC_HB,DOUBLE_PREC_HB,JSON_HB
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HNSW_EF_CONSTRUCTOR
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_ef_constructor
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HNSW_EF_SEARCH
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_ef_search
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HNSW_MAX_CONNECTION_PER_LAYER
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_max_connection_per_layer
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HOSTNAME
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE VARCHAR
......@@ -2402,6 +2372,26 @@ NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MHNSW_LIMIT_MULTIPLIER
VARIABLE_SCOPE SESSION
VARIABLE_TYPE DOUBLE
VARIABLE_COMMENT Defines the number of result candidates to look for in the vector index for ORDER BY ... LIMIT N queries. Specified in term of LIMIT (1 means look for exactly N candidates, 2 means look for 2*N, etc). Larger values means the search will be slower, but the result will be closer to perfect
NUMERIC_MIN_VALUE 1
NUMERIC_MAX_VALUE 100
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MHNSW_MAX_EDGES_PER_NODE
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Larger values means slower INSERT, larger index size and higher memory consumption, but better search results. Not used for SELECT
NUMERIC_MIN_VALUE 2
NUMERIC_MAX_VALUE 200
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MIN_EXAMINED_ROW_LIMIT
VARIABLE_SCOPE SESSION
VARIABLE_TYPE BIGINT UNSIGNED
......
......@@ -919,9 +919,8 @@ typedef struct system_variables
Charset_collation_map_st character_set_collations;
/* Temporary for HNSW tests */
uint hnsw_max_connection_per_layer;
uint hnsw_ef_constructor;
uint hnsw_ef_search;
uint mhnsw_max_edges_per_node;
double mhnsw_limit_multiplier;
} SV;
/**
......
......@@ -7373,22 +7373,18 @@ static Sys_var_enum Sys_block_encryption_mode(
SESSION_VAR(block_encryption_mode), CMD_LINE(REQUIRED_ARG),
block_encryption_mode_values, DEFAULT(0));
/* Temporary for HNSW tests */
static Sys_var_uint Sys_hnsw_ef_search(
"hnsw_ef_search",
"hnsw_ef_search",
SESSION_VAR(hnsw_ef_search), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
BLOCK_SIZE(1));
static Sys_var_uint Sys_hnsw_ef_constructor(
"hnsw_ef_constructor",
"hnsw_ef_constructor",
SESSION_VAR(hnsw_ef_constructor), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
BLOCK_SIZE(1));
static Sys_var_uint Sys_hnsw_max_connection_per_layer(
"hnsw_max_connection_per_layer",
"hnsw_max_connection_per_layer",
SESSION_VAR(hnsw_max_connection_per_layer), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(50),
BLOCK_SIZE(1));
static Sys_var_double Sys_mhnsw_limit_multiplier(
"mhnsw_limit_multiplier",
"Defines the number of result candidates to look for in the "
"vector index for ORDER BY ... LIMIT N queries. Specified in term "
"of LIMIT (1 means look for exactly N candidates, 2 means look for "
"2*N, etc). Larger values means the search will be slower, but the "
"result will be closer to perfect",
SESSION_VAR(mhnsw_limit_multiplier), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(1, 100), DEFAULT(2));
static Sys_var_uint Sys_mhnsw_max_edges_per_node(
"mhnsw_max_edges_per_node",
"Larger values means slower INSERT, larger index size and higher "
"memory consumption, but better search results. Not used for SELECT",
SESSION_VAR(mhnsw_max_edges_per_node), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(2, 200), DEFAULT(15), BLOCK_SIZE(1));
......@@ -21,6 +21,14 @@
#include "key.h"
#include <scope.h>
// Algorithm parameters
// best by test (fastest construction with recall > 99% for ef=20, limit=10)
// for random-xs-20-euclidean (9000) [ 3, 1.1, M=7 ]
// for mnist-784-euclidean (60000) [ 4, 1.1, M=13 ]
// for sift-128-euclidean (1000000) [ 4, 1.1, M>64 ] (98% with M=64)
static const double ef_construction_multiplier = 4;
static const double alpha = 1.1;
class MHNSW_Context;
class FVector: public Sql_alloc
......@@ -230,7 +238,7 @@ static int select_neighbors(MHNSW_Context *ctx, size_t layer,
bool discard= false;
for (const FVectorNode &neigh : neighbors)
{
if ((discard= vec->distance_to(neigh) < target_dist))
if ((discard= vec->distance_to(neigh) * alpha < target_dist))
break;
}
if (!discard)
......@@ -427,7 +435,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
if (res->length() == 0 || res->length() % 4)
return bad_value_on_insert(vec_field);
const double NORMALIZATION_FACTOR= 1 / std::log(thd->variables.hnsw_max_connection_per_layer);
const double NORMALIZATION_FACTOR= 1 / std::log(thd->variables.mhnsw_max_edges_per_node);
table->file->position(table->record[0]);
......@@ -495,15 +503,14 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
for (longlong cur_layer= new_node_layer; cur_layer >= 0; cur_layer--)
{
uint max_neighbors= (cur_layer == 0) // heuristics from the paper
? thd->variables.mhnsw_max_edges_per_node * 2
: thd->variables.mhnsw_max_edges_per_node;
if (int err= search_layer(&ctx, start_nodes,
thd->variables.hnsw_ef_constructor, cur_layer,
&candidates))
static_cast<uint>(ef_construction_multiplier * max_neighbors),
cur_layer, &candidates))
return err;
uint max_neighbors= (cur_layer == 0) // heuristics from the paper
? thd->variables.hnsw_max_connection_per_layer * 2
: thd->variables.hnsw_max_connection_per_layer;
if (int err= select_neighbors(&ctx, cur_layer, target, candidates,
max_neighbors))
return err;
......@@ -567,8 +574,9 @@ int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
FVector target(&ctx, res->ptr());
ctx.target= &target;
ulonglong ef_search= std::max<ulonglong>( //XXX why not always limit?
thd->variables.hnsw_ef_search, limit);
// this auto-scales ef with the limit, providing more adequate
// behavior than a fixed ef_search
uint ef_search= static_cast<uint>(limit * thd->variables.mhnsw_limit_multiplier);
for (size_t cur_layer= max_layer; cur_layer > 0; cur_layer--)
{
......@@ -619,6 +627,6 @@ const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length)
size_t len= sizeof(templ) + 32;
char *s= thd->alloc(len);
len= my_snprintf(s, len, templ, ref_length, 2 * ref_length *
thd->variables.hnsw_max_connection_per_layer);
thd->variables.mhnsw_max_edges_per_node);
return {s, len};
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment