Precise read time estimates for index_merge/Unique

67c6d511 · sergefp@mysql.com · 20295cf1 · 67c6d511 · 67c6d511 · 67c6d511
Commit 67c6d511 authored 21 years ago by sergefp@mysql.com
13 changed files
--- a/mysql-test/r/index_merge.result
+++ b/mysql-test/r/index_merge.result
-drop table if exists t0, t1, t2, t3;
+drop table if exists t0, t1, t2, t3,t4;
 create table t0
 (
 key1 int not null, 

--- a/mysql-test/t/index_merge.test
+++ b/mysql-test/t/index_merge.test
@@ -3,7 +3,7 @@
 #

 --disable_warnings
-drop table if exists t0, t1, t2, t3;
+drop table if exists t0, t1, t2, t3,t4;
 --enable_warnings

 # Create and fill a table with simple keys

--- a/sql/filesort.cc
+++ b/sql/filesort.cc
@@ -88,9 +88,9 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
 #endif
  FILESORT_INFO table_sort;
  /* 
-    don't use table->sort in filesort as it is also used by 
-    QUICK_INDEX_MERGE_SELECT. work with a copy of it and put it back at the 
-    end when index_merge select has finished with it.
+    Don't use table->sort in filesort as it is also used by 
+    QUICK_INDEX_MERGE_SELECT. Work with a copy and put it back at the end 
+    when index_merge select has finished with it.
  */
  memcpy(&table_sort, &table->sort, sizeof(FILESORT_INFO));
  table->sort.io_cache= NULL;

--- a/sql/ha_berkeley.h
+++ b/sql/ha_berkeley.h
@@ -167,7 +167,7 @@ class ha_berkeley: public handler
  longlong get_auto_increment();
  void print_error(int error, myf errflag);
  uint8 table_cache_type() { return HA_CACHE_TBL_TRANSACT; }
-  bool primary_key_is_clustered_covering() { return true; }
+  bool primary_key_is_clustered() { return true; }
 };

 extern bool berkeley_skip, berkeley_shared_data;

--- a/sql/ha_innodb.cc
+++ b/sql/ha_innodb.cc
@@ -2003,7 +2003,8 @@ build_template(
 		update field->query_id so that the formula
 		thd->query_id == field->query_id did not work. */

-                ibool index_contains_field = dict_index_contains_col_or_prefix(index, i);
+                ibool index_contains_field=
+                  dict_index_contains_col_or_prefix(index, i);

 		if (templ_type == ROW_MYSQL_REC_FIELDS && 
                    ((prebuilt->read_just_key && !index_contains_field) ||

--- a/sql/ha_innodb.h
+++ b/sql/ha_innodb.h
@@ -187,7 +187,7 @@ class ha_innobase: public handler
 	void init_table_handle_for_HANDLER(); 
 	longlong get_auto_increment();
        uint8 table_cache_type() { return HA_CACHE_TBL_ASKTRANSACT; }
-        bool primary_key_is_clustered_covering() { return true; }
+        bool primary_key_is_clustered() { return true; }
 };

 extern bool innodb_skip;

--- a/sql/handler.h
+++ b/sql/handler.h
@@ -378,10 +378,10 @@ class handler :public Sql_alloc

  /*
   RETURN
-     true  primary key (if there is one) is clustered key covering all fields
+     true  Primary key (if there is one) is clustered key covering all fields
     false otherwise
  */
-  virtual bool primary_key_is_clustered_covering() { return false; }
+  virtual bool primary_key_is_clustered() { return false; }
 };

 	/* Some extern variables used with handlers */

--- a/sql/mysql_priv.h
+++ b/sql/mysql_priv.h
@@ -118,6 +118,26 @@ extern CHARSET_INFO *national_charset_info, *table_alias_charset;
 */
 #define TIME_FOR_COMPARE   5	// 5 compares == one read

+/*
+  Number of comparisons of table rowids equivalent to reading one row from a 
+  table.
+*/
+#define TIME_FOR_COMPARE_ROWID  (TIME_FOR_COMPARE*2)
+
+/*
+  For sequential disk seeks the cost formula is:
+    DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST * #blocks_to_skip  
+  
+  The cost of average seek 
+    DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK =1.0.
+*/
+#define DISK_SEEK_BASE_COST ((double)0.5)
+
+#define BLOCKS_IN_AVG_SEEK  128
+
+#define DISK_SEEK_PROP_COST ((double)0.5/BLOCKS_IN_AVG_SEEK)
+
+
 /*
  Number of rows in a reference table when refereed through a not unique key.
  This value is only used when we don't know anything about the key

--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -118,11 +118,13 @@ class QUICK_RANGE_SELECT : public QUICK_SELECT_I
 protected:
  friend void print_quick_sel_range(QUICK_RANGE_SELECT *quick,
                                    const key_map* needed_reg);
-  friend QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, 
+  friend
+  QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table, 
                                               struct st_table_ref *ref);
  friend bool get_quick_keys(struct st_qsel_param *param,
                             QUICK_RANGE_SELECT *quick,KEY_PART *key,
-                             SEL_ARG *key_tree,char *min_key,uint min_key_flag,
+                             SEL_ARG *key_tree,
+                             char *min_key, uint min_key_flag,
                             char *max_key, uint max_key_flag);
  friend QUICK_RANGE_SELECT *get_quick_select(struct st_qsel_param*,uint idx,
                                              SEL_ARG *key_tree,
@@ -160,58 +162,62 @@ class QUICK_RANGE_SELECT : public QUICK_SELECT_I


 /*
-QUICK_INDEX_MERGE_SELECT - index_merge acces method quick select.
+  QUICK_INDEX_MERGE_SELECT - index_merge access method quick select.

    QUICK_INDEX_MERGE_SELECT uses 
     * QUICK_RANGE_SELECTs to get rows
     * Unique class to remove duplicate rows

-INDEX MERGE OPTIMIZER
-  Current implementation doesn't detect all cases where index_merge could be 
-  used, in particular:
-   * index_merge will never be used if range scan is possible (even if range 
-     scan is more expensive)
+  INDEX MERGE OPTIMIZER
+    Current implementation doesn't detect all cases where index_merge could 
+    be used, in particular:
+     * index_merge will never be used if range scan is possible (even if 
+       range scan is more expensive)

-   * index_merge+'using index' is not supported (this the consequence of the 
-     above restriction)
+     * index_merge+'using index' is not supported (this the consequence of 
+       the above restriction)
   
-   * If WHERE part contains complex nested AND and OR conditions, some ways to
-     retrieve rows using index_merge will not be considered. The choice of 
-     read plan may depend on the order of conjuncts/disjuncts in WHERE part of
-     the query, see comments near SEL_IMERGE::or_sel_tree_with_checks and 
-     imerge_list_or_list function for details.
+     * If WHERE part contains complex nested AND and OR conditions, some ways
+       to retrieve rows using index_merge will not be considered. The choice 
+       of read plan may depend on the order of conjuncts/disjuncts in WHERE 
+       part of the query, see comments near imerge_list_or_list and
+       SEL_IMERGE::or_sel_tree_with_checks functions for details.

-   * there is no "index_merge_ref" method (but index_merge on non-first table 
-     in join is possible with 'range checked for each record').
+     * There is no "index_merge_ref" method (but index_merge on non-first
+       table in join is possible with 'range checked for each record').

-   See comments around SEL_IMERGE class and test_quick_select for more details.
+    See comments around SEL_IMERGE class and test_quick_select for more 
+    details.

-ROW RETRIEVAL ALGORITHM
+  ROW RETRIEVAL ALGORITHM

-  index_merge uses Unique class for duplicates removal.  Index merge takes 
-  advantage of clustered covering primary key (CCPK) if the table has one.
-  The algorithm is as follows:
+    index_merge uses Unique class for duplicates removal.  index_merge takes
+    advantage of Clustered Primary Key (CPK) if the table has one.
+    The index_merge algorithm consists of two phases:

-  prepare() //implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique
+    Phase 1 (implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique):
+    prepare()
    {
      activate 'index only';
-    while(retrieve next row for non-CCPK scan)
+      while(retrieve next row for non-CPK scan)
      {
-      if (there is a CCPK scan and row will be retrieved by it)
+        if (there is a CPK scan and row will be retrieved by it)
          skip this row;
        else
-        put rowid into Unique;
+          put its rowid into Unique;
      }
      deactivate 'index only';
    }
    
-  fetch() //implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next calls
+    Phase 2 (implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next
+    calls):
+
+    fetch()
    {
      retrieve all rows from row pointers stored in Unique;
      free Unique;
-    retrieve all rows for CCPK scan;
+      retrieve all rows for CPK scan;
    }
-
 */

 class QUICK_INDEX_MERGE_SELECT : public QUICK_SELECT_I 
@@ -239,10 +245,10 @@ class QUICK_INDEX_MERGE_SELECT : public QUICK_SELECT_I
  /* last element in quick_selects list */
  QUICK_RANGE_SELECT* last_quick_select;

-  /* quick select that uses Covering Clustered Primary Key (NULL if none) */
+  /* quick select that uses clustered primary key (NULL if none) */
  QUICK_RANGE_SELECT* pk_quick_select;
  
-  /* true if this select is currently doing a CCPK scan */
+  /* true if this select is currently doing a clustered PK scan */
  bool  doing_pk_scan;
  
  Unique  *unique;

--- a/sql/records.cc
+++ b/sql/records.cc
@@ -98,7 +98,6 @@ void init_read_record(READ_RECORD *info,THD *thd, TABLE *table,
    }
  }
  else if (select && select->quick)
-           //&& (select->quick->get_type() != QUICK_SELECT_I::QS_TYPE_INDEX_MERGE))
  {
    DBUG_PRINT("info",("using rr_quick"));
    info->read_record=rr_quick;

--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1233,7 +1233,8 @@ class Unique :public Sql_alloc
  }

  bool get(TABLE *table);
-
+  static double get_use_cost(MEM_ROOT *alloc, uint nkeys, uint key_size, 
+                             ulong max_in_memory_size);
  friend int unique_write_to_file(gptr key, element_count count, Unique *unique);
  friend int unique_write_to_ptrs(gptr key, element_count count, Unique *unique);
 };

--- a/sql/uniques.cc
+++ b/sql/uniques.cc
@@ -63,12 +63,194 @@ Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg,
 	    comp_func_fixed_arg);
  /* If the following fail's the next add will also fail */
  my_init_dynamic_array(&file_ptrs, sizeof(BUFFPEK), 16, 16);
+  /* 
+    If you change the following, change it in get_max_elements function, too.
+  */
  max_elements= max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+size);
  open_cached_file(&file, mysql_tmpdir,TEMP_PREFIX, DISK_BUFFER_SIZE,
 		   MYF(MY_WME));
 }


+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#define M_E (exp(1))
+
+inline double log2_n_fact(double x)
+{
+  return (2 * ( ((x)+1) * log(((x)+1)/M_E) + log(2*M_PI*((x)+1))/2 ) / log(2));
+}
+
+/*
+  Calculate cost of merge_buffers call.
+
+  NOTE
+    See comment near Unique::get_use_cost for cost formula derivation.
+*/
+static double get_merge_buffers_cost(uint* buff_sizes, uint elem_size, 
+                                     int last, int f,int t)
+{  
+  uint sum= 0;
+  for (int i=f; i <= t; i++)
+    sum+= buff_sizes[i];
+  buff_sizes[last]= sum;
+  
+  int n_buffers= t - f + 1;
+  double buf_length= sum*elem_size;
+
+  return (((double)buf_length/(n_buffers+1)) / IO_SIZE) * 2 * n_buffers + 
+     buf_length * log(n_buffers)  / (TIME_FOR_COMPARE_ROWID * log(2.0));
+}
+
+/*
+  Calculate cost of merging buffers into one in Unique::get, i.e. calculate
+  how long (in terms of disk seeks) the two call
+    merge_many_buffs(...); 
+    merge_buffers(...); 
+  will take.
+
+  SYNOPSIS
+    get_merge_many_buffs_cost()
+      alloc         memory pool to use
+      maxbuffer     # of full buffers.
+      max_n_elems   # of elements in first maxbuffer buffers.
+      last_n_elems  # of elements in last buffer.
+      elem_size     size of buffer element.
+
+  NOTES
+    It is assumed that maxbuffer+1 buffers are merged, first maxbuffer buffers
+    contain max_n_elems each, last buffer contains last_n_elems elements.
+
+    The current implementation does a dumb simulation of merge_many_buffs
+    actions.
+  
+  RETURN
+   >=0  Cost of merge in disk seeks.
+   <0   Out of memory.
+*/
+static double get_merge_many_buffs_cost(MEM_ROOT *alloc,
+                                        uint maxbuffer, uint max_n_elems,
+                                        uint last_n_elems, int elem_size)
+{
+  register int i;
+  double total_cost= 0.0;
+  int    lastbuff;
+  uint*  buff_sizes;
+  
+  if (!(buff_sizes= (uint*)alloc_root(alloc, sizeof(uint) * (maxbuffer + 1))))
+    return -1.0;
+  for(i = 0; i < (int)maxbuffer; i++)
+    buff_sizes[i]= max_n_elems;
+  
+  buff_sizes[maxbuffer]= last_n_elems;
+
+  if (maxbuffer >= MERGEBUFF2)
+  {
+    /* Simulate merge_many_buff */
+    while (maxbuffer >= MERGEBUFF2)
+    {
+      lastbuff=0;
+      for (i = 0; i <= (int) maxbuffer - MERGEBUFF*3/2; i += MERGEBUFF)
+        total_cost += get_merge_buffers_cost(buff_sizes, elem_size, 
+                                             lastbuff++, i, i+MERGEBUFF-1);
+      
+      total_cost += get_merge_buffers_cost(buff_sizes, elem_size, 
+                                           lastbuff++, i, maxbuffer);
+      maxbuffer= (uint)lastbuff-1;
+    }
+  }
+  
+  /* Simulate final merge_buff call. */
+  total_cost += get_merge_buffers_cost(buff_sizes, elem_size, 0, 0, 
+                                       maxbuffer);
+  return total_cost;
+}
+
+
+/*
+  Calclulate cost of using Unique for processing nkeys elements of size 
+  key_size using max_in_memory_size memory.
+  
+  RETURN
+    Use cost as # of disk seeks.
+  
+  NOTES
+    cost(using_unqiue) = 
+      cost(create_trees) +  (see #1)
+      cost(merge) +         (see #2)
+      cost(read_result)     (see #3)
+
+    1. Cost of trees creation
+      For each Unique::put operation there will be 2*log2(n+1) elements
+      comparisons, where n runs from 1 tree_size (we assume that all added
+      elements are different). Together this gives:
+    
+      n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!) =
+  
+      = 2*ln((N+1)!) / ln(2) = {using Stirling formula} = 
+
+      = 2*( (N+1)*ln((N+1)/e) + (1/2)*ln(2*pi*(N+1)) / ln(2).
+
+      then cost(tree_creation) = n_compares*ROWID_COMPARE_COST;
+
+      Total cost of creating trees:
+      (n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost.
+    
+    2. Cost of merging.
+      If only one tree is created by Unique no merging will be necessary.
+      Otherwise, we model execution of merge_many_buff function and count
+      #of merges. (The reason behind this is that number of buffers is small, 
+      while size of buffers is big and we don't want to loose precision with 
+      O(x)-style formula)
+  
+    3. If only one tree is created by Unique no disk io will happen.
+      Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume 
+      these will be random seeks.
+*/
+
+double Unique::get_use_cost(MEM_ROOT *alloc, uint nkeys, uint key_size, 
+                            ulong max_in_memory_size)
+{
+  ulong max_elements_in_tree;
+  ulong last_tree_elems;
+  int   n_full_trees; /* number of trees in unique - 1 */
+  double result;
+  
+  max_elements_in_tree= max_in_memory_size / 
+                        ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size);
+  n_full_trees=    nkeys / max_elements_in_tree;
+  last_tree_elems= nkeys % max_elements_in_tree;
+  
+  /* Calculate cost of creating trees */
+  result= log2_n_fact(last_tree_elems);
+  if (n_full_trees)
+    result+= n_full_trees * log2_n_fact(max_elements_in_tree);
+  result /= TIME_FOR_COMPARE_ROWID;
+
+  /* Calculate cost of merging */
+  if (!n_full_trees)
+    return result;
+  
+  /* There is more then one tree and merging is necessary. */
+  /* Add cost of writing all trees to disk. */
+  result += n_full_trees * ceil(key_size*max_elements_in_tree / IO_SIZE);
+  result += ceil(key_size*last_tree_elems / IO_SIZE);
+
+  /* Cost of merge */
+  result += get_merge_many_buffs_cost(alloc, n_full_trees, 
+                                      max_elements_in_tree,
+                                      last_tree_elems, key_size);
+  /* 
+    Add cost of reading the resulting sequence, assuming there were no 
+    duplicate elements.
+  */
+  result += ceil((double)key_size*nkeys/IO_SIZE);
+
+  return result;
+}
+
 Unique::~Unique()
 {
  close_cached_file(&file);