Commit 7fdf6740 authored by konstantin@oak.local's avatar konstantin@oak.local

Implementation of SUM(DISTINCT), tests cases

parent 89b2ddcf
DROP TABLE IF EXISTS t1;
CREATE TABLE t1 (
id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,
gender CHAR(1),
name VARCHAR(20)
);
SELECT SUM(DISTINCT LENGTH(name)) s1 FROM t1;
s1
NULL
INSERT INTO t1 (gender, name) VALUES (NULL, NULL);
INSERT INTO t1 (gender, name) VALUES (NULL, NULL);
INSERT INTO t1 (gender, name) VALUES (NULL, NULL);
SELECT SUM(DISTINCT LENGTH(name)) s1 FROM t1;
s1
NULL
INSERT INTO t1 (gender, name) VALUES ('F', 'Helen'), ('F', 'Anastasia'),
('F', 'Katherine'), ('F', 'Margo'), ('F', 'Magdalene'), ('F', 'Mary');
CREATE TABLE t2 SELECT name FROM t1;
SELECT (SELECT SUM(DISTINCT LENGTH(name)) FROM t1) FROM t2;
(SELECT SUM(DISTINCT LENGTH(name)) FROM t1)
18
18
18
18
18
18
18
18
18
DROP TABLE t2;
INSERT INTO t1 (gender, name) VALUES ('F', 'Eva'), ('F', 'Sofia'),
('F', 'Sara'), ('F', 'Golda'), ('F', 'Toba'), ('F', 'Victory'),
('F', 'Faina'), ('F', 'Miriam'), ('F', 'Beki'), ('F', 'America'),
('F', 'Susan'), ('F', 'Glory'), ('F', 'Priscilla'), ('F', 'Rosmary'),
('F', 'Rose'), ('F', 'Margareth'), ('F', 'Elizabeth'), ('F', 'Meredith'),
('F', 'Julie'), ('F', 'Xenia'), ('F', 'Zena'), ('F', 'Olga'),
('F', 'Brunhilda'), ('F', 'Nataly'), ('F', 'Lara'), ('F', 'Svetlana'),
('F', 'Grethem'), ('F', 'Irene');
SELECT
SUM(DISTINCT LENGTH(name)) s1,
SUM(DISTINCT SUBSTRING(NAME, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(name, 1, 4))) s3
FROM t1;
s1 s2 s3
42 0 7
SELECT
SUM(DISTINCT LENGTH(g1.name)) s1,
SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3
FROM t1 g1, t1 g2, t1 g3;
s1 s2 s3
42 0 7
SELECT
SUM(DISTINCT LENGTH(g1.name)) s1,
SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3
FROM t1 g1, t1 g2, t1 g3 GROUP BY LENGTH(SUBSTRING(g3.name, 5, 10));
s1 s2 s3
42 0 NULL
42 0 7
42 0 4
42 0 4
42 0 4
42 0 4
42 0 4
SELECT SQL_BUFFER_RESULT
SUM(DISTINCT LENGTH(name)) s1,
SUM(DISTINCT SUBSTRING(NAME, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(name, 1, 4))) s3
FROM t1;
s1 s2 s3
42 0 7
SELECT SQL_BUFFER_RESULT
SUM(DISTINCT LENGTH(g1.name)) s1,
SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3
FROM t1 g1, t1 g2, t1 g3 GROUP BY LENGTH(SUBSTRING(g3.name, 5, 10));
s1 s2 s3
42 0 NULL
42 0 7
42 0 4
42 0 4
42 0 4
42 0 4
42 0 4
SET @l=1;
UPDATE t1 SET name=CONCAT(name, @l:=@l+1);
SELECT SUM(DISTINCT RIGHT(name, 1)) FROM t1;
SUM(DISTINCT RIGHT(name, 1))
45
SELECT SUM(DISTINCT id) FROM t1;
SUM(DISTINCT id)
703
SELECT SUM(DISTINCT id % 11) FROM t1;
SUM(DISTINCT id % 11)
55
DROP TABLE t1;
CREATE TABLE t1 (id INTEGER);
CREATE TABLE t2 (id INTEGER);
INSERT INTO t1 (id) VALUES (1), (1), (1),(1);
INSERT INTO t2 (id) SELECT id FROM t1;
INSERT INTO t1 (id) SELECT id FROM t2;
/* 8 */
INSERT INTO t1 (id) SELECT id FROM t2;
/* 12 */
INSERT INTO t1 (id) SELECT id FROM t2;
/* 16 */
INSERT INTO t1 (id) SELECT id FROM t2;
/* 20 */
INSERT INTO t1 (id) SELECT id FROM t2;
/* 24 */
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+1 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+2 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+4 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+8 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+16 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+32 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+64 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+128 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+256 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+512 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+1024 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+2048 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+4096 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+8192 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 SELECT id FROM t1 ORDER BY id*rand();
SELECT SUM(DISTINCT id) sm FROM t1;
sm
134225920
SELECT SUM(DISTINCT id) sm FROM t2;
sm
134225920
SELECT SUM(DISTINCT id) sm FROM t1 group by id % 13;
sm
10327590
10328851
10330112
10331373
10332634
10317510
10318770
10320030
10321290
10322550
10323810
10325070
10326330
SET max_heap_table_size=16384;
SHOW variables LIKE 'max_heap_table_size';
Variable_name Value
max_heap_table_size 16384
SELECT SUM(DISTINCT id) sm FROM t1;
sm
134225920
SELECT SUM(DISTINCT id) sm FROM t2;
sm
134225920
SELECT SUM(DISTINCT id) sm FROM t1 GROUP BY id % 13;
sm
10327590
10328851
10330112
10331373
10332634
10317510
10318770
10320030
10321290
10322550
10323810
10325070
10326330
DROP TABLE t1;
DROP TABLE t2;
#
# Various tests for SUM(DISTINCT ...)
#
--disable_warnings
DROP TABLE IF EXISTS t1;
--enable_warnings
CREATE TABLE t1 (
id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT,
gender CHAR(1),
name VARCHAR(20)
);
# According to ANSI SQL, SUM(DISTINCT ...) should return NULL for empty
# record set
SELECT SUM(DISTINCT LENGTH(name)) s1 FROM t1;
# According to ANSI SQL, SUM(DISTINCT ...) should return NULL for records sets
# entirely consisting of NULLs
INSERT INTO t1 (gender, name) VALUES (NULL, NULL);
INSERT INTO t1 (gender, name) VALUES (NULL, NULL);
INSERT INTO t1 (gender, name) VALUES (NULL, NULL);
SELECT SUM(DISTINCT LENGTH(name)) s1 FROM t1;
# Filling table with t1
INSERT INTO t1 (gender, name) VALUES ('F', 'Helen'), ('F', 'Anastasia'),
('F', 'Katherine'), ('F', 'Margo'), ('F', 'Magdalene'), ('F', 'Mary');
CREATE TABLE t2 SELECT name FROM t1;
SELECT (SELECT SUM(DISTINCT LENGTH(name)) FROM t1) FROM t2;
DROP TABLE t2;
INSERT INTO t1 (gender, name) VALUES ('F', 'Eva'), ('F', 'Sofia'),
('F', 'Sara'), ('F', 'Golda'), ('F', 'Toba'), ('F', 'Victory'),
('F', 'Faina'), ('F', 'Miriam'), ('F', 'Beki'), ('F', 'America'),
('F', 'Susan'), ('F', 'Glory'), ('F', 'Priscilla'), ('F', 'Rosmary'),
('F', 'Rose'), ('F', 'Margareth'), ('F', 'Elizabeth'), ('F', 'Meredith'),
('F', 'Julie'), ('F', 'Xenia'), ('F', 'Zena'), ('F', 'Olga'),
('F', 'Brunhilda'), ('F', 'Nataly'), ('F', 'Lara'), ('F', 'Svetlana'),
('F', 'Grethem'), ('F', 'Irene');
SELECT
SUM(DISTINCT LENGTH(name)) s1,
SUM(DISTINCT SUBSTRING(NAME, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(name, 1, 4))) s3
FROM t1;
SELECT
SUM(DISTINCT LENGTH(g1.name)) s1,
SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3
FROM t1 g1, t1 g2, t1 g3;
SELECT
SUM(DISTINCT LENGTH(g1.name)) s1,
SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3
FROM t1 g1, t1 g2, t1 g3 GROUP BY LENGTH(SUBSTRING(g3.name, 5, 10));
# here we explicitly request summing through temporary table (so
# Item_sum_sum_distinct::copy_or_same() is called)
SELECT SQL_BUFFER_RESULT
SUM(DISTINCT LENGTH(name)) s1,
SUM(DISTINCT SUBSTRING(NAME, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(name, 1, 4))) s3
FROM t1;
SELECT SQL_BUFFER_RESULT
SUM(DISTINCT LENGTH(g1.name)) s1,
SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2,
SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3
FROM t1 g1, t1 g2, t1 g3 GROUP BY LENGTH(SUBSTRING(g3.name, 5, 10));
# this test demonstrates that strings are automatically converted to numbers
# before summing
SET @l=1;
UPDATE t1 SET name=CONCAT(name, @l:=@l+1);
SELECT SUM(DISTINCT RIGHT(name, 1)) FROM t1;
# this is a test case for ordinary t1
SELECT SUM(DISTINCT id) FROM t1;
SELECT SUM(DISTINCT id % 11) FROM t1;
DROP TABLE t1;
#
# Test the case when distinct values doesn't fit in memory and
# filesort is used (see uniques.cc:merge_walk)
#
CREATE TABLE t1 (id INTEGER);
CREATE TABLE t2 (id INTEGER);
INSERT INTO t1 (id) VALUES (1), (1), (1),(1);
INSERT INTO t2 (id) SELECT id FROM t1;
INSERT INTO t1 (id) SELECT id FROM t2; /* 8 */
INSERT INTO t1 (id) SELECT id FROM t2; /* 12 */
INSERT INTO t1 (id) SELECT id FROM t2; /* 16 */
INSERT INTO t1 (id) SELECT id FROM t2; /* 20 */
INSERT INTO t1 (id) SELECT id FROM t2; /* 24 */
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+1 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+2 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+4 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+8 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+16 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+32 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+64 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+128 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+256 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+512 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+1024 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+2048 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+4096 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
INSERT INTO t2 (id) SELECT id+8192 FROM t1;
INSERT INTO t1 SELECT id FROM t2;
DELETE FROM t2;
#INSERT INTO t2 (id) SELECT id+16384 FROM t1;
#INSERT INTO t1 SELECT id FROM t2;
#DELETE FROM t2;
#INSERT INTO t2 (id) SELECT id+32768 FROM t1;
#INSERT INTO t1 SELECT id FROM t2;
#DELETE FROM t2;
#INSERT INTO t2 (id) SELECT id+65536 FROM t1;
#INSERT INTO t1 SELECT id FROM t2;
#DELETE FROM t2;
INSERT INTO t2 SELECT id FROM t1 ORDER BY id*rand();
# SELECT '++++++++++++++++++++++++++++++++++++++++++++++++++';
SELECT SUM(DISTINCT id) sm FROM t1;
SELECT SUM(DISTINCT id) sm FROM t2;
SELECT SUM(DISTINCT id) sm FROM t1 group by id % 13;
# this limit for max_heap_table_size is set to force testing the case, when
# all distinct sum values can not fit in memory and must be stored in a
# temporary table
SET max_heap_table_size=16384;
# to check that max_heap_table_size was actually set (hard limit for minimum
# max_heap_table_size is set in mysqld.cc):
SHOW variables LIKE 'max_heap_table_size';
SELECT SUM(DISTINCT id) sm FROM t1;
SELECT SUM(DISTINCT id) sm FROM t2;
SELECT SUM(DISTINCT id) sm FROM t1 GROUP BY id % 13;
DROP TABLE t1;
DROP TABLE t2;
...@@ -756,6 +756,39 @@ uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek, ...@@ -756,6 +756,39 @@ uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek,
} /* read_to_buffer */ } /* read_to_buffer */
/*
Put all room used by freed buffer to use in adjacent buffer. Note, that
we can't simply distribute memory evenly between all buffers, because
new areas must not overlap with old ones.
SYNOPSYS
reuse_freed_buff()
queue IN list of non-empty buffers, without freed buffer
reuse IN empty buffer
key_length IN key length
*/
void reuse_freed_buff(QUEUE *queue, BUFFPEK *reuse, uint key_length)
{
uchar *reuse_end= reuse->base + reuse->max_keys * key_length;
for (uint i= 0; i < queue->elements; ++i)
{
BUFFPEK *bp= (BUFFPEK *) queue_element(queue, i);
if (bp->base + bp->max_keys * key_length == reuse->base)
{
bp->max_keys+= reuse->max_keys;
return;
}
else if (bp->base == reuse_end)
{
bp->base= reuse->base;
bp->max_keys+= reuse->max_keys;
return;
}
}
DBUG_ASSERT(0);
}
/* /*
Merge buffers to one buffer Merge buffers to one buffer
*/ */
...@@ -881,29 +914,8 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file, ...@@ -881,29 +914,8 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file,
if (!(error= (int) read_to_buffer(from_file,buffpek, if (!(error= (int) read_to_buffer(from_file,buffpek,
rec_length))) rec_length)))
{ {
uchar *base= buffpek->base;
ulong max_keys= buffpek->max_keys;
VOID(queue_remove(&queue,0)); VOID(queue_remove(&queue,0));
reuse_freed_buff(&queue, buffpek, rec_length);
/* Put room used by buffer to use in other buffer */
for (refpek= (BUFFPEK**) &queue_top(&queue);
refpek <= (BUFFPEK**) &queue_end(&queue);
refpek++)
{
buffpek= *refpek;
if (buffpek->base+buffpek->max_keys*rec_length == base)
{
buffpek->max_keys+= max_keys;
break;
}
else if (base+max_keys*rec_length == buffpek->base)
{
buffpek->base= base;
buffpek->max_keys+= max_keys;
break;
}
}
break; /* One buffer have been removed */ break; /* One buffer have been removed */
} }
else if (error == -1) else if (error == -1)
......
...@@ -262,6 +262,122 @@ double Item_sum_sum::val() ...@@ -262,6 +262,122 @@ double Item_sum_sum::val()
} }
/* Item_sum_sum_distinct */
Item_sum_sum_distinct::Item_sum_sum_distinct(Item *item)
:Item_sum_num(item), sum(0.0), tree(0)
{
/*
quick_group is an optimizer hint, which means that GROUP BY can be
handled with help of index on grouped columns.
By setting quick_group to zero we force creation of temporary table
to perform GROUP BY.
*/
quick_group= 0;
}
Item_sum_sum_distinct::Item_sum_sum_distinct(THD *thd,
Item_sum_sum_distinct &original)
:Item_sum_num(thd, original), sum(0.0), tree(0)
{
quick_group= 0;
}
Item_sum_sum_distinct::~Item_sum_sum_distinct()
{
delete tree;
}
Item *
Item_sum_sum_distinct::copy_or_same(THD *thd)
{
return new (&thd->mem_root) Item_sum_sum_distinct(thd, *this);
}
C_MODE_START
static int simple_raw_key_cmp(void* arg, const void* key1, const void* key2)
{
return memcmp(key1, key2, *(uint *) arg);
}
C_MODE_END
bool Item_sum_sum_distinct::setup(THD *thd)
{
SELECT_LEX *select_lex= thd->lex->current_select;
/* what does it mean??? */
if (select_lex->linkage == GLOBAL_OPTIONS_TYPE)
return 1;
DBUG_ASSERT(tree == 0); /* setup can not be called twice */
/*
Uniques handles all unique elements in a tree until they can't fit in.
Then thee tree is dumped to the temporary file.
See class Unique for details.
*/
null_value= maybe_null= 1;
/*
TODO: if underlying item result fits in 4 bytes we can take advantage
of it and have tree of long/ulong. It gives 10% performance boost
*/
static uint key_length= sizeof(double);
tree= new Unique(simple_raw_key_cmp, &key_length, key_length,
thd->variables.max_heap_table_size);
return tree == 0;
}
void Item_sum_sum_distinct::clear()
{
DBUG_ASSERT(tree); /* we always have a tree */
null_value= 1;
tree->reset();
}
bool Item_sum_sum_distinct::add()
{
/* args[0]->val() may reset args[0]->null_value */
double val= args[0]->val();
if (!args[0]->null_value)
{
DBUG_ASSERT(tree);
null_value= 0;
if (val)
return tree->unique_add(&val);
}
return 0;
}
C_MODE_START
static int sum_sum_distinct(void *element, element_count num_of_dups,
void *item_sum_sum_distinct)
{
((Item_sum_sum_distinct *)
(item_sum_sum_distinct))->add(* (double *) element);
return 0;
}
C_MODE_END
double Item_sum_sum_distinct::val()
{
/*
We don't have a tree only if 'setup()' hasn't been called;
this is the case of sql_select.cc:return_zero_rows.
*/
sum= 0.0;
if (tree)
tree->walk(sum_sum_distinct, (void *) this);
return sum;
}
/* end of Item_sum_sum_distinct */
Item *Item_sum_count::copy_or_same(THD* thd) Item *Item_sum_count::copy_or_same(THD* thd)
{ {
return new (&thd->mem_root) Item_sum_count(thd, *this); return new (&thd->mem_root) Item_sum_count(thd, *this);
...@@ -1036,11 +1152,6 @@ String *Item_variance_field::val_str(String *str) ...@@ -1036,11 +1152,6 @@ String *Item_variance_field::val_str(String *str)
#include "sql_select.h" #include "sql_select.h"
int simple_raw_key_cmp(void* arg, byte* key1, byte* key2)
{
return memcmp(key1, key2, *(uint*) arg);
}
int simple_str_key_cmp(void* arg, byte* key1, byte* key2) int simple_str_key_cmp(void* arg, byte* key1, byte* key2)
{ {
Item_sum_count_distinct* item = (Item_sum_count_distinct*)arg; Item_sum_count_distinct* item = (Item_sum_count_distinct*)arg;
......
...@@ -27,9 +27,9 @@ class Item_sum :public Item_result_field ...@@ -27,9 +27,9 @@ class Item_sum :public Item_result_field
{ {
public: public:
enum Sumfunctype enum Sumfunctype
{ COUNT_FUNC,COUNT_DISTINCT_FUNC,SUM_FUNC,AVG_FUNC,MIN_FUNC, { COUNT_FUNC, COUNT_DISTINCT_FUNC, SUM_FUNC, SUM_DISTINCT_FUNC, AVG_FUNC,
MAX_FUNC, UNIQUE_USERS_FUNC,STD_FUNC,VARIANCE_FUNC,SUM_BIT_FUNC, MIN_FUNC, MAX_FUNC, UNIQUE_USERS_FUNC, STD_FUNC, VARIANCE_FUNC,
UDF_SUM_FUNC, GROUP_CONCAT_FUNC SUM_BIT_FUNC, UDF_SUM_FUNC, GROUP_CONCAT_FUNC
}; };
Item **args,*tmp_args[2]; Item **args,*tmp_args[2];
...@@ -138,6 +138,39 @@ class Item_sum_sum :public Item_sum_num ...@@ -138,6 +138,39 @@ class Item_sum_sum :public Item_sum_num
}; };
/*
Item_sum_sum_distinct - SELECT SUM(DISTINCT expr) FROM ...
support. See also: MySQL manual, chapter 'Adding New Functions To MySQL'
and comments in item_sum.cc.
*/
class Unique;
class Item_sum_sum_distinct :public Item_sum_num
{
double sum;
Unique *tree;
private:
Item_sum_sum_distinct(THD *thd, Item_sum_sum_distinct &item);
public:
Item_sum_sum_distinct(Item *item_par);
~Item_sum_sum_distinct();
bool setup(THD *thd);
void clear();
bool add();
double val();
inline void add(double val) { sum+= val; }
enum Sumfunctype sum_func () const { return SUM_DISTINCT_FUNC; }
void reset_field() {} // not used
void update_field() {} // not used
const char *func_name() const { return "sum_distinct"; }
Item *copy_or_same(THD* thd);
virtual void no_rows_in_result() {}
};
class Item_sum_count :public Item_sum_int class Item_sum_count :public Item_sum_int
{ {
longlong count; longlong count;
......
...@@ -1207,8 +1207,13 @@ class user_var_entry ...@@ -1207,8 +1207,13 @@ class user_var_entry
DTCollation collation; DTCollation collation;
}; };
/*
/* Class for unique (removing of duplicates) */ Unique -- class for unique (removing of duplicates).
Puts all values to the TREE. If the tree becomes too big,
it's dumped to the file. User can request sorted values, or
just iterate through them. In the last case tree merging is performed in
memory simultaneously with iteration, so it should be ~2-3x faster.
*/
class Unique :public Sql_alloc class Unique :public Sql_alloc
{ {
...@@ -1222,10 +1227,10 @@ class Unique :public Sql_alloc ...@@ -1222,10 +1227,10 @@ class Unique :public Sql_alloc
public: public:
ulong elements; ulong elements;
Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg, Unique(qsort_cmp2 comp_func, void *comp_func_fixed_arg,
uint size_arg, ulong max_in_memory_size_arg); uint size_arg, ulong max_in_memory_size_arg);
~Unique(); ~Unique();
inline bool unique_add(gptr ptr) inline bool unique_add(void *ptr)
{ {
if (tree.elements_in_tree > max_elements && flush()) if (tree.elements_in_tree > max_elements && flush())
return 1; return 1;
...@@ -1234,6 +1239,9 @@ public: ...@@ -1234,6 +1239,9 @@ public:
bool get(TABLE *table); bool get(TABLE *table);
void reset();
bool walk(tree_walk_action action, void *walk_action_arg);
friend int unique_write_to_file(gptr key, element_count count, Unique *unique); friend int unique_write_to_file(gptr key, element_count count, Unique *unique);
friend int unique_write_to_ptrs(gptr key, element_count count, Unique *unique); friend int unique_write_to_ptrs(gptr key, element_count count, Unique *unique);
}; };
......
...@@ -78,3 +78,4 @@ int merge_buffers(SORTPARAM *param,IO_CACHE *from_file, ...@@ -78,3 +78,4 @@ int merge_buffers(SORTPARAM *param,IO_CACHE *from_file,
IO_CACHE *to_file, uchar *sort_buffer, IO_CACHE *to_file, uchar *sort_buffer,
BUFFPEK *lastbuff,BUFFPEK *Fb, BUFFPEK *lastbuff,BUFFPEK *Fb,
BUFFPEK *Tb,int flag); BUFFPEK *Tb,int flag);
void reuse_freed_buff(QUEUE *queue, BUFFPEK *reuse, uint key_length);
...@@ -4055,14 +4055,25 @@ sum_expr: ...@@ -4055,14 +4055,25 @@ sum_expr:
{ $$= new Item_sum_unique_users($3,atoi($5.str),atoi($7.str),$9); } { $$= new Item_sum_unique_users($3,atoi($5.str),atoi($7.str),$9); }
| MIN_SYM '(' in_sum_expr ')' | MIN_SYM '(' in_sum_expr ')'
{ $$=new Item_sum_min($3); } { $$=new Item_sum_min($3); }
/*
According to ANSI SQL, DISTINCT is allowed and has
no sence inside MIN and MAX grouping functions; so MIN|MAX(DISTINCT ...)
is processed like an ordinary MIN | MAX()
*/
| MIN_SYM '(' DISTINCT in_sum_expr ')'
{ $$=new Item_sum_min($4); }
| MAX_SYM '(' in_sum_expr ')' | MAX_SYM '(' in_sum_expr ')'
{ $$=new Item_sum_max($3); } { $$=new Item_sum_max($3); }
| MAX_SYM '(' DISTINCT in_sum_expr ')'
{ $$=new Item_sum_max($4); }
| STD_SYM '(' in_sum_expr ')' | STD_SYM '(' in_sum_expr ')'
{ $$=new Item_sum_std($3); } { $$=new Item_sum_std($3); }
| VARIANCE_SYM '(' in_sum_expr ')' | VARIANCE_SYM '(' in_sum_expr ')'
{ $$=new Item_sum_variance($3); } { $$=new Item_sum_variance($3); }
| SUM_SYM '(' in_sum_expr ')' | SUM_SYM '(' in_sum_expr ')'
{ $$=new Item_sum_sum($3); } { $$=new Item_sum_sum($3); }
| SUM_SYM '(' DISTINCT in_sum_expr ')'
{ $$=new Item_sum_sum_distinct($4); }
| GROUP_CONCAT_SYM '(' opt_distinct expr_list opt_gorder_clause | GROUP_CONCAT_SYM '(' opt_distinct expr_list opt_gorder_clause
opt_gconcat_separator ')' opt_gconcat_separator ')'
{ {
......
...@@ -84,6 +84,7 @@ bool Unique::flush() ...@@ -84,6 +84,7 @@ bool Unique::flush()
elements+= tree.elements_in_tree; elements+= tree.elements_in_tree;
file_ptr.count=tree.elements_in_tree; file_ptr.count=tree.elements_in_tree;
file_ptr.file_pos=my_b_tell(&file); file_ptr.file_pos=my_b_tell(&file);
if (tree_walk(&tree, (tree_walk_action) unique_write_to_file, if (tree_walk(&tree, (tree_walk_action) unique_write_to_file,
(void*) this, left_root_right) || (void*) this, left_root_right) ||
insert_dynamic(&file_ptrs, (gptr) &file_ptr)) insert_dynamic(&file_ptrs, (gptr) &file_ptr))
...@@ -93,6 +94,237 @@ bool Unique::flush() ...@@ -93,6 +94,237 @@ bool Unique::flush()
} }
/*
Clear the tree and the file.
You must call reset() if you want to reuse Unique after walk().
*/
void
Unique::reset()
{
reset_tree(&tree);
/*
If elements != 0, some trees were stored in the file (see how
flush() works). Note, that we can not count on my_b_tell(&file) == 0
here, because it can return 0 right after walk(), and walk() does not
reset any Unique member.
*/
if (elements)
{
reset_dynamic(&file_ptrs);
reinit_io_cache(&file, WRITE_CACHE, 0L, 0, 1);
}
elements= 0;
}
/*
The comparison function, passed to queue_init() in merge_walk() must
use comparison function of Uniques::tree, but compare members of struct
BUFFPEK.
*/
struct BUFFPEK_COMPARE_CONTEXT
{
qsort_cmp2 key_compare;
void *key_compare_arg;
};
C_MODE_START
static int buffpek_compare(void *arg, byte *key_ptr1, byte *key_ptr2)
{
BUFFPEK_COMPARE_CONTEXT *ctx= (BUFFPEK_COMPARE_CONTEXT *) arg;
return ctx->key_compare(ctx->key_compare_arg,
*((byte **) key_ptr1), *((byte **)key_ptr2));
}
C_MODE_END
/*
DESCRIPTION
Function is very similar to merge_buffers, but instead of writing sorted
unique keys to the output file, it invokes walk_action for each key.
This saves I/O if you need to pass through all unique keys only once.
SYNOPSIS
merge_walk()
All params are 'IN' (but see comment for begin, end):
merge_buffer buffer to perform cached piece-by-piece loading
of trees; initially the buffer is empty
merge_buffer_size size of merge_buffer. Must be aligned with
key_length
key_length size of tree element; key_length * (end - begin)
must be less or equal than merge_buffer_size.
begin pointer to BUFFPEK struct for the first tree.
end pointer to BUFFPEK struct for the last tree;
end > begin and [begin, end) form a consecutive
range. BUFFPEKs structs in that range are used and
overwritten in merge_walk().
walk_action element visitor. Action is called for each unique
key.
walk_action_arg argument to walk action. Passed to it on each call.
compare elements comparison function
compare_arg comparison function argument
file file with all trees dumped. Trees in the file
must contain sorted unique values. Cache must be
initialized in read mode.
RETURN VALUE
0 ok
<> 0 error
*/
static bool merge_walk(uchar *merge_buffer, uint merge_buffer_size,
uint key_length, BUFFPEK *begin, BUFFPEK *end,
tree_walk_action walk_action, void *walk_action_arg,
qsort_cmp2 compare, void *compare_arg,
IO_CACHE *file)
{
BUFFPEK_COMPARE_CONTEXT compare_context = { compare, compare_arg };
QUEUE queue;
if (end <= begin ||
merge_buffer_size < key_length * (end - begin + 1) ||
init_queue(&queue, end - begin, offsetof(BUFFPEK, key), 0,
buffpek_compare, &compare_context))
return 1;
/* we need space for one key when a piece of merge buffer is re-read */
merge_buffer_size-= key_length;
uchar *save_key_buff= merge_buffer + merge_buffer_size;
uint max_key_count_per_piece= merge_buffer_size/(end-begin)/key_length;
/* if piece_size is aligned reuse_freed_buffer will always hit */
uint piece_size= max_key_count_per_piece * key_length;
uint bytes_read; /* to hold return value of read_to_buffer */
BUFFPEK *top;
int res= 1;
/*
Invariant: queue must contain top element from each tree, until a tree
is not completely walked through.
Here we're forcing the invariant, inserting one element from each tree
to the queue.
*/
for (top= begin; top != end; ++top)
{
top->base= merge_buffer + (top - begin) * piece_size;
top->max_keys= max_key_count_per_piece;
bytes_read= read_to_buffer(file, top, key_length);
if (bytes_read == (uint) (-1))
goto end;
DBUG_ASSERT(bytes_read);
queue_insert(&queue, (byte *) top);
}
top= (BUFFPEK *) queue_top(&queue);
while (queue.elements > 1)
{
/*
Every iteration one element is removed from the queue, and one is
inserted by the rules of the invariant. If two adjacent elements on
the top of the queue are not equal, biggest one is unique, because all
elements in each tree are unique. Action is applied only to unique
elements.
*/
void *old_key= top->key;
/*
read next key from the cache or from the file and push it to the
queue; this gives new top.
*/
top->key+= key_length;
if (--top->mem_count)
queue_replaced(&queue);
else /* next piece should be read */
{
/* save old_key not to overwrite it in read_to_buffer */
memcpy(save_key_buff, old_key, key_length);
old_key= save_key_buff;
bytes_read= read_to_buffer(file, top, key_length);
if (bytes_read == (uint) (-1))
goto end;
else if (bytes_read > 0) /* top->key, top->mem_count are reset */
queue_replaced(&queue); /* in read_to_buffer */
else
{
/*
Tree for old 'top' element is empty: remove it from the queue and
give all its memory to the nearest tree.
*/
queue_remove(&queue, 0);
reuse_freed_buff(&queue, top, key_length);
}
}
top= (BUFFPEK *) queue_top(&queue);
/* new top has been obtained; if old top is unique, apply the action */
if (compare(compare_arg, old_key, top->key))
{
if (walk_action(old_key, 1, walk_action_arg))
goto end;
}
}
/*
Applying walk_action to the tail of the last tree: this is safe because
either we had only one tree in the beginning, either we work with the
last tree in the queue.
*/
do
{
do
{
if (walk_action(top->key, 1, walk_action_arg))
goto end;
top->key+= key_length;
}
while (--top->mem_count);
bytes_read= read_to_buffer(file, top, key_length);
if (bytes_read == (uint) (-1))
goto end;
}
while (bytes_read);
res= 0;
end:
delete_queue(&queue);
return res;
}
/*
DESCRIPTION
Walks consecutively through all unique elements:
if all elements are in memory, then it simply invokes 'tree_walk', else
all flushed trees are loaded to memory piece-by-piece, pieces are
sorted, and action is called for each unique value.
Note: so as merging resets file_ptrs state, this method can change
internal Unique state to undefined: if you want to reuse Unique after
walk() you must call reset() first!
SYNOPSIS
Unique:walk()
All params are 'IN':
action function-visitor, typed in include/my_tree.h
function is called for each unique element
arg argument for visitor, which is passed to it on each call
RETURN VALUE
0 OK
<> 0 error
*/
bool Unique::walk(tree_walk_action action, void *walk_action_arg)
{
if (elements == 0) /* the whole tree is in memory */
return tree_walk(&tree, action, walk_action_arg, left_root_right);
/* flush current tree to the file to have some memory for merge buffer */
if (flush())
return 1;
if (flush_io_cache(&file) || reinit_io_cache(&file, READ_CACHE, 0L, 0, 0))
return 1;
uchar *merge_buffer= (uchar *) my_malloc(max_in_memory_size, MYF(0));
if (merge_buffer == 0)
return 1;
int res= merge_walk(merge_buffer, max_in_memory_size, size,
(BUFFPEK *) file_ptrs.buffer,
(BUFFPEK *) file_ptrs.buffer + file_ptrs.elements,
action, walk_action_arg,
tree.compare, tree.custom_arg, &file);
x_free(merge_buffer);
return res;
}
/* /*
Modify the TABLE element so that when one calls init_records() Modify the TABLE element so that when one calls init_records()
the rows will be read in priority order. the rows will be read in priority order.
...@@ -114,7 +346,7 @@ bool Unique::get(TABLE *table) ...@@ -114,7 +346,7 @@ bool Unique::get(TABLE *table)
return 0; return 0;
} }
} }
/* Not enough memory; Save the result to file */ /* Not enough memory; Save the result to file && free memory used by tree */
if (flush()) if (flush())
return 1; return 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment