Commit 398dbceb authored by unknown's avatar unknown

Bug#22555: STDDEV yields positive result for groups with only one row

When only one row was present, the subtraction of nearly the same number 
resulted in catastropic cancellation, introducing an error in the 
VARIANCE calculation near 1e-15.  That was sqrt()ed to get STDDEV, the 
error was escallated to near 1e-8.  

The simple fix of testing for a row count of 1 and forcing that to yield 
0.0 is insufficient, as two rows of the same value should also have a
variance of 0.0, yet the error would be about the same.

So, this patch changes the formula that computes the VARIANCE to be one
that is not subject to catastrophic cancellation.

In addition, it now uses only (faster-than-decimal) floating point numbers
to calculate, and renders that to other types on demand.


mysql-test/r/func_group.result:
  Test that the bug is fixed, and that no unexpected behavior arises from the 
  changes.
mysql-test/t/func_group.test:
  Test that the bug is fixed, and that no unexpected behavior arises from the 
  changes.
sql/item_sum.cc:
  Serg's suggestion: Force all VARIANCE calculations to be done with floating-
  point types.  It's faster, and the SQL standard says we may implement these
  functions any way we want.
  
  Additionally, use a form of variance calculation that is not subject to 
  catastrophic cancellation.   
  http://static.flickr.com/108/311308512_5c4e1c0c3d_b.jpg
sql/item_sum.h:
  Remove unused members and add a comment describing the recurrence relation.
parent 9bd212fd
......@@ -19,3 +19,4 @@
454f8960jsVT_kMKJtZ9OCgXoba0xQ
4554a95d7txO1DuO9G3nAizI3SkFAA
4554b3722d71SbPiI2Gx-RhbZjmuIQ
45771031yRCoM_ZfONdYchPvVEgLRg
......@@ -1029,3 +1029,271 @@ t1 CREATE TABLE `t1` (
`stddev(0)` double(8,4) default NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
drop table t1;
create table bug22555 (i smallint primary key auto_increment, s1 smallint, s2 smallint, e decimal(30,10), o double);
insert into bug22555 (s1, s2, e, o) values (53, 78, 11.4276528, 6.828112), (17, 78, 5.916793, 1.8502951), (18, 76, 2.679231, 9.17975591), (31, 62, 6.07831, 0.1), (19, 41, 5.37463, 15.1), (83, 73, 14.567426, 7.959222), (92, 53, 6.10151, 13.1856852), (7, 12, 13.92272, 3.442007), (92, 35, 11.95358909, 6.01376678), (38, 84, 2.572, 7.904571);
select std(s1/s2) from bug22555 group by i;
std(s1/s2)
0.00000000
0.00000000
0.00000000
0.00000000
0.00000000
0.00000000
0.00000000
0.00000000
0.00000000
0.00000000
select std(e) from bug22555 group by i;
std(e)
0.00000000000000
0.00000000000000
0.00000000000000
0.00000000000000
0.00000000000000
0.00000000000000
0.00000000000000
0.00000000000000
0.00000000000000
0.00000000000000
select std(o) from bug22555 group by i;
std(o)
0
0
0
0
0
0
0
0
0
0
drop table bug22555;
create table bug22555 (i smallint, s1 smallint, s2 smallint, o1 double, o2 double, e1 decimal, e2 decimal);
insert into bug22555 values (1,53,78,53,78,53,78),(2,17,78,17,78,17,78),(3,18,76,18,76,18,76);
select i, count(*) from bug22555 group by i;
i count(*)
1 1
2 1
3 1
select std(s1/s2) from bug22555 where i=1;
std(s1/s2)
0.00000000
select std(s1/s2) from bug22555 where i=2;
std(s1/s2)
0.00000000
select std(s1/s2) from bug22555 where i=3;
std(s1/s2)
0.00000000
select std(s1/s2) from bug22555 where i=1 group by i;
std(s1/s2)
0.00000000
select std(s1/s2) from bug22555 where i=2 group by i;
std(s1/s2)
0.00000000
select std(s1/s2) from bug22555 where i=3 group by i;
std(s1/s2)
0.00000000
select std(s1/s2) from bug22555 group by i order by i;
std(s1/s2)
0.00000000
0.00000000
0.00000000
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
i count(*) std(o1/o2)
1 1 0
2 1 0
3 1 0
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
i count(*) std(e1/e2)
1 1 0.00000000
2 1 0.00000000
3 1 0.00000000
set @saved_div_precision_increment=@@div_precision_increment;
set div_precision_increment=19;
select i, count(*), variance(s1/s2) from bug22555 group by i order by i;
i count(*) variance(s1/s2)
1 1 0.000000000000000000000000000000
2 1 0.000000000000000000000000000000
3 1 0.000000000000000000000000000000
select i, count(*), variance(o1/o2) from bug22555 group by i order by i;
i count(*) variance(o1/o2)
1 1 0
2 1 0
3 1 0
select i, count(*), variance(e1/e2) from bug22555 group by i order by i;
i count(*) variance(e1/e2)
1 1 0.000000000000000000000000000000
2 1 0.000000000000000000000000000000
3 1 0.000000000000000000000000000000
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
i count(*) std(s1/s2)
1 1 0.000000000000000000000000000000
2 1 0.000000000000000000000000000000
3 1 0.000000000000000000000000000000
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
i count(*) std(o1/o2)
1 1 0
2 1 0
3 1 0
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
i count(*) std(e1/e2)
1 1 0.000000000000000000000000000000
2 1 0.000000000000000000000000000000
3 1 0.000000000000000000000000000000
set div_precision_increment=20;
select i, count(*), variance(s1/s2) from bug22555 group by i order by i;
i count(*) variance(s1/s2)
1 1 0.000000000000000000000000000000
2 1 0.000000000000000000000000000000
3 1 0.000000000000000000000000000000
select i, count(*), variance(o1/o2) from bug22555 group by i order by i;
i count(*) variance(o1/o2)
1 1 0
2 1 0
3 1 0
select i, count(*), variance(e1/e2) from bug22555 group by i order by i;
i count(*) variance(e1/e2)
1 1 0.000000000000000000000000000000
2 1 0.000000000000000000000000000000
3 1 0.000000000000000000000000000000
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
i count(*) std(s1/s2)
1 1 0.000000000000000000000000000000
2 1 0.000000000000000000000000000000
3 1 0.000000000000000000000000000000
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
i count(*) std(o1/o2)
1 1 0
2 1 0
3 1 0
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
i count(*) std(e1/e2)
1 1 0.000000000000000000000000000000
2 1 0.000000000000000000000000000000
3 1 0.000000000000000000000000000000
set @@div_precision_increment=@saved_div_precision_increment;
insert into bug22555 values (1,53,78,53,78,53,78),(2,17,78,17,78,17,78),(3,18,76,18,76,18,76);
insert into bug22555 values (1,53,78,53,78,53,78),(2,17,78,17,78,17,78),(3,18,76,18,76,18,76);
insert into bug22555 values (1,53,78,53,78,53,78),(2,17,78,17,78,17,78),(3,18,76,18,76,18,76);
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
i count(*) std(s1/s2)
1 4 0.00000000
2 4 0.00000000
3 4 0.00000000
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
i count(*) std(o1/o2)
1 4 0
2 4 0
3 4 0
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
i count(*) std(e1/e2)
1 4 0.00000000
2 4 0.00000000
3 4 0.00000000
select std(s1/s2) from bug22555;
std(s1/s2)
0.21325764
select std(o1/o2) from bug22555;
std(o1/o2)
0.21325763586649
select std(e1/e2) from bug22555;
std(e1/e2)
0.21325764
set @saved_div_precision_increment=@@div_precision_increment;
set div_precision_increment=19;
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
i count(*) std(s1/s2)
1 4 0.000000000000000000000000000000
2 4 0.000000000000000000000000000000
3 4 0.000000000000000000000000000000
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
i count(*) std(o1/o2)
1 4 0
2 4 0
3 4 0
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
i count(*) std(e1/e2)
1 4 0.000000000000000000000000000000
2 4 0.000000000000000000000000000000
3 4 0.000000000000000000000000000000
select std(s1/s2) from bug22555;
std(s1/s2)
0.213257635866493405751853629226
select std(o1/o2) from bug22555;
std(o1/o2)
0.21325763586649
select std(e1/e2) from bug22555;
std(e1/e2)
0.213257635866493405751853629226
set div_precision_increment=20;
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
i count(*) std(s1/s2)
1 4 0.000000000000000000000000000000
2 4 0.000000000000000000000000000000
3 4 0.000000000000000000000000000000
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
i count(*) std(o1/o2)
1 4 0
2 4 0
3 4 0
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
i count(*) std(e1/e2)
1 4 0.000000000000000000000000000000
2 4 0.000000000000000000000000000000
3 4 0.000000000000000000000000000000
select std(s1/s2) from bug22555;
std(s1/s2)
0.213257635866493405751853629226
select std(o1/o2) from bug22555;
std(o1/o2)
0.21325763586649
select std(e1/e2) from bug22555;
std(e1/e2)
0.213257635866493405751853629226
set @@div_precision_increment=@saved_div_precision_increment;
drop table bug22555;
create table bug22555 (s smallint, o double, e decimal);
insert into bug22555 values (1,1,1),(2,2,2),(3,3,3),(6,6,6),(7,7,7);
select var_samp(s), var_pop(s) from bug22555;
var_samp(s) var_pop(s)
6.7000 5.3600
select var_samp(o), var_pop(o) from bug22555;
var_samp(o) var_pop(o)
6.7 5.36
select var_samp(e), var_pop(e) from bug22555;
var_samp(e) var_pop(e)
6.7000 5.3600
drop table bug22555;
create table bug22555 (s smallint, o double, e decimal);
insert into bug22555 values (null,null,null),(null,null,null);
select var_samp(s) as 'null', var_pop(s) as 'null' from bug22555;
null null
NULL NULL
select var_samp(o) as 'null', var_pop(o) as 'null' from bug22555;
null null
NULL NULL
select var_samp(e) as 'null', var_pop(e) as 'null' from bug22555;
null null
NULL NULL
insert into bug22555 values (1,1,1);
select var_samp(s) as 'null', var_pop(s) as '0' from bug22555;
null 0
NULL 0.0000
select var_samp(o) as 'null', var_pop(o) as '0' from bug22555;
null 0
NULL 0
select var_samp(e) as 'null', var_pop(e) as '0' from bug22555;
null 0
NULL 0.0000
insert into bug22555 values (2,2,2);
select var_samp(s) as '0.5', var_pop(s) as '0.25' from bug22555;
0.5 0.25
0.5000 0.2500
select var_samp(o) as '0.5', var_pop(o) as '0.25' from bug22555;
0.5 0.25
0.5 0.25
select var_samp(e) as '0.5', var_pop(e) as '0.25' from bug22555;
0.5 0.25
0.5000 0.2500
drop table bug22555;
End 5.0 tests.
......@@ -700,3 +700,95 @@ create table t1 select stddev(0);
show create table t1;
drop table t1;
#
# Bug#22555: STDDEV yields positive result for groups with only one row
#
create table bug22555 (i smallint primary key auto_increment, s1 smallint, s2 smallint, e decimal(30,10), o double);
insert into bug22555 (s1, s2, e, o) values (53, 78, 11.4276528, 6.828112), (17, 78, 5.916793, 1.8502951), (18, 76, 2.679231, 9.17975591), (31, 62, 6.07831, 0.1), (19, 41, 5.37463, 15.1), (83, 73, 14.567426, 7.959222), (92, 53, 6.10151, 13.1856852), (7, 12, 13.92272, 3.442007), (92, 35, 11.95358909, 6.01376678), (38, 84, 2.572, 7.904571);
select std(s1/s2) from bug22555 group by i;
select std(e) from bug22555 group by i;
select std(o) from bug22555 group by i;
drop table bug22555;
create table bug22555 (i smallint, s1 smallint, s2 smallint, o1 double, o2 double, e1 decimal, e2 decimal);
insert into bug22555 values (1,53,78,53,78,53,78),(2,17,78,17,78,17,78),(3,18,76,18,76,18,76);
select i, count(*) from bug22555 group by i;
select std(s1/s2) from bug22555 where i=1;
select std(s1/s2) from bug22555 where i=2;
select std(s1/s2) from bug22555 where i=3;
select std(s1/s2) from bug22555 where i=1 group by i;
select std(s1/s2) from bug22555 where i=2 group by i;
select std(s1/s2) from bug22555 where i=3 group by i;
select std(s1/s2) from bug22555 group by i order by i;
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
set @saved_div_precision_increment=@@div_precision_increment;
set div_precision_increment=19;
select i, count(*), variance(s1/s2) from bug22555 group by i order by i;
select i, count(*), variance(o1/o2) from bug22555 group by i order by i;
select i, count(*), variance(e1/e2) from bug22555 group by i order by i;
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
set div_precision_increment=20;
select i, count(*), variance(s1/s2) from bug22555 group by i order by i;
select i, count(*), variance(o1/o2) from bug22555 group by i order by i;
select i, count(*), variance(e1/e2) from bug22555 group by i order by i;
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
set @@div_precision_increment=@saved_div_precision_increment;
insert into bug22555 values (1,53,78,53,78,53,78),(2,17,78,17,78,17,78),(3,18,76,18,76,18,76);
insert into bug22555 values (1,53,78,53,78,53,78),(2,17,78,17,78,17,78),(3,18,76,18,76,18,76);
insert into bug22555 values (1,53,78,53,78,53,78),(2,17,78,17,78,17,78),(3,18,76,18,76,18,76);
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
select std(s1/s2) from bug22555;
select std(o1/o2) from bug22555;
select std(e1/e2) from bug22555;
set @saved_div_precision_increment=@@div_precision_increment;
set div_precision_increment=19;
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
select std(s1/s2) from bug22555;
select std(o1/o2) from bug22555;
select std(e1/e2) from bug22555;
set div_precision_increment=20;
select i, count(*), std(s1/s2) from bug22555 group by i order by i;
select i, count(*), std(o1/o2) from bug22555 group by i order by i;
select i, count(*), std(e1/e2) from bug22555 group by i order by i;
select std(s1/s2) from bug22555;
select std(o1/o2) from bug22555;
select std(e1/e2) from bug22555;
set @@div_precision_increment=@saved_div_precision_increment;
drop table bug22555;
create table bug22555 (s smallint, o double, e decimal);
insert into bug22555 values (1,1,1),(2,2,2),(3,3,3),(6,6,6),(7,7,7);
select var_samp(s), var_pop(s) from bug22555;
select var_samp(o), var_pop(o) from bug22555;
select var_samp(e), var_pop(e) from bug22555;
drop table bug22555;
create table bug22555 (s smallint, o double, e decimal);
insert into bug22555 values (null,null,null),(null,null,null);
select var_samp(s) as 'null', var_pop(s) as 'null' from bug22555;
select var_samp(o) as 'null', var_pop(o) as 'null' from bug22555;
select var_samp(e) as 'null', var_pop(e) as 'null' from bug22555;
insert into bug22555 values (1,1,1);
select var_samp(s) as 'null', var_pop(s) as '0' from bug22555;
select var_samp(o) as 'null', var_pop(o) as '0' from bug22555;
select var_samp(e) as 'null', var_pop(e) as '0' from bug22555;
insert into bug22555 values (2,2,2);
select var_samp(s) as '0.5', var_pop(s) as '0.25' from bug22555;
select var_samp(o) as '0.5', var_pop(o) as '0.25' from bug22555;
select var_samp(e) as '0.5', var_pop(e) as '0.25' from bug22555;
drop table bug22555;
###
--echo End 5.0 tests.
This diff is collapsed.
......@@ -665,8 +665,10 @@ class Item_variance_field :public Item_result_field
double val_real();
longlong val_int()
{ /* can't be fix_fields()ed */ return (longlong) rint(val_real()); }
String *val_str(String*);
my_decimal *val_decimal(my_decimal *);
String *val_str(String *str)
{ return val_string_from_real(str); }
my_decimal *val_decimal(my_decimal *dec_buf)
{ return val_decimal_from_real(dec_buf); }
bool is_null() { (void) val_int(); return null_value; }
enum_field_types field_type() const
{
......@@ -688,6 +690,14 @@ class Item_variance_field :public Item_result_field
= (sum(ai^2) - 2*sum(a)*sum(a)/count(a) + count(a)*sum(a)^2/count(a)^2 )/count(a) =
= (sum(ai^2) - 2*sum(a)^2/count(a) + sum(a)^2/count(a) )/count(a) =
= (sum(ai^2) - sum(a)^2/count(a))/count(a)
But, this falls prey to catastrophic cancellation. Instead, use the recurrence formulas
M_{1} = x_{1}, ~ M_{k} = M_{k-1} + (x_{k} - M_{k-1}) / k newline
S_{1} = 0, ~ S_{k} = S_{k-1} + (x_{k} - M_{k-1}) times (x_{k} - M_{k}) newline
for 2 <= k <= n newline
ital variance = S_{n} / (n-1)
*/
class Item_sum_variance : public Item_sum_num
......@@ -696,9 +706,8 @@ class Item_sum_variance : public Item_sum_num
public:
Item_result hybrid_type;
double sum, sum_sqr;
my_decimal dec_sum[2], dec_sqr[2];
int cur_dec;
double recurrence_m, recurrence_s; /* Used in recurrence relation. */
ulonglong count;
uint f_precision0, f_scale0;
uint f_precision1, f_scale1;
......@@ -707,7 +716,7 @@ class Item_sum_variance : public Item_sum_num
uint prec_increment;
Item_sum_variance(Item *item_par, uint sample_arg) :Item_sum_num(item_par),
hybrid_type(REAL_RESULT), cur_dec(0), count(0), sample(sample_arg)
hybrid_type(REAL_RESULT), count(0), sample(sample_arg)
{}
Item_sum_variance(THD *thd, Item_sum_variance *item);
enum Sumfunctype sum_func () const { return VARIANCE_FUNC; }
......@@ -727,7 +736,6 @@ class Item_sum_variance : public Item_sum_num
enum Item_result result_type () const { return REAL_RESULT; }
void cleanup()
{
cur_dec= 0;
count= 0;
Item_sum_num::cleanup();
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment