Commit d8d57d2c authored by Sergei Petrunia's avatar Sergei Petrunia

MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters

Encode such characters in hex.
parent 748b293c
...@@ -7896,16 +7896,41 @@ a ...@@ -7896,16 +7896,41 @@ a
drop table t1; drop table t1;
# #
# Another testcase: use a character that cannot be represented in utf8: # Another testcase: use a character that cannot be represented in utf8:
# Also, now it's testcase for:
# MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
# #
create table t1 ( a varchar(100) character set cp1251); create table t1 ( a varchar(100) character set cp1251);
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98'); insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88');
insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98');
analyze table t1 persistent for all; analyze table t1 persistent for all;
Table Op Msg_type Msg_text Table Op Msg_type Msg_text
test.t1 analyze status Operation failed test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
select hist_type, histogram select hist_type, histogram
from mysql.column_stats from mysql.column_stats
where db_name=database() and table_name='t1'; where db_name=database() and table_name='t1';
hist_type histogram hist_type histogram
JSON_HB {
"target_histogram_size": 10,
"collected_at": "REPLACED",
"collected_by": "REPLACED",
"histogram_hb": [
{
"start": "€",
"size": 0.6,
"ndv": 1
},
{
"start_hex": "98",
"end_hex": "98",
"size": 0.4,
"ndv": 1
}
]
}
analyze select * from t1 where a=_cp1251 x'88';
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 5.00 60.00 60.00 Using where
drop table t1; drop table t1;
# #
# ASAN use-after-poison my_strnxfrm_simple_internal / Histogram_json_hb::range_selectivity ... # ASAN use-after-poison my_strnxfrm_simple_internal / Histogram_json_hb::range_selectivity ...
...@@ -8102,7 +8127,8 @@ set histogram_type= JSON_HB, histogram_size= 1; ...@@ -8102,7 +8127,8 @@ set histogram_type= JSON_HB, histogram_size= 1;
insert into t1 values ('foo'),(unhex('9C')); insert into t1 values ('foo'),(unhex('9C'));
analyze table t1 persistent for all; analyze table t1 persistent for all;
Table Op Msg_type Msg_text Table Op Msg_type Msg_text
test.t1 analyze status Operation failed test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
select * from t1; select * from t1;
a a
foo foo
......
...@@ -227,9 +227,12 @@ drop table t1; ...@@ -227,9 +227,12 @@ drop table t1;
--echo # --echo #
--echo # Another testcase: use a character that cannot be represented in utf8: --echo # Another testcase: use a character that cannot be represented in utf8:
--echo # Also, now it's testcase for:
--echo # MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
--echo # --echo #
create table t1 ( a varchar(100) character set cp1251); create table t1 ( a varchar(100) character set cp1251);
insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98'); insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88');
insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98');
analyze table t1 persistent for all; analyze table t1 persistent for all;
--source include/histogram_replaces.inc --source include/histogram_replaces.inc
...@@ -237,6 +240,8 @@ select hist_type, histogram ...@@ -237,6 +240,8 @@ select hist_type, histogram
from mysql.column_stats from mysql.column_stats
where db_name=database() and table_name='t1'; where db_name=database() and table_name='t1';
analyze select * from t1 where a=_cp1251 x'88';
drop table t1; drop table t1;
--echo # --echo #
......
...@@ -70,11 +70,11 @@ static bool json_unescape_to_string(const char *val, int val_len, String* out) ...@@ -70,11 +70,11 @@ static bool json_unescape_to_string(const char *val, int val_len, String* out)
succeeds. succeeds.
*/ */
static bool json_escape_to_string(const String *str, String* out) static int json_escape_to_string(const String *str, String* out)
{ {
// Make sure 'out' has some memory allocated. // Make sure 'out' has some memory allocated.
if (!out->alloced_length() && out->alloc(128)) if (!out->alloced_length() && out->alloc(128))
return true; return JSON_ERROR_OUT_OF_SPACE;
while (1) while (1)
{ {
...@@ -90,15 +90,15 @@ static bool json_escape_to_string(const String *str, String* out) ...@@ -90,15 +90,15 @@ static bool json_escape_to_string(const String *str, String* out)
if (res >= 0) if (res >= 0)
{ {
out->length(res); out->length(res);
return false; // Ok return 0; // Ok
} }
if (res != JSON_ERROR_OUT_OF_SPACE) if (res != JSON_ERROR_OUT_OF_SPACE)
return true; // Some conversion error return res; // Some conversion error
// Out of space error. Try with a bigger buffer // Out of space error. Try with a bigger buffer
if (out->alloc(out->alloced_length()*2)) if (out->alloc(out->alloced_length()*2))
return true; return JSON_ERROR_OUT_OF_SPACE;
} }
} }
...@@ -208,8 +208,7 @@ class Histogram_json_builder : public Histogram_builder ...@@ -208,8 +208,7 @@ class Histogram_json_builder : public Histogram_builder
*/ */
bool finalize_bucket_with_end_value(void *elem) bool finalize_bucket_with_end_value(void *elem)
{ {
writer.add_member("end"); if (append_column_value(elem, false))
if (append_column_value(elem))
return true; return true;
finalize_bucket(); finalize_bucket();
return false; return false;
...@@ -224,19 +223,18 @@ class Histogram_json_builder : public Histogram_builder ...@@ -224,19 +223,18 @@ class Histogram_json_builder : public Histogram_builder
{ {
DBUG_ASSERT(bucket.size == 0); DBUG_ASSERT(bucket.size == 0);
writer.start_object(); writer.start_object();
writer.add_member("start"); if (append_column_value(elem, true))
if (append_column_value(elem))
return true; return true;
bucket.ndv= 1; bucket.ndv= 1;
bucket.size= cnt; bucket.size= cnt;
return false; return false;
} }
/* /*
Append the passed value into the JSON writer as string value Append the passed value into the JSON writer as string value
*/ */
bool append_column_value(void *elem) bool append_column_value(void *elem, bool is_start)
{ {
StringBuffer<MAX_FIELD_WIDTH> val; StringBuffer<MAX_FIELD_WIDTH> val;
...@@ -246,12 +244,21 @@ class Histogram_json_builder : public Histogram_builder ...@@ -246,12 +244,21 @@ class Histogram_json_builder : public Histogram_builder
// Escape the value for JSON // Escape the value for JSON
StringBuffer<MAX_FIELD_WIDTH> escaped_val; StringBuffer<MAX_FIELD_WIDTH> escaped_val;
if (json_escape_to_string(str, &escaped_val)) int rc= json_escape_to_string(str, &escaped_val);
return true; if (!rc)
{
// Note: The Json_writer does NOT do escapes (perhaps this should change?) writer.add_member(is_start? "start": "end");
writer.add_str(escaped_val.c_ptr_safe()); writer.add_str(escaped_val.c_ptr_safe());
return false; return false;
}
if (rc == JSON_ERROR_ILLEGAL_SYMBOL)
{
escaped_val.set_hex(val.ptr(), val.length());
writer.add_member(is_start? "start_hex": "end_hex");
writer.add_str(escaped_val.c_ptr_safe());
return false;
}
return true;
} }
/* /*
...@@ -496,6 +503,41 @@ bool read_bucket_endpoint(json_engine_t *je, Field *field, String *out, ...@@ -496,6 +503,41 @@ bool read_bucket_endpoint(json_engine_t *je, Field *field, String *out,
} }
bool read_hex_bucket_endpoint(json_engine_t *je, Field *field, String *out,
const char **err)
{
if (json_read_value(je))
return true;
if (je->value_type != JSON_VALUE_STRING || je->value_escaped ||
(je->value_len & 1))
{
*err= "Expected a hex string";
return true;
}
StringBuffer<128> buf;
for (auto pc= je->value; pc < je->value + je->value_len; pc+=2)
{
int hex_char1= hexchar_to_int(pc[0]);
int hex_char2= hexchar_to_int(pc[1]);
if (hex_char1 == -1 || hex_char2 == -1)
{
*err= "Expected a hex string";
return true;
}
buf.append((hex_char1 << 4) | hex_char2);
}
field->store_text(buf.ptr(), buf.length(), field->charset());
out->alloc(field->pack_length());
uint bytes= field->get_key_image((uchar*)out->ptr(),
field->key_length(), Field::itRAW);
out->length(bytes);
return false;
}
/* /*
@brief Parse a JSON reprsentation for one histogram bucket @brief Parse a JSON reprsentation for one histogram bucket
...@@ -619,6 +661,30 @@ int Histogram_json_hb::parse_bucket(json_engine_t *je, Field *field, ...@@ -619,6 +661,30 @@ int Histogram_json_hb::parse_bucket(json_engine_t *je, Field *field,
} }
save1.restore_to(je); save1.restore_to(je);
// Less common endoints:
Json_string start_hex_str("start_hex");
if (json_key_matches(je, start_hex_str.get()))
{
if (read_hex_bucket_endpoint(je, field, &value_buf, err))
return 1;
have_start= true;
continue;
}
save1.restore_to(je);
Json_string end_hex_str("end_hex");
if (json_key_matches(je, end_hex_str.get()))
{
if (read_hex_bucket_endpoint(je, field, &value_buf, err))
return 1;
last_bucket_end_endp.assign(value_buf.ptr(), value_buf.length());
*assigned_last_end= true;
continue;
}
save1.restore_to(je);
// Some unknown member. Skip it. // Some unknown member. Skip it.
if (json_skip_key(je)) if (json_skip_key(je))
return 1; return 1;
......
...@@ -32,12 +32,18 @@ ...@@ -32,12 +32,18 @@
"histogram_hb": [ "histogram_hb": [
{ "start": "value", "size":nnn.nn, "ndv": nnn }, { "start": "value", "size":nnn.nn, "ndv": nnn },
... ...
{ "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"}
// Optionally, start and/or end can be replaced with _hex variant
{ "start_hex: "value", "size":nnn.nn, "ndv":nnn},
...
{ "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"},
] ]
} }
The histogram is an object with single member named Histogram_json_hb:: The histogram is an object with single member named Histogram_json_hb::
JSON_NAME. The value of that member is an array of buckets. JSON_NAME. The value of that member is an array of buckets.
Each bucket is an object with these members: Each bucket is an object with these members:
"start" - the first value in the bucket. "start" - the first value in the bucket.
"size" - fraction of table rows that is contained in the bucket. "size" - fraction of table rows that is contained in the bucket.
...@@ -51,6 +57,11 @@ ...@@ -51,6 +57,11 @@
The exception is single-point buckets where last value is the same as the The exception is single-point buckets where last value is the same as the
first value. first value.
start/end can be replaced with start_hex/end_hex. In _hex variant, the
constant is encoded in hex. This encoding is used to handle so called
"unassigned characters": some non-UTF8 charsets have byte combinations that
are not mapped to any UTF8 character.
*/ */
class Histogram_json_hb : public Histogram_base class Histogram_json_hb : public Histogram_base
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment