Commit 2a1cdbab authored by Sergei Petrunia's avatar Sergei Petrunia

Fix JSON parsing: future-proof data representation in JSON, code cleanup

parent a0b4a868
...@@ -283,12 +283,13 @@ int json_key_matches(json_engine_t *je, json_string_t *k); ...@@ -283,12 +283,13 @@ int json_key_matches(json_engine_t *je, json_string_t *k);
int json_read_value(json_engine_t *j); int json_read_value(json_engine_t *j);
/* /*
* json_smart_read_value() reads parses a scalar value and value length from the json engine, json_smart_read_value() reads a JSON value. Pointer to value is stored in
* and copies them into `value` and `value_length` respectively. *value and its length in *value_len.
* It should only be called when the json_engine state is JST_VALUE.
* If it encounters a non-scalar value (say object or array) before getting to value_len, if the value is non a scalar, it returns pointers to its JSON
* such value is also read and copied into value. representation.
*/ The function should only be called when je->state==JST_VALUE.
*/
enum json_types json_smart_read_value(json_engine_t *je, const char **value, int *value_len); enum json_types json_smart_read_value(json_engine_t *je, const char **value, int *value_len);
/* /*
......
This diff is collapsed.
...@@ -37,6 +37,8 @@ analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz'; ...@@ -37,6 +37,8 @@ analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
explain extended select * from t1_json where a < 'b-1a'; explain extended select * from t1_json where a < 'b-1a';
analyze select * from t1_json where a > 'zzzzzzzzz'; analyze select * from t1_json where a > 'zzzzzzzzz';
drop table ten;
# test different valid JSON strings that are invalid histograms. # test different valid JSON strings that are invalid histograms.
UPDATE mysql.column_stats SET histogram='["a-1", "a-2", {"a": "b"}, "a-3"]' WHERE table_name='t1_json'; UPDATE mysql.column_stats SET histogram='["a-1", "a-2", {"a": "b"}, "a-3"]' WHERE table_name='t1_json';
FLUSH TABLES; FLUSH TABLES;
...@@ -45,23 +47,23 @@ explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz'; ...@@ -45,23 +47,23 @@ explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
--source include/have_sequence.inc --source include/have_sequence.inc
create table users ( create table t2 (
city varchar(100) city varchar(100)
); );
set histogram_size=50; set histogram_size=50;
insert into users select 'Moscow' from seq_1_to_99; insert into t2 select 'Moscow' from seq_1_to_99;
insert into users select 'Helsinki' from seq_1_to_2; insert into t2 select 'Helsinki' from seq_1_to_2;
set histogram_type=json_hb; set histogram_type=json_hb;
analyze table users persistent for all; analyze table t2 persistent for all;
explain extended select * from users where city = 'Moscow'; explain extended select * from t2 where city = 'Moscow';
analyze select * from users where city = 'Moscow'; analyze select * from t2 where city = 'Moscow';
explain extended select * from users where city = 'Helsinki'; explain extended select * from t2 where city = 'Helsinki';
analyze select * from users where city = 'helsinki'; analyze select * from t2 where city = 'helsinki';
explain extended select * from users where city < 'Lagos'; explain extended select * from t2 where city < 'Lagos';
drop table t1_bin; drop table t1_bin;
drop table t1_json; drop table t1_json;
drop table users; drop table t2;
DELETE FROM mysql.column_stats; DELETE FROM mysql.column_stats;
......
...@@ -8914,4 +8914,4 @@ ER_PARTITION_CONVERT_SUBPARTITIONED ...@@ -8914,4 +8914,4 @@ ER_PARTITION_CONVERT_SUBPARTITIONED
ER_PROVIDER_NOT_LOADED ER_PROVIDER_NOT_LOADED
eng "MariaDB tried to use the %s, but its provider plugin is not loaded" eng "MariaDB tried to use the %s, but its provider plugin is not loaded"
ER_JSON_HISTOGRAM_PARSE_FAILED ER_JSON_HISTOGRAM_PARSE_FAILED
eng "Failed to parse histogram, encountered JSON_TYPE '%d'." eng "Failed to parse histogram: %s at offset %d."
...@@ -1123,6 +1123,7 @@ class Column_stat: public Stat_table ...@@ -1123,6 +1123,7 @@ class Column_stat: public Stat_table
void get_stat_values() void get_stat_values()
{ {
table_field->read_stats->set_all_nulls(); table_field->read_stats->set_all_nulls();
// default: hist_type=NULL means there's no histogram
table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM; table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM;
if (table_field->read_stats->min_value) if (table_field->read_stats->min_value)
...@@ -1196,7 +1197,10 @@ class Column_stat: public Stat_table ...@@ -1196,7 +1197,10 @@ class Column_stat: public Stat_table
break; break;
} }
case COLUMN_STAT_HISTOGRAM: case COLUMN_STAT_HISTOGRAM:
//TODO: if stat_field->length() == 0 then histogram_type_on_disk is set to INVALID_HISTOGRAM /*
Do nothing here: we take the histogram length from the 'histogram'
column itself
*/
break; break;
} }
} }
...@@ -1245,7 +1249,7 @@ class Column_stat: public Stat_table ...@@ -1245,7 +1249,7 @@ class Column_stat: public Stat_table
} }
if (!hist->parse(mem_root, table_field, if (!hist->parse(mem_root, table_field,
table_field->read_stats->histogram_type_on_disk, table_field->read_stats->histogram_type_on_disk,
(const uchar*)val.ptr(), val.length())) val.ptr(), val.length()))
{ {
table_field->read_stats->histogram_= hist; table_field->read_stats->histogram_= hist;
return hist; return hist;
...@@ -1255,19 +1259,19 @@ class Column_stat: public Stat_table ...@@ -1255,19 +1259,19 @@ class Column_stat: public Stat_table
} }
}; };
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field *,
Histogram_type type_arg, bool Histogram_binary::parse(MEM_ROOT *mem_root, Field*,
const uchar *ptr_arg, uint size_arg) Histogram_type type_arg, const char *hist_data,
size_t hist_data_len)
{ {
// Just copy the data /* On-disk an in-memory formats are the same. Just copy the data. */
size = (uint8) size_arg; type= type_arg;
type = type_arg; size= (uint8) hist_data_len; // 'size' holds the size of histogram in bytes
if ((values = (uchar*)alloc_root(mem_root, size_arg))) if (!(values= (uchar*)alloc_root(mem_root, hist_data_len)))
{
memcpy(values, ptr_arg, size_arg);
return false;
}
return true; return true;
memcpy(values, hist_data, hist_data_len);
return false;
} }
/* /*
...@@ -1307,39 +1311,81 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root, ...@@ -1307,39 +1311,81 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root,
*/ */
bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field, bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field,
Histogram_type type_arg, const uchar *ptr, Histogram_type type_arg, const char *hist_data,
uint size_arg) size_t hist_data_len)
{ {
DBUG_ENTER("Histogram_json::parse"); DBUG_ENTER("Histogram_json::parse");
DBUG_ASSERT(type_arg == JSON_HB); DBUG_ASSERT(type_arg == JSON_HB);
size = (uint8) size_arg; const char *err;
const char *json = (char *)ptr; json_engine_t je;
int vt; json_string_t key_name;
std::vector<std::string> hist_buckets_text;
bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets_text); json_scan_start(&je, &my_charset_utf8mb4_bin,
if (!result) (const uchar*)hist_data,
{ (const uchar*)hist_data+hist_data_len);
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), vt);
DBUG_RETURN(true); if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
{
err= "Root JSON element must be a JSON object";
goto error;
} }
size= hist_buckets_text.size();
/* json_string_set_str(&key_name, (const uchar*)JSON_NAME,
Convert the text based array into a data structure that allows lookups and (const uchar*)JSON_NAME + strlen(JSON_NAME));
estimates json_string_set_cs(&key_name, system_charset_info);
*/
for (auto &s : hist_buckets_text) if (json_scan_next(&je) || je.state != JST_KEY ||
!json_key_matches(&je, &key_name))
{ {
field->store_text(s.data(), s.size(), &my_charset_bin); err= "The first key in the object must be histogram_hb_v1";
goto error;
}
// Get the value in "truncated key tuple format" here: // The value must be a JSON array
if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
{
err= "A JSON array expected";
goto error;
}
// Read the array
while (!json_scan_next(&je))
{
switch(je.state)
{
case JST_VALUE:
{
const char *val;
int val_len;
json_smart_read_value(&je, &val, &val_len);
if (je.value_type != JSON_VALUE_STRING &&
je.value_type != JSON_VALUE_NUMBER &&
je.value_type != JSON_VALUE_TRUE &&
je.value_type != JSON_VALUE_FALSE)
{
err= "Scalar value expected";
goto error;
}
uchar buf[MAX_KEY_LENGTH]; uchar buf[MAX_KEY_LENGTH];
uint len_to_copy= field->key_length(); uint len_to_copy= field->key_length();
field->store_text(val, val_len, &my_charset_bin);
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW); uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
histogram_bounds.push_back(std::string((char*)buf, bytes)); histogram_bounds.push_back(std::string((char*)buf, bytes));
// TODO: Should we also compare this endpoint with the previous
// to verify that the ordering is right?
break;
} }
case JST_ARRAY_END:
break;
}
}
size= histogram_bounds.size();
DBUG_RETURN(false); DBUG_RETURN(false);
error:
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
je.s.c_str - (const uchar*)hist_data);
DBUG_RETURN(true);
} }
...@@ -1594,7 +1640,7 @@ double Histogram_json::range_selectivity(Field *field, key_range *min_endp, ...@@ -1594,7 +1640,7 @@ double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
void Histogram_json::serialize(Field *field) void Histogram_json::serialize(Field *field)
{ {
field->store((char*)json_text, strlen((char*)json_text), &my_charset_bin); field->store(json_text.data(), json_text.size(), &my_charset_bin);
} }
...@@ -2052,13 +2098,16 @@ class Histogram_builder_json : public Histogram_builder ...@@ -2052,13 +2098,16 @@ class Histogram_builder_json : public Histogram_builder
} }
void build_json_from_histogram() { void build_json_from_histogram() {
Json_writer *writer = new Json_writer(); Json_writer writer;
writer->start_array(); writer.start_object();
writer.add_member(Histogram_json::JSON_NAME).start_array();
for(auto& value: bucket_bounds) { for(auto& value: bucket_bounds) {
writer->add_str(value.c_str()); writer.add_str(value.c_str());
} }
writer->end_array(); writer.end_array();
Binary_string *json_string = (Binary_string *) writer->output.get_string(); writer.end_object();
Binary_string *json_string = (Binary_string *) writer.output.get_string();
Histogram_json *hist= (Histogram_json*)histogram; Histogram_json *hist= (Histogram_json*)histogram;
hist->set_json_text(bucket_bounds.size(), (uchar *) json_string->c_ptr()); hist->set_json_text(bucket_bounds.size(), (uchar *) json_string->c_ptr());
} }
...@@ -2080,42 +2129,6 @@ Histogram_base *create_histogram(Histogram_type hist_type) ...@@ -2080,42 +2129,6 @@ Histogram_base *create_histogram(Histogram_type hist_type)
} }
bool json_get_array_items(const char *json, const char *json_end, int *value_type, std::vector<std::string> &container) {
json_engine_t je;
int vl;
const char *v;
json_scan_start(&je, &my_charset_utf8mb4_bin, (const uchar *)json, (const uchar *)json_end);
if (json_read_value(&je) || (*value_type = je.value_type) != JSON_VALUE_ARRAY)
{
return false;
}
std::string val;
while(!json_scan_next(&je))
{
switch(je.state)
{
case JST_VALUE:
*value_type = json_smart_read_value(&je, &v, &vl);
if (je.value_type != JSON_VALUE_STRING &&
je.value_type != JSON_VALUE_NUMBER &&
je.value_type != JSON_VALUE_TRUE &&
je.value_type != JSON_VALUE_FALSE)
{
return false;
}
val = std::string(v, vl);
container.emplace_back(val);
break;
case JST_ARRAY_END:
break;
}
}
return true;
}
C_MODE_START C_MODE_START
int histogram_build_walk(void *elem, element_count elem_cnt, void *arg) int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
......
...@@ -152,7 +152,7 @@ class Histogram_base : public Sql_alloc ...@@ -152,7 +152,7 @@ class Histogram_base : public Sql_alloc
{ {
public: public:
virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg, virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size)= 0; const char *hist_data, size_t hist_data_len)= 0;
virtual void serialize(Field *to_field)= 0; virtual void serialize(Field *to_field)= 0;
virtual Histogram_type get_type()=0; virtual Histogram_type get_type()=0;
...@@ -187,7 +187,7 @@ class Histogram_binary : public Histogram_base ...@@ -187,7 +187,7 @@ class Histogram_binary : public Histogram_base
{ {
public: public:
bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg, bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg,
const uchar *ptr_arg, uint size_arg) override; const char *hist_data, size_t hist_data_len) override;
void serialize(Field *to_field) override; void serialize(Field *to_field) override;
Histogram_type get_type() override { return type; } Histogram_type get_type() override { return type; }
...@@ -350,14 +350,16 @@ class Histogram_json : public Histogram_base ...@@ -350,14 +350,16 @@ class Histogram_json : public Histogram_base
uint8 size; /* Number of elements in the histogram */ uint8 size; /* Number of elements in the histogram */
/* Collection-time only: collected histogram in the JSON form. */ /* Collection-time only: collected histogram in the JSON form. */
uchar *json_text; std::string json_text;
// Array of histogram bucket endpoints in KeyTupleFormat. // Array of histogram bucket endpoints in KeyTupleFormat.
std::vector<std::string> histogram_bounds; std::vector<std::string> histogram_bounds;
public: public:
static constexpr const char* JSON_NAME="histogram_hb_v1";
bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg, bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size) override; const char *hist_data, size_t hist_data_len) override;
void serialize(Field *field) override; void serialize(Field *field) override;
...@@ -375,7 +377,8 @@ class Histogram_json : public Histogram_base ...@@ -375,7 +377,8 @@ class Histogram_json : public Histogram_base
void set_json_text(ulonglong sz, uchar *json_text_arg) void set_json_text(ulonglong sz, uchar *json_text_arg)
{ {
size = (uint8) sz; size = (uint8) sz;
json_text= json_text_arg; json_text.assign((const char*)json_text_arg,
strlen((const char*)json_text_arg));
} }
uint get_size() override uint get_size() override
...@@ -481,8 +484,9 @@ class Column_statistics ...@@ -481,8 +484,9 @@ class Column_statistics
ulonglong avg_frequency; ulonglong avg_frequency;
public: public:
/* Histogram type as specified in mysql.column_stats.hist_type */
Histogram_type histogram_type_on_disk; Histogram_type histogram_type_on_disk;
Histogram_base *histogram_; Histogram_base *histogram_;
uint32 no_values_provided_bitmap() uint32 no_values_provided_bitmap()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment