Commit 2a1cdbab authored by Sergei Petrunia's avatar Sergei Petrunia

Fix JSON parsing: future-proof data representation in JSON, code cleanup

parent a0b4a868
...@@ -283,12 +283,13 @@ int json_key_matches(json_engine_t *je, json_string_t *k); ...@@ -283,12 +283,13 @@ int json_key_matches(json_engine_t *je, json_string_t *k);
int json_read_value(json_engine_t *j); int json_read_value(json_engine_t *j);
/* /*
* json_smart_read_value() reads parses a scalar value and value length from the json engine, json_smart_read_value() reads a JSON value. Pointer to value is stored in
* and copies them into `value` and `value_length` respectively. *value and its length in *value_len.
* It should only be called when the json_engine state is JST_VALUE.
* If it encounters a non-scalar value (say object or array) before getting to value_len, if the value is non a scalar, it returns pointers to its JSON
* such value is also read and copied into value. representation.
*/ The function should only be called when je->state==JST_VALUE.
*/
enum json_types json_smart_read_value(json_engine_t *je, const char **value, int *value_len); enum json_types json_smart_read_value(json_engine_t *je, const char **value, int *value_len);
/* /*
......
This diff is collapsed.
...@@ -37,6 +37,8 @@ analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz'; ...@@ -37,6 +37,8 @@ analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
explain extended select * from t1_json where a < 'b-1a'; explain extended select * from t1_json where a < 'b-1a';
analyze select * from t1_json where a > 'zzzzzzzzz'; analyze select * from t1_json where a > 'zzzzzzzzz';
drop table ten;
# test different valid JSON strings that are invalid histograms. # test different valid JSON strings that are invalid histograms.
UPDATE mysql.column_stats SET histogram='["a-1", "a-2", {"a": "b"}, "a-3"]' WHERE table_name='t1_json'; UPDATE mysql.column_stats SET histogram='["a-1", "a-2", {"a": "b"}, "a-3"]' WHERE table_name='t1_json';
FLUSH TABLES; FLUSH TABLES;
...@@ -45,23 +47,23 @@ explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz'; ...@@ -45,23 +47,23 @@ explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
--source include/have_sequence.inc --source include/have_sequence.inc
create table users ( create table t2 (
city varchar(100) city varchar(100)
); );
set histogram_size=50; set histogram_size=50;
insert into users select 'Moscow' from seq_1_to_99; insert into t2 select 'Moscow' from seq_1_to_99;
insert into users select 'Helsinki' from seq_1_to_2; insert into t2 select 'Helsinki' from seq_1_to_2;
set histogram_type=json_hb; set histogram_type=json_hb;
analyze table users persistent for all; analyze table t2 persistent for all;
explain extended select * from users where city = 'Moscow'; explain extended select * from t2 where city = 'Moscow';
analyze select * from users where city = 'Moscow'; analyze select * from t2 where city = 'Moscow';
explain extended select * from users where city = 'Helsinki'; explain extended select * from t2 where city = 'Helsinki';
analyze select * from users where city = 'helsinki'; analyze select * from t2 where city = 'helsinki';
explain extended select * from users where city < 'Lagos'; explain extended select * from t2 where city < 'Lagos';
drop table t1_bin; drop table t1_bin;
drop table t1_json; drop table t1_json;
drop table users; drop table t2;
DELETE FROM mysql.column_stats; DELETE FROM mysql.column_stats;
......
...@@ -8914,4 +8914,4 @@ ER_PARTITION_CONVERT_SUBPARTITIONED ...@@ -8914,4 +8914,4 @@ ER_PARTITION_CONVERT_SUBPARTITIONED
ER_PROVIDER_NOT_LOADED ER_PROVIDER_NOT_LOADED
eng "MariaDB tried to use the %s, but its provider plugin is not loaded" eng "MariaDB tried to use the %s, but its provider plugin is not loaded"
ER_JSON_HISTOGRAM_PARSE_FAILED ER_JSON_HISTOGRAM_PARSE_FAILED
eng "Failed to parse histogram, encountered JSON_TYPE '%d'." eng "Failed to parse histogram: %s at offset %d."
...@@ -1123,6 +1123,7 @@ class Column_stat: public Stat_table ...@@ -1123,6 +1123,7 @@ class Column_stat: public Stat_table
void get_stat_values() void get_stat_values()
{ {
table_field->read_stats->set_all_nulls(); table_field->read_stats->set_all_nulls();
// default: hist_type=NULL means there's no histogram
table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM; table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM;
if (table_field->read_stats->min_value) if (table_field->read_stats->min_value)
...@@ -1196,7 +1197,10 @@ class Column_stat: public Stat_table ...@@ -1196,7 +1197,10 @@ class Column_stat: public Stat_table
break; break;
} }
case COLUMN_STAT_HISTOGRAM: case COLUMN_STAT_HISTOGRAM:
//TODO: if stat_field->length() == 0 then histogram_type_on_disk is set to INVALID_HISTOGRAM /*
Do nothing here: we take the histogram length from the 'histogram'
column itself
*/
break; break;
} }
} }
...@@ -1245,7 +1249,7 @@ class Column_stat: public Stat_table ...@@ -1245,7 +1249,7 @@ class Column_stat: public Stat_table
} }
if (!hist->parse(mem_root, table_field, if (!hist->parse(mem_root, table_field,
table_field->read_stats->histogram_type_on_disk, table_field->read_stats->histogram_type_on_disk,
(const uchar*)val.ptr(), val.length())) val.ptr(), val.length()))
{ {
table_field->read_stats->histogram_= hist; table_field->read_stats->histogram_= hist;
return hist; return hist;
...@@ -1255,19 +1259,19 @@ class Column_stat: public Stat_table ...@@ -1255,19 +1259,19 @@ class Column_stat: public Stat_table
} }
}; };
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field *,
Histogram_type type_arg, bool Histogram_binary::parse(MEM_ROOT *mem_root, Field*,
const uchar *ptr_arg, uint size_arg) Histogram_type type_arg, const char *hist_data,
size_t hist_data_len)
{ {
// Just copy the data /* On-disk an in-memory formats are the same. Just copy the data. */
size = (uint8) size_arg; type= type_arg;
type = type_arg; size= (uint8) hist_data_len; // 'size' holds the size of histogram in bytes
if ((values = (uchar*)alloc_root(mem_root, size_arg))) if (!(values= (uchar*)alloc_root(mem_root, hist_data_len)))
{ return true;
memcpy(values, ptr_arg, size_arg);
return false; memcpy(values, hist_data, hist_data_len);
} return false;
return true;
} }
/* /*
...@@ -1307,39 +1311,81 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root, ...@@ -1307,39 +1311,81 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root,
*/ */
bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field, bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field,
Histogram_type type_arg, const uchar *ptr, Histogram_type type_arg, const char *hist_data,
uint size_arg) size_t hist_data_len)
{ {
DBUG_ENTER("Histogram_json::parse"); DBUG_ENTER("Histogram_json::parse");
DBUG_ASSERT(type_arg == JSON_HB); DBUG_ASSERT(type_arg == JSON_HB);
size = (uint8) size_arg; const char *err;
const char *json = (char *)ptr; json_engine_t je;
int vt; json_string_t key_name;
std::vector<std::string> hist_buckets_text;
bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets_text); json_scan_start(&je, &my_charset_utf8mb4_bin,
if (!result) (const uchar*)hist_data,
{ (const uchar*)hist_data+hist_data_len);
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), vt);
DBUG_RETURN(true); if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
{
err= "Root JSON element must be a JSON object";
goto error;
} }
size= hist_buckets_text.size();
/* json_string_set_str(&key_name, (const uchar*)JSON_NAME,
Convert the text based array into a data structure that allows lookups and (const uchar*)JSON_NAME + strlen(JSON_NAME));
estimates json_string_set_cs(&key_name, system_charset_info);
*/
for (auto &s : hist_buckets_text) if (json_scan_next(&je) || je.state != JST_KEY ||
!json_key_matches(&je, &key_name))
{ {
field->store_text(s.data(), s.size(), &my_charset_bin); err= "The first key in the object must be histogram_hb_v1";
goto error;
}
// Get the value in "truncated key tuple format" here: // The value must be a JSON array
uchar buf[MAX_KEY_LENGTH]; if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
uint len_to_copy= field->key_length(); {
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW); err= "A JSON array expected";
histogram_bounds.push_back(std::string((char*)buf, bytes)); goto error;
} }
// Read the array
while (!json_scan_next(&je))
{
switch(je.state)
{
case JST_VALUE:
{
const char *val;
int val_len;
json_smart_read_value(&je, &val, &val_len);
if (je.value_type != JSON_VALUE_STRING &&
je.value_type != JSON_VALUE_NUMBER &&
je.value_type != JSON_VALUE_TRUE &&
je.value_type != JSON_VALUE_FALSE)
{
err= "Scalar value expected";
goto error;
}
uchar buf[MAX_KEY_LENGTH];
uint len_to_copy= field->key_length();
field->store_text(val, val_len, &my_charset_bin);
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
histogram_bounds.push_back(std::string((char*)buf, bytes));
// TODO: Should we also compare this endpoint with the previous
// to verify that the ordering is right?
break;
}
case JST_ARRAY_END:
break;
}
}
size= histogram_bounds.size();
DBUG_RETURN(false); DBUG_RETURN(false);
error:
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
je.s.c_str - (const uchar*)hist_data);
DBUG_RETURN(true);
} }
...@@ -1347,7 +1393,7 @@ static ...@@ -1347,7 +1393,7 @@ static
void store_key_image_to_rec_no_null(Field *field, uchar *ptr) { void store_key_image_to_rec_no_null(Field *field, uchar *ptr) {
MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table, MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
&field->table->write_set); &field->table->write_set);
field->set_key_image(ptr, field->key_length()); field->set_key_image(ptr, field->key_length());
dbug_tmp_restore_column_map(&field->table->write_set, old_map); dbug_tmp_restore_column_map(&field->table->write_set, old_map);
} }
...@@ -1506,9 +1552,9 @@ double Histogram_json::point_selectivity(Field *field, key_range *endpoint, doub ...@@ -1506,9 +1552,9 @@ double Histogram_json::point_selectivity(Field *field, key_range *endpoint, doub
/* /*
@param field The table field histogram is for. We don't care about the @param field The table field histogram is for. We don't care about the
field's current value, we only need its virtual functions to field's current value, we only need its virtual functions to
perform various operations perform various operations
@param min_endp, max_endp - this specifies the range. @param min_endp, max_endp - this specifies the range.
*/ */
double Histogram_json::range_selectivity(Field *field, key_range *min_endp, double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
...@@ -1594,7 +1640,7 @@ double Histogram_json::range_selectivity(Field *field, key_range *min_endp, ...@@ -1594,7 +1640,7 @@ double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
void Histogram_json::serialize(Field *field) void Histogram_json::serialize(Field *field)
{ {
field->store((char*)json_text, strlen((char*)json_text), &my_charset_bin); field->store(json_text.data(), json_text.size(), &my_charset_bin);
} }
...@@ -2052,13 +2098,16 @@ class Histogram_builder_json : public Histogram_builder ...@@ -2052,13 +2098,16 @@ class Histogram_builder_json : public Histogram_builder
} }
void build_json_from_histogram() { void build_json_from_histogram() {
Json_writer *writer = new Json_writer(); Json_writer writer;
writer->start_array(); writer.start_object();
writer.add_member(Histogram_json::JSON_NAME).start_array();
for(auto& value: bucket_bounds) { for(auto& value: bucket_bounds) {
writer->add_str(value.c_str()); writer.add_str(value.c_str());
} }
writer->end_array(); writer.end_array();
Binary_string *json_string = (Binary_string *) writer->output.get_string(); writer.end_object();
Binary_string *json_string = (Binary_string *) writer.output.get_string();
Histogram_json *hist= (Histogram_json*)histogram; Histogram_json *hist= (Histogram_json*)histogram;
hist->set_json_text(bucket_bounds.size(), (uchar *) json_string->c_ptr()); hist->set_json_text(bucket_bounds.size(), (uchar *) json_string->c_ptr());
} }
...@@ -2080,42 +2129,6 @@ Histogram_base *create_histogram(Histogram_type hist_type) ...@@ -2080,42 +2129,6 @@ Histogram_base *create_histogram(Histogram_type hist_type)
} }
bool json_get_array_items(const char *json, const char *json_end, int *value_type, std::vector<std::string> &container) {
json_engine_t je;
int vl;
const char *v;
json_scan_start(&je, &my_charset_utf8mb4_bin, (const uchar *)json, (const uchar *)json_end);
if (json_read_value(&je) || (*value_type = je.value_type) != JSON_VALUE_ARRAY)
{
return false;
}
std::string val;
while(!json_scan_next(&je))
{
switch(je.state)
{
case JST_VALUE:
*value_type = json_smart_read_value(&je, &v, &vl);
if (je.value_type != JSON_VALUE_STRING &&
je.value_type != JSON_VALUE_NUMBER &&
je.value_type != JSON_VALUE_TRUE &&
je.value_type != JSON_VALUE_FALSE)
{
return false;
}
val = std::string(v, vl);
container.emplace_back(val);
break;
case JST_ARRAY_END:
break;
}
}
return true;
}
C_MODE_START C_MODE_START
int histogram_build_walk(void *elem, element_count elem_cnt, void *arg) int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
......
...@@ -152,7 +152,7 @@ class Histogram_base : public Sql_alloc ...@@ -152,7 +152,7 @@ class Histogram_base : public Sql_alloc
{ {
public: public:
virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg, virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size)= 0; const char *hist_data, size_t hist_data_len)= 0;
virtual void serialize(Field *to_field)= 0; virtual void serialize(Field *to_field)= 0;
virtual Histogram_type get_type()=0; virtual Histogram_type get_type()=0;
...@@ -187,7 +187,7 @@ class Histogram_binary : public Histogram_base ...@@ -187,7 +187,7 @@ class Histogram_binary : public Histogram_base
{ {
public: public:
bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg, bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg,
const uchar *ptr_arg, uint size_arg) override; const char *hist_data, size_t hist_data_len) override;
void serialize(Field *to_field) override; void serialize(Field *to_field) override;
Histogram_type get_type() override { return type; } Histogram_type get_type() override { return type; }
...@@ -350,14 +350,16 @@ class Histogram_json : public Histogram_base ...@@ -350,14 +350,16 @@ class Histogram_json : public Histogram_base
uint8 size; /* Number of elements in the histogram */ uint8 size; /* Number of elements in the histogram */
/* Collection-time only: collected histogram in the JSON form. */ /* Collection-time only: collected histogram in the JSON form. */
uchar *json_text; std::string json_text;
// Array of histogram bucket endpoints in KeyTupleFormat. // Array of histogram bucket endpoints in KeyTupleFormat.
std::vector<std::string> histogram_bounds; std::vector<std::string> histogram_bounds;
public: public:
static constexpr const char* JSON_NAME="histogram_hb_v1";
bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg, bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size) override; const char *hist_data, size_t hist_data_len) override;
void serialize(Field *field) override; void serialize(Field *field) override;
...@@ -375,7 +377,8 @@ class Histogram_json : public Histogram_base ...@@ -375,7 +377,8 @@ class Histogram_json : public Histogram_base
void set_json_text(ulonglong sz, uchar *json_text_arg) void set_json_text(ulonglong sz, uchar *json_text_arg)
{ {
size = (uint8) sz; size = (uint8) sz;
json_text= json_text_arg; json_text.assign((const char*)json_text_arg,
strlen((const char*)json_text_arg));
} }
uint get_size() override uint get_size() override
...@@ -481,8 +484,9 @@ class Column_statistics ...@@ -481,8 +484,9 @@ class Column_statistics
ulonglong avg_frequency; ulonglong avg_frequency;
public: public:
/* Histogram type as specified in mysql.column_stats.hist_type */
Histogram_type histogram_type_on_disk; Histogram_type histogram_type_on_disk;
Histogram_base *histogram_; Histogram_base *histogram_;
uint32 no_values_provided_bitmap() uint32 no_values_provided_bitmap()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment