Commit 2a1cdbab authored by Sergei Petrunia's avatar Sergei Petrunia

Fix JSON parsing: future-proof data representation in JSON, code cleanup

parent a0b4a868
......@@ -283,12 +283,13 @@ int json_key_matches(json_engine_t *je, json_string_t *k);
int json_read_value(json_engine_t *j);
/*
* json_smart_read_value() reads parses a scalar value and value length from the json engine,
* and copies them into `value` and `value_length` respectively.
* It should only be called when the json_engine state is JST_VALUE.
* If it encounters a non-scalar value (say object or array) before getting to value_len,
* such value is also read and copied into value.
*/
json_smart_read_value() reads a JSON value. Pointer to value is stored in
*value and its length in *value_len.
if the value is non a scalar, it returns pointers to its JSON
representation.
The function should only be called when je->state==JST_VALUE.
*/
enum json_types json_smart_read_value(json_engine_t *je, const char **value, int *value_len);
/*
......
This diff is collapsed.
......@@ -37,6 +37,8 @@ analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
explain extended select * from t1_json where a < 'b-1a';
analyze select * from t1_json where a > 'zzzzzzzzz';
drop table ten;
# test different valid JSON strings that are invalid histograms.
UPDATE mysql.column_stats SET histogram='["a-1", "a-2", {"a": "b"}, "a-3"]' WHERE table_name='t1_json';
FLUSH TABLES;
......@@ -45,23 +47,23 @@ explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
--source include/have_sequence.inc
create table users (
create table t2 (
city varchar(100)
);
set histogram_size=50;
insert into users select 'Moscow' from seq_1_to_99;
insert into users select 'Helsinki' from seq_1_to_2;
insert into t2 select 'Moscow' from seq_1_to_99;
insert into t2 select 'Helsinki' from seq_1_to_2;
set histogram_type=json_hb;
analyze table users persistent for all;
explain extended select * from users where city = 'Moscow';
analyze select * from users where city = 'Moscow';
explain extended select * from users where city = 'Helsinki';
analyze select * from users where city = 'helsinki';
explain extended select * from users where city < 'Lagos';
analyze table t2 persistent for all;
explain extended select * from t2 where city = 'Moscow';
analyze select * from t2 where city = 'Moscow';
explain extended select * from t2 where city = 'Helsinki';
analyze select * from t2 where city = 'helsinki';
explain extended select * from t2 where city < 'Lagos';
drop table t1_bin;
drop table t1_json;
drop table users;
drop table t2;
DELETE FROM mysql.column_stats;
......
......@@ -8914,4 +8914,4 @@ ER_PARTITION_CONVERT_SUBPARTITIONED
ER_PROVIDER_NOT_LOADED
eng "MariaDB tried to use the %s, but its provider plugin is not loaded"
ER_JSON_HISTOGRAM_PARSE_FAILED
eng "Failed to parse histogram, encountered JSON_TYPE '%d'."
eng "Failed to parse histogram: %s at offset %d."
......@@ -1123,6 +1123,7 @@ class Column_stat: public Stat_table
void get_stat_values()
{
table_field->read_stats->set_all_nulls();
// default: hist_type=NULL means there's no histogram
table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM;
if (table_field->read_stats->min_value)
......@@ -1196,7 +1197,10 @@ class Column_stat: public Stat_table
break;
}
case COLUMN_STAT_HISTOGRAM:
//TODO: if stat_field->length() == 0 then histogram_type_on_disk is set to INVALID_HISTOGRAM
/*
Do nothing here: we take the histogram length from the 'histogram'
column itself
*/
break;
}
}
......@@ -1245,7 +1249,7 @@ class Column_stat: public Stat_table
}
if (!hist->parse(mem_root, table_field,
table_field->read_stats->histogram_type_on_disk,
(const uchar*)val.ptr(), val.length()))
val.ptr(), val.length()))
{
table_field->read_stats->histogram_= hist;
return hist;
......@@ -1255,19 +1259,19 @@ class Column_stat: public Stat_table
}
};
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field *,
Histogram_type type_arg,
const uchar *ptr_arg, uint size_arg)
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field*,
Histogram_type type_arg, const char *hist_data,
size_t hist_data_len)
{
// Just copy the data
size = (uint8) size_arg;
type = type_arg;
if ((values = (uchar*)alloc_root(mem_root, size_arg)))
{
memcpy(values, ptr_arg, size_arg);
return false;
}
/* On-disk an in-memory formats are the same. Just copy the data. */
type= type_arg;
size= (uint8) hist_data_len; // 'size' holds the size of histogram in bytes
if (!(values= (uchar*)alloc_root(mem_root, hist_data_len)))
return true;
memcpy(values, hist_data, hist_data_len);
return false;
}
/*
......@@ -1307,39 +1311,81 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root,
*/
bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field,
Histogram_type type_arg, const uchar *ptr,
uint size_arg)
Histogram_type type_arg, const char *hist_data,
size_t hist_data_len)
{
DBUG_ENTER("Histogram_json::parse");
DBUG_ASSERT(type_arg == JSON_HB);
size = (uint8) size_arg;
const char *json = (char *)ptr;
int vt;
std::vector<std::string> hist_buckets_text;
bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets_text);
if (!result)
{
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), vt);
DBUG_RETURN(true);
const char *err;
json_engine_t je;
json_string_t key_name;
json_scan_start(&je, &my_charset_utf8mb4_bin,
(const uchar*)hist_data,
(const uchar*)hist_data+hist_data_len);
if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
{
err= "Root JSON element must be a JSON object";
goto error;
}
size= hist_buckets_text.size();
/*
Convert the text based array into a data structure that allows lookups and
estimates
*/
for (auto &s : hist_buckets_text)
json_string_set_str(&key_name, (const uchar*)JSON_NAME,
(const uchar*)JSON_NAME + strlen(JSON_NAME));
json_string_set_cs(&key_name, system_charset_info);
if (json_scan_next(&je) || je.state != JST_KEY ||
!json_key_matches(&je, &key_name))
{
field->store_text(s.data(), s.size(), &my_charset_bin);
err= "The first key in the object must be histogram_hb_v1";
goto error;
}
// Get the value in "truncated key tuple format" here:
// The value must be a JSON array
if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
{
err= "A JSON array expected";
goto error;
}
// Read the array
while (!json_scan_next(&je))
{
switch(je.state)
{
case JST_VALUE:
{
const char *val;
int val_len;
json_smart_read_value(&je, &val, &val_len);
if (je.value_type != JSON_VALUE_STRING &&
je.value_type != JSON_VALUE_NUMBER &&
je.value_type != JSON_VALUE_TRUE &&
je.value_type != JSON_VALUE_FALSE)
{
err= "Scalar value expected";
goto error;
}
uchar buf[MAX_KEY_LENGTH];
uint len_to_copy= field->key_length();
field->store_text(val, val_len, &my_charset_bin);
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
histogram_bounds.push_back(std::string((char*)buf, bytes));
// TODO: Should we also compare this endpoint with the previous
// to verify that the ordering is right?
break;
}
case JST_ARRAY_END:
break;
}
}
size= histogram_bounds.size();
DBUG_RETURN(false);
error:
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
je.s.c_str - (const uchar*)hist_data);
DBUG_RETURN(true);
}
......@@ -1594,7 +1640,7 @@ double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
void Histogram_json::serialize(Field *field)
{
field->store((char*)json_text, strlen((char*)json_text), &my_charset_bin);
field->store(json_text.data(), json_text.size(), &my_charset_bin);
}
......@@ -2052,13 +2098,16 @@ class Histogram_builder_json : public Histogram_builder
}
void build_json_from_histogram() {
Json_writer *writer = new Json_writer();
writer->start_array();
Json_writer writer;
writer.start_object();
writer.add_member(Histogram_json::JSON_NAME).start_array();
for(auto& value: bucket_bounds) {
writer->add_str(value.c_str());
writer.add_str(value.c_str());
}
writer->end_array();
Binary_string *json_string = (Binary_string *) writer->output.get_string();
writer.end_array();
writer.end_object();
Binary_string *json_string = (Binary_string *) writer.output.get_string();
Histogram_json *hist= (Histogram_json*)histogram;
hist->set_json_text(bucket_bounds.size(), (uchar *) json_string->c_ptr());
}
......@@ -2080,42 +2129,6 @@ Histogram_base *create_histogram(Histogram_type hist_type)
}
bool json_get_array_items(const char *json, const char *json_end, int *value_type, std::vector<std::string> &container) {
json_engine_t je;
int vl;
const char *v;
json_scan_start(&je, &my_charset_utf8mb4_bin, (const uchar *)json, (const uchar *)json_end);
if (json_read_value(&je) || (*value_type = je.value_type) != JSON_VALUE_ARRAY)
{
return false;
}
std::string val;
while(!json_scan_next(&je))
{
switch(je.state)
{
case JST_VALUE:
*value_type = json_smart_read_value(&je, &v, &vl);
if (je.value_type != JSON_VALUE_STRING &&
je.value_type != JSON_VALUE_NUMBER &&
je.value_type != JSON_VALUE_TRUE &&
je.value_type != JSON_VALUE_FALSE)
{
return false;
}
val = std::string(v, vl);
container.emplace_back(val);
break;
case JST_ARRAY_END:
break;
}
}
return true;
}
C_MODE_START
int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
......
......@@ -152,7 +152,7 @@ class Histogram_base : public Sql_alloc
{
public:
virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size)= 0;
const char *hist_data, size_t hist_data_len)= 0;
virtual void serialize(Field *to_field)= 0;
virtual Histogram_type get_type()=0;
......@@ -187,7 +187,7 @@ class Histogram_binary : public Histogram_base
{
public:
bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg,
const uchar *ptr_arg, uint size_arg) override;
const char *hist_data, size_t hist_data_len) override;
void serialize(Field *to_field) override;
Histogram_type get_type() override { return type; }
......@@ -350,14 +350,16 @@ class Histogram_json : public Histogram_base
uint8 size; /* Number of elements in the histogram */
/* Collection-time only: collected histogram in the JSON form. */
uchar *json_text;
std::string json_text;
// Array of histogram bucket endpoints in KeyTupleFormat.
std::vector<std::string> histogram_bounds;
public:
static constexpr const char* JSON_NAME="histogram_hb_v1";
bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size) override;
const char *hist_data, size_t hist_data_len) override;
void serialize(Field *field) override;
......@@ -375,7 +377,8 @@ class Histogram_json : public Histogram_base
void set_json_text(ulonglong sz, uchar *json_text_arg)
{
size = (uint8) sz;
json_text= json_text_arg;
json_text.assign((const char*)json_text_arg,
strlen((const char*)json_text_arg));
}
uint get_size() override
......@@ -481,8 +484,9 @@ class Column_statistics
ulonglong avg_frequency;
public:
/* Histogram type as specified in mysql.column_stats.hist_type */
Histogram_type histogram_type_on_disk;
Histogram_base *histogram_;
uint32 no_values_provided_bitmap()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment