Commit 2a1cdbab authored by Sergei Petrunia's avatar Sergei Petrunia

Fix JSON parsing: future-proof data representation in JSON, code cleanup

parent a0b4a868
......@@ -283,12 +283,13 @@ int json_key_matches(json_engine_t *je, json_string_t *k);
int json_read_value(json_engine_t *j);
/*
* json_smart_read_value() reads parses a scalar value and value length from the json engine,
* and copies them into `value` and `value_length` respectively.
* It should only be called when the json_engine state is JST_VALUE.
* If it encounters a non-scalar value (say object or array) before getting to value_len,
* such value is also read and copied into value.
*/
json_smart_read_value() reads a JSON value. Pointer to value is stored in
*value and its length in *value_len.
if the value is non a scalar, it returns pointers to its JSON
representation.
The function should only be called when je->state==JST_VALUE.
*/
enum json_types json_smart_read_value(json_engine_t *je, const char **value, int *value_len);
/*
......
This diff is collapsed.
......@@ -37,6 +37,8 @@ analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
explain extended select * from t1_json where a < 'b-1a';
analyze select * from t1_json where a > 'zzzzzzzzz';
drop table ten;
# test different valid JSON strings that are invalid histograms.
UPDATE mysql.column_stats SET histogram='["a-1", "a-2", {"a": "b"}, "a-3"]' WHERE table_name='t1_json';
FLUSH TABLES;
......@@ -45,23 +47,23 @@ explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
--source include/have_sequence.inc
create table users (
create table t2 (
city varchar(100)
);
set histogram_size=50;
insert into users select 'Moscow' from seq_1_to_99;
insert into users select 'Helsinki' from seq_1_to_2;
insert into t2 select 'Moscow' from seq_1_to_99;
insert into t2 select 'Helsinki' from seq_1_to_2;
set histogram_type=json_hb;
analyze table users persistent for all;
explain extended select * from users where city = 'Moscow';
analyze select * from users where city = 'Moscow';
explain extended select * from users where city = 'Helsinki';
analyze select * from users where city = 'helsinki';
explain extended select * from users where city < 'Lagos';
analyze table t2 persistent for all;
explain extended select * from t2 where city = 'Moscow';
analyze select * from t2 where city = 'Moscow';
explain extended select * from t2 where city = 'Helsinki';
analyze select * from t2 where city = 'helsinki';
explain extended select * from t2 where city < 'Lagos';
drop table t1_bin;
drop table t1_json;
drop table users;
drop table t2;
DELETE FROM mysql.column_stats;
......
......@@ -8914,4 +8914,4 @@ ER_PARTITION_CONVERT_SUBPARTITIONED
ER_PROVIDER_NOT_LOADED
eng "MariaDB tried to use the %s, but its provider plugin is not loaded"
ER_JSON_HISTOGRAM_PARSE_FAILED
eng "Failed to parse histogram, encountered JSON_TYPE '%d'."
eng "Failed to parse histogram: %s at offset %d."
......@@ -1123,6 +1123,7 @@ class Column_stat: public Stat_table
void get_stat_values()
{
table_field->read_stats->set_all_nulls();
// default: hist_type=NULL means there's no histogram
table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM;
if (table_field->read_stats->min_value)
......@@ -1196,7 +1197,10 @@ class Column_stat: public Stat_table
break;
}
case COLUMN_STAT_HISTOGRAM:
//TODO: if stat_field->length() == 0 then histogram_type_on_disk is set to INVALID_HISTOGRAM
/*
Do nothing here: we take the histogram length from the 'histogram'
column itself
*/
break;
}
}
......@@ -1245,7 +1249,7 @@ class Column_stat: public Stat_table
}
if (!hist->parse(mem_root, table_field,
table_field->read_stats->histogram_type_on_disk,
(const uchar*)val.ptr(), val.length()))
val.ptr(), val.length()))
{
table_field->read_stats->histogram_= hist;
return hist;
......@@ -1255,19 +1259,19 @@ class Column_stat: public Stat_table
}
};
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field *,
Histogram_type type_arg,
const uchar *ptr_arg, uint size_arg)
bool Histogram_binary::parse(MEM_ROOT *mem_root, Field*,
Histogram_type type_arg, const char *hist_data,
size_t hist_data_len)
{
// Just copy the data
size = (uint8) size_arg;
type = type_arg;
if ((values = (uchar*)alloc_root(mem_root, size_arg)))
{
memcpy(values, ptr_arg, size_arg);
return false;
}
return true;
/* On-disk an in-memory formats are the same. Just copy the data. */
type= type_arg;
size= (uint8) hist_data_len; // 'size' holds the size of histogram in bytes
if (!(values= (uchar*)alloc_root(mem_root, hist_data_len)))
return true;
memcpy(values, hist_data, hist_data_len);
return false;
}
/*
......@@ -1307,39 +1311,81 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root,
*/
bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field,
Histogram_type type_arg, const uchar *ptr,
uint size_arg)
Histogram_type type_arg, const char *hist_data,
size_t hist_data_len)
{
DBUG_ENTER("Histogram_json::parse");
DBUG_ASSERT(type_arg == JSON_HB);
size = (uint8) size_arg;
const char *json = (char *)ptr;
int vt;
std::vector<std::string> hist_buckets_text;
bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets_text);
if (!result)
{
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), vt);
DBUG_RETURN(true);
const char *err;
json_engine_t je;
json_string_t key_name;
json_scan_start(&je, &my_charset_utf8mb4_bin,
(const uchar*)hist_data,
(const uchar*)hist_data+hist_data_len);
if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
{
err= "Root JSON element must be a JSON object";
goto error;
}
size= hist_buckets_text.size();
/*
Convert the text based array into a data structure that allows lookups and
estimates
*/
for (auto &s : hist_buckets_text)
json_string_set_str(&key_name, (const uchar*)JSON_NAME,
(const uchar*)JSON_NAME + strlen(JSON_NAME));
json_string_set_cs(&key_name, system_charset_info);
if (json_scan_next(&je) || je.state != JST_KEY ||
!json_key_matches(&je, &key_name))
{
field->store_text(s.data(), s.size(), &my_charset_bin);
err= "The first key in the object must be histogram_hb_v1";
goto error;
}
// Get the value in "truncated key tuple format" here:
uchar buf[MAX_KEY_LENGTH];
uint len_to_copy= field->key_length();
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
histogram_bounds.push_back(std::string((char*)buf, bytes));
// The value must be a JSON array
if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
{
err= "A JSON array expected";
goto error;
}
// Read the array
while (!json_scan_next(&je))
{
switch(je.state)
{
case JST_VALUE:
{
const char *val;
int val_len;
json_smart_read_value(&je, &val, &val_len);
if (je.value_type != JSON_VALUE_STRING &&
je.value_type != JSON_VALUE_NUMBER &&
je.value_type != JSON_VALUE_TRUE &&
je.value_type != JSON_VALUE_FALSE)
{
err= "Scalar value expected";
goto error;
}
uchar buf[MAX_KEY_LENGTH];
uint len_to_copy= field->key_length();
field->store_text(val, val_len, &my_charset_bin);
uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
histogram_bounds.push_back(std::string((char*)buf, bytes));
// TODO: Should we also compare this endpoint with the previous
// to verify that the ordering is right?
break;
}
case JST_ARRAY_END:
break;
}
}
size= histogram_bounds.size();
DBUG_RETURN(false);
error:
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
je.s.c_str - (const uchar*)hist_data);
DBUG_RETURN(true);
}
......@@ -1347,7 +1393,7 @@ static
void store_key_image_to_rec_no_null(Field *field, uchar *ptr) {
MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
&field->table->write_set);
field->set_key_image(ptr, field->key_length());
field->set_key_image(ptr, field->key_length());
dbug_tmp_restore_column_map(&field->table->write_set, old_map);
}
......@@ -1506,9 +1552,9 @@ double Histogram_json::point_selectivity(Field *field, key_range *endpoint, doub
/*
@param field The table field histogram is for. We don't care about the
field's current value, we only need its virtual functions to
field's current value, we only need its virtual functions to
perform various operations
@param min_endp, max_endp - this specifies the range.
*/
double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
......@@ -1594,7 +1640,7 @@ double Histogram_json::range_selectivity(Field *field, key_range *min_endp,
void Histogram_json::serialize(Field *field)
{
field->store((char*)json_text, strlen((char*)json_text), &my_charset_bin);
field->store(json_text.data(), json_text.size(), &my_charset_bin);
}
......@@ -2052,13 +2098,16 @@ class Histogram_builder_json : public Histogram_builder
}
void build_json_from_histogram() {
Json_writer *writer = new Json_writer();
writer->start_array();
Json_writer writer;
writer.start_object();
writer.add_member(Histogram_json::JSON_NAME).start_array();
for(auto& value: bucket_bounds) {
writer->add_str(value.c_str());
writer.add_str(value.c_str());
}
writer->end_array();
Binary_string *json_string = (Binary_string *) writer->output.get_string();
writer.end_array();
writer.end_object();
Binary_string *json_string = (Binary_string *) writer.output.get_string();
Histogram_json *hist= (Histogram_json*)histogram;
hist->set_json_text(bucket_bounds.size(), (uchar *) json_string->c_ptr());
}
......@@ -2080,42 +2129,6 @@ Histogram_base *create_histogram(Histogram_type hist_type)
}
bool json_get_array_items(const char *json, const char *json_end, int *value_type, std::vector<std::string> &container) {
json_engine_t je;
int vl;
const char *v;
json_scan_start(&je, &my_charset_utf8mb4_bin, (const uchar *)json, (const uchar *)json_end);
if (json_read_value(&je) || (*value_type = je.value_type) != JSON_VALUE_ARRAY)
{
return false;
}
std::string val;
while(!json_scan_next(&je))
{
switch(je.state)
{
case JST_VALUE:
*value_type = json_smart_read_value(&je, &v, &vl);
if (je.value_type != JSON_VALUE_STRING &&
je.value_type != JSON_VALUE_NUMBER &&
je.value_type != JSON_VALUE_TRUE &&
je.value_type != JSON_VALUE_FALSE)
{
return false;
}
val = std::string(v, vl);
container.emplace_back(val);
break;
case JST_ARRAY_END:
break;
}
}
return true;
}
C_MODE_START
int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
......
......@@ -152,7 +152,7 @@ class Histogram_base : public Sql_alloc
{
public:
virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size)= 0;
const char *hist_data, size_t hist_data_len)= 0;
virtual void serialize(Field *to_field)= 0;
virtual Histogram_type get_type()=0;
......@@ -187,7 +187,7 @@ class Histogram_binary : public Histogram_base
{
public:
bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg,
const uchar *ptr_arg, uint size_arg) override;
const char *hist_data, size_t hist_data_len) override;
void serialize(Field *to_field) override;
Histogram_type get_type() override { return type; }
......@@ -350,14 +350,16 @@ class Histogram_json : public Histogram_base
uint8 size; /* Number of elements in the histogram */
/* Collection-time only: collected histogram in the JSON form. */
uchar *json_text;
std::string json_text;
// Array of histogram bucket endpoints in KeyTupleFormat.
std::vector<std::string> histogram_bounds;
public:
static constexpr const char* JSON_NAME="histogram_hb_v1";
bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size) override;
const char *hist_data, size_t hist_data_len) override;
void serialize(Field *field) override;
......@@ -375,7 +377,8 @@ class Histogram_json : public Histogram_base
void set_json_text(ulonglong sz, uchar *json_text_arg)
{
size = (uint8) sz;
json_text= json_text_arg;
json_text.assign((const char*)json_text_arg,
strlen((const char*)json_text_arg));
}
uint get_size() override
......@@ -481,8 +484,9 @@ class Column_statistics
ulonglong avg_frequency;
public:
/* Histogram type as specified in mysql.column_stats.hist_type */
Histogram_type histogram_type_on_disk;
Histogram_base *histogram_;
uint32 no_values_provided_bitmap()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment