Commit a6430db2 authored by unknown's avatar unknown

Bug#28875 Conversion between ASCII and LATIN1 charsets does not function

(Regression, caused by a patch for the bug 22646).
Problem: when result type of date_format() was changed from
binary string to character string, mixing date_format()
with a ascii column in CONCAT() stopped to work.
Fix:
- adding "repertoire" flag into DTCollation class,
to mark items which can return only pure ASCII strings.
- allow character set conversion from pure ASCII to other character sets.


include/m_ctype.h:
  Defining new flags.
  Adding new function prototypes.
mysql-test/r/ctype_ucs.result:
  Adding tests.
mysql-test/r/ctype_utf8.result:
  Adding tests.
mysql-test/r/func_time.result:
  Adding tests.
mysql-test/t/ctype_ucs.test:
  Adding tests.
mysql-test/t/ctype_utf8.test:
  Adding tests.
mysql-test/t/func_time.test:
  Adding test.
mysys/charset.c:
  Adding pure ASCII detection when loading a dynamic character set.
sql/item.cc:
  - Moving detection of a Unicode superset into function.
  - Adding detection of a ASCII subset.
  - Adding creation of to-ASCII character set convertor when
    safe_charset_converter() failed and when the argument.
    repertoire is know to be pure ASCII.
sql/item.h:
  - Adding "repertoire" member into DTCollation class.
  - Adding "repertoire" argument to constructors.
  - Adding new methods:
    set_repertoire_from_charset()
    set_repertoire_from_value()
sql/item_func.cc:
  Adding "repertoire" argument.
sql/item_strfunc.cc:
  Adding "repertoire" argument.
sql/item_timefunc.cc:
  Initializing the result repertoire taking into account the "is_ascii"
  flag of the current locale.
sql/sql_lex.cc:
  Detect 7bit strings, return in Lex->text_string_is_7bit.
sql/sql_lex.h:
  Adding new member into LEX structure.
  Adding new member into Lex_input_stream
sql/sql_string.cc:
  Allow simple copy from pure ASCII to a ASCII-based character set.
sql/sql_yacc.yy:
  Depening on Lex->text_string_is_7bit and character set features,
  create Item_string with MY_REPERTOIRE_ASCII when it is possible.
strings/conf_to_src.c:
  - Adding printing of the "MY_CS_PUREASCII" flag
  - Adding printing of copyright
strings/ctype-extra.c:
  Recreating ctype-extra.c: ascii_general_ci and ascii_bin
  are now marked with MY_CS_PUREASCII flag.
strings/ctype.c:
  Adding new functions.
parent 77a2ca55
......@@ -78,8 +78,14 @@ extern MY_UNICASE_INFO *my_unicase_turkish[256];
#define MY_CS_READY 256 /* if a charset is initialized */
#define MY_CS_AVAILABLE 512 /* If either compiled-in or loaded*/
#define MY_CS_CSSORT 1024 /* if case sensitive sort order */
#define MY_CS_PUREASCII 2048 /* if a charset is pure ascii */
#define MY_CHARSET_UNDEFINED 0
/* Character repertoire flags */
#define MY_REPERTOIRE_ASCII 1 /* Pure ASCII U+0000..U+007F */
#define MY_REPERTOIRE_EXTENDED 2 /* Extended characters: U+0080..U+FFFF */
#define MY_REPERTOIRE_UNICODE30 3 /* ASCII | EXTENDED: U+0000..U+FFFF */
typedef struct my_uni_idx_st
{
......@@ -436,6 +442,11 @@ my_bool my_propagate_simple(CHARSET_INFO *cs, const uchar *str, uint len);
my_bool my_propagate_complex(CHARSET_INFO *cs, const uchar *str, uint len);
uint my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong len);
my_bool my_charset_is_ascii_based(CHARSET_INFO *cs);
my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs);
#define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */
#define _MY_NMR 04 /* Numeral (digit) */
......
......@@ -865,4 +865,30 @@ blob 65535 65535
text 65535 65535
text 65535 32767
drop table t1;
create table t1 (a varchar(15) character set ascii not null, b int);
insert into t1 values ('a',1);
select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062))
aa
select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062))
ab
select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062);
a b
a 1
select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062);
a b
select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation 'concat'
select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062);
ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '='
select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0);
ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (ucs2_general_ci,COERCIBLE) for operation '='
drop table t1;
End of 5.0 tests
......@@ -1639,6 +1639,42 @@ coercibility(col1) collation(col1)
0 utf8_swedish_ci
drop view v1, v2;
drop table t1;
set names utf8;
create table t1 (a varchar(10) character set latin1, b int);
insert into t1 values ('a',1);
select concat(a, if(b>10, N'x', N'y')) from t1;
concat(a, if(b>10, N'x', N'y'))
ay
select concat(a, if(b>10, N'æ', N'ß')) from t1;
ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
drop table t1;
set names utf8;
create table t1 (a varchar(10) character set latin1, b int);
insert into t1 values ('a',1);
select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1;
concat(a, if(b>10, _utf8'x', _utf8'y'))
ay
select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1;
ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
drop table t1;
set names utf8;
create table t1 (a varchar(10) character set latin1, b int);
insert into t1 values ('a',1);
select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1;
concat(a, if(b>10, _utf8 0x78, _utf8 0x79))
ay
select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1;
ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
drop table t1;
set names utf8;
create table t1 (a varchar(10) character set latin1, b int);
insert into t1 values ('a',1);
select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1;
concat(a, if(b>10, 'x' 'x', 'y' 'y'))
ayy
select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1;
ERROR HY000: Illegal mix of collations (latin1_swedish_ci,IMPLICIT) and (utf8_general_ci,COERCIBLE) for operation 'concat'
drop table t1;
CREATE TABLE t1 (
colA int(11) NOT NULL,
colB varchar(255) character set utf8 NOT NULL,
......
......@@ -1246,3 +1246,19 @@ SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SE
TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s")
838:59:58
838:59:59
set names latin1;
create table t1 (a varchar(15) character set ascii not null);
insert into t1 values ('070514-000000');
select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull'))
#
set names swe7;
select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (swe7_swedish_ci,COERCIBLE) for operation 'concat'
set names latin1;
set lc_time_names=fr_FR;
select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
ERROR HY000: Illegal mix of collations (ascii_general_ci,IMPLICIT) and (latin1_swedish_ci,COERCIBLE) for operation 'concat'
set lc_time_names=en_US;
drop table t1;
End of 5.0 tests
......@@ -594,4 +594,34 @@ select data_type, character_octet_length, character_maximum_length
from information_schema.columns where table_name='t1';
drop table t1;
#
# Conversion from UCS2 to ASCII is possible
# if the UCS2 string consists of only ASCII characters
#
create table t1 (a varchar(15) character set ascii not null, b int);
insert into t1 values ('a',1);
select concat(a,if(b<10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
select concat(a,if(b>10,_ucs2 0x0061,_ucs2 0x0062)) from t1;
select * from t1 where a=if(b<10,_ucs2 0x0061,_ucs2 0x0062);
select * from t1 where a=if(b>10,_ucs2 0x0061,_ucs2 0x0062);
#
# Conversion from UCS2 to ASCII is not possible if
# the UCS2 string has non-ASCII characters
#
--error 1267
select concat(a,if(b<10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
--error 1267
select concat(a,if(b>10,_ucs2 0x00C0,_ucs2 0x0062)) from t1;
--error 1267
select concat(a,if(b<10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
--error 1267
select concat(a,if(b>10,_ucs2 0x0062,_ucs2 0x00C0)) from t1;
--error 1267
select * from t1 where a=if(b<10,_ucs2 0x00C0,_ucs2 0x0062);
--error 1267
select * from t1 where a=if(b<10,_ucs2 0x0062,_ucs2 0x00C0);
drop table t1;
--echo End of 5.0 tests
......@@ -1314,6 +1314,46 @@ select coercibility(col1), collation(col1) from v2;
drop view v1, v2;
drop table t1;
#
# Check conversion of NCHAR strings to subset (e.g. latin1).
# Conversion is possible if string repertoire is ASCII.
# Conversion is not possible if the string have extended characters
#
set names utf8;
create table t1 (a varchar(10) character set latin1, b int);
insert into t1 values ('a',1);
select concat(a, if(b>10, N'x', N'y')) from t1;
--error 1267
select concat(a, if(b>10, N'æ', N'ß')) from t1;
drop table t1;
# Conversion tests for character set introducers
set names utf8;
create table t1 (a varchar(10) character set latin1, b int);
insert into t1 values ('a',1);
select concat(a, if(b>10, _utf8'x', _utf8'y')) from t1;
--error 1267
select concat(a, if(b>10, _utf8'æ', _utf8'ß')) from t1;
drop table t1;
# Conversion tests for introducer + HEX string
set names utf8;
create table t1 (a varchar(10) character set latin1, b int);
insert into t1 values ('a',1);
select concat(a, if(b>10, _utf8 0x78, _utf8 0x79)) from t1;
--error 1267
select concat(a, if(b>10, _utf8 0xC3A6, _utf8 0xC3AF)) from t1;
drop table t1;
# Conversion tests for "text_literal TEXT_STRING_literal" syntax structure
set names utf8;
create table t1 (a varchar(10) character set latin1, b int);
insert into t1 values ('a',1);
select concat(a, if(b>10, 'x' 'x', 'y' 'y')) from t1;
--error 1267
select concat(a, if(b>10, 'x' 'æ', 'y' 'ß')) from t1;
drop table t1;
#
# Bug#19960: Inconsistent results when joining
......
......@@ -752,3 +752,29 @@ DROP TABLE t1;
# Check if using GROUP BY with TIME_FORMAT() produces correct results
SELECT TIME_FORMAT(SEC_TO_TIME(a),"%H:%i:%s") FROM (SELECT 3020399 AS a UNION SELECT 3020398 ) x GROUP BY 1;
#
# Bug#28875 Conversion between ASCII and LATIN1 charsets does not function
#
set names latin1;
create table t1 (a varchar(15) character set ascii not null);
insert into t1 values ('070514-000000');
# Conversion of date_format() result to ASCII
# is safe with the default locale en_US
--replace_column 1 #
select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
# Error for swe7: it is not ASCII compatible
set names swe7;
--error 1267
select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
set names latin1;
# Conversion of date_format() result to ASCII
# is not safe with the non-default locale fr_FR
# because month and day names can have accented characters
set lc_time_names=fr_FR;
--error 1267
select concat(a,ifnull(min(date_format(now(), '%Y-%m-%d')),' ull')) from t1;
set lc_time_names=en_US;
drop table t1;
--echo End of 5.0 tests
......@@ -277,6 +277,9 @@ static int add_collation(CHARSET_INFO *cs)
if (sort_order && sort_order['A'] < sort_order['a'] &&
sort_order['a'] < sort_order['B'])
all_charsets[cs->number]->state|= MY_CS_CSSORT;
if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number]))
all_charsets[cs->number]->state|= MY_CS_PUREASCII;
}
}
else
......
......@@ -1296,6 +1296,25 @@ void Item::split_sum_func2(THD *thd, Item **ref_pointer_array,
}
static bool
left_is_superset(DTCollation *left, DTCollation *right)
{
/* Allow convert to Unicode */
if (left->collation->state & MY_CS_UNICODE &&
(left->derivation < right->derivation ||
(left->derivation == right->derivation &&
!(right->collation->state & MY_CS_UNICODE))))
return TRUE;
/* Allow convert from ASCII */
if (right->repertoire == MY_REPERTOIRE_ASCII &&
(left->derivation < right->derivation ||
(left->derivation == right->derivation &&
!(left->repertoire == MY_REPERTOIRE_ASCII))))
return TRUE;
/* Disallow conversion otherwise */
return FALSE;
}
/*
Aggregate two collations together taking
into account their coercibility (aka derivation):
......@@ -1360,18 +1379,12 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
; // Do nothing
}
else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) &&
collation->state & MY_CS_UNICODE &&
(derivation < dt.derivation ||
(derivation == dt.derivation &&
!(dt.collation->state & MY_CS_UNICODE))))
left_is_superset(this, &dt))
{
// Do nothing
}
else if ((flags & MY_COLL_ALLOW_SUPERSET_CONV) &&
dt.collation->state & MY_CS_UNICODE &&
(dt.derivation < derivation ||
(dt.derivation == derivation &&
!(collation->state & MY_CS_UNICODE))))
left_is_superset(&dt, this))
{
set(dt);
}
......@@ -1390,7 +1403,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
else
{
// Cannot apply conversion
set(0, DERIVATION_NONE);
set(0, DERIVATION_NONE, 0);
return 1;
}
}
......@@ -1412,8 +1425,8 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
{
if (derivation == DERIVATION_EXPLICIT)
{
set(0, DERIVATION_NONE);
return 1;
set(0, DERIVATION_NONE, 0);
return 1;
}
if (collation->state & MY_CS_BINSORT)
return 0;
......@@ -1427,6 +1440,7 @@ bool DTCollation::aggregate(DTCollation &dt, uint flags)
set(bin, DERIVATION_NONE);
}
}
repertoire|= dt.repertoire;
return 0;
}
......@@ -1566,12 +1580,16 @@ bool agg_item_charsets(DTCollation &coll, const char *fname,
{
Item* conv;
uint32 dummy_offset;
if (!String::needs_conversion(0, coll.collation,
(*arg)->collation.collation,
if (!String::needs_conversion(0, (*arg)->collation.collation,
coll.collation,
&dummy_offset))
continue;
if (!(conv= (*arg)->safe_charset_converter(coll.collation)))
if (!(conv= (*arg)->safe_charset_converter(coll.collation)) &&
((*arg)->collation.repertoire == MY_REPERTOIRE_ASCII))
conv= new Item_func_conv_charset(*arg, coll.collation, 1);
if (!conv)
{
if (nargs >=2 && nargs <= 3)
{
......
......@@ -49,29 +49,50 @@ class DTCollation {
public:
CHARSET_INFO *collation;
enum Derivation derivation;
uint repertoire;
void set_repertoire_from_charset(CHARSET_INFO *cs)
{
repertoire= cs->state & MY_CS_PUREASCII ?
MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
}
DTCollation()
{
collation= &my_charset_bin;
derivation= DERIVATION_NONE;
repertoire= MY_REPERTOIRE_UNICODE30;
}
DTCollation(CHARSET_INFO *collation_arg, Derivation derivation_arg)
{
collation= collation_arg;
derivation= derivation_arg;
set_repertoire_from_charset(collation_arg);
}
void set(DTCollation &dt)
{
collation= dt.collation;
derivation= dt.derivation;
repertoire= dt.repertoire;
}
void set(CHARSET_INFO *collation_arg, Derivation derivation_arg)
{
collation= collation_arg;
derivation= derivation_arg;
set_repertoire_from_charset(collation_arg);
}
void set(CHARSET_INFO *collation_arg,
Derivation derivation_arg,
uint repertoire_arg)
{
collation= collation_arg;
derivation= derivation_arg;
repertoire= repertoire_arg;
}
void set(CHARSET_INFO *collation_arg)
{ collation= collation_arg; }
{
collation= collation_arg;
set_repertoire_from_charset(collation_arg);
}
void set(Derivation derivation_arg)
{ derivation= derivation_arg; }
bool aggregate(DTCollation &dt, uint flags= 0);
......@@ -1650,10 +1671,11 @@ class Item_string :public Item
{
public:
Item_string(const char *str,uint length,
CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE)
CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE,
uint repertoire= MY_REPERTOIRE_UNICODE30)
{
collation.set(cs, dv);
str_value.set_or_copy_aligned(str,length,cs);
str_value.set_or_copy_aligned(str, length, cs);
collation.set(cs, dv, repertoire);
/*
We have to have a different max_length than 'length' here to
ensure that we get the right length if we do use the item
......@@ -1677,10 +1699,11 @@ class Item_string :public Item
fixed= 1;
}
Item_string(const char *name_par, const char *str, uint length,
CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE)
CHARSET_INFO *cs, Derivation dv= DERIVATION_COERCIBLE,
uint repertoire= MY_REPERTOIRE_UNICODE30)
{
collation.set(cs, dv);
str_value.set_or_copy_aligned(str,length,cs);
str_value.set_or_copy_aligned(str, length, cs);
collation.set(cs, dv, repertoire);
max_length= str_value.numchars()*cs->mbmaxlen;
set_name(name_par, 0, cs);
decimals=NOT_FIXED_DEC;
......@@ -1696,6 +1719,12 @@ class Item_string :public Item
str_value.copy(str_arg, length_arg, collation.collation);
max_length= str_value.numchars() * collation.collation->mbmaxlen;
}
void set_repertoire_from_value()
{
collation.repertoire= my_string_repertoire(str_value.charset(),
str_value.ptr(),
str_value.length());
}
enum Type type() const { return STRING_ITEM; }
double val_real();
longlong val_int();
......
......@@ -3751,7 +3751,7 @@ static user_var_entry *get_variable(HASH *hash, LEX_STRING &name,
entry->value=0;
entry->length=0;
entry->update_query_id=0;
entry->collation.set(NULL, DERIVATION_IMPLICIT);
entry->collation.set(NULL, DERIVATION_IMPLICIT, 0);
entry->unsigned_flag= 0;
/*
If we are here, we were called from a SET or a query which sets a
......
......@@ -2672,7 +2672,8 @@ void Item_func_set_collation::fix_length_and_dec()
colname, args[0]->collation.collation->csname);
return;
}
collation.set(set_collation, DERIVATION_EXPLICIT);
collation.set(set_collation, DERIVATION_EXPLICIT,
args[0]->collation.repertoire);
max_length= args[0]->max_length;
}
......
......@@ -1717,7 +1717,11 @@ void Item_func_date_format::fix_length_and_dec()
Item *arg1= args[1]->this_item();
decimals=0;
collation.set(thd->variables.collation_connection);
CHARSET_INFO *cs= thd->variables.collation_connection;
uint32 repertoire= arg1->collation.repertoire;
if (!thd->variables.lc_time_names->is_ascii)
repertoire|= MY_REPERTOIRE_EXTENDED;
collation.set(cs, arg1->collation.derivation, repertoire);
if (arg1->type() == STRING_ITEM)
{ // Optimize the normal case
fixed_length=1;
......
......@@ -311,10 +311,12 @@ static char *get_text(Lex_input_stream *lip)
uint found_escape=0;
CHARSET_INFO *cs= lip->m_thd->charset();
lip->tok_bitmap= 0;
sep= yyGetLast(); // String should end with this
while (lip->ptr != lip->end_of_query)
{
c = yyGet();
c= yyGet();
lip->tok_bitmap|= c;
#ifdef USE_MB
{
int l;
......@@ -605,6 +607,7 @@ int MYSQLlex(void *arg, void *yythd)
break;
}
yylval->lex_str.length= lip->yytoklen;
lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
return(NCHAR_STRING);
case MY_LEX_IDENT_OR_HEX:
......@@ -926,6 +929,7 @@ int MYSQLlex(void *arg, void *yythd)
break;
}
yylval->lex_str.length=lip->yytoklen;
lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1;
return(TEXT_STRING);
case MY_LEX_COMMENT: // Comment
......
......@@ -957,6 +957,9 @@ class Lex_input_stream
/** Position of ';' in the stream, to delimit multiple queries. */
const char* found_semicolon;
/** Token character bitmaps, to detect 7bit strings. */
uchar tok_bitmap;
/** SQL_MODE = IGNORE_SPACE. */
bool ignore_space;
......@@ -994,6 +997,7 @@ typedef struct st_lex : public Query_tables_list
gptr yacc_yyss,yacc_yyvs;
THD *thd;
CHARSET_INFO *charset, *underscore_charset;
bool text_string_is_7bit;
/* store original leaf_tables for INSERT SELECT and PS/SP */
TABLE_LIST *leaf_tables_insert;
/* Position (first character index) of SELECT of CREATE VIEW statement */
......
......@@ -263,6 +263,8 @@ bool String::needs_conversion(uint32 arg_length,
(to_cs == &my_charset_bin) ||
(to_cs == from_cs) ||
my_charset_same(from_cs, to_cs) ||
(my_charset_is_ascii_based(to_cs) &&
my_charset_is_8bit_pure_ascii(from_cs)) ||
((from_cs == &my_charset_bin) &&
(!(*offset=(arg_length % to_cs->mbminlen)))))
return FALSE;
......
......@@ -7509,18 +7509,54 @@ opt_load_data_set_spec:
/* Common definitions */
text_literal:
TEXT_STRING_literal
{
THD *thd= YYTHD;
$$ = new Item_string($1.str,$1.length,thd->variables.collation_connection);
}
| NCHAR_STRING
{ $$= new Item_string($1.str,$1.length,national_charset_info); }
| UNDERSCORE_CHARSET TEXT_STRING
{ $$ = new Item_string($2.str,$2.length,Lex->underscore_charset); }
| text_literal TEXT_STRING_literal
{ ((Item_string*) $1)->append($2.str,$2.length); }
;
TEXT_STRING
{
LEX_STRING tmp;
THD *thd= YYTHD;
CHARSET_INFO *cs_con= thd->variables.collation_connection;
CHARSET_INFO *cs_cli= thd->variables.character_set_client;
uint repertoire= thd->lex->text_string_is_7bit &&
my_charset_is_ascii_based(cs_cli) ?
MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
if (thd->charset_is_collation_connection ||
(repertoire == MY_REPERTOIRE_ASCII &&
my_charset_is_ascii_based(cs_con)))
tmp= $1;
else
thd->convert_string(&tmp, cs_con, $1.str, $1.length, cs_cli);
$$= new Item_string(tmp.str, tmp.length, cs_con,
DERIVATION_COERCIBLE, repertoire);
}
| NCHAR_STRING
{
uint repertoire= Lex->text_string_is_7bit ?
MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
DBUG_ASSERT(my_charset_is_ascii_based(national_charset_info));
$$= new Item_string($1.str, $1.length, national_charset_info,
DERIVATION_COERCIBLE, repertoire);
}
| UNDERSCORE_CHARSET TEXT_STRING
{
$$= new Item_string($2.str, $2.length, Lex->underscore_charset);
((Item_string*) $$)->set_repertoire_from_value();
}
| text_literal TEXT_STRING_literal
{
Item_string* item= (Item_string*) $1;
item->append($2.str, $2.length);
if (!(item->collation.repertoire & MY_REPERTOIRE_EXTENDED))
{
/*
If the string has been pure ASCII so far,
check the new part.
*/
CHARSET_INFO *cs= YYTHD->variables.collation_connection;
item->collation.repertoire|= my_string_repertoire(cs,
$2.str,
$2.length);
}
}
;
text_string:
TEXT_STRING_literal
......@@ -7592,20 +7628,22 @@ literal:
| TRUE_SYM { $$= new Item_int((char*) "TRUE",1,1); }
| HEX_NUM { $$ = new Item_hex_string($1.str, $1.length);}
| BIN_NUM { $$= new Item_bin_string($1.str, $1.length); }
| UNDERSCORE_CHARSET HEX_NUM
{
Item *tmp= new Item_hex_string($2.str, $2.length);
/*
it is OK only emulate fix_fieds, because we need only
| UNDERSCORE_CHARSET HEX_NUM
{
Item *tmp= new Item_hex_string($2.str, $2.length);
/*
it is OK only emulate fix_fieds, because we need only
value of constant
*/
String *str= tmp ?
tmp->quick_fix_field(), tmp->val_str((String*) 0) :
(String*) 0;
$$= new Item_string(str ? str->ptr() : "",
str ? str->length() : 0,
Lex->underscore_charset);
}
*/
String *str= tmp ?
tmp->quick_fix_field(), tmp->val_str((String*) 0) :
(String*) 0;
$$= new Item_string(str ? str->ptr() : "",
str ? str->length() : 0,
Lex->underscore_charset);
if ($$)
((Item_string *) $$)->set_repertoire_from_value();
}
| UNDERSCORE_CHARSET BIN_NUM
{
Item *tmp= new Item_bin_string($2.str, $2.length);
......
......@@ -179,14 +179,16 @@ is_case_sensitive(CHARSET_INFO *cs)
cs->sort_order['a'] < cs->sort_order['B']) ? 1 : 0;
}
void dispcset(FILE *f,CHARSET_INFO *cs)
{
fprintf(f,"{\n");
fprintf(f," %d,%d,%d,\n",cs->number,0,0);
fprintf(f," MY_CS_COMPILED%s%s%s,\n",
cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "",
cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "",
is_case_sensitive(cs) ? "|MY_CS_CSSORT" : "");
fprintf(f," MY_CS_COMPILED%s%s%s%s,\n",
cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "",
cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "",
is_case_sensitive(cs) ? "|MY_CS_CSSORT" : "",
my_charset_is_8bit_pure_ascii(cs) ? "|MY_CS_PUREASCII" : "");
if (cs->name)
{
......@@ -243,6 +245,28 @@ void dispcset(FILE *f,CHARSET_INFO *cs)
}
static void
fprint_copyright(FILE *file)
{
fprintf(file,
"/* Copyright (C) 2000-2007 MySQL AB\n"
"\n"
" This program is free software; you can redistribute it and/or modify\n"
" it under the terms of the GNU General Public License as published by\n"
" the Free Software Foundation; version 2 of the License.\n"
"\n"
" This program is distributed in the hope that it will be useful,\n"
" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"
" GNU General Public License for more details.\n"
"\n"
" You should have received a copy of the GNU General Public License\n"
" along with this program; if not, write to the Free Software\n"
" Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */\n"
"\n");
}
int
main(int argc, char **argv __attribute__((unused)))
{
......@@ -283,6 +307,7 @@ main(int argc, char **argv __attribute__((unused)))
"directory:\n");
fprintf(f, " ./conf_to_src ../sql/share/charsets/ > FILE\n");
fprintf(f, "*/\n\n");
fprint_copyright(f);
fprintf(f,"#include <my_global.h>\n");
fprintf(f,"#include <m_ctype.h>\n\n");
......
......@@ -5,7 +5,8 @@
To re-generate, run the following in the strings/ directory:
./conf_to_src ../sql/share/charsets/ > FILE
*/
/* Copyright (C) 2000-2003 MySQL AB
/* Copyright (C) 2000-2007 MySQL AB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -6721,7 +6722,7 @@ CHARSET_INFO compiled_charsets[] = {
#ifdef HAVE_CHARSET_ascii
{
11,0,0,
MY_CS_COMPILED|MY_CS_PRIMARY,
MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_PUREASCII,
"ascii", /* cset name */
"ascii_general_ci", /* coll name */
"", /* comment */
......@@ -7810,7 +7811,7 @@ CHARSET_INFO compiled_charsets[] = {
#ifdef HAVE_CHARSET_ascii
{
65,0,0,
MY_CS_COMPILED|MY_CS_BINSORT,
MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_PUREASCII,
"ascii", /* cset name */
"ascii_bin", /* coll name */
"", /* comment */
......
......@@ -306,3 +306,89 @@ my_bool my_parse_charset_xml(const char *buf, uint len,
my_xml_parser_free(&p);
return rc;
}
/*
Check repertoire: detect pure ascii strings
*/
uint
my_string_repertoire(CHARSET_INFO *cs, const char *str, ulong length)
{
const char *strend= str + length;
if (cs->mbminlen == 1)
{
for ( ; str < strend; str++)
{
if (((uchar) *str) > 0x7F)
return MY_REPERTOIRE_UNICODE30;
}
}
else
{
my_wc_t wc;
int chlen;
for (; (chlen= cs->cset->mb_wc(cs, &wc, str, strend)) > 0; str+= chlen)
{
if (wc > 0x7F)
return MY_REPERTOIRE_UNICODE30;
}
}
return MY_REPERTOIRE_ASCII;
}
/*
Detect whether a character set is ASCII compatible.
Returns TRUE for:
- all 8bit character sets whose Unicode mapping of 0x7B is '{'
(ignores swe7 which maps 0x7B to "LATIN LETTER A WITH DIAERESIS")
- all multi-byte character sets having mbminlen == 1
(ignores ucs2 whose mbminlen is 2)
TODO:
When merging to 5.2, this function should be changed
to check a new flag MY_CS_NONASCII,
return (cs->flag & MY_CS_NONASCII) ? 0 : 1;
This flag was previously added into 5.2 under terms
of WL#3759 "Optimize identifier conversion in client-server protocol"
especially to mark character sets not compatible with ASCII.
We won't backport this flag to 5.0 or 5.1.
This function is Ok for 5.0 and 5.1, because we're not going
to introduce new tricky character sets between 5.0 and 5.2.
*/
my_bool
my_charset_is_ascii_based(CHARSET_INFO *cs)
{
return
(cs->mbmaxlen == 1 && cs->tab_to_uni && cs->tab_to_uni['{'] == '{') ||
(cs->mbminlen == 1 && cs->mbmaxlen > 1);
}
/*
Detect if a character set is 8bit,
and it is pure ascii, i.e. doesn't have
characters outside U+0000..U+007F
This functions is shared between "conf_to_src"
and dynamic charsets loader in "mysqld".
*/
my_bool
my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs)
{
size_t code;
if (!cs->tab_to_uni)
return 0;
for (code= 0; code < 256; code++)
{
if (cs->tab_to_uni[code] > 0x7F)
return 0;
}
return 1;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment