Commit 1a260c78 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] knfsd: Improve UTF8 checking.

From: NeilBrown <neilb@cse.unsw.edu.au>

From: Fred.  We don't do all the utf8 checking we could in the kernel, but we
do some simple checks.  Implement slightly stricter, and probably more
efficient, checking.
parent c02c0886
......@@ -58,93 +58,94 @@
#define NFSDDBG_FACILITY NFSDDBG_XDR
/*
* From Peter Astrand <peter@cendio.se>: The following routines check
* whether a filename supplied by the client is valid.
*/
static const char trailing_bytes_for_utf8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
static const char utf8_byte_len[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
};
static inline int
is_legal_iso_utf8_sequence(unsigned char *source, int length)
is_legal_utf8_sequence(unsigned char *source, int length)
{
unsigned char a;
unsigned char *srcptr;
unsigned char *ptr;
unsigned char c;
srcptr = source + length;
if (length==1) return 1;
switch (length) {
/* Everything else falls through when "1"... */
/* Check for overlong sequence, and check second byte */
c = *(source + 1);
switch (*source) {
case 0xE0: /* 3 bytes */
if ( c < 0xA0 ) return 0;
break;
case 0xF0: /* 4 bytes */
if ( c < 0x90 ) return 0;
break;
case 0xF8: /* 5 bytes */
if ( c < 0xC8 ) return 0;
break;
case 0xFC: /* 6 bytes */
if ( c < 0x84 ) return 0;
break;
default:
/* Sequences with more than 6 bytes are invalid */
return 0;
/*
Byte 3-6 must be 80..BF
*/
case 6:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 5:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 4:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 3:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 2:
a = *--srcptr;
/* Upper limit */
if (a > 0xBF)
/* 2nd byte may never be > 0xBF */
return 0;
if ( (c & 0xC0) != 0x80) return 0;
}
/*
Lower limits checks, to detect non-shortest forms.
No fall-through in this inner switch.
*/
switch (*source) {
case 0xE0: /* 3 bytes */
if (a < 0xA0) return 0;
break;
case 0xF0: /* 4 bytes */
if (a < 0x90) return 0;
break;
case 0xF8: /* 5 bytes */
if (a < 0xC8) return 0;
break;
case 0xFC: /* 6 bytes */
if (a < 0x84) return 0;
break;
default:
/* In all cases, 2nd byte must be >= 0x80 (because leading
10...) */
if (a < 0x80) return 0;
}
/* Check that trailing bytes look like 10xxxxxx */
for (ptr = source++ + length - 1; ptr>source; ptr--)
if ( ((*ptr) & 0xC0) != 0x80 ) return 0;
return 1;
}
case 1:
/* Invalid ranges */
if (*source >= 0x80 && *source < 0xC2)
/* Multibyte char with value < 0xC2, non-shortest */
return 0;
if (*source > 0xFD)
/* Leading byte starting with 11111110 is illegal */
return 0;
if (!*source)
return 0;
/* This does some screening on disallowed unicode characters. It is NOT
* comprehensive.
*/
static int
is_allowed_utf8_char(unsigned char *source, int length)
{
/* We assume length and source point to a valid utf8 sequence */
unsigned char c;
/* Disallow F0000 and up (in utf8, F3B08080) */
if (*source > 0xF3 ) return 0;
c = *(source + 1);
switch (*source) {
case 0xF3:
if (c >= 0xB0) return 0;
break;
/* Disallow D800-F8FF (in utf8, EDA080-EFA3BF */
case 0xED:
if (c >= 0xA0) return 0;
break;
case 0xEE:
return 0;
break;
case 0xEF:
if (c <= 0xA3) return 0;
/* Disallow FFF9-FFFF (EFBFB9-EFBFBF) */
if (c==0xBF)
/* Don't need to check <=0xBF, since valid utf8 */
if ( *(source+2) >= 0xB9) return 0;
break;
}
return 1;
}
/* This routine should really check to see that the proper stringprep
* mappings have been applied. Instead, we do a simple screen of some
* of the more obvious illegal values by calling is_allowed_utf8_char.
* This will allow many illegal strings through, but if a client behaves,
* it will get full functionality. The other option (apart from full
* stringprep checking) is to limit everything to an easily handled subset,
* such as 7-bit ascii.
*
* Note - currently calling routines ignore return value except as boolean.
*/
static int
check_utf8(char *str, int len)
{
......@@ -155,11 +156,17 @@ check_utf8(char *str, int len)
sourceend = str + len;
while (chunk < sourceend) {
chunklen = trailing_bytes_for_utf8[*chunk]+1;
chunklen = utf8_byte_len[*chunk];
if (!chunklen)
return nfserr_inval;
if (chunk + chunklen > sourceend)
return nfserr_inval;
if (!is_legal_iso_utf8_sequence(chunk, chunklen))
if (!is_legal_utf8_sequence(chunk, chunklen))
return nfserr_inval;
if (!is_allowed_utf8_char(chunk, chunklen))
return nfserr_inval;
if ( (chunklen==1) && (!*chunk) )
return nfserr_inval; /* Disallow embedded nulls */
chunk += chunklen;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment