Browse Source

Changes in the validation of UTF-8

All UTF-8 encoding functionality (including the escape
sequence '\u') accepts all values from the original UTF-8
specification (with sequences of up to six bytes).

By default, the decoding functions in the UTF-8 library do not
accept invalid Unicode code points, such as surrogates. A new
parameter 'nonstrict' makes them accept all code points up to
(2^31)-1, as in the original UTF-8 specification.
pull/22/head
Roberto Ierusalimschy 6 years ago
parent
commit
1e0c73d5b6
  1. 2
      llex.c
  2. 6
      lobject.c
  3. 76
      lutf8lib.c
  4. 43
      manual/manual.of
  5. 17
      testes/literals.lua
  6. 90
      testes/utf8.lua

2
llex.c

@ -335,7 +335,7 @@ static unsigned long readutf8esc (LexState *ls) {
while ((save_and_next(ls), lisxdigit(ls->current))) {
i++;
r = (r << 4) + luaO_hexavalue(ls->current);
esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
esccheck(ls, r <= 0x7FFFFFFFu, "UTF-8 value too large");
}
esccheck(ls, ls->current == '}', "missing '}'");
next(ls); /* skip '}' */

6
lobject.c

@ -343,7 +343,7 @@ size_t luaO_str2num (const char *s, TValue *o) {
int luaO_utf8esc (char *buff, unsigned long x) {
int n = 1; /* number of bytes put in buffer (backwards) */
lua_assert(x <= 0x10FFFF);
lua_assert(x <= 0x7FFFFFFFu);
if (x < 0x80) /* ascii? */
buff[UTF8BUFFSZ - 1] = cast_char(x);
else { /* need continuation bytes */
@ -435,9 +435,9 @@ const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
pushstr(L, buff, l);
break;
}
case 'U': { /* an 'int' as a UTF-8 sequence */
case 'U': { /* a 'long' as a UTF-8 sequence */
char buff[UTF8BUFFSZ];
int l = luaO_utf8esc(buff, cast(long, va_arg(argp, long)));
int l = luaO_utf8esc(buff, va_arg(argp, long));
pushstr(L, buff + UTF8BUFFSZ - l, l);
break;
}

76
lutf8lib.c

@ -21,12 +21,14 @@
#include "lualib.h"
#define MAXUNICODE 0x10FFFF
#define MAXUNICODE 0x10FFFFu
#define MAXUTF 0x7FFFFFFFu
/*
** Integer type for decoded UTF-8 values; MAXUNICODE needs 21 bits.
** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
*/
#if LUAI_BITSINT >= 21
#if LUAI_BITSINT >= 31
typedef unsigned int utfint;
#else
typedef unsigned long utfint;
@ -46,38 +48,46 @@ static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
/*
** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
** Decode one UTF-8 sequence, returning NULL if byte sequence is
** invalid. The array 'limits' stores the minimum value for each
** sequence length, to check for overlong representations. Its first
** entry forces an error for non-ascii bytes with no continuation
** bytes (count == 0).
*/
static const char *utf8_decode (const char *o, utfint *val) {
static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
const unsigned char *s = (const unsigned char *)o;
unsigned int c = s[0];
static const char *utf8_decode (const char *s, utfint *val, int strict) {
static const utfint limits[] =
{~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
unsigned int c = (unsigned char)s[0];
utfint res = 0; /* final result */
if (c < 0x80) /* ascii? */
res = c;
else {
int count = 0; /* to count number of continuation bytes */
while (c & 0x40) { /* still have continuation bytes? */
int cc = s[++count]; /* read next byte */
for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
unsigned int cc = (unsigned char)s[++count]; /* read next byte */
if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
return NULL; /* invalid byte sequence */
res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
c <<= 1; /* to test next bit */
}
res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */
if (count > 3 || res > MAXUNICODE || res <= limits[count])
if (count > 5 || res > MAXUTF || res < limits[count])
return NULL; /* invalid byte sequence */
s += count; /* skip continuation bytes read */
}
if (strict) {
/* check for invalid code points; too large or surrogates */
if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
return NULL;
}
if (val) *val = res;
return (const char *)s + 1; /* +1 to include first byte */
return s + 1; /* +1 to include first byte */
}
/*
** utf8len(s [, i [, j]]) --> number of characters that start in the
** range [i,j], or nil + current position if 's' is not well formed in
** that interval
** utf8len(s [, i [, j [, nonstrict]]]) --> number of characters that
** start in the range [i,j], or nil + current position if 's' is not
** well formed in that interval
*/
static int utflen (lua_State *L) {
lua_Integer n = 0; /* counter for the number of characters */
@ -85,12 +95,13 @@ static int utflen (lua_State *L) {
const char *s = luaL_checklstring(L, 1, &len);
lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
int nonstrict = lua_toboolean(L, 4);
luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
"initial position out of string");
luaL_argcheck(L, --posj < (lua_Integer)len, 3,
"final position out of string");
while (posi <= posj) {
const char *s1 = utf8_decode(s + posi, NULL);
const char *s1 = utf8_decode(s + posi, NULL, !nonstrict);
if (s1 == NULL) { /* conversion error? */
lua_pushnil(L); /* return nil ... */
lua_pushinteger(L, posi + 1); /* ... and current position */
@ -105,14 +116,15 @@ static int utflen (lua_State *L) {
/*
** codepoint(s, [i, [j]]) -> returns codepoints for all characters
** that start in the range [i,j]
** codepoint(s, [i, [j [, nonstrict]]]) -> returns codepoints for all
** characters that start in the range [i,j]
*/
static int codepoint (lua_State *L) {
size_t len;
const char *s = luaL_checklstring(L, 1, &len);
lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
int nonstrict = lua_toboolean(L, 4);
int n;
const char *se;
luaL_argcheck(L, posi >= 1, 2, "out of range");
@ -126,7 +138,7 @@ static int codepoint (lua_State *L) {
se = s + pose; /* string end */
for (s += posi - 1; s < se;) {
utfint code;
s = utf8_decode(s, &code);
s = utf8_decode(s, &code, !nonstrict);
if (s == NULL)
return luaL_error(L, "invalid UTF-8 code");
lua_pushinteger(L, code);
@ -137,8 +149,8 @@ static int codepoint (lua_State *L) {
static void pushutfchar (lua_State *L, int arg) {
lua_Integer code = luaL_checkinteger(L, arg);
luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
lua_pushfstring(L, "%U", (long)code);
}
@ -209,7 +221,7 @@ static int byteoffset (lua_State *L) {
}
static int iter_aux (lua_State *L) {
static int iter_aux (lua_State *L, int strict) {
size_t len;
const char *s = luaL_checklstring(L, 1, &len);
lua_Integer n = lua_tointeger(L, 2) - 1;
@ -223,8 +235,8 @@ static int iter_aux (lua_State *L) {
return 0; /* no more codepoints */
else {
utfint code;
const char *next = utf8_decode(s + n, &code);
if (next == NULL || iscont(next))
const char *next = utf8_decode(s + n, &code, strict);
if (next == NULL)
return luaL_error(L, "invalid UTF-8 code");
lua_pushinteger(L, n + 1);
lua_pushinteger(L, code);
@ -233,9 +245,19 @@ static int iter_aux (lua_State *L) {
}
static int iter_auxstrict (lua_State *L) {
return iter_aux(L, 1);
}
static int iter_auxnostrict (lua_State *L) {
return iter_aux(L, 0);
}
static int iter_codes (lua_State *L) {
int nonstrict = lua_toboolean(L, 2);
luaL_checkstring(L, 1);
lua_pushcfunction(L, iter_aux);
lua_pushcfunction(L, nonstrict ? iter_auxnostrict : iter_auxstrict);
lua_pushvalue(L, 1);
lua_pushinteger(L, 0);
return 3;
@ -243,7 +265,7 @@ static int iter_codes (lua_State *L) {
/* pattern to match a single UTF-8 character */
#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
#define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
static const luaL_Reg funcs[] = {

43
manual/manual.of

@ -1004,6 +1004,8 @@ the escape sequence @T{\u{@rep{XXX}}}
(note the mandatory enclosing brackets),
where @rep{XXX} is a sequence of one or more hexadecimal digits
representing the character code point.
This code point can be any value smaller than @M{2@sp{31}}.
(Lua uses the original UTF-8 specification here.)
Literal strings can also be defined using a long format
enclosed by @def{long brackets}.
@ -6899,6 +6901,7 @@ x = string.gsub("$name-$version.tar.gz", "%$(%w+)", t)
}
@LibEntry{string.len (s)|
Receives a string and returns its length.
The empty string @T{""} has length 0.
Embedded zeros are counted,
@ -6907,6 +6910,7 @@ so @T{"a\000bc\000"} has length 5.
}
@LibEntry{string.lower (s)|
Receives a string and returns a copy of this string with all
uppercase letters changed to lowercase.
All other characters are left unchanged.
@ -6915,6 +6919,7 @@ The definition of what an uppercase letter is depends on the current locale.
}
@LibEntry{string.match (s, pattern [, init])|
Looks for the first @emph{match} of
@id{pattern} @see{pm} in the string @id{s}.
If it finds one, then @id{match} returns
@ -6946,6 +6951,7 @@ The format string cannot have the variable-length options
}
@LibEntry{string.rep (s, n [, sep])|
Returns a string that is the concatenation of @id{n} copies of
the string @id{s} separated by the string @id{sep}.
The default value for @id{sep} is the empty string
@ -6958,11 +6964,13 @@ with a single call to this function.)
}
@LibEntry{string.reverse (s)|
Returns a string that is the string @id{s} reversed.
}
@LibEntry{string.sub (s, i [, j])|
Returns the substring of @id{s} that
starts at @id{i} and continues until @id{j};
@id{i} and @id{j} can be negative.
@ -6998,6 +7006,7 @@ this function also returns the index of the first unread byte in @id{s}.
}
@LibEntry{string.upper (s)|
Receives a string and returns a copy of this string with all
lowercase letters changed to uppercase.
All other characters are left unchanged.
@ -7318,8 +7327,24 @@ or one plus the length of the subject string.
As in the string library,
negative indices count from the end of the string.
Functions that create byte sequences
accept all values up to @T{0x7FFFFFFF},
as defined in the original UTF-8 specification;
that implies byte sequences of up to six bytes.
Functions that interpret byte sequences only accept
valid sequences (well formed and not overlong).
By default, they only accept byte sequences
that result in valid Unicode code points,
rejecting values larger than @T{10FFFF} and surrogates.
A boolean argument @id{nonstrict}, when available,
lifts these checks,
so that all values up to @T{0x7FFFFFFF} are accepted.
(Not well formed and overlong sequences are still rejected.)
@LibEntry{utf8.char (@Cdots)|
Receives zero or more integers,
converts each one to its corresponding UTF-8 byte sequence
and returns a string with the concatenation of all these sequences.
@ -7327,14 +7352,15 @@ and returns a string with the concatenation of all these sequences.
}
@LibEntry{utf8.charpattern|
The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xF4][\x80-\xBF]*}
The pattern (a string, not a function) @St{[\0-\x7F\xC2-\xFD][\x80-\xBF]*}
@see{pm},
which matches exactly one UTF-8 byte sequence,
assuming that the subject is a valid UTF-8 string.
}
@LibEntry{utf8.codes (s)|
@LibEntry{utf8.codes (s [, nonstrict])|
Returns values so that the construction
@verbatim{
@ -7347,7 +7373,8 @@ It raises an error if it meets any invalid byte sequence.
}
@LibEntry{utf8.codepoint (s [, i [, j]])|
@LibEntry{utf8.codepoint (s [, i [, j [, nonstrict]]])|
Returns the codepoints (as integers) from all characters in @id{s}
that start between byte position @id{i} and @id{j} (both included).
The default for @id{i} is 1 and for @id{j} is @id{i}.
@ -7355,7 +7382,8 @@ It raises an error if it meets any invalid byte sequence.
}
@LibEntry{utf8.len (s [, i [, j]])|
@LibEntry{utf8.len (s [, i [, j [, nonstrict]]])|
Returns the number of UTF-8 characters in string @id{s}
that start between positions @id{i} and @id{j} (both inclusive).
The default for @id{i} is @num{1} and for @id{j} is @num{-1}.
@ -7365,6 +7393,7 @@ returns a false value plus the position of the first invalid byte.
}
@LibEntry{utf8.offset (s, n [, i])|
Returns the position (in bytes) where the encoding of the
@id{n}-th character of @id{s}
(counting from position @id{i}) starts.
@ -8755,6 +8784,12 @@ You can enclose the call in parentheses if you need to
discard these extra results.
}
@item{
By default, the decoding functions in the @Lid{utf8} library
do not accept surrogates as valid code points.
An extra parameter in these functions makes them more permissive.
}
}
}

17
testes/literals.lua

@ -56,16 +56,23 @@ assert("abc\z
assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0))
-- limits for 1-byte sequences
assert("\u{0}\u{7F}" == "\x00\z\x7F")
assert("\u{0}\u{7F}" == "\x00\x7F")
-- limits for 2-byte sequences
assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF")
assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF")
-- limits for 3-byte sequences
assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\z\xEF\xBF\xBF")
assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\xEF\xBF\xBF")
-- limits for 4-byte sequences
assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF")
assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF")
-- limits for 5-byte sequences
assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF")
-- limits for 6-byte sequences
assert("\u{4000000}\u{7FFFFFFF}" ==
"\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF")
-- Error in escape sequences
@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]])
lexerror([[" \256"]], [[\256"]])
-- errors in UTF-8 sequences
lexerror([["abc\u{110000}"]], [[abc\u{110000]]) -- too large
lexerror([["abc\u{100000000}"]], [[abc\u{100000000]]) -- too large
lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{'
lexerror([["abc\u"]], [[abc\u"]]) -- missing '{'
lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}'

90
testes/utf8.lua

@ -21,62 +21,59 @@ local justone = "^" .. utf8.charpattern .. "$"
-- 't' is the list of codepoints of 's'
local function checksyntax (s, t)
-- creates a string "return '\u{t[1]}...\u{t[n]}'"
local ts = {"return '"}
for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
ts[#t + 2] = "'"
ts = table.concat(ts)
-- its execution should result in 's'
assert(assert(load(ts))() == s)
end
assert(utf8.offset("alo", 5) == nil)
assert(utf8.offset("alo", -4) == nil)
-- 't' is the list of codepoints of 's'
local function check (s, t)
local l = utf8.len(s)
-- 'check' makes several tests over the validity of string 's'.
-- 't' is the list of codepoints of 's'.
local function check (s, t, nonstrict)
local l = utf8.len(s, 1, -1, nonstrict)
assert(#t == l and len(s) == l)
assert(utf8.char(table.unpack(t)) == s)
assert(utf8.char(table.unpack(t)) == s) -- 't' and 's' are equivalent
assert(utf8.offset(s, 0) == 1)
checksyntax(s, t)
local t1 = {utf8.codepoint(s, 1, -1)}
-- creates new table with all codepoints of 's'
local t1 = {utf8.codepoint(s, 1, -1, nonstrict)}
assert(#t == #t1)
for i = 1, #t do assert(t[i] == t1[i]) end
for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1'
for i = 1, l do
for i = 1, l do -- for all codepoints
local pi = utf8.offset(s, i) -- position of i-th char
local pi1 = utf8.offset(s, 2, pi) -- position of next char
assert(string.find(string.sub(s, pi, pi1 - 1), justone))
assert(utf8.offset(s, -1, pi1) == pi)
assert(utf8.offset(s, i - l - 1) == pi)
assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi)))
assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
for j = pi, pi1 - 1 do
assert(utf8.offset(s, 0, j) == pi)
end
for j = pi + 1, pi1 - 1 do
assert(not utf8.len(s, j))
end
assert(utf8.len(s, pi, pi) == 1)
assert(utf8.len(s, pi, pi1 - 1) == 1)
assert(utf8.len(s, pi) == l - i + 1)
assert(utf8.len(s, pi1) == l - i)
assert(utf8.len(s, 1, pi) == i)
assert(utf8.len(s, pi, pi, nonstrict) == 1)
assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
assert(utf8.len(s, 1, pi, -1, nonstrict) == i)
end
local i = 0
for p, c in utf8.codes(s) do
i = i + 1
assert(c == t[i] and p == utf8.offset(s, i))
assert(utf8.codepoint(s, p) == c)
end
assert(i == #t)
i = 0
for p, c in utf8.codes(s) do
for p, c in utf8.codes(s, nonstrict) do
i = i + 1
assert(c == t[i] and p == utf8.offset(s, i))
assert(utf8.codepoint(s, p, p, nonstrict) == c)
end
assert(i == #t)
@ -105,13 +102,17 @@ do -- error indication in utf8.len
check("\xF4\x9F\xBF\xBF", 1)
end
-- error in utf8.codes
checkerror("invalid UTF%-8 code",
function ()
local s = "ab\xff"
for c in utf8.codes(s) do assert(c) end
end)
-- errors in utf8.codes
do
local function errorcodes (s)
checkerror("invalid UTF%-8 code",
function ()
for c in utf8.codes(s) do assert(c) end
end)
end
errorcodes("ab\xff")
errorcodes("\u{110000}")
end
-- error in initial position for offset
checkerror("position out of range", utf8.offset, "abc", 1, 5)
@ -141,14 +142,22 @@ do
assert(#t == 0)
checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1)
checkerror("out of range", utf8.codepoint, s, 1, #s + 1)
-- surrogates
assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1)
assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1)
assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800)
assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF)
assert(utf8.codepoint("\u{7FFFFFFF}", 1, 1, true) == 0x7FFFFFFF)
end
assert(utf8.char() == "")
assert(utf8.char(97, 98, 99) == "abc")
assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1")
assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
assert(utf8.codepoint(utf8.char(0x7FFFFFFF), 1, 1, true) == (1<<31) - 1)
checkerror("value out of range", utf8.char, 0x10FFFF + 1)
checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1)
checkerror("value out of range", utf8.char, -1)
local function invalid (s)
checkerror("invalid UTF%-8 code", utf8.codepoint, s)
@ -158,6 +167,10 @@ end
-- UTF-8 representation for 0x11ffff (value out of valid range)
invalid("\xF4\x9F\xBF\xBF")
-- surrogates
invalid("\u{D800}")
invalid("\u{DFFF}")
-- overlong sequences
invalid("\xC0\x80") -- zero
invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte)
@ -183,6 +196,21 @@ s = "\0 \x7F\z
s = string.gsub(s, " ", "")
check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
do
-- original UTF-8 values
local s = "\u{4000000}\u{7FFFFFFF}"
assert(#s == 12)
check(s, {0x4000000, 0x7FFFFFFF}, true)
s = "\u{200000}\u{3FFFFFF}"
assert(#s == 10)
check(s, {0x200000, 0x3FFFFFF}, true)
s = "\u{10000}\u{1fffff}"
assert(#s == 8)
check(s, {0x10000, 0x1FFFFF}, true)
end
x = "日本語a-4\0éó"
check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})

Loading…
Cancel
Save