From 814213b65fa4ab2b1a7216d06f68a6f3df89efcd Mon Sep 17 00:00:00 2001 From: Roberto Ierusalimschy Date: Mon, 27 May 2024 11:29:39 -0300 Subject: [PATCH] utf8.offset returns also final position of character 'utf8.offset' returns two values: the initial and the final position of the given character. --- lutf8lib.c | 20 ++++++++++++++------ manual/manual.of | 22 ++++++++++++++-------- testes/utf8.lua | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 55 insertions(+), 31 deletions(-) diff --git a/lutf8lib.c b/lutf8lib.c index 3a5b9bc3..7b747937 100644 --- a/lutf8lib.c +++ b/lutf8lib.c @@ -181,8 +181,8 @@ static int utfchar (lua_State *L) { /* -** offset(s, n, [i]) -> index where n-th character counting from -** position 'i' starts; 0 means character at 'i'. +** offset(s, n, [i]) -> indices where n-th character counting from +** position 'i' starts and ends; 0 means character at 'i'. */ static int byteoffset (lua_State *L) { size_t len; @@ -217,11 +217,19 @@ static int byteoffset (lua_State *L) { } } } - if (n == 0) /* did it find given character? */ - lua_pushinteger(L, posi + 1); - else /* no such character */ + if (n != 0) { /* did not find given character? */ luaL_pushfail(L); - return 1; + return 1; + } + lua_pushinteger(L, posi + 1); /* initial position */ + if ((s[posi] & 0x80) != 0) { /* multi-byte character? */ + do { + posi++; + } while (iscontp(s + posi + 1)); /* skip to final byte */ + } + /* else one-byte character: final position is the initial one */ + lua_pushinteger(L, posi + 1); /* 'posi' now is the final position */ + return 2; } diff --git a/manual/manual.of b/manual/manual.of index f830b01c..359bd166 100644 --- a/manual/manual.of +++ b/manual/manual.of @@ -7958,21 +7958,27 @@ returns @fail plus the position of the first invalid byte. @LibEntry{utf8.offset (s, n [, i])| -Returns the position (in bytes) where the encoding of the -@id{n}-th character of @id{s} -(counting from position @id{i}) starts. +Returns the the position of the @id{n}-th character of @id{s} +(counting from byte position @id{i}) as two integers: +The index (in bytes) where its encoding starts and the +index (in bytes) where it ends. + +If the specified character is right after the end of @id{s}, +the function behaves as if there was a @Char{\0} there. +If the specified character is neither in the subject +nor right after its end, +the function returns @fail. + A negative @id{n} gets characters before position @id{i}. The default for @id{i} is 1 when @id{n} is non-negative and @T{#s + 1} otherwise, so that @T{utf8.offset(s, -n)} gets the offset of the @id{n}-th character from the end of the string. -If the specified character is neither in the subject -nor right after its end, -the function returns @fail. As a special case, -when @id{n} is 0 the function returns the start of the encoding -of the character that contains the @id{i}-th byte of @id{s}. +when @id{n} is 0 the function returns the start and end +of the encoding of the character that contains the +@id{i}-th byte of @id{s}. This function assumes that @id{s} is a valid UTF-8 string. diff --git a/testes/utf8.lua b/testes/utf8.lua index efadbd5c..dc0f2f09 100644 --- a/testes/utf8.lua +++ b/testes/utf8.lua @@ -52,25 +52,35 @@ local function check (s, t, nonstrict) for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1' for i = 1, l do -- for all codepoints - local pi = utf8.offset(s, i) -- position of i-th char + local pi, pie = utf8.offset(s, i) -- position of i-th char local pi1 = utf8.offset(s, 2, pi) -- position of next char + assert(pi1 == pie + 1) assert(string.find(string.sub(s, pi, pi1 - 1), justone)) assert(utf8.offset(s, -1, pi1) == pi) assert(utf8.offset(s, i - l - 1) == pi) assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict))) for j = pi, pi1 - 1 do - assert(utf8.offset(s, 0, j) == pi) + local off1, off2 = utf8.offset(s, 0, j) + assert(off1 == pi and off2 == pi1 - 1) end for j = pi + 1, pi1 - 1 do assert(not utf8.len(s, j)) end - assert(utf8.len(s, pi, pi, nonstrict) == 1) - assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1) - assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1) - assert(utf8.len(s, pi1, -1, nonstrict) == l - i) - assert(utf8.len(s, 1, pi, nonstrict) == i) + assert(utf8.len(s, pi, pi, nonstrict) == 1) + assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1) + assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1) + assert(utf8.len(s, pi1, -1, nonstrict) == l - i) + assert(utf8.len(s, 1, pi, nonstrict) == i) end + local expected = 1 -- expected position of "current" character + for i = 1, l + 1 do + local p, e = utf8.offset(s, i) + assert(p == expected) + expected = e + 1 + end + assert(expected - 1 == #s + 1) + local i = 0 for p, c in utf8.codes(s, nonstrict) do i = i + 1 @@ -94,20 +104,20 @@ end do -- error indication in utf8.len - local function check (s, p) + local function checklen (s, p) local a, b = utf8.len(s) assert(not a and b == p) end - check("abc\xE3def", 4) - check("\xF4\x9F\xBF", 1) - check("\xF4\x9F\xBF\xBF", 1) + checklen("abc\xE3def", 4) + checklen("\xF4\x9F\xBF", 1) + checklen("\xF4\x9F\xBF\xBF", 1) -- spurious continuation bytes - check("汉字\x80", #("汉字") + 1) - check("\x80hello", 1) - check("hel\x80lo", 4) - check("汉字\xBF", #("汉字") + 1) - check("\xBFhello", 1) - check("hel\xBFlo", 4) + checklen("汉字\x80", #("汉字") + 1) + checklen("\x80hello", 1) + checklen("hel\x80lo", 4) + checklen("汉字\xBF", #("汉字") + 1) + checklen("\xBFhello", 1) + checklen("hel\xBFlo", 4) end -- errors in utf8.codes