Browse Source

add utf8proc_iterate_graphemes

pull/213/head
Nils Blomqvist 4 years ago
parent
commit
c530ec926e
  1. 29
      test/graphemetest.c
  2. 29
      utf8proc.c
  3. 19
      utf8proc.h

29
test/graphemetest.c

@ -81,6 +81,35 @@ void checkline(const char *_buf, bool verbose) {
} while (i < si);
}
if (si) { /* test calls to utf8proc_iterate_graphemes */
/* note and remove break indicators */
int breaks[16];
int a = 0, i = 0, j = 0;
while (i < si) {
if (src[i] == '/') {
breaks[j++] = i;
a = i;
while (a < si) {
src[a] = src[a+1];
a++;
}
si = a-1;
}
i++;
}
breaks[j++] = si;
int k = 0;
int read_bytes = 0;
int start, end;
while ( utf8proc_iterate_graphemes(src, &read_bytes, si, &start, &end) ) {
check(breaks[k] == start, "expected grapheme start not found");
check(breaks[k+1] == end, "expected grapheme end not found");
k++;
}
}
if (verbose)
printf("passed grapheme test: \"%s\"\n", (char*) src);
}

29
utf8proc.c

@ -170,6 +170,35 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
return 4;
}
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate_graphemes(
const utf8proc_uint8_t *str, utf8proc_int32_t *read_bytes, utf8proc_ssize_t strlen,
utf8proc_uint32_t *start, utf8proc_uint32_t *end
) {
int n, break_state = 0;
utf8proc_int32_t codepoint, prev_codepoint = 0;
if (*read_bytes == strlen)
return 0;
*start = *read_bytes;
while (1) {
n = utf8proc_iterate(str + *read_bytes, strlen - *read_bytes, &codepoint);
if (*read_bytes == strlen) {
codepoint = 0; // Final dummy codepoint
} else
if (codepoint == -1) {
return n;
}
*read_bytes = *read_bytes + n;
if (prev_codepoint != 0 && (true == utf8proc_grapheme_break_stateful(
prev_codepoint, codepoint, &break_state)) ) {
*read_bytes = *read_bytes - n;
*end = *read_bytes; // The last byte (not inclusive) of this grapheme
return 1;
}
prev_codepoint = codepoint;
}
// Unreachable
}
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
}

19
utf8proc.h

@ -613,6 +613,25 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
/**
* Given the UTF-8 string `str`, produce the starting and ending byte indices of each
* extended grapheme cluster (as defined by UAX#29).
*
* This function will return 1 as long as a grapheme cluster can be found, and 0
* once `strlen` number of bytes has been read. A negative error code will be
* returned in case of an error.
*
* @param str The UTF-8 string to read from.
* @param start The index of the first byte in the grapheme cluster.
* @param strlen The maximum number of bytes read from `str`.
* @param end The index of the last byte (non-inclusive) in the grapheme cluster.
* @param read_bytes Keeps track of how many bytes have been read. Should
* initially be set to 0.
*/
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate_graphemes(
const utf8proc_uint8_t *str, utf8proc_int32_t *read_bytes, utf8proc_ssize_t strlen,
utf8proc_uint32_t *start, utf8proc_uint32_t *end);
/**
* Given a codepoint `c`, return the codepoint of the corresponding

Loading…
Cancel
Save