Browse Source

properties: add "ambiguous_width" property for ambiguous East Asian Width (#270)

Some characters have their width defined as "Ambiguous" in UAX#11.
These are typically rendered as single-width by modern monospace fonts,
and utf8proc correctly returns charwidth==1 for these.

However some applications might need to support older CJK fonts where
characters which where two-byte in legacy encodings were rendered as
double-width. An example of this is the 'ambiwidth' option of vim
and neovim which supports rendering in terminals using such wideness
rules.

Add an 'ambiguous_width' property to utf8proc_property_t for such characters.
master
bfredl 2 months ago
committed by GitHub
parent
commit
3de4596fbe
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 10
      data/data_generator.jl
  2. 14
      test/charwidth.c
  3. 4
      utf8proc.c
  4. 12
      utf8proc.h
  5. 25745
      utf8proc_data.c

10
data/data_generator.jl

@ -190,6 +190,7 @@ function read_east_asian_widths(filename)
for (rng,widthcode) in read_hex_ranges(filename)
w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
widthcode == "A" ? -1 : # ambiguous width
nothing
if !isnothing(w)
set_all!(ea_widths, rng, w)
@ -221,7 +222,7 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
# Widths from UAX #11: East Asian Width
eaw = get(ea_widths, code, nothing)
if !isnothing(eaw)
width = eaw
width = eaw < 0 ? 1 : eaw
end
# A few exceptional cases, found by manual comparison to other wcwidth
@ -242,6 +243,9 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
return width
end
global function is_ambiguous_width(code)
return get(ea_widths, code, 0) < 0
end
end
#-------------------------------------------------------------------------------
@ -394,6 +398,7 @@ function char_table_properties!(sequences, char)
control_boundary = char.category in ("Zl", "Zp", "Cc", "Cf") &&
!(char.code in (0x200C, 0x200D)),
charwidth = derive_char_width(code, char.category),
ambiguous_width = is_ambiguous_width(code),
boundclass = get_grapheme_boundclass(code),
indic_conjunct_break = get_indic_conjunct_break(code),
)
@ -479,7 +484,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
print(io, """
static const utf8proc_property_t utf8proc_properties[] = {
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
""")
for prop in deduplicated_props
print(io, " {",
@ -498,6 +503,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
prop.ignorable, ", ",
prop.control_boundary, ", ",
prop.charwidth, ", ",
prop.ambiguous_width, ", ",
"0, ", # bitfield padding
c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),

14
test/charwidth.c

@ -25,6 +25,7 @@ int main(int argc, char **argv)
for (c = 0; c <= 0x110000; ++c) {
int cat = utf8proc_get_property(c)->category;
int w = utf8proc_charwidth(c);
int ambiguous = utf8proc_charwidth_ambiguous(c);
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
error += 1;
@ -42,6 +43,10 @@ int main(int argc, char **argv)
isprint(c) ? "printable" : "non-printable", c);
error += 1;
}
if (c <= 127 && utf8proc_charwidth_ambiguous(c)) {
fprintf(stderr, "ambiwith set for ASCII %x\n", c);
error += 1;
}
if (!my_isprint(c) && w > 0) {
fprintf(stderr, "non-printing %x had width %d\n", c, w);
error += 1;
@ -50,11 +55,20 @@ int main(int argc, char **argv)
fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
error += 1;
}
if (ambiguous && w >= 2) {
fprintf(stderr, "char %x is both doublewidth and ambiguous\n", c);
error += 1;
}
}
check(!error, "utf8proc_charwidth FAILED %d tests.", error);
check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
check(utf8proc_charwidth_ambiguous(0x00ad) , "incorrect ambiguous width for U+00AD (soft hyphen)");
check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
check(utf8proc_charwidth_ambiguous(0xe000), "incorrect ambiguous width for U+e000 (PUA)");
check(utf8proc_charwidth_ambiguous(0x00A1), "incorrect ambiguous width for U+00A1 (inverted exclamation mark)");
check(!utf8proc_charwidth_ambiguous(0x00A2), "incorrect ambiguous width for U+00A2 (cent sign)");
/* print some other information by compariing with system wcwidth */
printf("Mismatches with system wcwidth (not necessarily errors):\n");

4
utf8proc.c

@ -432,6 +432,10 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
return utf8proc_get_property(c)->charwidth;
}
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t c) {
return utf8proc_get_property(c)->ambiguous_width;
}
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
return (utf8proc_category_t) utf8proc_get_property(c)->category;
}

12
utf8proc.h

@ -268,7 +268,9 @@ typedef struct utf8proc_property_struct {
unsigned control_boundary:1;
/** The width of the codepoint. */
unsigned charwidth:2;
unsigned pad:2;
/** East Asian width class A */
unsigned ambiguous_width:1;
unsigned pad:1;
/**
* Boundclass.
* @see utf8proc_boundclass_t.
@ -667,6 +669,14 @@ UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c);
* (analogous to `isprint` or `iscntrl`), use utf8proc_category(). */
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
/**
* Given a codepoint, return whether it has East Asian width class A (Ambiguous)
*
* Codepoints with this property are considered to have charwidth 1 (if they are printable)
* but some East Asian fonts render them as double width.
*/
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t codepoint);
/**
* Return the Unicode category for the codepoint (one of the
* @ref utf8proc_category_t constants.)

25745
utf8proc_data.c

File diff suppressed because it is too large
Loading…
Cancel
Save