properties: add "ambiguous_width" property for ambiguous East Asian Width (#270)

Some characters have their width defined as "Ambiguous" in UAX#11. These are typically rendered as single-width by modern monospace fonts, and utf8proc correctly returns charwidth==1 for these. However some applications might need to support older CJK fonts where characters which where two-byte in legacy encodings were rendered as double-width. An example of this is the 'ambiwidth' option of vim and neovim which supports rendering in terminals using such wideness rules. Add an 'ambiguous_width' property to utf8proc_property_t for such characters.
2 months ago · 3de4596fbe
5 changed files with 12923 additions and 12862 deletions
--- a/data/data_generator.jl
+++ b/data/data_generator.jl
@ -190,6 +190,7 @@ function read_east_asian_widths(filename)
    for (rng,widthcode) in read_hex_ranges(filename)
        w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
            widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
+            widthcode == "A"  ? -1 : # ambiguous width
            nothing
        if !isnothing(w)
            set_all!(ea_widths, rng, w)
@ -221,7 +222,7 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
        # Widths from UAX #11: East Asian Width
        eaw = get(ea_widths, code, nothing)
        if !isnothing(eaw)
-            width = eaw
+            width = eaw < 0 ? 1 : eaw
        end

        # A few exceptional cases, found by manual comparison to other wcwidth
@ -242,6 +243,9 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")

        return width
    end
+    global function is_ambiguous_width(code)
+        return get(ea_widths, code, 0) < 0
+    end
 end

 #-------------------------------------------------------------------------------
@ -394,6 +398,7 @@ function char_table_properties!(sequences, char)
        control_boundary     = char.category in ("Zl", "Zp", "Cc", "Cf") &&
                               !(char.code in (0x200C, 0x200D)),
        charwidth            = derive_char_width(code, char.category),
+        ambiguous_width      = is_ambiguous_width(code),
        boundclass           = get_grapheme_boundclass(code),
        indic_conjunct_break = get_indic_conjunct_break(code),
    )
@ -479,7 +484,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup

    print(io, """
        static const utf8proc_property_t utf8proc_properties[] = {
-          {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
+          {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
        """)
    for prop in deduplicated_props
        print(io, "  {",
@ -498,6 +503,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
              prop.ignorable, ", ",
              prop.control_boundary, ", ",
              prop.charwidth, ", ",
+              prop.ambiguous_width, ", ",
              "0, ", # bitfield padding
              c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
              c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),
--- a/test/charwidth.c
+++ b/test/charwidth.c
@ -25,6 +25,7 @@ int main(int argc, char **argv)
    for (c = 0; c <= 0x110000; ++c) {
        int cat = utf8proc_get_property(c)->category;
        int w = utf8proc_charwidth(c);
+        int ambiguous = utf8proc_charwidth_ambiguous(c);
        if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
            fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
            error += 1;
@ -42,6 +43,10 @@ int main(int argc, char **argv)
            isprint(c) ? "printable" : "non-printable", c);
            error += 1;
        }
+        if (c <= 127 && utf8proc_charwidth_ambiguous(c)) {
+            fprintf(stderr, "ambiwith set for ASCII %x\n", c);
+            error += 1;
+        }
        if (!my_isprint(c) && w > 0) {
            fprintf(stderr, "non-printing %x had width %d\n", c, w);
            error += 1;
@ -50,11 +55,20 @@ int main(int argc, char **argv)
            fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
            error += 1;
        }
+        if (ambiguous && w >= 2) {
+            fprintf(stderr, "char %x is both doublewidth and ambiguous\n", c);
+            error += 1;
+        }
    }
    check(!error, "utf8proc_charwidth FAILED %d tests.", error);

    check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
+    check(utf8proc_charwidth_ambiguous(0x00ad) , "incorrect ambiguous width for U+00AD (soft hyphen)");
    check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
+    check(utf8proc_charwidth_ambiguous(0xe000), "incorrect ambiguous width for U+e000 (PUA)");
+
+    check(utf8proc_charwidth_ambiguous(0x00A1), "incorrect ambiguous width for U+00A1 (inverted exclamation mark)");
+    check(!utf8proc_charwidth_ambiguous(0x00A2), "incorrect ambiguous width for U+00A2 (cent sign)");

    /* print some other information by compariing with system wcwidth */
    printf("Mismatches with system wcwidth (not necessarily errors):\n");
--- a/utf8proc.c
+++ b/utf8proc.c
@ -432,6 +432,10 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
  return utf8proc_get_property(c)->charwidth;
 }

+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t c) {
+  return utf8proc_get_property(c)->ambiguous_width;
+}
+
 UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
  return (utf8proc_category_t) utf8proc_get_property(c)->category;
 }
--- a/utf8proc.h
+++ b/utf8proc.h
@ -268,7 +268,9 @@ typedef struct utf8proc_property_struct {
  unsigned control_boundary:1;
  /** The width of the codepoint. */
  unsigned charwidth:2;
-  unsigned pad:2;
+  /** East Asian width class A */
+  unsigned ambiguous_width:1;
+  unsigned pad:1;
  /**
   * Boundclass.
   * @see utf8proc_boundclass_t.
@ -667,6 +669,14 @@ UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c);
 * (analogous to `isprint` or `iscntrl`), use utf8proc_category(). */
 UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);

+/**
+ * Given a codepoint, return whether it has East Asian width class A (Ambiguous)
+ *
+ * Codepoints with this property are considered to have charwidth 1 (if they are printable)
+ * but some East Asian fonts render them as double width.
+ */
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t codepoint);
+
 /**
 * Return the Unicode category for the codepoint (one of the
 * @ref utf8proc_category_t constants.)
--- a/utf8proc_data.c
+++ b/utf8proc_data.c