update graphemes for Unicode 7, add utf8proc_grapheme_break function

10 years ago · 397a1eabea
8 changed files with 10695 additions and 10397 deletions
--- a/.gitignore
+++ b/.gitignore
@ -15,3 +15,5 @@ bench/icu
 bench/unistring
 normtest
 graphemetest
+utf8proc_data.c.new
+printproperty
--- a/14
+++ b/14
@ -29,16 +29,19 @@ clean:
 	$(MAKE) -C bench clean

 update: utf8proc_data.c.new
+	cp -f utf8proc_data.c.new utf8proc_data.c

 # real targets

-utf8proc_data.c.new: UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
+utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
 	$(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new

 UnicodeData.txt:
-
 	$(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

+GraphemeBreakProperty.txt:
+	$(CURL) -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
+
 DerivedCoreProperties.txt:
 	$(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt

@ -72,10 +75,13 @@ GraphemeBreakTest.txt:
 	$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@

 normtest: normtest.c utf8proc.o mojibake.h tests.h
-	$(cc) normtest.c utf8proc.o -o normtest
+	$(cc) normtest.c utf8proc.o -o $@

 graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h
-	$(cc) graphemetest.c utf8proc.o -o graphemetest
+	$(cc) graphemetest.c utf8proc.o -o $@
+
+printproperty: printproperty.c utf8proc.o mojibake.h tests.h
+	$(cc) printproperty.c utf8proc.o -o $@

 check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
 	./normtest
--- a/data_generator.rb
+++ b/data_generator.rb
@ -75,13 +75,13 @@ $ignorable_list.each_line do |entry|
  end
 end

-$grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
-$grapheme_extend = []
-$grapheme_extend_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
-    $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
-  elsif entry =~ /^[0-9A-F]+/
-    $grapheme_extend << $&.hex
+$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
+$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
+$grapheme_boundclass_list.each_line do |entry|
+  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
+    $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
+  elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
+    $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
  end
 end

@ -161,18 +161,18 @@ class UnicodeChar
    "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
    "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
    "#{ary2c decomp_mapping}, " <<
-    "#{bidi_mirrored}, " <<
+    "#{ary2c case_folding}, " <<
    "#{uppercase_mapping or -1}, " <<
    "#{lowercase_mapping or -1}, " <<
    "#{titlecase_mapping or -1}, " <<
    "#{comb1_indicies[code] ?
       (comb1_indicies[code]*comb2_indicies.keys.length) : -1
      }, #{comb2_indicies[code] or -1}, " <<
+    "#{bidi_mirrored}, " <<
    "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
    "#{$ignorable.include?(code)}, " <<
    "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
-    "#{$grapheme_extend.include?(code)}, " <<
-    "#{ary2c case_folding}},\n"
+    "#{$grapheme_boundclass[code]}},\n"
  end
 end

@ -295,7 +295,7 @@ end
 $stdout << "};\n\n"

 $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << "  {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
+$stdout << "  {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
 properties.each { |line|
  $stdout << line
 }
--- a/graphemetest.c
+++ b/graphemetest.c
@ -7,7 +7,7 @@ int main(void)
    FILE *f = fopen("GraphemeBreakTest.txt", "r");
    uint8_t src[1024];
    
-    check(f != NULL, "error opening NormalizationTest.txt");
+    check(f != NULL, "error opening GraphemeBreakTest.txt");
    while (getline(&buf, &bufsize, f) > 0) {
        size_t bi = 0, si = 0;
        lineno += 1;
@ -20,7 +20,7 @@ int main(void)
        while (buf[bi]) {
            bi = skipspaces(buf, bi);
            if (buf[bi] == '/') { /* grapheme break */
-                src[si++] = 0xff;
+                src[si++] = '/';
                bi++;
            }
            else if (buf[bi] == '+') { /* no break */
@ -34,8 +34,8 @@ int main(void)
                while (src[si]) ++si; /* advance to NUL termination */
            }
        }
-        if (si && src[si-1] == 0xff)
-            --si; /* no 0xff after final grapheme */
+        if (si && src[si-1] == '/')
+            --si; /* no break after final grapheme */
        src[si] = 0; /* NUL-terminate */
        
        if (si) {
@ -44,16 +44,27 @@ int main(void)
            ssize_t glen;
            uint8_t *g; /* utf8proc_map grapheme results */
            while (i < si) {
-                if (src[i] != 0xff)
+                if (src[i] != '/')
                    utf8[j++] = src[i++];
                else
                    i++;
            }
            glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
+            if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
+                 /* the test file contains surrogate codepoints, which are only for UTF-16 */
+                 printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
+            }
+            else {
                 check(glen >= 0, "utf8proc_map error = %s",
                       utf8proc_errmsg(glen));
+                 for (i = 0; i <= glen; ++i)
+                      if (g[i] == 0xff)
+                           g[i] = '/'; /* easier-to-read output (/ is not in test strings) */
+                 printf("line %zd\n", lineno);
                 check(!strcmp((char*)g, (char*)src),
-                  "grapheme mismatch: %s vs. %s", (char*)g, (char*)src);
+                       "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
+            }
+            free(g);
        }
    }
    fclose(f);
--- a/mojibake.h
+++ b/mojibake.h
@ -170,17 +170,17 @@ typedef struct utf8proc_property_struct {
  utf8proc_propval_t bidi_class;
  utf8proc_propval_t decomp_type;
  const int32_t *decomp_mapping;
-  unsigned bidi_mirrored:1;
+  const int32_t *casefold_mapping;
  int32_t uppercase_mapping;
  int32_t lowercase_mapping;
  int32_t titlecase_mapping;
  int32_t comb1st_index;
  int32_t comb2nd_index;
+  unsigned bidi_mirrored:1;
  unsigned comp_exclusion:1;
  unsigned ignorable:1;
  unsigned control_boundary:1;
-  unsigned extend:1;
-  const int32_t *casefold_mapping;
+  unsigned boundclass:4;
 } utf8proc_property_t;

 #define UTF8PROC_CATEGORY_LU  1
@ -253,6 +253,21 @@ typedef struct utf8proc_property_struct {
 #define UTF8PROC_DECOMP_TYPE_FRACTION 15
 #define UTF8PROC_DECOMP_TYPE_COMPAT   16

+/* values for boundclass property: */
+#define UTF8PROC_BOUNDCLASS_START    0
+#define UTF8PROC_BOUNDCLASS_OTHER    1
+#define UTF8PROC_BOUNDCLASS_CR       2
+#define UTF8PROC_BOUNDCLASS_LF       3
+#define UTF8PROC_BOUNDCLASS_CONTROL  4
+#define UTF8PROC_BOUNDCLASS_EXTEND   5
+#define UTF8PROC_BOUNDCLASS_L        6
+#define UTF8PROC_BOUNDCLASS_V        7
+#define UTF8PROC_BOUNDCLASS_T        8
+#define UTF8PROC_BOUNDCLASS_LV       9
+#define UTF8PROC_BOUNDCLASS_LVT     10
+#define UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 11
+#define UTF8PROC_BOUNDCLASS_SPACINGMARK 12
+
 DLLEXPORT extern const int8_t utf8proc_utf8class[256];

 DLLEXPORT const char *utf8proc_version(void);
@ -367,6 +382,12 @@ DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options
 *           crash!
 */

+DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
+/*
+ * Given a pair of consecutive codepoints (c1,c2), return whether a grapheme break is
+ * permitted between them (as defined by the extended grapheme clusters in UAX#29).
+ */
+
 DLLEXPORT ssize_t utf8proc_map(
  const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
 );
--- a/printproperty.c
+++ b/printproperty.c
@ -0,0 +1,45 @@
+/* simple test program to print out the utf8proc properties for a codepoint */
+
+#include "tests.h"
+
+int main(int argc, char **argv)
+{
+     int i;
+
+     for (i = 1; i < argc; ++i) {
+          int c;
+          check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
+          const utf8proc_property_t *p = utf8proc_get_property(c);
+          printf("U+%s:\n"
+                 "  category = %d\n"
+                 "  combining_class = %d\n"
+                 "  bidi_class = %d\n"
+                 "  decomp_type = %d\n"
+                 "  uppercase_mapping = %x\n"
+                 "  lowercase_mapping = %x\n"
+                 "  titlecase_mapping = %x\n"
+                 "  comb1st_index = %d\n"
+                 "  comb2nd_index = %d\n"
+                 "  bidi_mirrored = %d\n"
+                 "  comp_exclusion = %d\n"
+                 "  ignorable = %d\n"
+                 "  control_boundary = %d\n"
+                 "  boundclass = %d\n",
+                 argv[i],
+                 p->category,
+                 p->combining_class,
+                 p->bidi_class,
+                 p->decomp_type,
+                 p->uppercase_mapping,
+                 p->lowercase_mapping,
+                 p->titlecase_mapping,
+                 p->comb1st_index,
+                 p->comb2nd_index,
+                 p->bidi_mirrored,
+                 p->comp_exclusion,
+                 p->ignorable,
+                 p->control_boundary,
+                 p->boundclass);
+     }
+     return 0;
+}
--- a/utf8proc.c
+++ b/utf8proc.c
@ -81,19 +81,6 @@ DLLEXPORT const int8_t utf8proc_utf8class[256] = {
 #define UTF8PROC_HANGUL_S_START  0xAC00
 #define UTF8PROC_HANGUL_S_END    0xD7A4

-
-#define UTF8PROC_BOUNDCLASS_START    0
-#define UTF8PROC_BOUNDCLASS_OTHER    1
-#define UTF8PROC_BOUNDCLASS_CR       2
-#define UTF8PROC_BOUNDCLASS_LF       3
-#define UTF8PROC_BOUNDCLASS_CONTROL  4
-#define UTF8PROC_BOUNDCLASS_EXTEND   5
-#define UTF8PROC_BOUNDCLASS_L        6
-#define UTF8PROC_BOUNDCLASS_V        7
-#define UTF8PROC_BOUNDCLASS_T        8
-#define UTF8PROC_BOUNDCLASS_LV       9
-#define UTF8PROC_BOUNDCLASS_LVT     10
-
 /* in libmojibake, we append "m" to whatever version of utf8proc
   we have merged with most recently + whatever increment would
   correspond to semantic versioning rules.   Currently, we use 1.2m
@ -206,6 +193,38 @@ DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
  );
 }

+/* return whether there is a grapheme break between boundclasses lbc and tbc */
+static bool grapheme_break(int lbc, int tbc) {
+     return 
+          (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
+          (lbc == UTF8PROC_BOUNDCLASS_CR &&
+           tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
+          (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+          (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :
+          (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
+          (lbc == UTF8PROC_BOUNDCLASS_L &&
+           (tbc == UTF8PROC_BOUNDCLASS_L ||
+            tbc == UTF8PROC_BOUNDCLASS_V ||
+            tbc == UTF8PROC_BOUNDCLASS_LV ||
+            tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
+          ((lbc == UTF8PROC_BOUNDCLASS_LV ||
+            lbc == UTF8PROC_BOUNDCLASS_V) &&
+           (tbc == UTF8PROC_BOUNDCLASS_V ||
+            tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
+          ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
+            lbc == UTF8PROC_BOUNDCLASS_T) &&
+           tbc == UTF8PROC_BOUNDCLASS_T) ? false :
+          (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&
+           tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :
+          (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK);
+}
+
+/* return whether there is a grapheme break between codepoints c1 and c2 */
+DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) {
+     return grapheme_break(utf8proc_get_property(c1)->boundclass,
+                           utf8proc_get_property(c2)->boundclass);
+}
+
 #define utf8proc_decompose_lump(replacement_uc) \
  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
  options & ~UTF8PROC_LUMP, last_boundclass)
@ -302,48 +321,8 @@ DLLEXPORT ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufs
  }
  if (options & UTF8PROC_CHARBOUND) {
    bool boundary;
-    int tbc, lbc;
-    tbc =
-      (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
-      (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
-      ((category == UTF8PROC_CATEGORY_ZL ||
-        category == UTF8PROC_CATEGORY_ZP ||
-        category == UTF8PROC_CATEGORY_CC ||
-        category == UTF8PROC_CATEGORY_CF) &&
-        !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
-      property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
-      ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
-        uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
-      (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
-        UTF8PROC_BOUNDCLASS_V :
-      (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
-        UTF8PROC_BOUNDCLASS_T :
-      (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
-        ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
-          UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
-      ) :
-      UTF8PROC_BOUNDCLASS_OTHER;
-    lbc = *last_boundclass;
-    boundary =
-      (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
-      (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
-      (lbc == UTF8PROC_BOUNDCLASS_CR &&
-       tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
-      (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-      (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
-      (lbc == UTF8PROC_BOUNDCLASS_L &&
-       (tbc == UTF8PROC_BOUNDCLASS_L ||
-        tbc == UTF8PROC_BOUNDCLASS_V ||
-        tbc == UTF8PROC_BOUNDCLASS_LV ||
-        tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
-      ((lbc == UTF8PROC_BOUNDCLASS_LV ||
-        lbc == UTF8PROC_BOUNDCLASS_V) &&
-       (tbc == UTF8PROC_BOUNDCLASS_V ||
-        tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
-      ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
-        lbc == UTF8PROC_BOUNDCLASS_T) &&
-       tbc == UTF8PROC_BOUNDCLASS_T) ? false :
-       true;
+    int tbc = property->boundclass;
+    boundary = grapheme_break(*last_boundclass, tbc);
    *last_boundclass = tbc;
    if (boundary) {
      if (bufsize >= 1) dst[0] = 0xFFFF;
--- a/utf8proc_data.c
+++ b/utf8proc_data.c