add benchmark (issue #12)

10 years ago · 20cff0757b
6 changed files with 215 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,7 @@
 *.dll
 *.dylib
 *.dSYM
+*.txt
+*.out
+bench
+icu
--- a/bench/Makefile
+++ b/bench/Makefile
@ -0,0 +1,33 @@
+CURL=curl
+
+CC = cc
+CFLAGS = -O2 -std=c99 -pedantic -Wall
+
+all: bench
+
+LIBMOJIBAKE = ../libmojibake.a
+
+bench: bench.o util.o
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ bench.o util.o $(LIBMOJIBAKE)
+
+DATAURL = https://raw.githubusercontent.com/duerst/eprun/master/benchmark
+DATAFILES = Deutsch_.txt Japanese_.txt Korean_.txt Vietnamese_.txt
+
+$(DATAFILES):
+	$(CURL) -O $(DATAURL)/$@
+
+bench.out: $(DATAFILES) bench
+	./bench -nfkc $(DATAFILES) > $@
+
+# you may need make CPPFLAGS=... LDFLAGS=... to help it find ICU
+icu: icu.o util.o
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ icu.o util.o -licuuc
+
+icu.out: $(DATAFILES) icu
+	./icu $(DATAFILES) > $@
+
+.c.o:
+	$(CC) $(CPPFLAGS) -I.. $(CFLAGS) -c -o $@ $<
+
+clean:
+	rm -rf *.o *.txt bench *.out icu
--- a/bench/bench.c
+++ b/bench/bench.c
@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "mojibake.h"
+#include "util.h"
+
+int main(int argc, char **argv)
+{
+	 int i;
+	 int options = 0;
+	 
+	 for (i = 1; i < argc; ++i) {
+		  if (!strcmp(argv[i], "-nfkc")) {
+			   options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE|UTF8PROC_COMPAT;
+			   continue;
+		  }
+		  if (!strcmp(argv[i], "-nfkd")) {
+			   options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE|UTF8PROC_COMPAT;
+			   continue;
+		  }
+		  if (!strcmp(argv[i], "-nfc")) {
+			   options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE;
+			   continue;
+		  }
+		  if (!strcmp(argv[i], "-nfd")) {
+			   options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE;
+			   continue;
+		  }
+		  if (!strcmp(argv[i], "-casefold")) {
+			   options |= UTF8PROC_CASEFOLD;
+			   continue;
+		  }
+		  if (argv[i][0] == '-') {
+			   fprintf(stderr, "unrecognized option: %s\n", argv[i]);
+			   return EXIT_FAILURE;
+		  }
+
+		  size_t len;
+		  uint8_t *src = readfile(argv[i], &len);
+		  if (!src) {
+			   fprintf(stderr, "error reading %s\n", argv[i]);
+			   return EXIT_FAILURE;
+		  }
+		  uint8_t *dest;
+		  mytime start = gettime();
+		  for (int i = 0; i < 100; ++i) {
+			   utf8proc_map(src, len, &dest, options);
+			   free(dest);
+		  }
+		  printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100);
+		  free(src);
+	 }
+
+	 return EXIT_SUCCESS;
+}
--- a/bench/icu.c
+++ b/bench/icu.c
@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+/* ICU4C */
+#include <unicode/utypes.h>
+#include <unicode/ustring.h>
+#include <unicode/ucnv.h>
+#include <unicode/unorm2.h>
+
+#include "util.h"
+
+int main(int argc, char **argv)
+{
+	 int i;
+
+	 UErrorCode err;
+	 UConverter *uc = ucnv_open("UTF8", &err);
+	 if (U_FAILURE(err)) return EXIT_FAILURE;
+
+	 const UNormalizer2 *NFKC = unorm2_getNFKCInstance(&err);
+	 if (U_FAILURE(err)) return EXIT_FAILURE;
+	 
+	 for (i = 1; i < argc; ++i) {
+		  if (argv[i][0] == '-') {
+			   fprintf(stderr, "unrecognized option: %s\n", argv[i]);
+			   return EXIT_FAILURE;
+		  }
+
+		  size_t len;
+		  uint8_t *src = readfile(argv[i], &len);
+		  if (!src) {
+			   fprintf(stderr, "error reading %s\n", argv[i]);
+			   return EXIT_FAILURE;
+		  }
+
+		  /* convert UTF8 data to ICU's UTF16 */
+		  UChar *usrc = (UChar*) malloc(2*len * sizeof(UChar));
+		  ucnv_toUChars(uc, usrc, 2*len, (char*) src, len, &err);
+		  if (U_FAILURE(err)) return EXIT_FAILURE;
+		  size_t ulen = u_strlen(usrc);
+
+		  /* ICU's insane normalization API requires you to
+			 know the size of the destination buffer in advance,
+			 or alternatively to repeatly try normalizing and
+			 double the buffer size until it succeeds.  Here, I just
+			 allocate a huge destination buffer to avoid the issue. */
+		  UChar *udest = (UChar*) malloc(10*ulen * sizeof(UChar));
+
+		  mytime start = gettime();
+		  for (int i = 0; i < 100; ++i) {
+			   unorm2_normalize(NFKC, usrc, ulen, udest, 10*ulen, &err);
+			   if (U_FAILURE(err)) return EXIT_FAILURE;
+		  }
+		  printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100);
+		  free(udest);
+		  free(usrc);
+		  free(src);
+	 }
+
+	 return EXIT_SUCCESS;
+}
--- a/bench/util.c
+++ b/bench/util.c
@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+
+#include "util.h"
+
+/* read file named FILENAME into an array of *len bytes,
+   returning NULL on error */
+uint8_t *readfile(const char *filename, size_t *len)
+{
+	 *len = 0;
+	 struct stat st;
+	 if (0 != stat(filename, &st)) return NULL;
+	 *len = st.st_size;
+	 FILE *f = fopen(filename, "r");
+	 if (!f) return NULL;
+	 uint8_t *s = (uint8_t *) malloc(sizeof(uint8_t) * *len);
+	 if (!s) return NULL;
+	 if (fread(s, 1, *len, f) != *len) {
+		  free(s);
+		  s = NULL;
+	 }
+	 fclose(f);
+	 return s;
+}
+
+mytime gettime(void) {
+	 mytime t;
+	 gettimeofday(&t, NULL);
+	 return t;
+}
+
+/* time difference in seconds */
+double elapsed(mytime t1, mytime t0)
+{
+     return (double)(t1.tv_sec - t0.tv_sec) +
+          (double)(t1.tv_usec - t0.tv_usec) * 1.0E-6;
+}
+
--- a/bench/util.h
+++ b/bench/util.h
@ -0,0 +1,22 @@
+#ifndef UTIL_H
+#define UTIL_H 1
+
+#include <inttypes.h>
+#include <sys/time.h>
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+uint8_t *readfile(const char *filename, size_t *len);
+
+typedef struct timeval mytime;
+mytime gettime(void);
+double elapsed(mytime t1, mytime t0);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* UTIL_H */