Steven G. Johnson
10 years ago
6 changed files with 215 additions and 0 deletions
@ -0,0 +1,33 @@ |
|||
CURL=curl |
|||
|
|||
CC = cc |
|||
CFLAGS = -O2 -std=c99 -pedantic -Wall |
|||
|
|||
all: bench |
|||
|
|||
LIBMOJIBAKE = ../libmojibake.a |
|||
|
|||
bench: bench.o util.o |
|||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ bench.o util.o $(LIBMOJIBAKE) |
|||
|
|||
DATAURL = https://raw.githubusercontent.com/duerst/eprun/master/benchmark |
|||
DATAFILES = Deutsch_.txt Japanese_.txt Korean_.txt Vietnamese_.txt |
|||
|
|||
$(DATAFILES): |
|||
$(CURL) -O $(DATAURL)/$@ |
|||
|
|||
bench.out: $(DATAFILES) bench |
|||
./bench -nfkc $(DATAFILES) > $@ |
|||
|
|||
# you may need make CPPFLAGS=... LDFLAGS=... to help it find ICU
|
|||
icu: icu.o util.o |
|||
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ icu.o util.o -licuuc |
|||
|
|||
icu.out: $(DATAFILES) icu |
|||
./icu $(DATAFILES) > $@ |
|||
|
|||
.c.o: |
|||
$(CC) $(CPPFLAGS) -I.. $(CFLAGS) -c -o $@ $< |
|||
|
|||
clean: |
|||
rm -rf *.o *.txt bench *.out icu |
@ -0,0 +1,56 @@ |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <string.h> |
|||
|
|||
#include "mojibake.h" |
|||
#include "util.h" |
|||
|
|||
int main(int argc, char **argv) |
|||
{ |
|||
int i; |
|||
int options = 0; |
|||
|
|||
for (i = 1; i < argc; ++i) { |
|||
if (!strcmp(argv[i], "-nfkc")) { |
|||
options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE|UTF8PROC_COMPAT; |
|||
continue; |
|||
} |
|||
if (!strcmp(argv[i], "-nfkd")) { |
|||
options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE|UTF8PROC_COMPAT; |
|||
continue; |
|||
} |
|||
if (!strcmp(argv[i], "-nfc")) { |
|||
options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE; |
|||
continue; |
|||
} |
|||
if (!strcmp(argv[i], "-nfd")) { |
|||
options |= UTF8PROC_STABLE|UTF8PROC_DECOMPOSE; |
|||
continue; |
|||
} |
|||
if (!strcmp(argv[i], "-casefold")) { |
|||
options |= UTF8PROC_CASEFOLD; |
|||
continue; |
|||
} |
|||
if (argv[i][0] == '-') { |
|||
fprintf(stderr, "unrecognized option: %s\n", argv[i]); |
|||
return EXIT_FAILURE; |
|||
} |
|||
|
|||
size_t len; |
|||
uint8_t *src = readfile(argv[i], &len); |
|||
if (!src) { |
|||
fprintf(stderr, "error reading %s\n", argv[i]); |
|||
return EXIT_FAILURE; |
|||
} |
|||
uint8_t *dest; |
|||
mytime start = gettime(); |
|||
for (int i = 0; i < 100; ++i) { |
|||
utf8proc_map(src, len, &dest, options); |
|||
free(dest); |
|||
} |
|||
printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); |
|||
free(src); |
|||
} |
|||
|
|||
return EXIT_SUCCESS; |
|||
} |
@ -0,0 +1,61 @@ |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
|
|||
/* ICU4C */ |
|||
#include <unicode/utypes.h> |
|||
#include <unicode/ustring.h> |
|||
#include <unicode/ucnv.h> |
|||
#include <unicode/unorm2.h> |
|||
|
|||
#include "util.h" |
|||
|
|||
int main(int argc, char **argv) |
|||
{ |
|||
int i; |
|||
|
|||
UErrorCode err; |
|||
UConverter *uc = ucnv_open("UTF8", &err); |
|||
if (U_FAILURE(err)) return EXIT_FAILURE; |
|||
|
|||
const UNormalizer2 *NFKC = unorm2_getNFKCInstance(&err); |
|||
if (U_FAILURE(err)) return EXIT_FAILURE; |
|||
|
|||
for (i = 1; i < argc; ++i) { |
|||
if (argv[i][0] == '-') { |
|||
fprintf(stderr, "unrecognized option: %s\n", argv[i]); |
|||
return EXIT_FAILURE; |
|||
} |
|||
|
|||
size_t len; |
|||
uint8_t *src = readfile(argv[i], &len); |
|||
if (!src) { |
|||
fprintf(stderr, "error reading %s\n", argv[i]); |
|||
return EXIT_FAILURE; |
|||
} |
|||
|
|||
/* convert UTF8 data to ICU's UTF16 */ |
|||
UChar *usrc = (UChar*) malloc(2*len * sizeof(UChar)); |
|||
ucnv_toUChars(uc, usrc, 2*len, (char*) src, len, &err); |
|||
if (U_FAILURE(err)) return EXIT_FAILURE; |
|||
size_t ulen = u_strlen(usrc); |
|||
|
|||
/* ICU's insane normalization API requires you to
|
|||
know the size of the destination buffer in advance, |
|||
or alternatively to repeatly try normalizing and |
|||
double the buffer size until it succeeds. Here, I just |
|||
allocate a huge destination buffer to avoid the issue. */ |
|||
UChar *udest = (UChar*) malloc(10*ulen * sizeof(UChar)); |
|||
|
|||
mytime start = gettime(); |
|||
for (int i = 0; i < 100; ++i) { |
|||
unorm2_normalize(NFKC, usrc, ulen, udest, 10*ulen, &err); |
|||
if (U_FAILURE(err)) return EXIT_FAILURE; |
|||
} |
|||
printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); |
|||
free(udest); |
|||
free(usrc); |
|||
free(src); |
|||
} |
|||
|
|||
return EXIT_SUCCESS; |
|||
} |
@ -0,0 +1,39 @@ |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <sys/stat.h> |
|||
|
|||
#include "util.h" |
|||
|
|||
/* read file named FILENAME into an array of *len bytes,
|
|||
returning NULL on error */ |
|||
uint8_t *readfile(const char *filename, size_t *len) |
|||
{ |
|||
*len = 0; |
|||
struct stat st; |
|||
if (0 != stat(filename, &st)) return NULL; |
|||
*len = st.st_size; |
|||
FILE *f = fopen(filename, "r"); |
|||
if (!f) return NULL; |
|||
uint8_t *s = (uint8_t *) malloc(sizeof(uint8_t) * *len); |
|||
if (!s) return NULL; |
|||
if (fread(s, 1, *len, f) != *len) { |
|||
free(s); |
|||
s = NULL; |
|||
} |
|||
fclose(f); |
|||
return s; |
|||
} |
|||
|
|||
mytime gettime(void) { |
|||
mytime t; |
|||
gettimeofday(&t, NULL); |
|||
return t; |
|||
} |
|||
|
|||
/* time difference in seconds */ |
|||
double elapsed(mytime t1, mytime t0) |
|||
{ |
|||
return (double)(t1.tv_sec - t0.tv_sec) + |
|||
(double)(t1.tv_usec - t0.tv_usec) * 1.0E-6; |
|||
} |
|||
|
@ -0,0 +1,22 @@ |
|||
#ifndef UTIL_H |
|||
#define UTIL_H 1 |
|||
|
|||
#include <inttypes.h> |
|||
#include <sys/time.h> |
|||
#include <time.h> |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
uint8_t *readfile(const char *filename, size_t *len); |
|||
|
|||
typedef struct timeval mytime; |
|||
mytime gettime(void); |
|||
double elapsed(mytime t1, mytime t0); |
|||
|
|||
#ifdef __cplusplus |
|||
} |
|||
#endif |
|||
|
|||
#endif /* UTIL_H */ |
Loading…
Reference in new issue