|
|
@ -36,16 +36,16 @@ |
|
|
|
* |
|
|
|
* The features of utf8proc include: |
|
|
|
* |
|
|
|
* - Transformation of strings (@ref utf8proc_map) to: |
|
|
|
* - Transformation of strings (utf8proc_map()) to: |
|
|
|
* - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
|
|
|
|
* - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT) |
|
|
|
* - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK) |
|
|
|
* - case-folding (@ref UTF8PROC_CASEFOLD) |
|
|
|
* - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC |
|
|
|
* - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND) |
|
|
|
* - Character-width computation: @ref utf8proc_charwidth |
|
|
|
* - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string |
|
|
|
* - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8. |
|
|
|
* - Unicode normalization: utf8proc_NFD(), utf8proc_NFC(), utf8proc_NFKD(), utf8proc_NFKC() |
|
|
|
* - Detecting grapheme boundaries (utf8proc_grapheme_break() and @ref UTF8PROC_CHARBOUND) |
|
|
|
* - Character-width computation: utf8proc_charwidth() |
|
|
|
* - Classification of characters by Unicode category: utf8proc_category() and utf8proc_category_string() |
|
|
|
* - Encode (utf8proc_encode_char()) and decode (utf8proc_iterate()) Unicode codepoints to/from UTF-8. |
|
|
|
*/ |
|
|
|
|
|
|
|
/** @file */ |
|
|
@ -59,7 +59,7 @@ |
|
|
|
* semantic-versioning rules (http://semver.org) based on API
|
|
|
|
* compatibility. |
|
|
|
* |
|
|
|
* This is also returned at runtime by @ref utf8proc_version; however, the |
|
|
|
* This is also returned at runtime by utf8proc_version(); however, the |
|
|
|
* runtime version may append a string like "-dev" to the version number |
|
|
|
* for prerelease versions. |
|
|
|
* |
|
|
@ -261,7 +261,7 @@ typedef struct utf8proc_property_struct { |
|
|
|
/**
|
|
|
|
* Can this codepoint be ignored? |
|
|
|
* |
|
|
|
* Used by @ref utf8proc_decompose_char when @ref UTF8PROC_IGNORE is |
|
|
|
* Used by utf8proc_decompose_char() when @ref UTF8PROC_IGNORE is |
|
|
|
* passed as an option. |
|
|
|
*/ |
|
|
|
unsigned ignorable:1; |
|
|
@ -398,8 +398,8 @@ typedef enum { |
|
|
|
} utf8proc_indic_conjunct_break_t; |
|
|
|
|
|
|
|
/**
|
|
|
|
* Function pointer type passed to @ref utf8proc_map_custom and |
|
|
|
* @ref utf8proc_decompose_custom, which is used to specify a user-defined |
|
|
|
* Function pointer type passed to utf8proc_map_custom() and |
|
|
|
* utf8proc_decompose_custom(), which is used to specify a user-defined |
|
|
|
* mapping of codepoints to be applied in conjunction with other mappings. |
|
|
|
*/ |
|
|
|
typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data); |
|
|
@ -424,7 +424,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void); |
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns an informative error string for the given utf8proc error code |
|
|
|
* (e.g. the error codes returned by @ref utf8proc_map). |
|
|
|
* (e.g. the error codes returned by utf8proc_map()). |
|
|
|
*/ |
|
|
|
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode); |
|
|
|
|
|
|
@ -496,7 +496,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int |
|
|
|
* |
|
|
|
* @return |
|
|
|
* In case of success, the number of codepoints written is returned; in case |
|
|
|
* of an error, a negative error code is returned (@ref utf8proc_errmsg). |
|
|
|
* of an error, a negative error code is returned (utf8proc_errmsg()). |
|
|
|
* @par |
|
|
|
* If the number of written codepoints would be bigger than `bufsize`, the |
|
|
|
* required buffer size is returned, while the buffer will be overwritten with |
|
|
@ -508,7 +508,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( |
|
|
|
); |
|
|
|
|
|
|
|
/**
|
|
|
|
* The same as @ref utf8proc_decompose_char, but acts on a whole UTF-8 |
|
|
|
* The same as utf8proc_decompose_char(), but acts on a whole UTF-8 |
|
|
|
* string and orders the decomposed sequences correctly. |
|
|
|
* |
|
|
|
* If the @ref UTF8PROC_NULLTERM flag in `options` is set, processing |
|
|
@ -517,8 +517,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char( |
|
|
|
* codepoints) is written into the buffer being pointed to by |
|
|
|
* `buffer` (which must contain at least `bufsize` entries). In case of |
|
|
|
* success, the number of codepoints written is returned; in case of an |
|
|
|
* error, a negative error code is returned (@ref utf8proc_errmsg). |
|
|
|
* See @ref utf8proc_decompose_custom to supply additional transformations. |
|
|
|
* error, a negative error code is returned (utf8proc_errmsg()). |
|
|
|
* See utf8proc_decompose_custom() to supply additional transformations. |
|
|
|
* |
|
|
|
* If the number of written codepoints would be bigger than `bufsize`, the |
|
|
|
* required buffer size is returned, while the buffer will be overwritten with |
|
|
@ -530,10 +530,10 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( |
|
|
|
); |
|
|
|
|
|
|
|
/**
|
|
|
|
* The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function |
|
|
|
* The same as utf8proc_decompose(), but also takes a `custom_func` mapping function |
|
|
|
* that is called on each codepoint in `str` before any other transformations |
|
|
|
* (along with a `custom_data` pointer that is passed through to `custom_func`). |
|
|
|
* The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom. |
|
|
|
* The `custom_func` argument is ignored if it is `NULL`. See also utf8proc_map_custom(). |
|
|
|
*/ |
|
|
|
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( |
|
|
|
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, |
|
|
@ -559,7 +559,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( |
|
|
|
* |
|
|
|
* @return |
|
|
|
* In case of success, the length (in codepoints) of the normalized UTF-32 string is |
|
|
|
* returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). |
|
|
|
* returned; otherwise, a negative error code is returned (utf8proc_errmsg()). |
|
|
|
* |
|
|
|
* @warning The entries of the array pointed to by `str` have to be in the |
|
|
|
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! |
|
|
@ -587,7 +587,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b |
|
|
|
* @return |
|
|
|
* In case of success, the length (in bytes) of the resulting nul-terminated |
|
|
|
* UTF-8 string is returned; otherwise, a negative error code is returned |
|
|
|
* (@ref utf8proc_errmsg). |
|
|
|
* (utf8proc_errmsg()). |
|
|
|
* |
|
|
|
* @warning The amount of free space pointed to by `buffer` must |
|
|
|
* exceed the amount of the input data by one byte, and the |
|
|
@ -617,7 +617,7 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( |
|
|
|
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state); |
|
|
|
|
|
|
|
/**
|
|
|
|
* Same as @ref utf8proc_grapheme_break_stateful, except without support for the |
|
|
|
* Same as utf8proc_grapheme_break_stateful(), except without support for the |
|
|
|
* Unicode 9 additions to the algorithm. Supported for legacy reasons. |
|
|
|
*/ |
|
|
|
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( |
|
|
@ -664,7 +664,7 @@ UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c); |
|
|
|
* |
|
|
|
* @note |
|
|
|
* If you want to check for particular types of non-printable characters, |
|
|
|
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */ |
|
|
|
* (analogous to `isprint` or `iscntrl`), use utf8proc_category(). */ |
|
|
|
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint); |
|
|
|
|
|
|
|
/**
|
|
|
@ -690,7 +690,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi |
|
|
|
* contain NULL characters with the string if `str` contained NULL |
|
|
|
* characters). Other flags in the `options` field are passed to the |
|
|
|
* functions defined above, and regarded as described. See also |
|
|
|
* @ref utf8proc_map_custom to supply a custom codepoint transformation. |
|
|
|
* utf8proc_map_custom() to supply a custom codepoint transformation. |
|
|
|
* |
|
|
|
* In case of success the length of the new string is returned, |
|
|
|
* otherwise a negative error code is returned. |
|
|
@ -703,7 +703,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( |
|
|
|
); |
|
|
|
|
|
|
|
/**
|
|
|
|
* Like @ref utf8proc_map, but also takes a `custom_func` mapping function |
|
|
|
* Like utf8proc_map(), but also takes a `custom_func` mapping function |
|
|
|
* that is called on each codepoint in `str` before any other transformations |
|
|
|
* (along with a `custom_data` pointer that is passed through to `custom_func`). |
|
|
|
* The `custom_func` argument is ignored if it is `NULL`. |
|
|
@ -717,7 +717,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( |
|
|
|
* |
|
|
|
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or |
|
|
|
* NFKC_Casefold normalized version of the null-terminated string `str`. These |
|
|
|
* are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM |
|
|
|
* are shortcuts to calling utf8proc_map() with @ref UTF8PROC_NULLTERM |
|
|
|
* combined with @ref UTF8PROC_STABLE and flags indicating the normalization. |
|
|
|
*/ |
|
|
|
/** @{ */ |
|
|
|