libdas2
das2 core C utilities
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
utf8.h
1 /*
2  Basic UTF-8 manipulation routines
3  by Jeff Bezanson
4  placed in the public domain Fall 2005
5 
6  This code is designed to provide the utilities you need to manipulate
7  UTF-8 as an internal string encoding. These functions do not perform the
8  error checking normally needed when handling UTF-8 data, so if you happen
9  to be from the Unicode Consortium you will want to flay me alive.
10  I do this because error checking can be performed at the boundaries (I/O),
11  with these routines reserved for higher performance on data known to be
12  valid.
13  A UTF-8 validation routine is included.
14 */
15 
16 #ifndef UTF8_H
17 #define UTF8_H
18 
19 #include <stdio.h>
20 #include <stdarg.h>
21 #include <stdint.h>
22 #include <wchar.h>
23 #include <wctype.h>
24 #include <stdarg.h>
25 
26 #ifdef __cplusplus
27 extern "C" {
28 #endif
29 
30 extern int locale_is_utf8;
31 
32 /* is c the start of a utf8 sequence? */
33 #define isutf(c) (((c)&0xC0)!=0x80)
34 
35 #define UEOF ((uint32_t)-1)
36 
37 /* convert UTF-8 data to wide character */
38 size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz);
39 
40 /* the opposite conversion */
41 size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz);
42 
43 /* single character to UTF-8, returns # bytes written */
44 size_t u8_wc_toutf8(char *dest, uint32_t ch);
45 
46 /* character number to byte offset */
47 size_t u8_offset(const char *str, size_t charnum);
48 
49 /* byte offset to character number */
50 size_t u8_charnum(const char *s, size_t offset);
51 
52 /* return next character, updating an index variable */
53 uint32_t u8_nextchar(const char *s, size_t *i);
54 
55 /* next character without NUL character terminator */
56 uint32_t u8_nextmemchar(const char *s, size_t *i);
57 
58 /* move to next character */
59 void u8_inc(const char *s, size_t *i);
60 
61 /* move to previous character */
62 void u8_dec(const char *s, size_t *i);
63 
64 /* returns length of next utf-8 sequence */
65 size_t u8_seqlen(const char *s);
66 
67 /* returns the # of bytes needed to encode a certain character */
68 size_t u8_charlen(uint32_t ch);
69 
70 /* computes the # of bytes needed to encode a WC string as UTF-8 */
71 size_t u8_codingsize(uint32_t *wcstr, size_t n);
72 
73 char read_escape_control_char(char c);
74 
75 /* assuming src points to the character after a backslash, read an
76  escape sequence, storing the result in dest and returning the number of
77  input characters processed */
78 size_t u8_read_escape_sequence(const char *src, size_t ssz, uint32_t *dest);
79 
80 /* given a wide character, convert it to an ASCII escape sequence stored in
81  buf, where buf is "sz" bytes. returns the number of characters output.
82  sz must be at least 3. */
83 int u8_escape_wchar(char *buf, size_t sz, uint32_t ch);
84 
85 /* convert a string "src" containing escape sequences to UTF-8 */
86 size_t u8_unescape(char *buf, size_t sz, const char *src);
87 
88 /* convert UTF-8 "src" to escape sequences.
89 
90  sz is buf size in bytes. must be at least 12.
91 
92  if escape_quotes is nonzero, quote characters will be escaped.
93 
94  if ascii is nonzero, the output is 7-bit ASCII, no UTF-8 survives.
95 
96  starts at src[*pi], updates *pi to point to the first unprocessed
97  byte of the input.
98 
99  end is one more than the last allowable value of *pi.
100 
101  returns number of bytes placed in buf, including a NUL terminator.
102 */
103 size_t u8_escape(char *buf, size_t sz, const char *src, size_t *pi, size_t end,
104  int escape_quotes, int ascii);
105 
106 /* utility predicates used by the above */
107 int octal_digit(char c);
108 int hex_digit(char c);
109 
110 /* return a pointer to the first occurrence of ch in s, or NULL if not
111  found. character index of found character returned in *charn. */
112 char *u8_strchr(const char *s, uint32_t ch, size_t *charn);
113 
114 /* same as the above, but searches a buffer of a given size instead of
115  a NUL-terminated string. */
116 char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn);
117 
118 char *u8_memrchr(const char *s, uint32_t ch, size_t sz);
119 
120 /* count the number of characters in a UTF-8 string */
121 size_t u8_strlen(const char *s);
122 
123 /* number of columns occupied by a string */
124 size_t u8_strwidth(const char *s);
125 
126 /* Only works on Linux.
127  * TODO: Update this function to pickup Windows Code-page 65001, which is
128  * a UTF-8 implementation
129  */
130 int u8_is_locale_utf8(const char *locale);
131 
132 /* printf where the format string and arguments may be in UTF-8.
133  you can avoid this function and just use ordinary printf() if the current
134  locale is UTF-8. */
135 size_t u8_vprintf(const char *fmt, va_list ap);
136 size_t u8_printf(const char *fmt, ...);
137 
138 /* determine whether a sequence of bytes is valid UTF-8. length is in bytes */
139 int u8_isvalid(const char *str, size_t length);
140 
141 /* reverse a UTF-8 string. len is length in bytes. dest and src must both
142  be allocated to at least len+1 bytes. returns 1 for error, 0 otherwise */
143 int u8_reverse(char *dest, char *src, size_t len);
144 
151 char* u8_strncpy(char* dest, const char* src, size_t len);
152 
153 #ifdef __cplusplus
154 }
155 #endif
156 
157 
158 #endif