libdas2
das2 core C utilities
Main Page
Modules
Data Structures
Related Pages
Files
File List
Globals
All
Data Structures
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
das2
utf8.h
1
/*
2
Basic UTF-8 manipulation routines
3
by Jeff Bezanson
4
placed in the public domain Fall 2005
5
6
This code is designed to provide the utilities you need to manipulate
7
UTF-8 as an internal string encoding. These functions do not perform the
8
error checking normally needed when handling UTF-8 data, so if you happen
9
to be from the Unicode Consortium you will want to flay me alive.
10
I do this because error checking can be performed at the boundaries (I/O),
11
with these routines reserved for higher performance on data known to be
12
valid.
13
A UTF-8 validation routine is included.
14
*/
15
16
#ifndef UTF8_H
17
#define UTF8_H
18
19
#include <stdio.h>
20
#include <stdarg.h>
21
#include <stdint.h>
22
#include <wchar.h>
23
#include <wctype.h>
24
#include <stdarg.h>
25
26
#ifdef __cplusplus
27
extern
"C"
{
28
#endif
29
30
extern
int
locale_is_utf8;
31
32
/* is c the start of a utf8 sequence? */
33
#define isutf(c) (((c)&0xC0)!=0x80)
34
35
#define UEOF ((uint32_t)-1)
36
37
/* convert UTF-8 data to wide character */
38
size_t
u8_toucs(uint32_t *dest,
size_t
sz,
const
char
*src,
size_t
srcsz);
39
40
/* the opposite conversion */
41
size_t
u8_toutf8(
char
*dest,
size_t
sz,
const
uint32_t *src,
size_t
srcsz);
42
43
/* single character to UTF-8, returns # bytes written */
44
size_t
u8_wc_toutf8(
char
*dest, uint32_t ch);
45
46
/* character number to byte offset */
47
size_t
u8_offset(
const
char
*str,
size_t
charnum);
48
49
/* byte offset to character number */
50
size_t
u8_charnum(
const
char
*s,
size_t
offset);
51
52
/* return next character, updating an index variable */
53
uint32_t u8_nextchar(
const
char
*s,
size_t
*i);
54
55
/* next character without NUL character terminator */
56
uint32_t u8_nextmemchar(
const
char
*s,
size_t
*i);
57
58
/* move to next character */
59
void
u8_inc(
const
char
*s,
size_t
*i);
60
61
/* move to previous character */
62
void
u8_dec(
const
char
*s,
size_t
*i);
63
64
/* returns length of next utf-8 sequence */
65
size_t
u8_seqlen(
const
char
*s);
66
67
/* returns the # of bytes needed to encode a certain character */
68
size_t
u8_charlen(uint32_t ch);
69
70
/* computes the # of bytes needed to encode a WC string as UTF-8 */
71
size_t
u8_codingsize(uint32_t *wcstr,
size_t
n);
72
73
char
read_escape_control_char(
char
c);
74
75
/* assuming src points to the character after a backslash, read an
76
escape sequence, storing the result in dest and returning the number of
77
input characters processed */
78
size_t
u8_read_escape_sequence(
const
char
*src,
size_t
ssz, uint32_t *dest);
79
80
/* given a wide character, convert it to an ASCII escape sequence stored in
81
buf, where buf is "sz" bytes. returns the number of characters output.
82
sz must be at least 3. */
83
int
u8_escape_wchar(
char
*buf,
size_t
sz, uint32_t ch);
84
85
/* convert a string "src" containing escape sequences to UTF-8 */
86
size_t
u8_unescape(
char
*buf,
size_t
sz,
const
char
*src);
87
88
/* convert UTF-8 "src" to escape sequences.
89
90
sz is buf size in bytes. must be at least 12.
91
92
if escape_quotes is nonzero, quote characters will be escaped.
93
94
if ascii is nonzero, the output is 7-bit ASCII, no UTF-8 survives.
95
96
starts at src[*pi], updates *pi to point to the first unprocessed
97
byte of the input.
98
99
end is one more than the last allowable value of *pi.
100
101
returns number of bytes placed in buf, including a NUL terminator.
102
*/
103
size_t
u8_escape(
char
*buf,
size_t
sz,
const
char
*src,
size_t
*pi,
size_t
end,
104
int
escape_quotes,
int
ascii);
105
106
/* utility predicates used by the above */
107
int
octal_digit(
char
c);
108
int
hex_digit(
char
c);
109
110
/* return a pointer to the first occurrence of ch in s, or NULL if not
111
found. character index of found character returned in *charn. */
112
char
*u8_strchr(
const
char
*s, uint32_t ch,
size_t
*charn);
113
114
/* same as the above, but searches a buffer of a given size instead of
115
a NUL-terminated string. */
116
char
*u8_memchr(
const
char
*s, uint32_t ch,
size_t
sz,
size_t
*charn);
117
118
char
*u8_memrchr(
const
char
*s, uint32_t ch,
size_t
sz);
119
120
/* count the number of characters in a UTF-8 string */
121
size_t
u8_strlen(
const
char
*s);
122
123
/* number of columns occupied by a string */
124
size_t
u8_strwidth(
const
char
*s);
125
126
/* Only works on Linux.
127
* TODO: Update this function to pickup Windows Code-page 65001, which is
128
* a UTF-8 implementation
129
*/
130
int
u8_is_locale_utf8(
const
char
*locale);
131
132
/* printf where the format string and arguments may be in UTF-8.
133
you can avoid this function and just use ordinary printf() if the current
134
locale is UTF-8. */
135
size_t
u8_vprintf(
const
char
*fmt, va_list ap);
136
size_t
u8_printf(
const
char
*fmt, ...);
137
138
/* determine whether a sequence of bytes is valid UTF-8. length is in bytes */
139
int
u8_isvalid(
const
char
*str,
size_t
length);
140
141
/* reverse a UTF-8 string. len is length in bytes. dest and src must both
142
be allocated to at least len+1 bytes. returns 1 for error, 0 otherwise */
143
int
u8_reverse(
char
*dest,
char
*src,
size_t
len);
144
151
char
* u8_strncpy(
char
* dest,
const
char
* src,
size_t
len);
152
153
#ifdef __cplusplus
154
}
155
#endif
156
157
158
#endif
Generated by
1.8.5