关于utf 8：如何在普通C中检测UTF-8？

How to detect UTF-8 in plain C?

我在寻找一个简单的旧C代码段，它检测给定的字符串是UTF-8编码。我知道regex的解决方案，但是由于各种原因，在这个特定的情况下最好避免使用普通的C以外的任何东西。

使用regex的解决方案如下(警告：忽略了各种检查)：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

#define UTF8_DETECT_REGEXP "^([\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$"

const char *error;
int error_off;
int rc;
int vect[100];

utf8_re = pcre_compile(UTF8_DETECT_REGEXP, PCRE_CASELESS, &error, &error_off, NULL);
utf8_pe = pcre_study(utf8_re, 0, &error);

rc = pcre_exec(utf8_re, utf8_pe, str, len, 0, 0, vect, sizeof(vect)/sizeof(vect[0]));

if (rc > 0) {
printf("string is in UTF8
");
} else {
printf("string is not in UTF8
")
}

相关讨论

下面是这个表达式在纯C语言中的一个(希望没有bug)实现：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

_Bool is_utf8(const char * string)
{
if(!string)
return 0;

const unsigned char * bytes = (const unsigned char *)string;
while(*bytes)
{
if( (// ASCII
// use bytes[0] <= 0x7F to allow ASCII control characters
bytes[0] == 0x09 ||
bytes[0] == 0x0A ||
bytes[0] == 0x0D ||
(0x20 <= bytes[0] && bytes[0] <= 0x7E)
)
) {
bytes += 1;
continue;
}

if( (// non-overlong 2-byte
(0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF)
)
) {
bytes += 2;
continue;
}

if( (// excluding overlongs
bytes[0] == 0xE0 &&
(0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
) ||
(// straight 3-byte
((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
bytes[0] == 0xEE ||
bytes[0] == 0xEF) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
) ||
(// excluding surrogates
bytes[0] == 0xED &&
(0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
)
) {
bytes += 3;
continue;
}

if( (// planes 1-3
bytes[0] == 0xF0 &&
(0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
) ||
(// planes 4-15
(0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
) ||
(// plane 16
bytes[0] == 0xF4 &&
(0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
)
) {
bytes += 4;
continue;
}

return 0;
}

return 1;
}

请注意，这是W3C为表单验证推荐的正则表达式的忠实翻译，它确实拒绝了一些有效的UTF-8序列(尤其是那些包含ASCII控制字符的序列)。

此外，即使在通过进行注释中提到的更改来解决此问题之后，它仍然假定零终止，这会阻止嵌入nul字符，尽管从技术上讲，这是合法的。

当我尝试创建自己的字符串库时，我使用了修改过的utf-8(即将nul编码为超长的两字节序列)-可以随意使用这个头作为模板，以提供不受上述缺点影响的验证例程。

相关讨论

这个译码器是我找到的最简单的。它还通过向它输入一个单字节以及保持一个状态来工作。状态对于解析通过网络以块形式传入的utf8非常有用。

http://bjoern.hoehrmann.de/utf-8/解码器/dfa/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

#define UTF8_ACCEPT 0
#define UTF8_REJECT 1

static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

uint32_t inline
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte];

*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);

*state = utf8d[256 + *state*16 + type];
return *state;
}

简单的验证器/检测器不需要代码点，因此可以这样编写(初始状态设置为UTF8_ACCEPT)：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
size_t i;
uint32_t type;

for (i = 0; i < len; i++) {
// We don't care about the codepoint, so this is
// a simplified version of the decode function.
type = utf8d[(uint8_t)str[i]];
*state = utf8d[256 + (*state) * 16 + type];

if (*state == UTF8_REJECT)
break;
}

return *state;
}

如果文本有效，则返回utf8 UTF8_ACCEPT。如果是无效的UTF8_REJECT。如果需要更多的数据，则返回其他整数。

以块形式(例如从网络)提供数据的使用示例：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

char buf[128];
size_t bytes_read;
uint32_t state = UTF8_ACCEPT;

// Validate the UTF8 data in chunks.
while ((bytes_read = get_new_data(buf, sizeof(buf))) {
if (validate_utf8(&state, buf, bytes_read) == UTF8_REJECT)) {
fprintf(stderr,"Invalid UTF8 data!
");
return -1;
}
}

// If everything went well we should have proper UTF8,
// the data might instead have ended in the middle of a UTF8
// codepoint.
if (state != UTF8_ACCEPT) {
fprintf(stderr,"Invalid UTF8, incomplete codepoint
");
}

相关讨论

您无法检测给定的字符串(或字节序列)是否是UTF-8编码文本，例如，每个UTF-8八位字节序列也是一个有效的(如果是无意义的)拉丁-1(或某些其他编码)八位字节序列。但是，并非每个有效的拉丁-1八位字节系列都是有效的UTF-8系列。因此，可以排除不符合UTF-8编码模式的字符串：

1
2
3
4

U+0000-U+007F 0xxxxxxx
U+0080-U+07FF 110yyyxx 10xxxxxx
U+0800-U+FFFF 1110yyyy 10yyyyxx 10xxxxxx
U+10000-U+10FFFF 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx

相关讨论

您必须将字符串解析为utf-8，请参见http://www.rfc-editor.org/rfc/rfc3629.txt，这非常简单。如果解析失败，则不是UTF-8。有几个简单的UTF-8库可以做到这一点。

如果您知道字符串是普通的旧ASCII，或者它包含ASCII之外的字符(这些字符是UTF-8编码的)，那么可以简化它。在这种情况下，您通常不需要考虑差异，UTF-8的设计是现有的可以处理ASCII的程序，在大多数情况下可以透明地处理UTF-8。

请记住，ascii本身是以utf-8编码的，所以ascii是有效的utf-8。

C字符串可以是任何东西，您需要解决的问题是，您不知道内容是否是ASCII、GB 2312、CP437、UTF-16，或者其他十几个字符编码中的任何一个使程序难以运行。？

无法检测给定的字节数组是UTF-8字符串。您可以可靠地确定它不能是有效的utf-8(这并不意味着它不是无效的utf-8)；您可以确定它可能是有效的utf-8序列，但这可能是误报。

对于一个简单的例子，使用随机数生成器生成一个3个随机字节的数组，并使用它来测试代码。这些是随机字节，因此不是UTF-8，所以代码认为"可能是UTF-8"的每个字符串都是误报。我猜(在这种情况下)你的代码在12%的时间内都会出错。

一旦你意识到这是不可能的，你就可以开始考虑返回一个信心水平(除了你的预测)。例如，您的函数可能返回"我88%确定这是UTF-8"。

现在对所有其他类型的数据执行此操作。例如，您可能有一个函数来检查数据是否是UTF-16，该函数可能返回"我95%相信这是UTF-16"，然后决定(因为95%高于88%)数据更有可能是UTF-16而不是UTF-8。

下一步是添加一些技巧来提高信心水平。例如，如果字符串似乎主要包含由空格分隔的组有效音节，那么您可以更加确信它实际上是UTF-8。同样，如果数据可能是HTML，那么您可以检查是否有可能是有效的HTML标记，并使用它来增加您的信心。

当然，这同样适用于其他类型的数据。例如，如果数据有一个有效的PE32或ELF头，或者正确的BMP或JPG或MP3头，那么您可以更加确信它根本不是UTF-8。

一个更好的方法是修复问题的实际原因。例如，可能会在您关心的所有文件的开头添加某种类型的"文档类型"标识符，或者可能会说"此软件采用UTF-8，不支持任何其他内容"；这样您就不需要首先进行不可靠的猜测。

您可以使用集成到Firefox中的UTF-8探测器。它是在通用字符集检测器中发现的，它几乎是C++库的一个分支。应该非常容易找到识别UTF-8的类，并且只接受它。这个类的基本功能是检测UTF-8特有的字符序列。

获取最新的火狐中继
转到mozillaextensionsuniversalchardetect
找到UTF-8探测器类(我不太记得它的确切名称)

根据我的计算，3个随机字节似乎有15.8%的机会成为有效的UTF-8：

128^3可能的纯ASCII序列=2097152

2^16-2^11可能的3字节UTF-8字符(假设允许代理项对和非字符)=63488

1920个2字节的utf-8字符(在ASCII字符之前或之后)=1920*128*2=524288

除以3字节序列数=(2097152+63488+491520)/16777216.0=0.1580810546875

imho这大大超出了对不正确匹配的估计，因为文件只有3个字节长。随着字节数的增加，交集向下延伸。另外，非utf-8中的实际文本不是随机的，有大量的孤立字节具有高位集，这是无效的utf-8。

猜测失败几率的一个更有用的度量标准是具有高位集的字节序列有多可能是有效的UTF-8。我得到这些值：

1
2
3
4

1 byte = 0% # the really important number that is often ignored
2 byte = 11.7%
3 byte = 3.03% (assumes surrogate halves are valid)
4 byte = 1.76% (includes two 2-byte characters)

还可以尝试找到一个实际可读的字符串(在任何语言和任何编码中)，它也是一个有效的UTF-8字符串。这是非常困难的，表明这不是实际数据的问题。

我知道这是一条老路，但我想我会把我的解决方案贴在这里，因为我认为这是对@christoph的绝妙解决方案(我投了反对票)的改进。

我不是专家，所以我可能读错了RFC，但在我看来，32字节的映射可以代替256字节的映射，从而节省了内存和时间。

这让我想到了一个简单的宏，它将字符串指针向前推进一个utf-8字符，将utf8代码点存储在一个32位有符号整数中，并在出错时存储值-1。

这是代码和一些注释。

#include <stdint.h>
/**
* Maps the last 5 bits in a byte (0b11111xxx) to a UTF-8 codepoint length.
*
* Codepoint length 0 == error.
*
* The first valid length can be any value between 1 to 4 (5== error).
*
* An intermidiate (second, third or forth) valid length must be 5.
*
* To map was populated using the following Ruby script:
*
* map = []; 32.times { map << 0 }; (0..0b1111).each {|i| map[i] = 1} ;
* (0b10000..0b10111).each {|i| map[i] = 5} ;
* (0b11000..0b11011).each {|i| map[i] = 2} ;
* (0b11100..0b11101).each {|i| map[i] = 3} ;
* map[0b11110] = 4; map;
*/
static uint8_t fio_str_utf8_map[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5,
5, 5, 2, 2, 2, 2, 3, 3, 4, 0};

/**
* Advances the `ptr` by one utf-8 character, placing the value of the UTF-8
* character into the i32 variable (which must be a signed integer with 32bits
* or more). On error, `i32` will be equal to `-1` and `ptr` will not step
* forwards.
*
* The `end` value is only used for overflow protection.
*/
#define FIO_STR_UTF8_CODE_POINT(ptr, end, i32) \
switch (fio_str_utf8_map[((uint8_t *)(ptr))[0] >> 3]) { \
case 1: \
(i32) = ((uint8_t *)(ptr))[0]; \
++(ptr); \
break; \
case 2: \
if (((ptr) + 2 > (end)) || \
fio_str_utf8_map[((uint8_t *)(ptr))[1] >> 3] != 5) { \
(i32) = -1; \
break; \
} \
(i32) = \
((((uint8_t *)(ptr))[0] & 31) << 6) | (((uint8_t *)(ptr))[1] & 63); \
(ptr) += 2; \
break; \
case 3: \
if (((ptr) + 3 > (end)) || \
fio_str_utf8_map[((uint8_t *)(ptr))[1] >> 3] != 5 || \
fio_str_utf8_map[((uint8_t *)(ptr))[2] >> 3] != 5) { \
(i32) = -1; \
break; \
} \
(i32) = ((((uint8_t *)(ptr))[0] & 15) << 12) | \
((((uint8_t *)(ptr))[1] & 63) << 6) | \
(((uint8_t *)(ptr))[2] & 63); \
(ptr) += 3; \
break; \
case 4: \
if (((ptr) + 4 > (end)) || \
fio_str_utf8_map[((uint8_t *)(ptr))[1] >> 3] != 5 || \
fio_str_utf8_map[((uint8_t *)(ptr))[2] >> 3] != 5 || \
fio_str_utf8_map[((uint8_t *)(ptr))[3] >> 3] != 5) { \
(i32) = -1; \
break; \
} \
(i32) = ((((uint8_t *)(ptr))[0] & 7) << 18) | \
((((uint8_t *)(ptr))[1] & 63) << 12) | \
((((uint8_t *)(ptr))[2] & 63) << 6) | \
(((uint8_t *)(ptr))[3] & 63); \
(ptr) += 4; \
break; \
default: \
(i32) = -1; \
break; \
}

/** Returns 1 if the String is UTF-8 valid and 0 if not. */
inline static size_t fio_str_utf8_valid2(char const *str, size_t length) {
if (!str)
return 0;
if (!length)
return 1;
const char *const end = str + length;
int32_t c = 0;
do {
FIO_STR_UTF8_CODE_POINT(str, end, c);
} while (c > 0 && str < end);
return str == end && c >= 0;
}

基本上，我检查给定的键(最多4个字符的字符串)是否与此链接中的格式匹配：http://www.fileformat.info/info/unicode/utf8.htm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

/*
** Checks if the given string has all bytes like: 10xxxxxx
** where x is either 0 or 1
*/

static int chars_are_folow_uni(const unsigned char *chars)
{
while (*chars)
{
if ((*chars >> 6) != 0x2)
return (0);
chars++;
}
return (1);
}

int char_is_utf8(const unsigned char *key)
{
int required_len;

if (key[0] >> 7 == 0)
required_len = 1;
else if (key[0] >> 5 == 0x6)
required_len = 2;
else if (key[0] >> 4 == 0xE)
required_len = 3;
else if (key[0] >> 5 == 0x1E)
required_len = 4;
else
return (0);
return (strlen(key) == required_len && chars_are_folow_uni(key + 1));
}

对我来说很好：

1
2
3
4
5

unsigned char buf[5];

ft_to_utf8(L'歓', buf);
printf("%d
", char_is_utf8(buf)); // => 1

The below programme reads utf-8 strings(ascii, non ascii chars like euro
etc...) from stdin. Each line is passed to
func_find_utf8. As utf-8 chars are multi byte chars,the function
func_find_utf8 checks char bits to find whetehr character is ascii
or non-ascii. If the charcter is non-ascii, know the width of bytes.
Pass the width of bytes and position it found to function print_non_ascii.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

#include<stdio.h>

#include<string.h>

/* UTF-8 : BYTE_BITS*/

/* B0_BYTE : 0XXXXXXX */

/* B1_BYTE : 10XXXXXX */

/* B2_BYTE : 110XXXXX */

/* B3_BYTE : 1110XXXX */

/* B4_BYTE : 11110XXX */

/* B5_BYTE : 111110XX */

/* B6_BYTE : 1111110X */

#define B0_BYTE 0x00

#define B1_BYTE 0x80

#define B2_BYTE 0xC0

#define B3_BYTE 0xE0

#define B4_BYTE 0xF0

#define B5_BYTE 0xF8

#define B6_BYTE 0xFC

#define B7_BYTE 0xFE

/* Please tune this as per number of lines input */

#define MAX_UTF8_STR 10

/* 600 is used because 6byteX100chars */

#define MAX_UTF8_CHR 600

void func_find_utf8 (char *ptr_to_str);

void print_non_ascii (int bytes, char *pbyte);

char strbuf[MAX_UTF8_STR][MAX_UTF8_CHR];

int
main (int ac, char *av[])
{

int i = 0;

char no_newln_str[MAX_UTF8_CHR];

i = 0;

printf ("

You can enter utf-8 string or Q/q to QUIT

");

while (i < MAX_UTF8_STR)
{

fgets (strbuf[i], MAX_UTF8_CHR, stdin);

if (!strlen (strbuf[i]))
break;

if ((strbuf[i][0] == 'Q') || (strbuf[i][0] == 'q'))
break;

strcpy (no_newln_str, strbuf[i]);

no_newln_str[strlen (no_newln_str) - 1] = 0;

func_find_utf8 (no_newln_str);

++i;

}

return 1;

}

void
func_find_utf8 (char *ptr_to_str)
{

int found_non_ascii;

char *pbyte;

pbyte = ptr_to_str;

found_non_ascii = 0;

while (*pbyte)
{

if ((*pbyte & B1_BYTE) == B0_BYTE)
{

pbyte++;

continue;

}

else
{

found_non_ascii = 1;

if ((*pbyte & B7_BYTE) == B6_BYTE)
{

print_non_ascii (6, pbyte);

pbyte += 6;

continue;

}

if ((*pbyte & B6_BYTE) == B5_BYTE)
{

print_non_ascii (5, pbyte);

pbyte += 5;

continue;

}

if ((*pbyte & B5_BYTE) == B4_BYTE)
{

print_non_ascii (4, pbyte);

pbyte += 4;

continue;

}

if ((*pbyte & B4_BYTE) == B3_BYTE)
{

print_non_ascii (3, pbyte);

pbyte += 3;

continue;

}

if ((*pbyte & B3_BYTE) == B2_BYTE)
{

print_non_ascii (2, pbyte);

pbyte += 2;

continue;

}

}

}

if (found_non_ascii)
printf (" These are Non Ascci chars
");

}

void
print_non_ascii (int bytes, char *pbyte)
{

char store[6];

int i;

memset (store, 0, 6);

memcpy (store, pbyte, bytes);

i = 0;

while (i < bytes)
printf ("%c", store[i++]);

printf ("%c", ' ');

fflush (stdout);

}

相关讨论