When implementing UTF-8 string functions in a library or programming language, you are required to continue processing even if you encounter an invalid byte string.
The minimum required function is a function that returns the number of bytes in the next character.
Standard C functions include mblen
and mbtowc
, but these functions return a negative value when they encounter an invalid byte string, so either stop processing or the invalid byte string. You will have to truncate the part behind it.
The Unicode Standard describes how to calculate the incorrect byte string size.
The following table appears in Chapter 3 of Unicode Standard 7.0 (Table 3-7. Well-Formed UTF-8 Byte Sequences).
Code point 1st byte 2nd byte 3rd byte 4th byte
U+0000 - U+007F 00 - 7F
U+0080 - U+07FF C2 - DF 80 - BF
U+0800 - U+0FFF E0 A0 - BF 80 - BF
U+1000 - U+CFFF E1 - EC 80 - BF 80 - BF
U+D000 - U+D7FF ED 80 - 9F 80 - BF
U+E000 - U+FFFF EE - EF 80 - BF 80 - BF
U+10000 - U+3FFFF F0 90 - BF 80 - BF 80 - BF
U+40000 - U+FFFFF F1 - F3 80 - BF 80 - BF 80 - BF
U+100000 - U+10FFFF F4 80 - 8F 80 - BF 80 - BF
RFC 3629 defines it as follows:
UTF8-octets = *( UTF8-char )
UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
UTF8-1 = %x00-7F
UTF8-2 = %xC2-DF UTF8-tail
UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / %xF4 %x80-8F 2( UTF8-tail )
UTF8-tail = %x80-BF
Use the test cases described in Chapter 3 of the Unicode Standard.
#include <stdio.h>
#include <stdint.h>
#define UTF8_LEAD(c) ((uint8_t)(c) < 0x80 || ((uint8_t)(c) > 0xC1 && (uint8_t)(c) < 0xF5))
#define UTF8_TRAIL(c) (((uint8_t)(c) & 0xC0) == 0x80)
uint8_t utf8_fwd(const char*);
uint8_t utf8_next(const char*, uint32_t*);
void print_each_char(const char*);
int main(void)
{
// http://en.wikipedia.org/wiki/UTF-8#Examples
// char* str = "\u0024\u00A2\u20AC\U00010348";
char* str = "\x24\xC2\xA2\xE2\x82\xAC\xF0\x90\x8D\x88";
// Unicode 7.0.0 D93b
// http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf
char* str2 = "\x41\xC0\xAF\x41\xF4\x80\x80\x41";
char* str3 = "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64";
puts("expected: <0024 00A2 20AC 10348>");
print_each_char(str);
puts("expected: <41 FFFD FFFD FFFD 41>");
print_each_char(str2);
puts("expected: <0061 FFFD FFFD FFFD 0062 FFFD 0063 FFFD FFFD 0064>");
print_each_char(str3);
return 0;
}
I referred to PCRE's pcre_valid_utf8.c (Thanks to mpyw for providing information). I used ʻuint8_t in
stdint.h (C99) instead of ʻunsigned char
.
The argument of print_each_char
is set to const char
for compatibility with C standard string functions (STR04-C. c-str04-c.html)).
You have the option of using bitwise operations, state transitions, etc. to shorten the code, but it can be difficult to read and slow down.
void print_each_char(const char* str)
{
uint8_t size;
while (*str) {
size = utf8_fwd(str);
printf("%.*s\n", size, str);
str += size;
}
}
uint8_t utf8_fwd(const char* str)
{
uint8_t lead = *str;
uint8_t trail;
uint8_t count;
uint8_t size;
if (lead == 0) {
return 0;
} else if (lead < 0x80) {
return 1;
} else if (lead < 0xC2) {
return 1;
} else if (lead < 0xE0) {
count = 2;
} else if (lead < 0xF0) {
count = 3;
} else if (lead < 0xF5) {
count = 4;
} else {
return 1;
}
trail = *(++str);
if (count == 3) {
if ((lead == 0xE0 && 0xA0 > trail) ||
(lead == 0xED && trail > 0x9F)
) {
return 1;
}
} else if (count == 4) {
if ((lead == 0xF0 && 0x90 > trail) ||
(lead == 0xF4 && trail > 0x8F)
) {
return 1;
}
}
for (size = 1; size < count; ++size) {
if (!UTF8_TRAIL(trail)) {
return size;
}
trail = *(++str);
}
return size;
}
void print_each_char(const char* str)
{
uint8_t size;
uint32_t cp;
while (*str) {
size = utf8_next(str, &cp);
if (cp == 0xFFFD) {
printf("U+FFFD %d \xEF\xBF\xBD\n", size);
} else {
printf("U+%X %.*s\n", cp, size, str);
}
str += size;
}
}
uint8_t utf8_next(const char* str, uint32_t* cp)
{
uint8_t lead = *str;
uint8_t trail;
uint8_t count;
uint8_t size;
*cp = 0xFFFD;
if (lead == 0) {
return 0;
} else if (lead < 0x80) {
*cp = lead;
return 1;
} else if (lead < 0xC2) {
return 1;
} else if (lead < 0xE0) {
count = 2;
} else if (lead < 0xF0) {
count = 3;
} else if (lead < 0xF5) {
count = 4;
} else {
return 1;
}
trail = *(++str);
if (count == 3) {
if ((lead == 0xE0 && 0xA0 > trail) ||
(lead == 0xED && trail > 0x9F)
) {
return 1;
}
} else if (count == 4) {
if ((lead == 0xF0 && 0x90 > trail) ||
(lead == 0xF4 && trail > 0x8F)
) {
return 1;
}
}
for (size = 1; size < count; ++size) {
if (!UTF8_TRAIL(trail)) {
return size;
}
trail = *(++str);
}
if (size == 2) {
*cp = ((lead & 0x1F) << 6) | (trail & 0x3F);
} else if (size == 3) {
*cp = ((lead & 0x1F) << 12)
| (((uint8_t) *(str - 2) & 0x3F) << 6)
| ((uint8_t) *(str - 1) & 0x3F);
} else {
*cp = ((lead & 0x7) << 18)
| (((uint8_t) *(str - 3) & 0x3F) << 12)
| (((uint8_t) *(str - 2) & 0x3F) << 6)
| ((uint8_t) *(str - 1) & 0x3F);
}
return size;
}
Macros such as ʻU8_FWD_1 and ʻU8_NEXT_OR_FFFD
(utf8.h) in ICU can be expected to improve speed. Please refer to this article for how to use these macros. When calculating the number of characters or substrings, no code point is required, so omitting the code for that amount can further improve the speed.
#define UTF8_TRAIL_BYTES(lead) \
((((uint8_t) lead) < 0xC2 ? \
0 : (((uint8_t) lead) < 0xE0 ? \
1 : (((uint8_t) lead) < 0xF0 ? \
2 : (((uint8_t) lead) < 0xF5 ? \
3 : 0)))))
#define UTF8_FWD(s, i) \
do { \
uint8_t __lead = (s)[(i)++]; \
if (__lead > 0x7F) { \
uint8_t __count = UTF8_TRAIL_BYTES(__lead); \
uint8_t __trail = (s)[(i)]; \
if (__count == 2) { \
if ((__lead == 0xE0 && (0xA0 > __trail)) || \
(__lead == 0xED && (__trail > 0x9F)) \
) { \
__count = 0; \
} \
} else if (__count == 3) { \
if ((__lead == 0xF0 && (0x90 > __trail)) || \
(__lead == 0xF4 && (__trail > 0x8F)) \
) { \
__count = 0; \
} \
} \
while (__count > 0 && UTF8_TRAIL(__trail)) { \
__trail = (s)[++(i)]; \
--__count; \
} \
} \
} while (0)
void print_each_char(const char* str)
{
int32_t previous = 0;
int32_t current = 0;
while (str[current]) {
previous = current;
UTF8_FWD(str, current);
printf("%.*s\n", current - previous, str + previous);
}
}
#define UTF8_CP_UNSAFE(s, i, c) \
do { \
(c) = (uint8_t) *((s)+(i)); \
if ((c) > 0x7F) { \
if ((c) < 0xE0) { \
(c) = (((c) & 0x1F) << 6) | ((uint8_t) *((s)+(i)+1) & 0x3F); \
} else if ((c) < 0xF0) { \
(c) = (((c) & 0x1F) << 12) \
| (((uint8_t) *((s)+(i)+1) & 0x3F) << 6) \
| ((uint8_t) *((s)+(i)+2) & 0x3F); \
} else { \
(c) = (((c) & 0x7) << 18) \
| (((uint8_t) *((s)+(i)+1) & 0x3F) << 12) \
| (((uint8_t) *((s)+(i)+2) & 0x3F) << 6) \
| ((uint8_t) *((s)+(i)+3) & 0x3F); \
}\
} \
} while (0)
#define UTF8_TRAIL_BYTES(lead) \
((((uint8_t) lead) < 0xC2 ? \
0 : (((uint8_t) lead) < 0xE0 ? \
1 : (((uint8_t) lead) < 0xF0 ? \
2 : (((uint8_t) lead) < 0xF5 ? \
3 : 0)))))
#define UTF8_NEXT(s, i, c) \
do { \
uint8_t __lead = (s)[(i)++]; \
(c) = UTF8_LEAD(__lead) ? __lead : 0xFFFD; \
if (__lead > 0x7F) { \
uint8_t __count = UTF8_TRAIL_BYTES(__lead); \
uint8_t __size = __count + 1; \
uint8_t __trail = (s)[(i)]; \
if (__count == 2) { \
if ((__lead == 0xE0 && (0xA0 > __trail)) || \
(__lead == 0xED && (__trail > 0x9F)) \
) { \
(c) = 0xFFFD; \
__count = 0; \
} \
} else if (__count == 3) { \
if ((__lead == 0xF0 && (0x90 > __trail)) || \
(__lead == 0xF4 && (__trail > 0x8F)) \
) { \
(c) = 0xFFFD; \
__count = 0; \
} \
} \
while (__count > 0) { \
if (!UTF8_TRAIL(__trail)) { \
(c) = 0xFFFD; \
break; \
} \
__trail = (s)[++(i)]; \
--__count; \
} \
if ((c) == __lead) { \
UTF8_CP_UNSAFE((s), (i) - __size, (c)); \
} \
} \
} while (0)
void print_each_char(const char* str)
{
int32_t previous = 0;
int32_t current = 0;
int32_t cp;
while (str[current]) {
previous = current;
UTF8_NEXT(str, current, cp);
if (cp == 0xFFFD) {
printf("U+%X \xEF\xBF\xBD\n", cp);
} else {
printf("U+%X %.*s\n", cp, current - previous, str + previous);
}
}
}
Recommended Posts