Extract character by character from UTF-8 string considering invalid byte string

When implementing UTF-8 string functions in a library or programming language, you are required to continue processing even if you encounter an invalid byte string.

The minimum required function is a function that returns the number of bytes in the next character. Standard C functions include mblen and mbtowc, but these functions return a negative value when they encounter an invalid byte string, so either stop processing or the invalid byte string. You will have to truncate the part behind it.

The Unicode Standard describes how to calculate the incorrect byte string size.

Byte sequence range table

The following table appears in Chapter 3 of Unicode Standard 7.0 (Table 3-7. Well-Formed UTF-8 Byte Sequences).

Code point 1st byte 2nd byte 3rd byte 4th byte
  U+0000 -   U+007F   00 - 7F
  U+0080 -   U+07FF   C2 - DF    80 - BF
  U+0800 -   U+0FFF   E0         A0 - BF     80 - BF
  U+1000 -   U+CFFF   E1 - EC    80 - BF     80 - BF
  U+D000 -   U+D7FF   ED         80 - 9F     80 - BF
  U+E000 -   U+FFFF   EE - EF    80 - BF     80 - BF
 U+10000 -  U+3FFFF   F0         90 - BF     80 - BF    80 - BF
 U+40000 -  U+FFFFF   F1 - F3    80 - BF     80 - BF    80 - BF
U+100000 - U+10FFFF   F4         80 - 8F     80 - BF    80 - BF

RFC 3629 defines it as follows:

UTF8-octets = *( UTF8-char )
UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
UTF8-1      = %x00-7F
UTF8-2      = %xC2-DF UTF8-tail
UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / %xF4 %x80-8F 2( UTF8-tail )
UTF8-tail   = %x80-BF

test case

Use the test cases described in Chapter 3 of the Unicode Standard.

#include <stdio.h>
#include <stdint.h>

#define UTF8_LEAD(c) ((uint8_t)(c) < 0x80 || ((uint8_t)(c) > 0xC1 && (uint8_t)(c) < 0xF5))
#define UTF8_TRAIL(c) (((uint8_t)(c) & 0xC0) == 0x80)

uint8_t utf8_fwd(const char*);
uint8_t utf8_next(const char*, uint32_t*);
void print_each_char(const char*);

int main(void)
{
    // http://en.wikipedia.org/wiki/UTF-8#Examples
    // char* str = "\u0024\u00A2\u20AC\U00010348";
    char* str = "\x24\xC2\xA2\xE2\x82\xAC\xF0\x90\x8D\x88";

    // Unicode 7.0.0 D93b
    // http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf
    char* str2 = "\x41\xC0\xAF\x41\xF4\x80\x80\x41";
    char* str3 = "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64";

    puts("expected: <0024 00A2 20AC 10348>");
    print_each_char(str);
    puts("expected: <41 FFFD FFFD FFFD 41>");
    print_each_char(str2);
    puts("expected: <0061 FFFD FFFD FFFD 0062 FFFD 0063 FFFD FFFD 0064>");
    print_each_char(str3);

    return 0;
}

C language

I referred to PCRE's pcre_valid_utf8.c (Thanks to mpyw for providing information). I used ʻuint8_t in stdint.h (C99) instead of ʻunsigned char.

The argument of print_each_char is set to const char for compatibility with C standard string functions (STR04-C. c-str04-c.html)).

You have the option of using bitwise operations, state transitions, etc. to shorten the code, but it can be difficult to read and slow down.

Function version

Only the number of bytes of a character

void print_each_char(const char* str)
{
    uint8_t size;
    while (*str) {
        size = utf8_fwd(str);
        printf("%.*s\n", size, str);
        str += size;   
    }
}

uint8_t utf8_fwd(const char* str)
{
    uint8_t lead = *str;
    uint8_t trail;
    uint8_t count; 
    uint8_t size;

    if (lead == 0) {
        return 0;
    } else if (lead < 0x80) {
        return 1;
    } else if (lead < 0xC2) {
        return 1;
    } else if (lead < 0xE0) {
        count = 2;
    } else if (lead < 0xF0) {
        count = 3;
    } else if (lead < 0xF5) {
        count = 4;
    } else {
        return 1;
    }

    trail = *(++str);

    if (count == 3) {
        if ((lead == 0xE0 && 0xA0 > trail) ||
            (lead == 0xED && trail > 0x9F)
        ) {
            return 1;
        }
    } else if (count == 4) {
        if ((lead == 0xF0 && 0x90 > trail) ||
            (lead == 0xF4 && trail > 0x8F)
        ) {
            return 1;
        }
    }

    for (size = 1; size < count; ++size) {

        if (!UTF8_TRAIL(trail)) {
            return size;
        }

        trail = *(++str);
    }
    return size;
}

Code point

void print_each_char(const char* str)
{
    uint8_t size;
    uint32_t cp;

    while (*str) {
        size = utf8_next(str, &cp);

        if (cp == 0xFFFD) {
            printf("U+FFFD %d \xEF\xBF\xBD\n", size);
        } else {
            printf("U+%X %.*s\n", cp, size, str);
        }

        str += size;   
    }
}

uint8_t utf8_next(const char* str, uint32_t* cp)
{
    uint8_t lead = *str;
    uint8_t trail;
    uint8_t count; 
    uint8_t size;

    *cp = 0xFFFD;

    if (lead == 0) {
        return 0;
    } else if (lead < 0x80) {
        *cp = lead;
        return 1;
    } else if (lead < 0xC2) {
        return 1;
    } else if (lead < 0xE0) {
        count = 2;
    } else if (lead < 0xF0) {
        count = 3;
    } else if (lead < 0xF5) {
        count = 4;
    } else {
        return 1;
    }

    trail = *(++str);

    if (count == 3) {
        if ((lead == 0xE0 && 0xA0 > trail) ||
            (lead == 0xED && trail > 0x9F)
        ) {
            return 1;
        }
    } else if (count == 4) {
        if ((lead == 0xF0 && 0x90 > trail) ||
            (lead == 0xF4 && trail > 0x8F)
        ) {
            return 1;
        }
    }

    for (size = 1; size < count; ++size) {

        if (!UTF8_TRAIL(trail)) {
            return size;
        }

        trail = *(++str);
    }

    if (size == 2) {
        *cp = ((lead & 0x1F) << 6) | (trail & 0x3F);
    } else if (size == 3) {
        *cp = ((lead & 0x1F) << 12)
            | (((uint8_t) *(str - 2) & 0x3F) <<  6)
            | ((uint8_t) *(str - 1) & 0x3F);
    } else {
        *cp = ((lead & 0x7) << 18)
            | (((uint8_t) *(str - 3) & 0x3F) << 12)
            | (((uint8_t) *(str - 2) & 0x3F) << 6)
            | ((uint8_t) *(str - 1) & 0x3F);
    }

    return size;
}

Macro version

Macros such as ʻU8_FWD_1 and ʻU8_NEXT_OR_FFFD (utf8.h) in ICU can be expected to improve speed. Please refer to this article for how to use these macros. When calculating the number of characters or substrings, no code point is required, so omitting the code for that amount can further improve the speed.

Only the number of bytes of a character

#define UTF8_TRAIL_BYTES(lead) \
  ((((uint8_t) lead) < 0xC2 ? \
  0 : (((uint8_t) lead) < 0xE0 ? \
  1 : (((uint8_t) lead) < 0xF0 ? \
  2 : (((uint8_t) lead) < 0xF5 ? \
  3 : 0)))))

#define UTF8_FWD(s, i) \
  do { \
    uint8_t __lead = (s)[(i)++]; \
    if (__lead > 0x7F) { \
      uint8_t __count = UTF8_TRAIL_BYTES(__lead); \
      uint8_t __trail = (s)[(i)]; \
      if (__count == 2) { \
        if ((__lead == 0xE0 && (0xA0 > __trail)) || \
            (__lead == 0xED && (__trail > 0x9F)) \
        ) { \
          __count = 0; \
        } \
      } else if (__count == 3) { \
        if ((__lead == 0xF0 && (0x90 > __trail)) || \
            (__lead == 0xF4 && (__trail > 0x8F)) \
        ) { \
          __count = 0; \
        } \
      } \
      while (__count > 0 && UTF8_TRAIL(__trail)) { \
        __trail = (s)[++(i)]; \
        --__count; \
      } \
    } \
  } while (0)

void print_each_char(const char* str)
{
    int32_t previous = 0;
    int32_t current = 0;

    while (str[current]) {
        previous = current;
        UTF8_FWD(str, current);
        printf("%.*s\n", current - previous, str + previous);
    }
}

Code point

#define UTF8_CP_UNSAFE(s, i, c) \
  do { \
    (c) = (uint8_t) *((s)+(i)); \
    if ((c) > 0x7F) { \
      if ((c) < 0xE0) { \
        (c) = (((c) & 0x1F) << 6) | ((uint8_t) *((s)+(i)+1) & 0x3F); \
      } else if ((c) < 0xF0) { \
        (c) = (((c) & 0x1F) << 12) \
            | (((uint8_t) *((s)+(i)+1) & 0x3F) << 6) \
            | ((uint8_t) *((s)+(i)+2) & 0x3F); \
      } else { \
        (c) = (((c) & 0x7) << 18) \
            | (((uint8_t) *((s)+(i)+1) & 0x3F) << 12) \
            | (((uint8_t) *((s)+(i)+2) & 0x3F) << 6) \
            | ((uint8_t) *((s)+(i)+3) & 0x3F); \
      }\
    } \
  } while (0)

#define UTF8_TRAIL_BYTES(lead) \
  ((((uint8_t) lead) < 0xC2 ? \
  0 : (((uint8_t) lead) < 0xE0 ? \
  1 : (((uint8_t) lead) < 0xF0 ? \
  2 : (((uint8_t) lead) < 0xF5 ? \
  3 : 0)))))

#define UTF8_NEXT(s, i, c) \
  do { \
    uint8_t __lead = (s)[(i)++]; \
    (c) = UTF8_LEAD(__lead) ? __lead : 0xFFFD; \
    if (__lead > 0x7F) { \
      uint8_t __count = UTF8_TRAIL_BYTES(__lead); \
      uint8_t __size = __count + 1; \
      uint8_t __trail = (s)[(i)]; \
      if (__count == 2) { \
        if ((__lead == 0xE0 && (0xA0 > __trail)) || \
            (__lead == 0xED && (__trail > 0x9F)) \
        ) { \
          (c) = 0xFFFD; \
          __count = 0; \
        } \
      } else if (__count == 3) { \
        if ((__lead == 0xF0 && (0x90 > __trail)) || \
            (__lead == 0xF4 && (__trail > 0x8F)) \
        ) { \
          (c) = 0xFFFD; \
          __count = 0; \
        } \
      } \
      while (__count > 0) { \
        if (!UTF8_TRAIL(__trail)) { \
          (c) = 0xFFFD; \
          break; \
        } \
        __trail = (s)[++(i)]; \
        --__count; \
      } \
      if ((c) == __lead) { \
        UTF8_CP_UNSAFE((s), (i) - __size, (c)); \
      } \
    } \
  } while (0)

void print_each_char(const char* str)
{
    int32_t previous = 0;
    int32_t current = 0;
    int32_t cp;

    while (str[current]) {
        previous = current;
        UTF8_NEXT(str, current, cp);
        if (cp == 0xFFFD) {
            printf("U+%X \xEF\xBF\xBD\n", cp);
        } else {
            printf("U+%X %.*s\n", cp, current - previous, str + previous);
        }

    }
}

Recommended Posts