Extract character by character from UTF-8 string using ICU

If you need to work with grapheme clusters, you need to use BreakIterator. Please refer to this article.

Build options

run:
	clang main.c -licuuc -o main
	./main

C language

utf8.h

U8_FWD_1

#include <stdio.h>
#include <string.h>
#include <unicode/utf8.h>

void print_each_char(const char*, size_t);

int main(void)
{
    const char* str = "?Noe";
    size_t size = strlen(str);
    print_each_char(str, size);
}

void print_each_char(const char* str, size_t size)
{
    int32_t previous;
    int32_t current = 0;
    int32_t buf_size = 0;
    int32_t* str_size = (int32_t*) &size;

    while (current < size) {
        previous = current;
        U8_FWD_1(str, current, *str_size);
        buf_size = current - previous;
        printf("%.*s\n", buf_size, str + previous);
    }

}

U8_NEXT_OR_FFFD

You need to have a variable for the code point.

void print_each_char(const char* str, size_t size)
{
    int32_t previous;
    int32_t current = 0;
    int32_t buf_size = 0;
    int32_t* str_size = (int32_t*) &size;
    UChar32 cp;

    while (current < size) {
        previous = current;
        U8_NEXT_OR_FFFD(str, current, *str_size, cp);
        buf_size = current - previous;
        printf("%.*s\n", buf_size, str + previous);
    }

}

UText

#include <stdio.h>
#include <unicode/utext.h>

void print_each_char(const char*);

int main(void)
{
    const char* str = "?Noe";
    print_each_char(str);
}

void print_each_char(const char* str)
{
    UText *ut;
    UErrorCode status = U_ZERO_ERROR;
    UChar32 cp;
    uint8_t size = 0;
    int64_t previous;

    ut = utext_openUTF8(NULL, str, -1, &status);

    for (cp = utext_next32From(ut, 0); cp > -1; cp = UTEXT_NEXT32(ut)) {
        previous = utext_getPreviousNativeIndex(ut);
        size = 	UTEXT_GETNATIVEINDEX(ut) - previous;
        printf("%.*s\n", size, str + previous);
    }
}

UString

#include <stdio.h>
#include <string.h>
#include <unicode/ustring.h>
#include <unicode/utf8.h>

void print_each_char(const char*);

int main(void)
{
    const char* str = "?Noe";
    print_each_char(str);
}


void print_each_char(const char* str)
{
    UChar dest[128];
    int32_t capacity = 128;
    int32_t length;
    UErrorCode status;
    int32_t dest_pos = 0;
    UChar32 cp;

    char buf[5];
    int32_t size = 5;
    int32_t pos = 0;
    UBool error;

    u_strFromUTF8(dest, capacity, &length, str, -1, &status);

    while (dest_pos < length) {
        U16_NEXT(dest, dest_pos, length, cp);
        U8_APPEND((uint8_t*) buf, pos, size, cp, error);
        printf("%.*s\n", size, buf);
        memset(buf, 0, size);
        pos = 0;
    }
}

UCharIterator

#include <stdio.h>
#include <string.h>
#include <unicode/uiter.h>
#include <unicode/utf8.h>

void print_each_char(const char*);

int main(void)
{
    const char* str = "?Noe";
    print_each_char(str);
}


void print_each_char(const char* str)
{
    UCharIterator iter;
    UChar32 cp = 0;

    char buf[5];
    int32_t size = 5;
    int32_t pos = 0;
    UBool error;

    uiter_setUTF8(&iter, str, -1);
    
    for (cp = uiter_next32(&iter); cp !=  U_SENTINEL; cp = uiter_next32(&iter)) {
        U8_APPEND((uint8_t*) buf, pos, size, cp, error);
        printf("%.*s\n", size, buf);
        memset(buf, 0, size);
        pos = 0;
    }
}

C++

StringCharacterIterator

If you include ʻustream.hand add-licuio` to the build options, you can output UnicodeString with std :: cout,

#include <iostream>
#include <unicode/schriter.h>
#include <unicode/ustream.h>

void print_each_char(std::string);

int main(void)
{
    std::string str("?Noe");
    print_each_char(str);
}

void print_each_char(std::string str)
{
    UnicodeString ustr(str.c_str());
    StringCharacterIterator it(ustr);
    UChar32 cp;

    for (cp = it.first32(); it.hasNext(); cp = it.next32()) {
        UnicodeString buf(cp);
        std::cout << buf << '\n';
    }
}

If you use UnicodeString :: toUTF8String:

void print_each_char(std::string str)
{
    UnicodeString ustr(str.c_str());
    StringCharacterIterator it(ustr);
    UChar32 cp;

    for (cp = it.first32(); it.hasNext(); cp = it.next32()) {
        UnicodeString ubuf(cp);
        std::string buf;
        ubuf.toUTF8String(buf);
        std::cout << buf << '\n';
    }
}

UCharCharacterIterator

Add -licuio to your build options to use ʻustream.h`.

#include <iostream>
#include <unicode/uchriter.h>
#include <unicode/ustream.h>

int main(void)
{
    UnicodeString ustr("?Noe");
    UCharCharacterIterator it(ustr.getTerminatedBuffer(), ustr.length());
    UChar32 cp;

    while (it.hasNext()) {
        cp = it.next32PostInc();
        UnicodeString buf(cp);
        std::cout << buf << '\n';
    }

    return 0;
}

UnicodeString

Note that it is in ʻUChar` units.

#include <iostream>
#include <unicode/unistr.h>
#include <unicode/ustream.h>

void print_each_char(std::string);

int main(void)
{
    std::string str("?Noe");
    print_each_char(str);
}

void print_each_char(std::string str)
{
    UnicodeString ustr(str.c_str());
    int32_t length = ustr.length();
    int32_t size = 0;
    UChar32 cp;

    for (int32_t i = 0; i < length; i += size) {
        cp = ustr.char32At(i);
        size = cp < 0x10000 ? 1 : 2;
        UnicodeString buf(cp);
        std::cout << buf << '\n';
    }
}

Recommended Posts