If it is a char type, null-terminated string, does not contain invalid byte strings, and the locale can be fixed, mblen
, mbrlen
, mbtowc
can be extracted character by character. , Mbrtowc
, mbrtoc32
can be used. The return value of these functions is the number of bytes in the next character.
The use of mblen
and mbrlen
is whether you need to manage the state variables yourself. Specify the variable that represents the internal state in the third argument of mbrlen
.
The difference between mbtowc
and mblen
is whether you need a code point. The code point is passed by reference as the first argument of mbtowc
. The type of the first argument is wchar_t, so the result depends on the OS. I have confirmed that wchar_t
is 32 bits in size on Linux and Mac OS X, but not on 16 bit Windows.
The use of mbrtowc
and mbtowc
is whether you need to manage the state variables yourself. Specify the variable that represents the internal state with the 4th argument of mbrtowc
.
mbrtoc32
is a function added in C11 that uses char32_t
instead of wchar_t
compared to mbrtowc
. As of January 2015, it is available under GCC.
mbrtoc16
was also introduced in C11 at the same time, but its use is limited as it returns a negative value when it encounters a character in the range U + 10000 to U + 10FFFF.
If you plan to use it on multiple operating systems, you will need to consider ICU. Please refer to this article.
Let's try the following string: The "?" (U + 20BB7) is used to see if it supports basic multilingual plane characters (U + 10000 to U + 10FFFF).
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
void print_each_char(const char*);
int main(void)
{
//const char* str = "\U00020BB7\u91CE\u5BB6";
const char* str = "?Noe";
const char* str2 = "?field\x80 house";
setlocale(LC_CTYPE, "ja_JP.UTF-8");
print_each_char(str);
puts("");
print_each_char(str2);
return 0;
}
mblen
void print_each_char(const char* str)
{
char size;
mblen(NULL, 0);
while (*str) {
size = mblen(str, MB_CUR_MAX);
if (size < 1) {
puts("Processing was interrupted because an invalid byte string was encountered.");
break;
}
printf("%.*s\n", size, str);
str += size;
}
}
mbrlen
You need to include wchar.h
to use mbstate_t
.
#include <wchar.h>
void print_each_char(const char* str)
{
char size;
mbstate_t state;
while (*str) {
size = mbrlen(str, MB_CUR_MAX, &state);
if (size < 1) {
puts("Processing was interrupted because an invalid byte string was encountered.");
break;
}
printf("%.*s\n", size, str);
str += size;
}
}
mbtowc
#include <wchar.h>
void print_each_char(const char* str)
{
char size;
wchar_t cp;
mbtowc(&cp, NULL, 0);
while (*str) {
size = mbtowc(&cp, str, MB_CUR_MAX);
if (size < 1) {
puts("Processing was interrupted because an invalid byte string was encountered.");
break;
}
printf("U+%X %.*s\n", cp, size, str);
str += size;
}
}
mbrtowc
#include <wchar.h>
void print_each_char(const char* str)
{
char size;
wchar_t cp;
mbstate_t status;
mbsinit(&status);
while (*str) {
size = mbrtowc(&cp, str, MB_CUR_MAX, &status);
if (size < 1) {
puts("Processing was interrupted because an invalid byte string was encountered.");
break;
}
printf("U+%X %.*s\n", cp, size, str);
str += size;
}
}
mbrtoc32
Introduced in C11, mbrtoc32
is available under GCC.
#include <wchar.h>
#include <uchar.h>
void print_each_char(const char* str)
{
char size;
char32_t cp;
mbstate_t status;
mbsinit(&status);
while (*str) {
size = mbrtoc32(&cp, str, MB_CUR_MAX, &status);
if (size < 1) {
puts("Processing was interrupted because an invalid byte string was encountered.");
break;
}
printf("U+%X %.*s\n", cp, size, str);
str += size;
}
}
You can use mbstowcs
just to find the number of characters. However, mbstowcs
must take into account the differences in wchar_t
from OS to OS.
mbstowcs
#include <stdio.h>
#include <string.h>
#include <locale.h>
#include <stdlib.h>
int main(void)
{
const char* str = "?Noe";
size_t size = strlen(str);
wchar_t wchar[1000];
setlocale(LC_CTYPE, "ja_JP.UTF-8");
printf("%zu\n", mbstowcs(wchar, str, size));
return 0;
}
Recommended Posts