Chicken McNuggets said:
I think this is my big area of misunderstanding. How does one go about
converting between different encodings? I assume one has to use a
library to do so but which one?
Your C library might also have iconv(3) which can convert a variety character
set encodings. UTF to/from Unicode is simple by design. Other character encoding
to Unicode can be much harder.
If my text is already ASCII then no changes to the encoding need to be
done due to the fact that ASCII is the same in the different Unicode
implementations but the problems come when working with characters in
different Unicode implementations such as UTF-8, UTF-16 and UTF-32.
If you handle 0x01 to 0x7F as ASCII and 0x80 to 0xFF as a valid but unknown
characters, you should be able to handle UTF-8 with little or no modification.
All str* functions work on UTF-8, although calls like strchr won't be able to
find non-ASCII characters. However strstr will be able to search non-ASCII
characters in UTF-8 strings.
How would one go about converting a UTF-8 string to a UTF-32 string for
instance in C?
http://en.wikipedia.org/wiki/UTF-8 describes the conversion to/from UTF-8 and
Unicode. It might sound frightenning, but it's really easy and straightforward.
My functions to extract the first Unicode character from a UTF-8 C-string and
the remaining characters look something like
unsigned firstunicode(char *string) {
if ((string[0]&0x80)==0)
return string[0];
else if ( (string[0]&0xE0)==0xC0
&& (string[1]&0xC0)==0x80)
return ((string[0]&0x1F)<< 6)
| (string[1]&0x3F);
else if ((string[0]&0xF0)==0xE0
&& (string[1]&0xC0)==0x80
&& (string[2]&0xC0)==0x80)
return ((string[0]&0x0F)<<12)
| ((string[1]&0x3F)<< 6)
| (string[2]&0x3F );
else if ((string[0]&0xF8)==0xF0
&& (string[1]&0xC0)==0x80
&& (string[2]&0xC0)==0x80
&& (string[3]&0xC0)==0x80)
return ((string[0]&0x07)<<18)
| ((string[1]&0x3F)<<12)
| ((string[2]&0x3F)<< 6)
| (string[3]&0x3F );
else if ((string[0]&0xFC)==0xF8
&& (string[1]&0xC0)==0x80
&& (string[2]&0xC0)==0x80
&& (string[3]&0xC0)==0x80
&& (string[4]&0xC0)==0x80)
return ((string[0]&0x03)<<24)
| ((string[1]&0x3F)<<18)
| ((string[2]&0x3F)<<12)
| ((string[3]&0x3F)<< 6)
| (string[4]&0x3F);
else if ((string[0]&0xFE)==0xFC
&& (string[1]&0xC0)==0x80
&& (string[2]&0xC0)==0x80
&& (string[3]&0xC0)==0x80
&& (string[4]&0xC0)==0x80
&& (string[5]&0xC0)==0x80)
return ((string[0]&0x01)<<30)
| ((string[1]&0x3F)<<24)
| ((string[2]&0x3F)<<18)
| ((string[3]&0x3F)<<12)
| ((string[4]&0x3F)<< 6)
| (string[5]&0x3F );
else
return 0;
}
char *restunicode(char *string) {
string++;
while ((*string&0xC0)==0x80) string++;
return string;
}
...
unsigned u[strlen(string)]; int l = 0;
for (char *s=string; *s; s=restunicode(s))
u[l++] = firstunicode(s);
...
and convert one Unicode to UTF-8
char *addunicode(unsigned code, char *string, char *endplusone) {
if (code==0) {
return 0;
}else if (code<=0x7F) {
if (string+2<=endplusone) return 0;
*string++ = code;
*string = 0;
return string;
}else if (code<=0x7FF) {
if (string+3<=endplusone) return 0;
*string++ = (code>>6) & 0x1F | 0xC0;
*string++ = (code ) & 0x3F | 0x80;
*string = 0;
return string;
}else if (code<=0xFFFF) {
if (string+4<=endplusone) return 0;
*string++ = (code>>12) & 0x0F | 0xE0;
*string++ = (code>> 6) & 0x3F | 0x80;
*string++ = (code ) & 0x3F | 0x80;
*string = 0;
return string;
}else if (code<=0x1FFFFF) {
if (string+5<=endplusone) return 0;
*string++ = (code>>18) & 0x07 | 0xF0;
*string++ = (code>>12) & 0x3F | 0x80;
*string++ = (code>> 6) & 0x3F | 0x80;
*string++ = (code ) & 0x3F | 0x80;
*string = 0;
return string;
}else if (code<=0x3FFFFFF) {
if (string+6<=endplusone) return 0;
*string++ = (code>>24) & 0x03 | 0xF8;
*string++ = (code>>18) & 0x3F | 0x80;
*string++ = (code>>12) & 0x3F | 0x80;
*string++ = (code>> 6) & 0x3F | 0x80;
*string++ = (code ) & 0x3F | 0x80;
*string = 0;
return string;
}else {
if (string+7<=endplusone) return 0;
*string++ = (code>>30) & 0x01 | 0xFC;
*string++ = (code>>24) & 0x3F | 0x80;
*string++ = (code>>18) & 0x3F | 0x80;
*string++ = (code>>12) & 0x3F | 0x80;
*string++ = (code>> 6) & 0x3F | 0x80;
*string++ = (code ) & 0x3F | 0x80;
*string = 0;
return string;
}else
return 0;
}
...
unsigned u[strlen(string)]; int l = 0;
char t[5*l+1]; char *p = t;
for (int j=0; j<l; j++)
p = addunicode(u[j], p, t+5*l+1;
...