Here's how to deal with UTF-8 strings in GLBasic. It can convert to and from ISO 8895-1 (Latin-1), so you can "PRINT" UTF-8 strings, and write UTF-8 file from latin encodings.
// --------------------------------- //
// Project: UTF8- Routines
// --------------------------------- //
//! Copyvert an ansi text with codepage latin-1 (ISO 8895-1) to a UTF-8 string.
//! Prefix the UTF8_BOM$() at the start of a file when you write it to disk.
FUNCTION UTF8_fromLatin1$: latin2$
LOCAL out$
// https://stackoverflow.com/a/7904190/2721136
FOR i% = 0 TO LEN(latin2$)-1
// https://de.wikipedia.org/wiki/ISO_8859-1
LOCAL cha = ASC(latin2$, i)
IF cha < 0x80
INC out$, CHR$(cha)
ELSE
// all 11 bit codepoints (0x0 -- 0x7ff)
// fit within a 2byte utf8 char
// firstbyte = 110 +xxxxx := 0xc0 + (char>>6) MSB
// second = 10 +xxxxxx := 0x80 + (char& 63) LSB
INC out$, CHR$(bOR(0xc0, bAND(ASR(cha,6), 0x1f))) // 2+1+5 bits
INC out$, CHR$(bOR(0x80, bAND(cha, 0x3f))) // 1+1+6 bits
ENDIF
NEXT
RETURN out$
ENDFUNCTION
INLINE
void Latin8FromUtf8(char* str, int slen) {
int i = 0;
char* dest = str;
const char* pStrC = str;
const char* pEnd = str + slen;
// *pStrC != '\0' &&
while(pStrC < pEnd) {
const wchar_t c = wchar_t(static_cast<unsigned char>(*pStrC++));
// U-0 to U-7F
if((c & 0x80) == 0x00) {
*dest++=char(c);
continue;
}
// U-80 to U-7FF
if((c & 0xE0) == 0xC0) {
if(i<slen) {
wchar_t d = static_cast<wchar_t>(*pStrC++);
*dest++=(char((c&0x1f)<<6 | (d&0x3f)));
continue;
}
}
// U-800 to U-FFFF
if((c & 0xF0) == 0xE0) {
if(i+1<slen) {
const wchar_t d = static_cast<wchar_t>(*pStrC++);
const wchar_t e = static_cast<wchar_t>(*pStrC++);
*dest++=(char((c&0x0f)<<12 | (d&0x3f)<<6 | (e&0x3f)));
continue;
}
}
// three continuation (U-10000 to U-10ffff)
if((c & 0xF8) == 0xF0) {
if(i+2<slen) {
const wchar_t d = static_cast<wchar_t>(*pStrC++);
const wchar_t e = static_cast<wchar_t>(*pStrC++);
const wchar_t f = static_cast<wchar_t>(*pStrC++);
*dest++=(char((((c&0x0f)<<18 | (d&0x3f)<<12 | (e&0x3f)<<6 | (f&0x3f)) + 655536) & 0x000000ff));
continue;
}
}
}
*dest = '\0';
}
ENDINLINE
//! Convert the utf8$ string to a Latin-1 (ISO 8895-1) string. Use this to 'PRINT' an UTF-8 string.
FUNCTION UTF8_toLatin1$: utf8$
LOCAL copy$ = utf8$
INLINE
char* str = copy_Str.getbuffer(1);
Latin8FromUtf8(str, LEN(copy_Str));
copy_Str.releasebuffer();
ENDINLINE
RETURN copy$
ENDFUNCTION
//! Return the 3 byte UTF-8 BOM (byte order mask).
//! That is the first 3 bytes of an UTF-8 text file.
FUNCTION UTF8_BOM$:
RETURN CHR$(0xef) + CHR$(0xbb) + CHR$(0xbf)
ENDFUNCTION