UTF-8 (UTF8) utilities

Previous topic - Next topic

Kitty Hello

Here's how to deal with UTF-8 strings in GLBasic. It can convert to and from ISO 8895-1 (Latin-1), so you can "PRINT" UTF-8 strings, and write UTF-8 file from latin encodings.
Code (glbasic) Select

// --------------------------------- //
// Project: UTF8- Routines
// --------------------------------- //

//! Copyvert an ansi text with codepage latin-1 (ISO 8895-1) to a UTF-8 string.
//! Prefix the UTF8_BOM$() at the start of a file when you write it to disk.
FUNCTION UTF8_fromLatin1$: latin2$
LOCAL out$
// https://stackoverflow.com/a/7904190/2721136
FOR i% = 0 TO LEN(latin2$)-1
// https://de.wikipedia.org/wiki/ISO_8859-1
LOCAL cha = ASC(latin2$, i)
IF cha < 0x80
INC out$, CHR$(cha)
ELSE
// all 11 bit codepoints (0x0 -- 0x7ff)
// fit within a 2byte utf8 char
// firstbyte = 110 +xxxxx := 0xc0 + (char>>6) MSB
// second    = 10 +xxxxxx := 0x80 + (char& 63) LSB
INC out$,  CHR$(bOR(0xc0, bAND(ASR(cha,6), 0x1f))) // 2+1+5 bits
INC out$,  CHR$(bOR(0x80, bAND(cha, 0x3f))) // 1+1+6 bits
ENDIF
NEXT

RETURN out$
ENDFUNCTION

INLINE
void Latin8FromUtf8(char* str, int slen) {
    int i = 0;

    char* dest = str;

    const char* pStrC = str;
    const char* pEnd = str + slen;
    // *pStrC != '\0' &&
    while(pStrC < pEnd) {
        const wchar_t c = wchar_t(static_cast<unsigned char>(*pStrC++));

        // U-0 to U-7F
        if((c & 0x80) == 0x00) {
            *dest++=char(c);
            continue;
        }
        // U-80 to U-7FF
        if((c & 0xE0) == 0xC0) {
            if(i<slen) {
                wchar_t d = static_cast<wchar_t>(*pStrC++);
                *dest++=(char((c&0x1f)<<6 | (d&0x3f)));
                continue;
            }
        }

        // U-800 to U-FFFF
        if((c & 0xF0) == 0xE0) {
            if(i+1<slen) {
                const wchar_t d = static_cast<wchar_t>(*pStrC++);
                const wchar_t e = static_cast<wchar_t>(*pStrC++);
                *dest++=(char((c&0x0f)<<12 | (d&0x3f)<<6 | (e&0x3f)));
                continue;
            }
        }
        // three continuation (U-10000 to U-10ffff)
        if((c & 0xF8) == 0xF0) {
            if(i+2<slen) {
                const wchar_t d = static_cast<wchar_t>(*pStrC++);
                const wchar_t e = static_cast<wchar_t>(*pStrC++);
                const wchar_t f = static_cast<wchar_t>(*pStrC++);
                *dest++=(char((((c&0x0f)<<18 | (d&0x3f)<<12 | (e&0x3f)<<6 | (f&0x3f)) + 655536)  & 0x000000ff));
                continue;
            }
        }
    }
    *dest = '\0';
}
ENDINLINE


//! Convert the utf8$ string to a Latin-1 (ISO 8895-1) string. Use this to 'PRINT' an UTF-8 string.
FUNCTION UTF8_toLatin1$: utf8$
LOCAL copy$ = utf8$
INLINE
char* str = copy_Str.getbuffer(1);
Latin8FromUtf8(str, LEN(copy_Str));
copy_Str.releasebuffer();
ENDINLINE
RETURN copy$

ENDFUNCTION


//! Return the 3 byte UTF-8 BOM (byte order mask).
//! That is the first 3 bytes of an UTF-8 text file.
FUNCTION UTF8_BOM$:
RETURN CHR$(0xef) + CHR$(0xbb) + CHR$(0xbf)
ENDFUNCTION