Tuesday, June 8, 2021

Convert UTF-8 string to byte array


The logic of encoding Unicode in UTF-8 is basically:


Up to 4 bytes per character can be used. The fewest number of bytes possible is used.

Characters up to U+007F are encoded with a single byte.

For multibyte sequences, the number of leading 1 bits in the first byte gives the number of bytes for the character. The rest of the bits of the first byte can be used to encode bits of the character.

The continuation bytes begin with 10, and the other 6 bits encode bits of the character.

Here's a function I wrote a while back for encoding a JavaScript UTF-16 string in UTF-8:



function toUTF8Array(str) {

    var utf8 = [];

    for (var i=0; i < str.length; i++) {

        var charcode = str.charCodeAt(i);

        if (charcode < 0x80) utf8.push(charcode);

        else if (charcode < 0x800) {

            utf8.push(0xc0 | (charcode >> 6), 

                      0x80 | (charcode & 0x3f));

        }

        else if (charcode < 0xd800 || charcode >= 0xe000) {

            utf8.push(0xe0 | (charcode >> 12), 

                      0x80 | ((charcode>>6) & 0x3f), 

                      0x80 | (charcode & 0x3f));

        }

        // surrogate pair

        else {

            i++;

            // UTF-16 encodes 0x10000-0x10FFFF by

            // subtracting 0x10000 and splitting the

            // 20 bits of 0x0-0xFFFFF into two halves

            charcode = 0x10000 + (((charcode & 0x3ff)<<10)

                      | (str.charCodeAt(i) & 0x3ff));

            utf8.push(0xf0 | (charcode >>18), 

                      0x80 | ((charcode>>12) & 0x3f), 

                      0x80 | ((charcode>>6) & 0x3f), 

                      0x80 | (charcode & 0x3f));

        }

    }

    return utf8;

}


references:

https://stackoverflow.com/questions/18729405/how-to-convert-utf8-string-to-byte-array


No comments:

Post a Comment