Simplify UTF-8 decoding, just trivially convert bytes to 16-bit Unicode

ASM will do the rest (see ByteVector#encodeUTF8)
This commit is contained in:
Alexander Udalov
2016-01-19 03:34:08 +03:00
parent 9620893fa4
commit b216ed99b4
@@ -23,14 +23,6 @@ const val MAX_UTF8_INFO_LENGTH = 65535
const val UTF8_MODE_MARKER = 0.toChar()
// Leading bytes are prefixed with 110 in UTF-8
private val LEADING_BYTE_MASK = 0b11000000
// Continuation bytes are prefixed with 10 in UTF-8
private val CONTINUATION_BYTE_MASK = 0b10000000
private val TWO_LOWER_BITS_MASK = 0b00000011
private val SIX_LOWER_BITS_MASK = 0b00111111
fun bytesToStrings(bytes: ByteArray): Array<String> {
val result = ArrayList<String>(1)
val buffer = StringBuilder()
@@ -41,30 +33,16 @@ fun bytesToStrings(bytes: ByteArray): Array<String> {
bytesInBuffer += 2
for (b in bytes) {
if (b >= 0) {
buffer.append(b.toChar())
val c = b.toInt() and 0xFF // 0 <= c <= 255
buffer.append(c.toChar())
if (0 < b && b <= 127) {
bytesInBuffer++
// Zeros occupy two bytes
if (b == 0.toByte()) bytesInBuffer++
}
else {
val int = b.toInt() and 0xFF
val leadingByte = LEADING_BYTE_MASK or (int shr 6)
val continuationByte = CONTINUATION_BYTE_MASK or (int and SIX_LOWER_BITS_MASK)
val encodedByte = (leadingByte shl 8) or continuationByte
buffer.append(encodedByte.toChar())
bytesInBuffer += 2
if (bytesInBuffer > MAX_UTF8_INFO_LENGTH) {
result.add(buffer.substring(0, buffer.length - 1))
buffer.setLength(0)
buffer.append(encodedByte.toChar())
bytesInBuffer = 2
}
}
if (bytesInBuffer == MAX_UTF8_INFO_LENGTH) {
if (bytesInBuffer >= MAX_UTF8_INFO_LENGTH - 1) {
result.add(buffer.toString())
buffer.setLength(0)
bytesInBuffer = 0
@@ -85,19 +63,7 @@ fun stringsToBytes(strings: Array<String>): ByteArray {
var i = 0
for (s in strings) {
for (si in 0..s.length - 1) {
val c = s[si]
val int = c.toInt()
if (int <= 127) {
result[i++] = c.toByte()
}
else {
val leadingByte = (int and 0xFFFF) shr 8
val continuationByte = int and 0xFF
val higherBits = (leadingByte and TWO_LOWER_BITS_MASK) shl 6
val lowerBits = continuationByte and SIX_LOWER_BITS_MASK
result[i++] = (higherBits or lowerBits).toByte()
}
result[i++] = s[si].toByte()
}
}