Simplify UTF-8 decoding, just trivially convert bytes to 16-bit Unicode
ASM will do the rest (see ByteVector#encodeUTF8)
This commit is contained in:
+5
-39
@@ -23,14 +23,6 @@ const val MAX_UTF8_INFO_LENGTH = 65535
|
||||
|
||||
const val UTF8_MODE_MARKER = 0.toChar()
|
||||
|
||||
// Leading bytes are prefixed with 110 in UTF-8
|
||||
private val LEADING_BYTE_MASK = 0b11000000
|
||||
// Continuation bytes are prefixed with 10 in UTF-8
|
||||
private val CONTINUATION_BYTE_MASK = 0b10000000
|
||||
|
||||
private val TWO_LOWER_BITS_MASK = 0b00000011
|
||||
private val SIX_LOWER_BITS_MASK = 0b00111111
|
||||
|
||||
fun bytesToStrings(bytes: ByteArray): Array<String> {
|
||||
val result = ArrayList<String>(1)
|
||||
val buffer = StringBuilder()
|
||||
@@ -41,30 +33,16 @@ fun bytesToStrings(bytes: ByteArray): Array<String> {
|
||||
bytesInBuffer += 2
|
||||
|
||||
for (b in bytes) {
|
||||
if (b >= 0) {
|
||||
buffer.append(b.toChar())
|
||||
val c = b.toInt() and 0xFF // 0 <= c <= 255
|
||||
buffer.append(c.toChar())
|
||||
if (0 < b && b <= 127) {
|
||||
bytesInBuffer++
|
||||
// Zeros occupy two bytes
|
||||
if (b == 0.toByte()) bytesInBuffer++
|
||||
}
|
||||
else {
|
||||
val int = b.toInt() and 0xFF
|
||||
val leadingByte = LEADING_BYTE_MASK or (int shr 6)
|
||||
val continuationByte = CONTINUATION_BYTE_MASK or (int and SIX_LOWER_BITS_MASK)
|
||||
val encodedByte = (leadingByte shl 8) or continuationByte
|
||||
|
||||
buffer.append(encodedByte.toChar())
|
||||
bytesInBuffer += 2
|
||||
|
||||
if (bytesInBuffer > MAX_UTF8_INFO_LENGTH) {
|
||||
result.add(buffer.substring(0, buffer.length - 1))
|
||||
buffer.setLength(0)
|
||||
buffer.append(encodedByte.toChar())
|
||||
bytesInBuffer = 2
|
||||
}
|
||||
}
|
||||
|
||||
if (bytesInBuffer == MAX_UTF8_INFO_LENGTH) {
|
||||
if (bytesInBuffer >= MAX_UTF8_INFO_LENGTH - 1) {
|
||||
result.add(buffer.toString())
|
||||
buffer.setLength(0)
|
||||
bytesInBuffer = 0
|
||||
@@ -85,19 +63,7 @@ fun stringsToBytes(strings: Array<String>): ByteArray {
|
||||
var i = 0
|
||||
for (s in strings) {
|
||||
for (si in 0..s.length - 1) {
|
||||
val c = s[si]
|
||||
|
||||
val int = c.toInt()
|
||||
if (int <= 127) {
|
||||
result[i++] = c.toByte()
|
||||
}
|
||||
else {
|
||||
val leadingByte = (int and 0xFFFF) shr 8
|
||||
val continuationByte = int and 0xFF
|
||||
val higherBits = (leadingByte and TWO_LOWER_BITS_MASK) shl 6
|
||||
val lowerBits = continuationByte and SIX_LOWER_BITS_MASK
|
||||
result[i++] = (higherBits or lowerBits).toByte()
|
||||
}
|
||||
result[i++] = s[si].toByte()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user