Minor, simplify utfEncoding, add option to force using 8to7 encoding

This commit is contained in:
Alexander Udalov
2016-01-11 18:38:19 +03:00
parent fe15984a59
commit 5585c7da7f
2 changed files with 12 additions and 14 deletions
@@ -21,8 +21,10 @@ import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.List;
import static org.jetbrains.kotlin.serialization.jvm.UtfEncodingKt.MAX_UTF8_INFO_LENGTH;
public class BitEncoding {
private static boolean NEW = true;
private static final boolean FORCE_8TO7_ENCODING = "true".equals(System.getProperty("kotlin.jvm.serialization.use8to7"));
private BitEncoding() {
}
@@ -36,9 +38,9 @@ public class BitEncoding {
*/
@NotNull
public static String[] encodeBytes(@NotNull byte[] data) {
if (NEW) {
List<String> strings = UtfEncodingKt.bytesToStrings(data);
return strings.toArray(new String[strings.size()]);
// TODO: try both encodings here and choose the best one (with the smallest size)
if (!FORCE_8TO7_ENCODING) {
return UtfEncodingKt.bytesToStrings(data);
}
byte[] bytes = encode8to7(data);
// Since 0x0 byte is encoded as two bytes in the Modified UTF-8 (0xc0 0x80) and zero is rather common to byte arrays, we increment
@@ -111,9 +113,6 @@ public class BitEncoding {
}
}
// The maximum possible length of the byte array in the CONSTANT_Utf8_info structure in the bytecode, as per JVMS7 4.4.7
private static final int MAX_UTF8_INFO_LENGTH = 65535;
/**
* Converts a big byte array into the array of strings, where each string, when written to the constant pool table in bytecode, produces
* a byte array of not more than MAX_UTF8_INFO_LENGTH. Each byte, except those which are 0x0, occupies exactly one byte in the constant
@@ -163,7 +162,7 @@ public class BitEncoding {
*/
@NotNull
public static byte[] decodeBytes(@NotNull String[] data) {
if (NEW) {
if (!FORCE_8TO7_ENCODING) {
return UtfEncodingKt.stringsToBytes(data);
}
byte[] bytes = combineStringArrayIntoBytes(data);
@@ -19,18 +19,17 @@ package org.jetbrains.kotlin.serialization.jvm
import java.util.*
// The maximum possible length of the byte array in the CONSTANT_Utf8_info structure in the bytecode, as per JVMS7 4.4.7
private val MAX_UTF8_INFO_LENGTH = 65535
const val MAX_UTF8_INFO_LENGTH = 65535
// Leading bytes are prefixed with 110 in UTF-8
private val LEADING_BYTE_MASK = 0b11000000
// Continuation bytes are prefixed with 10 in UTF-8
private val CONTINUATION_BYTE_MASK = 0b10000000
private val TWO_HIGHER_BITS_MASK = 0b11000000
private val TWO_LOWER_BITS_MASK = 0b00000011
private val SIX_LOWER_BITS_MASK = 0b00111111
fun bytesToStrings(bytes: ByteArray): List<String> {
fun bytesToStrings(bytes: ByteArray): Array<String> {
val result = ArrayList<String>(1)
val buffer = StringBuilder()
var bytesInBuffer = 0
@@ -42,7 +41,7 @@ fun bytesToStrings(bytes: ByteArray): List<String> {
}
else {
val int = b.toInt() and 0xFF
val leadingByte = LEADING_BYTE_MASK or ((int and TWO_HIGHER_BITS_MASK) shr 6)
val leadingByte = LEADING_BYTE_MASK or (int shr 6)
val continuationByte = CONTINUATION_BYTE_MASK or (int and SIX_LOWER_BITS_MASK)
val encodedByte = (leadingByte shl 8) or continuationByte
@@ -68,7 +67,7 @@ fun bytesToStrings(bytes: ByteArray): List<String> {
result.add(buffer.toString())
}
return result
return result.toTypedArray()
}
fun stringsToBytes(strings: Array<String>): ByteArray {
@@ -95,4 +94,4 @@ fun stringsToBytes(strings: Array<String>): ByteArray {
}
return result
}
}