Convert between byte[] and String[] efficiently
This commit is contained in:
committed by
Pavel V. Talanov
parent
2af8dd4298
commit
32c5624531
+184
-30
@@ -24,7 +24,9 @@ import org.jetbrains.asm4.commons.Method;
|
||||
import org.jetbrains.jet.lang.resolve.name.FqName;
|
||||
import org.jetbrains.jet.lang.resolve.name.Name;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static org.jetbrains.asm4.Type.*;
|
||||
|
||||
@@ -298,57 +300,209 @@ public class JavaProtoBufUtil {
|
||||
* Converts a byte array of serialized data to an array of {@code String} satisfying JVM annotation value argument restrictions:
|
||||
* <ol>
|
||||
* <li>Each string's length should be no more than 65535</li>
|
||||
* <li>UTF-8 representation of each string cannot contain byte 0x0 or bytes in the range 0xf0..0xff</li>
|
||||
* <li>UTF-8 representation of each string cannot contain bytes in the range 0xf0..0xff</li>
|
||||
* </ol>
|
||||
*/
|
||||
@NotNull
|
||||
public static String[] encodeBytes(@NotNull byte[] data) {
|
||||
// Each byte of data is split into two 4-bit parts ('lo' and 'hi'), then lo + 1 and hi + 1 are appended to the string. Hence, each
|
||||
// byte of the string is in the range 0x01..0x10 and this guarantees there's no byte 0x0 and no bytes in the range 0xf0..0xff
|
||||
// TODO: use Scala's approach instead (break data into chunks of 7 bits)
|
||||
int m = 32766;
|
||||
assert 2 * m <= 65535 : m;
|
||||
byte[] bytes = encode8to7(data);
|
||||
// Since 0x0 byte is encoded as two bytes in the Modified UTF-8 (0xc0 0x80) and zero is rather common to byte arrays, we increment
|
||||
// every byte by one modulo max byte value, so that the less common value 0x7f will be represented as two bytes instead.
|
||||
addModuloByte(bytes, 1);
|
||||
return splitBytesToStringArray(bytes);
|
||||
}
|
||||
|
||||
int n = data.length;
|
||||
String[] result = new String[(n + m - 1) / m];
|
||||
for (int offset = 0, resultIndex = 0; offset < n; offset += m, resultIndex++) {
|
||||
int length = Math.min(n - offset, m);
|
||||
byte[] a = new byte[length * 2];
|
||||
for (int i = 0; i < length; i++) {
|
||||
int lo = data[offset + i] & 0x0f;
|
||||
int hi = (data[offset + i] & 0xf0) >>> 4;
|
||||
a[2 * i] = (byte) (lo + 1);
|
||||
a[2 * i + 1] = (byte) (hi + 1);
|
||||
/**
|
||||
* Converts a byte array to another byte array, every element of which is in the range 0x0..0x7f.
|
||||
*
|
||||
* The conversion is equivalent to the following: input bytes are combined into one long bit string. This big string is then split into
|
||||
* groups of 7 bits. Each resulting 7-bit chunk is then converted to a byte (with a leading bit = 0). The last chunk may have less than
|
||||
* 7 bits, it's prepended with zeros to form a byte. The result is then the array of these bytes, each of which is obviously in the
|
||||
* range 0x0..0x7f.
|
||||
*
|
||||
* Suppose the input of 4 bytes is given (bytes are listed from the beginning to the end, each byte from the least significant bit to
|
||||
* the most significant bit, bits within each byte are numbered):
|
||||
*
|
||||
* 01234567 01234567 01234567 01234567
|
||||
*
|
||||
* The output for this kind of input will be of the following form ('#' represents a zero bit):
|
||||
*
|
||||
* 0123456# 7012345# 6701234# 5670123# 4567####
|
||||
*/
|
||||
@NotNull
|
||||
private static byte[] encode8to7(@NotNull byte[] data) {
|
||||
// ceil(data.length * 8 / 7)
|
||||
int resultLength = (data.length * 8 + 6) / 7;
|
||||
byte[] result = new byte[resultLength];
|
||||
|
||||
// We maintain a pointer to the bit in the input, which is represented by two numbers: index of the current byte in the input and
|
||||
// the index of a bit inside this byte (0 is least significant, 7 is most significant)
|
||||
int byteIndex = 0;
|
||||
int bit = 0;
|
||||
|
||||
// Write all resulting bytes except the last one. To do this we need to collect exactly 7 bits, starting from the current, into a
|
||||
// byte. In almost all cases these 7 bits can be collected from two parts: the first is several (at least one) most significant bits
|
||||
// from the current byte, the second is several (maybe zero) least significant bits from the next byte. The special case is when the
|
||||
// current bit is the first (least significant) bit in its byte (bit == 0): then the 7 needed bits are just the 7 least significant
|
||||
// of the current byte.
|
||||
for (int i = 0; i < resultLength - 1; i++) {
|
||||
if (bit == 0) {
|
||||
result[i] = (byte) (data[byteIndex] & 0x7f);
|
||||
bit = 7;
|
||||
continue;
|
||||
}
|
||||
result[resultIndex] = new String(a);
|
||||
|
||||
int firstPart = (data[byteIndex] & 0xff) >>> bit;
|
||||
int newBit = (bit + 7) & 7;
|
||||
int secondPart = (data[++byteIndex] & ((1 << newBit) - 1)) << 8 - bit;
|
||||
result[i] = (byte) (firstPart + secondPart);
|
||||
bit = newBit;
|
||||
}
|
||||
|
||||
// Write the last byte, which is just several most significant bits of the last byte in the input, padded with zeros
|
||||
if (resultLength > 0) {
|
||||
assert bit != 0 : "The last chunk cannot start from the input byte since otherwise at least one bit will remain unprocessed";
|
||||
assert byteIndex == data.length - 1 : "The last 7-bit chunk should be encoded from the last input byte: " +
|
||||
byteIndex + " != " + (data.length - 1);
|
||||
result[resultLength - 1] = (byte) ((data[byteIndex] & 0xff) >>> bit);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static void addModuloByte(@NotNull byte[] data, int increment) {
|
||||
for (int i = 0, n = data.length; i < n; i++) {
|
||||
data[i] = (byte) ((data[i] + increment) & 0x7f);
|
||||
}
|
||||
}
|
||||
|
||||
// The maximum possible length of the byte array in the CONSTANT_Utf8_info structure in the bytecode, as per JVMS7 4.4.7
|
||||
private static final int MAX_UTF8_INFO_LENGTH = 65535;
|
||||
|
||||
/**
|
||||
* Converts a big byte array into the array of strings, where each string, when written to the constant pool table in bytecode, produces
|
||||
* a byte array of not more than MAX_UTF8_INFO_LENGTH. Each byte, except those which are 0x0, occupies exactly one byte in the constant
|
||||
* pool table. Zero bytes occupy two bytes in the table each.
|
||||
*
|
||||
* When strings are constructed from the array of bytes here, they are encoded in the platform's default encoding. This is fine: the
|
||||
* conversion to the Modified UTF-8 (which here would be equivalent to replacing each 0x0 with 0xc0 0x80) will happen later by ASM, when
|
||||
* it writes these strings to the bytecode
|
||||
*/
|
||||
@NotNull
|
||||
private static String[] splitBytesToStringArray(@NotNull byte[] data) {
|
||||
List<String> result = new ArrayList<String>();
|
||||
|
||||
// The offset where the currently processed string starts
|
||||
int off = 0;
|
||||
|
||||
// The effective length the bytes of the current string would occupy in the constant pool table
|
||||
int len = 0;
|
||||
|
||||
for (int i = 0, n = data.length; i < n; i++) {
|
||||
// When the effective length reaches at least MAX - 1, we add the current string to the result. Note that the effective length
|
||||
// is at most MAX here: non-zero bytes occupy 1 byte and zero bytes occupy 2 bytes, so we couldn't jump over more than one byte
|
||||
if (len >= MAX_UTF8_INFO_LENGTH - 1) {
|
||||
assert len <= MAX_UTF8_INFO_LENGTH : "Produced strings cannot contain more than " + MAX_UTF8_INFO_LENGTH + " bytes: " + len;
|
||||
result.add(new String(data, off, i - off));
|
||||
off = i;
|
||||
len = 0;
|
||||
}
|
||||
|
||||
if (data[i] == 0) {
|
||||
len += 2;
|
||||
}
|
||||
else {
|
||||
len++;
|
||||
}
|
||||
}
|
||||
|
||||
if (len >= 0) {
|
||||
result.add(new String(data, off, data.length - off));
|
||||
}
|
||||
|
||||
return result.toArray(new String[result.size()]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts encoded array of {@code String} obtained by {@link JavaProtoBufUtil#encodeBytes(byte[])} back to a byte array.
|
||||
*/
|
||||
@NotNull
|
||||
public static byte[] decodeBytes(@NotNull String[] data) {
|
||||
int length = 0;
|
||||
for (String s : data) {
|
||||
assert s.length() % 2 == 0 : s.length();
|
||||
length += s.length() / 2;
|
||||
}
|
||||
byte[] result = new byte[length];
|
||||
byte[] bytes = combineStringArrayIntoBytes(data);
|
||||
// Adding 0x7f modulo max byte value is equivalent to subtracting 1 the same modulo, which is inverse to what happens in encodeBytes
|
||||
addModuloByte(bytes, 0x7f);
|
||||
return decode7to8(bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines the array of strings resulted from encodeBytes() into one long byte array
|
||||
*/
|
||||
@NotNull
|
||||
private static byte[] combineStringArrayIntoBytes(@NotNull String[] data) {
|
||||
int resultLength = 0;
|
||||
for (String s : data) {
|
||||
assert s.length() <= MAX_UTF8_INFO_LENGTH : "Too long string: " + s.length();
|
||||
resultLength += s.length();
|
||||
}
|
||||
|
||||
byte[] result = new byte[resultLength];
|
||||
int p = 0;
|
||||
for (String s : data) {
|
||||
for (int i = 0, n = s.length(); i < n; i += 2) {
|
||||
int lo = s.charAt(i) - 1;
|
||||
int hi = s.charAt(i + 1) - 1;
|
||||
assert 0 <= lo && lo < 0xf0 : lo;
|
||||
assert 0 <= hi && hi < 0xf0 : hi;
|
||||
result[p++] = (byte) (lo + (hi << 4));
|
||||
for (int i = 0, n = s.length(); i < n; i++) {
|
||||
result[p++] = (byte) s.charAt(i);
|
||||
}
|
||||
}
|
||||
|
||||
assert p == length;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the byte array resulted from encode8to7().
|
||||
*
|
||||
* Each byte of the input array has at most 7 valuable bits of information. So the decoding is equivalent to the following: least
|
||||
* significant 7 bits of all input bytes are combined into one long bit string. This bit string is then split into groups of 8 bits,
|
||||
* each of which forms a byte in the output. If there are any leftovers, they are ignored, since they were added just as a padding and
|
||||
* do not comprise a full byte.
|
||||
*
|
||||
* Suppose the following encoded byte array is given (bits are numbered the same way as in encode8to7() doc):
|
||||
*
|
||||
* 01234567 01234567 01234567 01234567
|
||||
*
|
||||
* The output of the following form would be produced:
|
||||
*
|
||||
* 01234560 12345601 23456012
|
||||
*
|
||||
* Note how all most significant bits and leftovers are dropped, since they don't contain any useful information
|
||||
*/
|
||||
@NotNull
|
||||
private static byte[] decode7to8(@NotNull byte[] data) {
|
||||
// floor(7 * data.length / 8)
|
||||
int resultLength = 7 * data.length / 8;
|
||||
|
||||
byte[] result = new byte[resultLength];
|
||||
|
||||
// We maintain a pointer to an input bit in the same fashion as in encode8to7(): it's represented as two numbers: index of the
|
||||
// current byte in the input and index of the bit in the byte
|
||||
int byteIndex = 0;
|
||||
int bit = 0;
|
||||
|
||||
// A resulting byte is comprised of 8 bits, starting from the current bit. Since each input byte only "contains 7 bytes", a
|
||||
// resulting byte always consists of two parts: several most significant bits of the current byte and several least significant bits
|
||||
// of the next byte
|
||||
for (int i = 0; i < resultLength; i++) {
|
||||
int firstPart = (data[byteIndex] & 0xff) >>> bit;
|
||||
byteIndex++;
|
||||
int secondPart = (data[byteIndex] & ((1 << (bit + 1)) - 1)) << 7 - bit;
|
||||
result[i] = (byte) (firstPart + secondPart);
|
||||
|
||||
if (bit == 6) {
|
||||
byteIndex++;
|
||||
bit = 0;
|
||||
}
|
||||
else {
|
||||
bit++;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user