From c8a4fa58cdb9c003d03a1e7c1c6c3c57bfd12b5f Mon Sep 17 00:00:00 2001 From: Abduqodiri Qurbonzoda Date: Thu, 11 Apr 2019 03:12:29 +0300 Subject: [PATCH] Implement String to/from ByteArray conversion (KT-24810) --- libraries/stdlib/common/src/kotlin/TextH.kt | 46 +++ libraries/stdlib/common/test/testUtils.kt | 2 + libraries/stdlib/js/src/kotlin/text/string.kt | 46 +++ libraries/stdlib/js/src/kotlin/text/text.kt | 12 +- .../stdlib/js/src/kotlin/text/utf8Encoding.kt | 272 ++++++++++++++ libraries/stdlib/js/test/core/testUtils.kt | 4 + .../js/test/text/StringEncodingTestJs.kt | 11 + .../jvm/runtime/kotlin/text/TypeAliases.kt | 7 + .../stdlib/jvm/src/kotlin/text/StringsJVM.kt | 72 ++++ libraries/stdlib/jvm/test/testUtilsJVM.kt | 9 + .../jvm/test/text/StringEncodingTestJvm.kt | 11 + .../test/collections/UnsignedArraysTest.kt | 6 +- libraries/stdlib/test/testUtils.kt | 12 +- .../stdlib/test/text/StringEncodingTest.kt | 337 ++++++++++++++++++ .../kotlin-stdlib-runtime-merged.txt | 4 + 15 files changed, 844 insertions(+), 7 deletions(-) create mode 100644 libraries/stdlib/js/src/kotlin/text/utf8Encoding.kt create mode 100644 libraries/stdlib/js/test/text/StringEncodingTestJs.kt create mode 100644 libraries/stdlib/jvm/test/text/StringEncodingTestJvm.kt create mode 100644 libraries/stdlib/test/text/StringEncodingTest.kt diff --git a/libraries/stdlib/common/src/kotlin/TextH.kt b/libraries/stdlib/common/src/kotlin/TextH.kt index d787fe943a9..1ac9c1b4946 100644 --- a/libraries/stdlib/common/src/kotlin/TextH.kt +++ b/libraries/stdlib/common/src/kotlin/TextH.kt @@ -82,6 +82,13 @@ expect enum class RegexOption { MULTILINE } +/** + * The exception thrown when a character encoding or decoding error occurs. + */ +@SinceKotlin("1.3") +@ExperimentalStdlibApi +public expect open class CharacterCodingException() : Exception + // From char.kt @@ -135,6 +142,45 @@ public expect fun CharArray.concatToString(startIndex: Int = 0, endIndex: Int = @ExperimentalStdlibApi public expect fun String.toCharArray(startIndex: Int = 0, endIndex: Int = this.length): CharArray +/** + * Decodes a string from the bytes in UTF-8 encoding in this array or its subrange. + * + * @param startIndex the beginning (inclusive) of the subrange to decode, 0 by default. + * @param endIndex the end (exclusive) of the subrange to decode, size of this array by default. + * @param throwOnInvalidSequence specifies whether to throw an exception on malformed byte sequence or replace it by the replacement char `\uFFFD`. + * + * @throws IndexOutOfBoundsException if [startIndex] is less than zero or [endIndex] is greater than the size of this array. + * @throws IllegalArgumentException if [startIndex] is greater than [endIndex]. + * @throws CharacterCodingException if the byte array contains malformed UTF-8 byte sequence and [throwOnInvalidSequence] is true. + */ +@SinceKotlin("1.3") +@ExperimentalStdlibApi +public expect fun ByteArray.decodeToString( + startIndex: Int = 0, + endIndex: Int = this.size, + throwOnInvalidSequence: Boolean = false +): String + +/** + * Encodes this string or its substring to an array of bytes in UTF-8 encoding. + * + * @param startIndex the beginning (inclusive) of the substring to encode, 0 by default. + * @param endIndex the end (exclusive) of the substring to encode, length of this string by default. + * @param throwOnInvalidSequence specifies whether to throw an exception on malformed char sequence or replace. + * + * @throws IndexOutOfBoundsException if [startIndex] is less than zero or [endIndex] is greater than the length of this string. + * @throws IllegalArgumentException if [startIndex] is greater than [endIndex]. + * @throws CharacterCodingException if this string contains malformed char sequence and [throwOnInvalidSequence] is true. + */ +@SinceKotlin("1.3") +@ExperimentalStdlibApi +public expect fun String.encodeToByteArray( + startIndex: Int = 0, + endIndex: Int = this.length, + throwOnInvalidSequence: Boolean = false +): ByteArray + + internal expect fun String.nativeIndexOf(str: String, fromIndex: Int): Int internal expect fun String.nativeLastIndexOf(str: String, fromIndex: Int): Int diff --git a/libraries/stdlib/common/test/testUtils.kt b/libraries/stdlib/common/test/testUtils.kt index d9bb8055a35..10d38a32136 100644 --- a/libraries/stdlib/common/test/testUtils.kt +++ b/libraries/stdlib/common/test/testUtils.kt @@ -8,3 +8,5 @@ package test public expect fun assertTypeEquals(expected: Any?, actual: Any?) internal expect fun String.removeLeadingPlusOnJava6(): String + +internal expect inline fun testOnNonJvm6And7(f: () -> Unit) diff --git a/libraries/stdlib/js/src/kotlin/text/string.kt b/libraries/stdlib/js/src/kotlin/text/string.kt index 42c12933951..8f6e7ed2b7d 100644 --- a/libraries/stdlib/js/src/kotlin/text/string.kt +++ b/libraries/stdlib/js/src/kotlin/text/string.kt @@ -74,6 +74,52 @@ public actual fun String.toCharArray(startIndex: Int = 0, endIndex: Int = this.l return CharArray(endIndex - startIndex) { get(startIndex + it) } } +/** + * Decodes a string from the bytes in UTF-8 encoding in this array or its subrange. + * + * @param startIndex the beginning (inclusive) of the subrange to decode, 0 by default. + * @param endIndex the end (exclusive) of the subrange to decode, size of this array by default. + * @param throwOnInvalidSequence specifies whether to throw an exception on malformed byte sequence or replace it by the replacement char `\uFFFD`. + * + * @throws IndexOutOfBoundsException if [startIndex] is less than zero or [endIndex] is greater than the size of this array. + * @throws IllegalArgumentException if [startIndex] is greater than [endIndex]. + * @throws CharacterCodingException if the byte array contains malformed UTF-8 byte sequence and [throwOnInvalidSequence] is true. + */ +@SinceKotlin("1.3") +@Suppress("ACTUAL_FUNCTION_WITH_DEFAULT_ARGUMENTS") +@ExperimentalStdlibApi +public actual fun ByteArray.decodeToString( + startIndex: Int = 0, + endIndex: Int = this.size, + throwOnInvalidSequence: Boolean = false +): String { + AbstractList.checkBoundsIndexes(startIndex, endIndex, this.size) + return decodeUtf8(this, startIndex, endIndex, throwOnInvalidSequence) +} + +/** + * Encodes this string or its substring to an array of bytes in UTF-8 encoding. + * + * @param startIndex the beginning (inclusive) of the substring to encode, 0 by default. + * @param endIndex the end (exclusive) of the substring to encode, length of this string by default. + * @param throwOnInvalidSequence specifies whether to throw an exception on malformed char sequence or replace. + * + * @throws IndexOutOfBoundsException if [startIndex] is less than zero or [endIndex] is greater than the length of this string. + * @throws IllegalArgumentException if [startIndex] is greater than [endIndex]. + * @throws CharacterCodingException if this string contains malformed char sequence and [throwOnInvalidSequence] is true. + */ +@SinceKotlin("1.3") +@Suppress("ACTUAL_FUNCTION_WITH_DEFAULT_ARGUMENTS") +@ExperimentalStdlibApi +public actual fun String.encodeToByteArray( + startIndex: Int = 0, + endIndex: Int = this.length, + throwOnInvalidSequence: Boolean = false +): ByteArray { + AbstractList.checkBoundsIndexes(startIndex, endIndex, length) + return encodeUtf8(this, startIndex, endIndex, throwOnInvalidSequence) +} + /** * Returns a copy of this string converted to upper case using the rules of the default locale. * diff --git a/libraries/stdlib/js/src/kotlin/text/text.kt b/libraries/stdlib/js/src/kotlin/text/text.kt index aad92732dd0..7031b210418 100644 --- a/libraries/stdlib/js/src/kotlin/text/text.kt +++ b/libraries/stdlib/js/src/kotlin/text/text.kt @@ -77,4 +77,14 @@ public actual class StringBuilder(content: String = "") : Appendable, CharSequen */ @SinceKotlin("1.3") @Suppress("EXTENSION_SHADOWED_BY_MEMBER", "NOTHING_TO_INLINE") -public actual inline fun StringBuilder.clear(): StringBuilder = this.clear() \ No newline at end of file +public actual inline fun StringBuilder.clear(): StringBuilder = this.clear() + + +/** + * The exception thrown when a character encoding or decoding error occurs. + */ +@SinceKotlin("1.3") +@ExperimentalStdlibApi +public actual open class CharacterCodingException(message: String?) : Exception(message) { + actual constructor() : this(null) +} diff --git a/libraries/stdlib/js/src/kotlin/text/utf8Encoding.kt b/libraries/stdlib/js/src/kotlin/text/utf8Encoding.kt new file mode 100644 index 00000000000..64badc8fcea --- /dev/null +++ b/libraries/stdlib/js/src/kotlin/text/utf8Encoding.kt @@ -0,0 +1,272 @@ +/* + * Copyright 2010-2019 JetBrains s.r.o. and Kotlin Programming Language contributors. + * Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file. + */ + +package kotlin.text + +/** Returns the negative [size] if [throwOnMalformed] is false, throws [CharacterCodingException] otherwise. */ +@UseExperimental(ExperimentalStdlibApi::class) +private fun malformed(size: Int, index: Int, throwOnMalformed: Boolean): Int { + if (throwOnMalformed) throw CharacterCodingException("Malformed sequence starting at ${index - 1}") + return -size +} + +/** + * Returns code point corresponding to UTF-16 surrogate pair, + * where the first of the pair is the [high] and the second is in the [string] at the [index]. + * Returns zero if the pair is malformed and [throwOnMalformed] is false. + * + * @throws CharacterCodingException if the pair is malformed and [throwOnMalformed] is true. + */ +private fun codePointFromSurrogate(string: String, high: Int, index: Int, endIndex: Int, throwOnMalformed: Boolean): Int { + if (high !in 0xD800..0xDBFF || index >= endIndex) { + return malformed(0, index, throwOnMalformed) + } + val low = string[index].toInt() + if (low !in 0xDC00..0xDFFF) { + return malformed(0, index, throwOnMalformed) + } + return 0x10000 + ((high and 0x3FF) shl 10) or (low and 0x3FF) +} + +/** + * Returns code point corresponding to UTF-8 sequence of two bytes, + * where the first byte of the sequence is the [byte1] and the second byte is in the [bytes] array at the [index]. + * Returns zero if the sequence is malformed and [throwOnMalformed] is false. + * + * @throws CharacterCodingException if the sequence of two bytes is malformed and [throwOnMalformed] is true. + */ +private fun codePointFrom2(bytes: ByteArray, byte1: Int, index: Int, endIndex: Int, throwOnMalformed: Boolean): Int { + if (byte1 and 0x1E == 0 || index >= endIndex) { + return malformed(0, index, throwOnMalformed) + } + val byte2 = bytes[index].toInt() + if (byte2 and 0xC0 != 0x80) { + return malformed(0, index, throwOnMalformed) + } + return (byte1 shl 6) xor byte2 xor 0xF80 +} + +/** + * Returns code point corresponding to UTF-8 sequence of three bytes, + * where the first byte of the sequence is the [byte1] and the others are in the [bytes] array starting from the [index]. + * Returns a non-positive value indicating number of bytes from [bytes] included in malformed sequence + * if the sequence is malformed and [throwOnMalformed] is false. + * + * @throws CharacterCodingException if the sequence of three bytes is malformed and [throwOnMalformed] is true. + */ +private fun codePointFrom3(bytes: ByteArray, byte1: Int, index: Int, endIndex: Int, throwOnMalformed: Boolean): Int { + if (index >= endIndex) { + return malformed(0, index, throwOnMalformed) + } + + val byte2 = bytes[index].toInt() + if (byte1 and 0xF == 0) { + if (byte2 and 0xE0 != 0xA0) { + // Non-shortest form + return malformed(0, index, throwOnMalformed) + } + } else if (byte1 and 0xF == 0xD) { + if (byte2 and 0xE0 != 0x80) { + // Surrogate code point + return malformed(0, index, throwOnMalformed) + } + } else if (byte2 and 0xC0 != 0x80) { + return malformed(0, index, throwOnMalformed) + } + + if (index + 1 == endIndex) { + return malformed(1, index, throwOnMalformed) + } + val byte3 = bytes[index + 1].toInt() + if (byte3 and 0xC0 != 0x80) { + return malformed(1, index, throwOnMalformed) + } + + return (byte1 shl 12) xor (byte2 shl 6) xor byte3 xor -0x1E080 +} + +/** + * Returns code point corresponding to UTF-8 sequence of four bytes, + * where the first byte of the sequence is the [byte1] and the others are in the [bytes] array starting from the [index]. + * Returns a non-positive value indicating number of bytes from [bytes] included in malformed sequence + * if the sequence is malformed and [throwOnMalformed] is false. + * + * @throws CharacterCodingException if the sequence of four bytes is malformed and [throwOnMalformed] is true. + */ +private fun codePointFrom4(bytes: ByteArray, byte1: Int, index: Int, endIndex: Int, throwOnMalformed: Boolean): Int { + if (index >= endIndex) { + malformed(0, index, throwOnMalformed) + } + + val byte2 = bytes[index].toInt() + if (byte1 and 0xF == 0x0) { + if (byte2 and 0xF0 <= 0x80) { + // Non-shortest form + return malformed(0, index, throwOnMalformed) + } + } else if (byte1 and 0xF == 0x4) { + if (byte2 and 0xF0 != 0x80) { + // Out of Unicode code points domain (larger than U+10FFFF) + return malformed(0, index, throwOnMalformed) + } + } else if (byte1 and 0xF > 0x4) { + return malformed(0, index, throwOnMalformed) + } else if (byte2 and 0xC0 != 0x80) { + return malformed(0, index, throwOnMalformed) + } + + if (index + 1 == endIndex) { + return malformed(1, index, throwOnMalformed) + } + val byte3 = bytes[index + 1].toInt() + if (byte3 and 0xC0 != 0x80) { + return malformed(1, index, throwOnMalformed) + } + + if (index + 2 == endIndex) { + return malformed(2, index, throwOnMalformed) + } + val byte4 = bytes[index + 2].toInt() + if (byte4 and 0xC0 != 0x80) { + return malformed(2, index, throwOnMalformed) + } + return (byte1 shl 18) xor (byte2 shl 12) xor (byte3 shl 6) xor byte4 xor 0x381F80 +} + +/** + * Maximum number of bytes needed to encode a single char. + * + * Code points in `0..0x7F` are encoded in a single byte. + * Code points in `0x80..0x7FF` are encoded in two bytes. + * Code points in `0x800..0xD7FF` or in `0xE000..0xFFFF` are encoded in three bytes. + * Surrogate code points in `0xD800..0xDFFF` are not Unicode scalar values, therefore aren't encoded. + * Code points in `0x10000..0x10FFFF` are represented by a pair of surrogate `Char`s and are encoded in four bytes. + */ +private const val MAX_BYTES_PER_CHAR = 3 + +/** + * The byte sequence a malformed UTF-16 char sequence is replaced by. + */ +private val REPLACEMENT_BYTE_SEQUENCE: ByteArray = byteArrayOf(0xEF.toByte(), 0xBF.toByte(), 0xBD.toByte()) + +/** + * Encodes the [string] using UTF-8 and returns the resulting [ByteArray]. + * + * @param string the string to encode. + * @param startIndex the start offset (inclusive) of the substring to encode. + * @param endIndex the end offset (exclusive) of the substring to encode. + * @param throwOnMalformed whether to throw on malformed char sequence or replace by the [REPLACEMENT_BYTE_SEQUENCE]. + * + * @throws CharacterCodingException if the char sequence is malformed and [throwOnMalformed] is true. + */ +internal fun encodeUtf8(string: String, startIndex: Int, endIndex: Int, throwOnMalformed: Boolean): ByteArray { + require(startIndex >= 0 && endIndex <= string.length && startIndex <= endIndex) + + val bytes = ByteArray((endIndex - startIndex) * MAX_BYTES_PER_CHAR) + var byteIndex = 0 + var charIndex = startIndex + + while (charIndex < endIndex) { + val code = string[charIndex++].toInt() + when { + code < 0x80 -> + bytes[byteIndex++] = code.toByte() + code < 0x800 -> { + bytes[byteIndex++] = ((code shr 6) or 0xC0).toByte() + bytes[byteIndex++] = ((code and 0x3F) or 0x80).toByte() + } + code < 0xD800 || code >= 0xE000 -> { + bytes[byteIndex++] = ((code shr 12) or 0xE0).toByte() + bytes[byteIndex++] = (((code shr 6) and 0x3F) or 0x80).toByte() + bytes[byteIndex++] = ((code and 0x3F) or 0x80).toByte() + } + else -> { // Surrogate char value + val codePoint = codePointFromSurrogate(string, code, charIndex, endIndex, throwOnMalformed) + if (codePoint <= 0) { + bytes[byteIndex++] = REPLACEMENT_BYTE_SEQUENCE[0] + bytes[byteIndex++] = REPLACEMENT_BYTE_SEQUENCE[1] + bytes[byteIndex++] = REPLACEMENT_BYTE_SEQUENCE[2] + } else { + bytes[byteIndex++] = ((codePoint shr 18) or 0xF0).toByte() + bytes[byteIndex++] = (((codePoint shr 12) and 0x3F) or 0x80).toByte() + bytes[byteIndex++] = (((codePoint shr 6) and 0x3F) or 0x80).toByte() + bytes[byteIndex++] = ((codePoint and 0x3F) or 0x80).toByte() + charIndex++ + } + } + } + } + + return if (bytes.size == byteIndex) bytes else bytes.copyOf(byteIndex) +} + +/** + * The character a malformed UTF-8 byte sequence is replaced by. + */ +private const val REPLACEMENT_CHAR = '\uFFFD' + +/** + * Decodes the UTF-8 [bytes] array and returns the resulting [String]. + * + * @param bytes the byte array to decode. + * @param startIndex the start offset (inclusive) of the array to be decoded. + * @param endIndex the end offset (exclusive) of the array to be encoded. + * @param throwOnMalformed whether to throw on malformed byte sequence or replace by the [REPLACEMENT_CHAR]. + * + * @throws CharacterCodingException if the array is malformed UTF-8 byte sequence and [throwOnMalformed] is true. + */ +internal fun decodeUtf8(bytes: ByteArray, startIndex: Int, endIndex: Int, throwOnMalformed: Boolean): String { + require(startIndex >= 0 && endIndex <= bytes.size && startIndex <= endIndex) + + var byteIndex = startIndex + val stringBuilder = StringBuilder() + + while (byteIndex < endIndex) { + val byte = bytes[byteIndex++].toInt() + when { + byte >= 0 -> + stringBuilder.append(byte.toChar()) + byte shr 5 == -2 -> { + val code = codePointFrom2(bytes, byte, byteIndex, endIndex, throwOnMalformed) + if (code <= 0) { + stringBuilder.append(REPLACEMENT_CHAR) + byteIndex += -code + } else { + stringBuilder.append(code.toChar()) + byteIndex += 1 + } + } + byte shr 4 == -2 -> { + val code = codePointFrom3(bytes, byte, byteIndex, endIndex, throwOnMalformed) + if (code <= 0) { + stringBuilder.append(REPLACEMENT_CHAR) + byteIndex += -code + } else { + stringBuilder.append(code.toChar()) + byteIndex += 2 + } + } + byte shr 3 == -2 -> { + val code = codePointFrom4(bytes, byte, byteIndex, endIndex, throwOnMalformed) + if (code <= 0) { + stringBuilder.append(REPLACEMENT_CHAR) + byteIndex += -code + } else { + val high = (code - 0x10000) shr 10 or 0xD800 + val low = (code and 0x3FF) or 0xDC00 + stringBuilder.append(high.toChar()) + stringBuilder.append(low.toChar()) + byteIndex += 3 + } + } + else -> { + malformed(0, byteIndex, throwOnMalformed) + stringBuilder.append(REPLACEMENT_CHAR) + } + } + } + + return stringBuilder.toString() +} \ No newline at end of file diff --git a/libraries/stdlib/js/test/core/testUtils.kt b/libraries/stdlib/js/test/core/testUtils.kt index 245cac85b49..66b141e6f85 100644 --- a/libraries/stdlib/js/test/core/testUtils.kt +++ b/libraries/stdlib/js/test/core/testUtils.kt @@ -13,3 +13,7 @@ public actual fun assertTypeEquals(expected: Any?, actual: Any?) { @Suppress("NOTHING_TO_INLINE") internal actual inline fun String.removeLeadingPlusOnJava6(): String = this + +internal actual inline fun testOnNonJvm6And7(f: () -> Unit) { + f() +} diff --git a/libraries/stdlib/js/test/text/StringEncodingTestJs.kt b/libraries/stdlib/js/test/text/StringEncodingTestJs.kt new file mode 100644 index 00000000000..7b27caa0369 --- /dev/null +++ b/libraries/stdlib/js/test/text/StringEncodingTestJs.kt @@ -0,0 +1,11 @@ +/* + * Copyright 2010-2019 JetBrains s.r.o. and Kotlin Programming Language contributors. + * Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file. + */ + +package test.text + + +internal actual val surrogateCodePointDecoding: String = "���" + +internal actual val surrogateCharEncoding: ByteArray = byteArrayOf(0xEF.toByte(), 0xBF.toByte(), 0xBD.toByte()) \ No newline at end of file diff --git a/libraries/stdlib/jvm/runtime/kotlin/text/TypeAliases.kt b/libraries/stdlib/jvm/runtime/kotlin/text/TypeAliases.kt index 465b536d7aa..f01ea47f4a5 100644 --- a/libraries/stdlib/jvm/runtime/kotlin/text/TypeAliases.kt +++ b/libraries/stdlib/jvm/runtime/kotlin/text/TypeAliases.kt @@ -11,3 +11,10 @@ package kotlin.text @Suppress("ACTUAL_WITHOUT_EXPECT") // TODO: some supertypes are missing @SinceKotlin("1.1") public actual typealias StringBuilder = java.lang.StringBuilder + +/** + * The exception thrown when a character encoding or decoding error occurs. + */ +@SinceKotlin("1.3") +@ExperimentalStdlibApi +public actual typealias CharacterCodingException = java.nio.charset.CharacterCodingException \ No newline at end of file diff --git a/libraries/stdlib/jvm/src/kotlin/text/StringsJVM.kt b/libraries/stdlib/jvm/src/kotlin/text/StringsJVM.kt index 41d66215588..40a517fc606 100644 --- a/libraries/stdlib/jvm/src/kotlin/text/StringsJVM.kt +++ b/libraries/stdlib/jvm/src/kotlin/text/StringsJVM.kt @@ -9,7 +9,10 @@ package kotlin.text +import java.nio.ByteBuffer +import java.nio.CharBuffer import java.nio.charset.Charset +import java.nio.charset.CodingErrorAction import java.util.Locale import java.util.regex.Pattern @@ -142,6 +145,75 @@ public actual fun String.toCharArray(startIndex: Int = 0, endIndex: Int = this.l return toCharArray(CharArray(endIndex - startIndex), 0, startIndex, endIndex) } +/** + * Decodes a string from the bytes in UTF-8 encoding in this array or its subrange. + * + * @param startIndex the beginning (inclusive) of the subrange to decode, 0 by default. + * @param endIndex the end (exclusive) of the subrange to decode, size of this array by default. + * @param throwOnInvalidSequence specifies whether to throw an exception on malformed byte sequence or replace it by the replacement char `\uFFFD`. + * + * @throws IndexOutOfBoundsException if [startIndex] is less than zero or [endIndex] is greater than the size of this array. + * @throws IllegalArgumentException if [startIndex] is greater than [endIndex]. + * @throws CharacterCodingException if the byte array contains malformed UTF-8 byte sequence and [throwOnInvalidSequence] is true. + */ +@SinceKotlin("1.3") +@Suppress("ACTUAL_FUNCTION_WITH_DEFAULT_ARGUMENTS") +@ExperimentalStdlibApi +public actual fun ByteArray.decodeToString( + startIndex: Int = 0, + endIndex: Int = this.size, + throwOnInvalidSequence: Boolean = false +): String { + AbstractList.checkBoundsIndexes(startIndex, endIndex, this.size) + + if (!throwOnInvalidSequence) { + return String(this, startIndex, endIndex - startIndex) + } + + val decoder = Charsets.UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + + return decoder.decode(ByteBuffer.wrap(this, startIndex, endIndex - startIndex)).toString() +} + +/** + * Encodes this string or its substring to an array of bytes in UTF-8 encoding. + * + * @param startIndex the beginning (inclusive) of the substring to encode, 0 by default. + * @param endIndex the end (exclusive) of the substring to encode, length of this string by default. + * @param throwOnInvalidSequence specifies whether to throw an exception on malformed char sequence or replace. + * + * @throws IndexOutOfBoundsException if [startIndex] is less than zero or [endIndex] is greater than the length of this string. + * @throws IllegalArgumentException if [startIndex] is greater than [endIndex]. + * @throws CharacterCodingException if this string contains malformed char sequence and [throwOnInvalidSequence] is true. + */ +@SinceKotlin("1.3") +@Suppress("ACTUAL_FUNCTION_WITH_DEFAULT_ARGUMENTS") +@ExperimentalStdlibApi +public actual fun String.encodeToByteArray( + startIndex: Int = 0, + endIndex: Int = this.length, + throwOnInvalidSequence: Boolean = false +): ByteArray { + AbstractList.checkBoundsIndexes(startIndex, endIndex, length) + + if (!throwOnInvalidSequence) { + return this.substring(startIndex, endIndex).toByteArray(Charsets.UTF_8) + } + + val encoder = Charsets.UTF_8.newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + + val byteBuffer = encoder.encode(CharBuffer.wrap(this, startIndex, endIndex)) + return if (byteBuffer.hasArray() && byteBuffer.arrayOffset() == 0 && byteBuffer.remaining() == byteBuffer.array()!!.size) { + byteBuffer.array() + } else { + ByteArray(byteBuffer.remaining()).also { byteBuffer.get(it) } + } +} + /** * Returns a new character array containing the characters from this string. */ diff --git a/libraries/stdlib/jvm/test/testUtilsJVM.kt b/libraries/stdlib/jvm/test/testUtilsJVM.kt index b3cf0ec7dc1..2246955adbf 100644 --- a/libraries/stdlib/jvm/test/testUtilsJVM.kt +++ b/libraries/stdlib/jvm/test/testUtilsJVM.kt @@ -16,3 +16,12 @@ private val isJava6 = System.getProperty("java.version").startsWith("1.6.") internal actual fun String.removeLeadingPlusOnJava6(): String = if (isJava6) removePrefix("+") else this +private val isJava7 = System.getProperty("java.version").startsWith("1.7.") + +private val isJava8AndAbove = !isJava6 && !isJava7 + +internal actual inline fun testOnNonJvm6And7(f: () -> Unit) { + if (isJava8AndAbove) { + f() + } +} \ No newline at end of file diff --git a/libraries/stdlib/jvm/test/text/StringEncodingTestJvm.kt b/libraries/stdlib/jvm/test/text/StringEncodingTestJvm.kt new file mode 100644 index 00000000000..ebad75d68c7 --- /dev/null +++ b/libraries/stdlib/jvm/test/text/StringEncodingTestJvm.kt @@ -0,0 +1,11 @@ +/* + * Copyright 2010-2019 JetBrains s.r.o. and Kotlin Programming Language contributors. + * Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file. + */ + +package test.text + + +internal actual val surrogateCodePointDecoding: String = "�" + +internal actual val surrogateCharEncoding: ByteArray = byteArrayOf(0x3F) \ No newline at end of file diff --git a/libraries/stdlib/test/collections/UnsignedArraysTest.kt b/libraries/stdlib/test/collections/UnsignedArraysTest.kt index d3dedc16736..ed66a3a0119 100644 --- a/libraries/stdlib/test/collections/UnsignedArraysTest.kt +++ b/libraries/stdlib/test/collections/UnsignedArraysTest.kt @@ -6,16 +6,12 @@ @file:Suppress("SIGNED_CONSTANT_CONVERTED_TO_UNSIGNED") package test.collections +import test.assertArrayContentEquals import test.collections.behaviors.collectionBehavior import test.collections.behaviors.listBehavior import test.collections.behaviors.iteratorBehavior import kotlin.test.* -fun assertArrayContentEquals(expected: UIntArray, actual: UIntArray, message: String = "") { assertTrue(expected contentEquals actual, message) } -fun assertArrayContentEquals(expected: ULongArray, actual: ULongArray, message: String = "") { assertTrue(expected contentEquals actual, message) } -fun assertArrayContentEquals(expected: UShortArray, actual: UShortArray, message: String = "") { assertTrue(expected contentEquals actual, message) } -fun assertArrayContentEquals(expected: UByteArray, actual: UByteArray, message: String = "") { assertTrue(expected contentEquals actual, message) } - class UnsignedArraysTest { diff --git a/libraries/stdlib/test/testUtils.kt b/libraries/stdlib/test/testUtils.kt index c851c7e1125..6c08207aab5 100644 --- a/libraries/stdlib/test/testUtils.kt +++ b/libraries/stdlib/test/testUtils.kt @@ -5,6 +5,7 @@ package test +import kotlin.test.assertTrue import kotlin.test.fail // just a static type check @@ -17,4 +18,13 @@ inline fun assertStaticAndRuntimeTypeIs(value: @kotlin.internal.NoIn if ((value as Any?) !is T) { fail("Expected value $value to have ${T::class} type") } -} \ No newline at end of file +} + + +fun assertArrayContentEquals(expected: ByteArray, actual: ByteArray, message: String? = null) = assertTrue(expected contentEquals actual, message) +fun assertArrayContentEquals(expected: CharArray, actual: CharArray, message: String? = null) = assertTrue(expected contentEquals actual, message) + +fun assertArrayContentEquals(expected: UIntArray, actual: UIntArray, message: String? = null) = assertTrue(expected contentEquals actual, message) +fun assertArrayContentEquals(expected: ULongArray, actual: ULongArray, message: String? = null) = assertTrue(expected contentEquals actual, message) +fun assertArrayContentEquals(expected: UShortArray, actual: UShortArray, message: String? = null) = assertTrue(expected contentEquals actual, message) +fun assertArrayContentEquals(expected: UByteArray, actual: UByteArray, message: String? = null) = assertTrue(expected contentEquals actual, message) \ No newline at end of file diff --git a/libraries/stdlib/test/text/StringEncodingTest.kt b/libraries/stdlib/test/text/StringEncodingTest.kt new file mode 100644 index 00000000000..1fdb44ca595 --- /dev/null +++ b/libraries/stdlib/test/text/StringEncodingTest.kt @@ -0,0 +1,337 @@ +/* + * Copyright 2010-2019 JetBrains s.r.o. and Kotlin Programming Language contributors. + * Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file. + */ + +package test.text + +import test.assertArrayContentEquals +import test.testOnNonJvm6And7 +import kotlin.test.* + +// When decoding utf-8, JVM and JS implementations replace the sequence reflecting a surrogate code point differently. +// JS replaces each byte of the sequence by the replacement char, whereas JVM replaces the whole sequence with a single replacement char. +// See corresponding actual to find out the replacement. +internal expect val surrogateCodePointDecoding: String + +// The byte sequence used to replace a surrogate char. +// JVM default replacement sequence consist of single 0x3F byte. +// JS and Native replacement byte sequence is [0xEF, 0xBF, 0xBD]. +internal expect val surrogateCharEncoding: ByteArray + +class StringEncodingTest { + private fun bytes(vararg elements: Int) = ByteArray(elements.size) { elements[it].toByte() } + + private fun testEncoding(isWellFormed: Boolean, expected: ByteArray, string: String) { + assertArrayContentEquals(expected, string.encodeToByteArray()) + if (!isWellFormed) { + assertFailsWith { string.encodeToByteArray(throwOnInvalidSequence = true) } + } else { + assertArrayContentEquals(expected, string.encodeToByteArray(throwOnInvalidSequence = true)) + assertEquals(string, string.encodeToByteArray(throwOnInvalidSequence = true).decodeToString()) + } + } + + private fun testEncoding(isWellFormed: Boolean, expected: ByteArray, string: String, startIndex: Int, endIndex: Int) { + assertArrayContentEquals(expected, string.encodeToByteArray(startIndex, endIndex)) + if (!isWellFormed) { + assertFailsWith { string.encodeToByteArray(startIndex, endIndex, true) } + } else { + assertArrayContentEquals(expected, string.encodeToByteArray(startIndex, endIndex, true)) + assertEquals( + string.substring(startIndex, endIndex), + string.encodeToByteArray(startIndex, endIndex, true).decodeToString() + ) + } + } + + @Test + fun encodeToByteArray() { + // empty string + testEncoding(true, bytes(), "") + + // 1-byte chars + testEncoding(true, bytes(0), "\u0000") + testEncoding(true, bytes(0x2D), "-") + testEncoding(true, bytes(0x7F), "\u007F") + + // 2-byte chars + testEncoding(true, bytes(0xC2, 0x80), "\u0080") + testEncoding(true, bytes(0xC2, 0xBF), "¿") + testEncoding(true, bytes(0xDF, 0xBF), "\u07FF") + + // 3-byte chars + testEncoding(true, bytes(0xE0, 0xA0, 0x80), "\u0800") + testEncoding(true, bytes(0xE6, 0x96, 0xA4), "斤") + testEncoding(true, bytes(0xED, 0x9F, 0xBF), "\uD7FF") + + // surrogate chars + testEncoding(false, surrogateCharEncoding, "\uD800") + testEncoding(false, surrogateCharEncoding, "\uDB6A") + testEncoding(false, surrogateCharEncoding, "\uDFFF") + + // 3-byte chars + testEncoding(true, bytes(0xEE, 0x80, 0x80), "\uE000") + testEncoding(true, bytes(0xEF, 0x98, 0xBC), "\uF63C") + testEncoding(true, bytes(0xEF, 0xBF, 0xBF), "\uFFFF") + + // 4-byte surrogate pairs + testEncoding(true, bytes(0xF0, 0x90, 0x80, 0x80), "\uD800\uDC00") + testEncoding(true, bytes(0xF2, 0xA2, 0x97, 0xBC), "\uDA49\uDDFC") + testEncoding(true, bytes(0xF4, 0x8F, 0xBF, 0xBF), "\uDBFF\uDFFF") + + // reversed surrogate pairs + testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, "\uDC00\uD800") + testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, "\uDDFC\uDA49") + testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, "\uDFFF\uDBFF") + + testEncoding( + false, + bytes( + 0, /**/ 0x2D, /**/ 0x7F, /**/ 0xC2, 0x80, /**/ 0xC2, 0xBF, /**/ 0xDF, 0xBF, /**/ 0xE0, 0xA0, 0x80, /**/ + 0xE6, 0x96, 0xA4, /**/ 0xED, 0x9F, 0xBF, /**/ 0x7A + ) /**/ + surrogateCharEncoding /**/ + surrogateCharEncoding /**/ + 0x7A /**/ + surrogateCharEncoding /**/ + 0x7A /**/ + surrogateCharEncoding, + "\u0000-\u007F\u0080¿\u07FF\u0800斤\uD7FFz\uDFFF\uD800z\uDB6Az\uDB6A" + ) + + testEncoding( + true, + bytes( + 0xEE, 0x80, 0x80, /**/ 0xEF, 0x98, 0xBC, /**/ 0xC2, 0xBF, /**/ 0xEF, 0xBF, 0xBF, /**/ + 0xF0, 0x90, 0x80, 0x80, /**/ 0xF2, 0xA2, 0x97, 0xBC, /**/ 0xF4, 0x8F, 0xBF, 0xBF + ), + "\uE000\uF63C¿\uFFFF\uD800\uDC00\uDA49\uDDFC\uDBFF\uDFFF" + ) + + val longChars = CharArray(200_000) { 'k' } + val longBytes = longChars.concatToString().encodeToByteArray() + assertEquals(200_000, longBytes.size) + assertTrue { longBytes.all { it == 0x6B.toByte() } } + } + + @Test + fun encodeToByteArraySlice() { + assertFailsWith { "".encodeToByteArray(startIndex = 1) } + assertFailsWith { "123".encodeToByteArray(startIndex = 10) } + assertFailsWith { "123".encodeToByteArray(startIndex = -1) } + assertFailsWith { "123".encodeToByteArray(endIndex = 10) } + assertFailsWith { "123".encodeToByteArray(endIndex = -1) } + assertFailsWith { "123".encodeToByteArray(startIndex = 5, endIndex = 10) } + assertFailsWith { "123".encodeToByteArray(startIndex = 5, endIndex = 2) } + assertFailsWith { "123".encodeToByteArray(startIndex = 1, endIndex = 4) } + + testEncoding(true, bytes(), "abc", 0, 0) + testEncoding(true, bytes(), "abc", 3, 3) + testEncoding(true, bytes(0x62, 0x63), "abc", 1, 3) + testEncoding(true, bytes(0x61, 0x62), "abc", 0, 2) + testEncoding(true, bytes(0x62), "abc", 1, 2) + + testEncoding(true, bytes(0x2D), "-", 0, 1) + testEncoding(true, bytes(0xC2, 0xBF), "¿", 0, 1) + testEncoding(true, bytes(0xE6, 0x96, 0xA4), "斤", 0, 1) + + testEncoding(false, surrogateCharEncoding, "\uDB6A", 0, 1) + + testEncoding(true, bytes(0xEF, 0x98, 0xBC), "\uF63C", 0, 1) + + testEncoding(true, bytes(0xF2, 0xA2, 0x97, 0xBC), "\uDA49\uDDFC", 0, 2) + testEncoding(false, surrogateCharEncoding, "\uDA49\uDDFC", 0, 1) + testEncoding(false, surrogateCharEncoding, "\uDA49\uDDFC", 1, 2) + + testEncoding( + false, + bytes(0xE6, 0x96, 0xA4, /**/ 0xED, 0x9F, 0xBF, /**/ 0x7A) /**/ + surrogateCharEncoding /**/ + surrogateCharEncoding, + "\u0000-\u007F\u0080¿\u07FF\u0800斤\uD7FFz\uDFFF\uD800z\uDB6Az\uDB6A", + startIndex = 7, + endIndex = 12 + ) + + testEncoding( + false, + bytes(0xC2, 0xBF, /**/ 0xEF, 0xBF, 0xBF, /**/ 0xF0, 0x90, 0x80, 0x80, /**/ 0xF2, 0xA2, 0x97, 0xBC) /**/ + surrogateCharEncoding, + "\uE000\uF63C¿\uFFFF\uD800\uDC00\uDA49\uDDFC\uDBFF\uDFFF", + startIndex = 2, + endIndex = 9 + ) + + val longChars = CharArray(200_000) { 'k' } + val longBytes = longChars.concatToString().encodeToByteArray(startIndex = 5000, endIndex = 195_000) + assertEquals(190_000, longBytes.size) + assertTrue { longBytes.all { it == 0x6B.toByte() } } + } + + private fun testDecoding(isWellFormed: Boolean, expected: String, bytes: ByteArray) { + assertEquals(expected, bytes.decodeToString()) + if (!isWellFormed) { + assertFailsWith { bytes.decodeToString(throwOnInvalidSequence = true) } + } else { + assertEquals(expected, bytes.decodeToString(throwOnInvalidSequence = true)) + assertArrayContentEquals(bytes, bytes.decodeToString(throwOnInvalidSequence = true).encodeToByteArray()) + } + } + + private fun testDecoding(isWellFormed: Boolean, expected: String, bytes: ByteArray, startIndex: Int, endIndex: Int) { + assertEquals(expected, bytes.decodeToString(startIndex, endIndex)) + if (!isWellFormed) { + assertFailsWith { bytes.decodeToString(startIndex, endIndex, true) } + } else { + assertEquals(expected, bytes.decodeToString(startIndex, endIndex, true)) + assertArrayContentEquals( + bytes.sliceArray(startIndex until endIndex), + bytes.decodeToString(startIndex, endIndex, true).encodeToByteArray() + ) + } + } + + private fun truncatedSurrogateDecoding() = + surrogateCodePointDecoding.let { if (it.length > 1) it.dropLast(1) else it } + + @Test + fun decodeToString() { + testDecoding(true, "", bytes()) // empty + testDecoding(true, "\u0000", bytes(0x0)) // null char + testDecoding(true, "zC", bytes(0x7A, 0x43)) // 1-byte chars + + testDecoding(false, "��", bytes(0x85, 0xAF)) // invalid bytes starting with 1 bit + testDecoding(true, "¿", bytes(0xC2, 0xBF)) // 2-byte char + testDecoding(false, "�z", bytes(0xCF, 0x7A)) // 2-byte char, second byte starts with 0 bit + testDecoding(false, "��", bytes(0xC1, 0xAA)) // 1-byte char written in two bytes + + testDecoding(false, "�z", bytes(0xEF, 0xAF, 0x7A)) // 3-byte char, third byte starts with 0 bit + testDecoding(false, "���", bytes(0xE0, 0x9F, 0xAF)) // 2-byte char written in three bytes + testDecoding(false, "�z", bytes(0xE0, 0xAF, 0x7A)) // 3-byte char, third byte starts with 0 bit + testDecoding(true, "\u1FFF", bytes(0xE1, 0xBF, 0xBF)) // 3-byte char + + testOnNonJvm6And7 { + testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF)) // 3-byte high-surrogate char + testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xB3, 0x9A)) // 3-byte low-surrogate char + testDecoding( + false, + surrogateCodePointDecoding + surrogateCodePointDecoding, + bytes(0xED, 0xAF, 0xBF, /**/ 0xED, 0xB3, 0x9A) + ) // surrogate pair chars + testDecoding(false, "�z", bytes(0xEF, 0x7A)) // 3-byte char, second byte starts with 0 bit, third byte missing + + testDecoding(false, "�����", bytes(0xF9, 0x94, 0x80, 0x80, 0x80)) // 5-byte code point larger than 0x10FFFF + testDecoding(false, "������", bytes(0xFD, 0x94, 0x80, 0x80, 0x80, 0x80)) // 6-byte code point larger than 0x10FFFF + + // Ill-Formed Sequences for Surrogates + testDecoding( + false, + surrogateCodePointDecoding + surrogateCodePointDecoding + truncatedSurrogateDecoding() + "A", + bytes(0xED, 0xA0, 0x80, /**/ 0xED, 0xBF, 0xBF, /**/ 0xED, 0xAF, /**/ 0x41) + ) + // Truncated Sequences + testDecoding(false, "����A", bytes(0xE1, 0x80, /**/ 0xE2, /**/ 0xF0, 0x91, 0x92, /**/ 0xF1, 0xBF, /**/ 0x41)) + } + + testDecoding(false, "�", bytes(0xE0, 0xAF)) // 3-byte char, third byte missing + + testDecoding(true, "\uD83D\uDFDF", bytes(0xF0, 0x9F, 0x9F, 0x9F)) // 4-byte char + testDecoding(false, "����", bytes(0xF0, 0x8F, 0x9F, 0x9F)) // 3-byte char written in four bytes + testDecoding(false, "����", bytes(0xF4, 0x9F, 0x9F, 0x9F)) // 4-byte code point larger than 0x10FFFF + testDecoding(false, "����", bytes(0xF5, 0x80, 0x80, 0x80)) // 4-byte code point larger than 0x10FFFF + + // Non-Shortest Form Sequences + testDecoding(false, "��������A", bytes(0xC0, 0xAF, /**/ 0xE0, 0x80, 0xBF, /**/ 0xF0, 0x81, 0x82, /**/ 0x41)) + // Other Ill-Formed Sequences + testDecoding(false, "�����A��B", bytes(0xF4, 0x91, 0x92, 0x93, /**/ 0xFF, /**/ 0x41, /**/ 0x80, 0xBF, /**/ 0x42)) + + val longBytes = ByteArray(200_000) { 0x6B.toByte() } + val longString = longBytes.decodeToString() + assertEquals(200_000, longString.length) + assertTrue { longString.all { it == 'k' } } + } + + @Test + fun decodeToStringSlice() { + assertFailsWith { bytes().decodeToString(1, 0) } + assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 10) } + assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = -1) } + assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(endIndex = 10) } + assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(endIndex = -1) } + assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 5, endIndex = 10) } + assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 5, endIndex = 2) } + assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 1, endIndex = 4) } + + testDecoding(true, "", bytes(), startIndex = 0, endIndex = 0) + testDecoding(true, "", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 0) + testDecoding(true, "", bytes(0x61, 0x62, 0x63), startIndex = 3, endIndex = 3) + testDecoding(true, "abc", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 3) + testDecoding(true, "ab", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 2) + testDecoding(true, "bc", bytes(0x61, 0x62, 0x63), startIndex = 1, endIndex = 3) + testDecoding(true, "b", bytes(0x61, 0x62, 0x63), startIndex = 1, endIndex = 2) + + testDecoding(true, "¿", bytes(0xC2, 0xBF), startIndex = 0, endIndex = 2) + testDecoding(false, "�", bytes(0xC2, 0xBF), startIndex = 0, endIndex = 1) + testDecoding(false, "�", bytes(0xC2, 0xBF), startIndex = 1, endIndex = 2) + + testDecoding(false, "�", bytes(0xEF, 0xAF, 0x7A), startIndex = 0, endIndex = 2) + testDecoding(false, "�z", bytes(0xEF, 0xAF, 0x7A), startIndex = 1, endIndex = 3) + testDecoding(true, "z", bytes(0xEF, 0xAF, 0x7A), startIndex = 2, endIndex = 3) + + testOnNonJvm6And7 { + testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF), startIndex = 0, endIndex = 3) + testDecoding(false, truncatedSurrogateDecoding(), bytes(0xED, 0xB3, 0x9A), startIndex = 0, endIndex = 2) + testDecoding(false, "���", bytes(0xED, 0xAF, 0xBF, 0xED, 0xB3, 0x9A), startIndex = 1, endIndex = 4) + testDecoding(false, "�", bytes(0xEF, 0x7A), startIndex = 0, endIndex = 1) + testDecoding(true, "z", bytes(0xEF, 0x7A), startIndex = 1, endIndex = 2) + } + + testDecoding(true, "\uD83D\uDFDF", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 0, endIndex = 4) + testDecoding(false, "��", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 2, endIndex = 4) + testDecoding(false, "��", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 1, endIndex = 3) + + val longBytes = ByteArray(200_000) { 0x6B.toByte() } + val longString = longBytes.decodeToString(startIndex = 5000, endIndex = 195_000) + assertEquals(190_000, longString.length) + assertTrue { longString.all { it == 'k' } } + } + + @Test + fun kotlinxIOUnicodeTest() { + fun String.readHex(): ByteArray = split(" ") + .filter { it.isNotBlank() } + .map { it.toInt(16).toByte() } + .toByteArray() + + val smokeTestData = "\ud83c\udf00" + val smokeTestDataCharArray: CharArray = smokeTestData.toCharArray() + val smokeTestDataAsBytes = "f0 9f 8c 80".readHex() + + val testData = "file content with unicode " + + "\ud83c\udf00 :" + + " \u0437\u0434\u043e\u0440\u043e\u0432\u0430\u0442\u044c\u0441\u044f :" + + " \uc5ec\ubcf4\uc138\uc694 :" + + " \u4f60\u597d :" + + " \u00f1\u00e7" + val testDataCharArray: CharArray = testData.toCharArray() + val testDataAsBytes: ByteArray = ("66 69 6c 65 20 63 6f 6e 74 65 6e 74 20 77 69 74 " + + " 68 20 75 6e 69 63 6f 64 65 20 f0 9f 8c 80 20 3a 20 d0 b7 d0 b4 d0 be d1 " + + "80 d0 be d0 b2 d0 b0 d1 82 d1 8c d1 81 d1 8f 20 3a 20 ec 97 ac eb b3 b4 ec " + + " 84 b8 ec 9a 94 20 3a 20 e4 bd a0 e5 a5 bd 20 3a 20 c3 b1 c3 a7").readHex() + + + assertArrayContentEquals(smokeTestDataAsBytes, smokeTestData.encodeToByteArray()) + assertArrayContentEquals(testDataAsBytes, testData.encodeToByteArray()) + + assertEquals(smokeTestData, smokeTestDataAsBytes.decodeToString()) + assertEquals(testData, testDataAsBytes.decodeToString()) + + assertEquals(smokeTestData, smokeTestDataCharArray.concatToString()) + assertEquals(testData, testDataCharArray.concatToString()) + + assertArrayContentEquals(smokeTestDataCharArray, smokeTestData.toCharArray()) + assertArrayContentEquals(testDataCharArray, testData.toCharArray()) + + assertArrayContentEquals(smokeTestDataAsBytes, smokeTestDataCharArray.concatToString().encodeToByteArray()) + assertArrayContentEquals(testDataAsBytes, testDataCharArray.concatToString().encodeToByteArray()) + + assertArrayContentEquals(smokeTestDataCharArray, smokeTestDataAsBytes.decodeToString().toCharArray()) + assertArrayContentEquals(testDataCharArray, testDataAsBytes.decodeToString().toCharArray()) + + assertEquals("\uD858\uDE18\n", bytes(0xF0, 0xA6, 0x88, 0x98, 0x0a).decodeToString()) + assertEquals("\u0BF5\n", bytes(0xE0, 0xAF, 0xB5, 0x0A).decodeToString()) + assertEquals("\u041a\n", bytes(0xD0, 0x9A, 0x0A).decodeToString()) + } +} diff --git a/libraries/tools/binary-compatibility-validator/reference-public-api/kotlin-stdlib-runtime-merged.txt b/libraries/tools/binary-compatibility-validator/reference-public-api/kotlin-stdlib-runtime-merged.txt index a295b2ea792..b9f119b4b71 100644 --- a/libraries/tools/binary-compatibility-validator/reference-public-api/kotlin-stdlib-runtime-merged.txt +++ b/libraries/tools/binary-compatibility-validator/reference-public-api/kotlin-stdlib-runtime-merged.txt @@ -4914,6 +4914,8 @@ public final class kotlin/text/StringsKt { public static synthetic fun contains$default (Ljava/lang/CharSequence;Ljava/lang/CharSequence;ZILjava/lang/Object;)Z public static final fun count (Ljava/lang/CharSequence;Lkotlin/jvm/functions/Function1;)I public static final fun decapitalize (Ljava/lang/String;)Ljava/lang/String; + public static final fun decodeToString ([BIIZ)Ljava/lang/String; + public static synthetic fun decodeToString$default ([BIIZILjava/lang/Object;)Ljava/lang/String; public static final fun drop (Ljava/lang/CharSequence;I)Ljava/lang/CharSequence; public static final fun drop (Ljava/lang/String;I)Ljava/lang/String; public static final fun dropLast (Ljava/lang/CharSequence;I)Ljava/lang/CharSequence; @@ -4922,6 +4924,8 @@ public final class kotlin/text/StringsKt { public static final fun dropLastWhile (Ljava/lang/String;Lkotlin/jvm/functions/Function1;)Ljava/lang/String; public static final fun dropWhile (Ljava/lang/CharSequence;Lkotlin/jvm/functions/Function1;)Ljava/lang/CharSequence; public static final fun dropWhile (Ljava/lang/String;Lkotlin/jvm/functions/Function1;)Ljava/lang/String; + public static final fun encodeToByteArray (Ljava/lang/String;IIZ)[B + public static synthetic fun encodeToByteArray$default (Ljava/lang/String;IIZILjava/lang/Object;)[B public static final fun endsWith (Ljava/lang/CharSequence;CZ)Z public static final fun endsWith (Ljava/lang/CharSequence;Ljava/lang/CharSequence;Z)Z public static final fun endsWith (Ljava/lang/String;Ljava/lang/String;Z)Z