Files
kotlin-fork/libraries/stdlib/jvm/test/io/Console.kt
T
Roman Elizarov e26a3ad033 Speed up stdlib readLine function (#3185)
There are several performance optimizations:

* ByteBuffer/CharBuffer/StringBuilder objects pre-allocated and are
  reused on each call to readLine.
* The state for readLine is lazily allocated via JVM classloading
  (using a singleton object).
* There is an auto-detection heuristic for "directEOL" encodings which
  represent LF ('\n') directly as the corresponding byte
  (UTF-8 and many single-byte encodings are like that).
  When "directEOL" encoding is used, then bytes are batched into
  ByteBuffer for a single call to CharsetDecoder.decode which
  results in higher throughput. Otherwise (UTF-16, etc), slower
  byte-by-byte approach is used.
* Bytes and chars are directly moved in/out of byte/char arrays and
  ByteBuffer/CharBuffer wrappers are used only to interface with
  JVM CharsetDecoder class (which is the slowest piece).
* StringBuilder is not used at all for short lines (<=32 chars).

There are also some function improvements to readLine functionality:

* Restriction on "max chars per byte" is lifted, so readLine works with
  all encodings that JVM supports.
* It support on-the-fly changes to system default charset, because
  it rechecks current charset on each call and updates it decoder
  when needed.

All the other features of readLine function are retained:

* It does not read more bytes from System.in than needed, so it
  is compatible with other ways to read System.in. On-the-fly
  changes to System.in are supported.
* It is thread-safe. Its internal mutable state is protected by
  synchronization.
* There is an internal method for tests that supports explicit
  charset specification, but the name of this method has changed.

There are additional tests:

* Check all supported encodings on JVM to make sure that readLine
  works correctly with them all.
* Check unicode code points of different bits length with all standard
  unicode encodings (UTF-8, UTF-16, and UTF-32 in LE/HE byte orders).

Benchmarks that compare different implementations of readLine,
including this one (readLine6NoLV in the set) can be found here:
https://github.com/elizarov/ReadLineBenchmark

Taking BufferedReader as 100% baseline we see that:

* Current readLine is 7.5 times slower than BufferedReader baseline.
* New implementation in this commit is 2.5 timer slower than baseline.
  It is ~3 times faster than existing implementation of readLine.

Altogether these optimizations are enough to enable reading of
~500K lines in sports programming setting under 2s time-limit with
plenty of headroom in time. Example that is using this version of
readLine can be found here:
https://codeforces.com/contest/1322/submission/73005366

#KT-37416 Fixed
2020-03-23 14:36:55 +03:00

168 lines
6.5 KiB
Kotlin

/*
* Copyright 2010-2018 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
@file:Suppress("INVISIBLE_REFERENCE", "INVISIBLE_MEMBER")
package test.io
import org.junit.Test
import java.nio.charset.Charset
import kotlin.random.Random
import kotlin.random.nextInt
import kotlin.test.*
class ConsoleTest {
private val linuxLineSeparator: String = "\n"
private val windowsLineSeparator: String = "\r\n"
@Test
fun shouldReadEmptyLine() {
testReadLine("", emptyList())
}
@Test
fun shouldReadSingleLine() {
for (length in 1..3) {
val line = buildString { repeat(length) { append('a' + it) } }
testReadLine(line, listOf(line))
}
}
@Test
fun trailingEmptyLineIsIgnored() {
testReadLine(linuxLineSeparator, listOf(""))
testReadLine(windowsLineSeparator, listOf(""))
testReadLine("a$linuxLineSeparator", listOf("a"))
testReadLine("a$windowsLineSeparator", listOf("a"))
}
@Test
fun shouldReadOneLine() {
testReadLine("first", listOf("first"))
}
@Test
fun shouldReadTwoLines() {
testReadLine("first${linuxLineSeparator}second", listOf("first", "second"))
}
@Test
fun shouldReadConsecutiveEmptyLines() {
testReadLine("$linuxLineSeparator$linuxLineSeparator", listOf("", ""))
testReadLine("$linuxLineSeparator$windowsLineSeparator", listOf("", ""))
testReadLine("$windowsLineSeparator$linuxLineSeparator", listOf("", ""))
testReadLine("$windowsLineSeparator$windowsLineSeparator", listOf("", ""))
}
@Test
fun shouldReadWindowsLineSeparator() {
testReadLine("first${windowsLineSeparator}second", listOf("first", "second"))
}
@Test
fun shouldReadMultibyteEncodings() {
testReadLine("first${linuxLineSeparator}second", listOf("first", "second"), charset = Charsets.UTF_32)
}
@Test
fun shouldReadAllSupportedEncodings() {
val lines = listOf(
"ONE", "TWICE", "", "0123456",
"This is a very long line that will overflow buffers that are allocated in the code of LineReader object",
"This line is quite short",
"x".repeat(1000), // stress
"7", "8", "9" // some short stuff at the end
)
// Filter all available charsets that can be encoded
val charsets: List<Charset> = Charset.availableCharsets().values.filter { charset ->
try {
charset.newEncoder()
true // take it
} catch (e: UnsupportedOperationException) {
false // we can only test charset that supports encoding, skip it
}
}
// Run the test
for (separator in listOf(linuxLineSeparator, windowsLineSeparator)) {
val text = lines.joinToString(separator)
for (charset in charsets) {
val reference = readLinesReference(text, charset)
if (reference != lines) continue // this encoding does not support ASCII chars that we test, skip
// Now we can test readLine function
val actual = readLines(text, charset)
assertEquals(lines, actual, "Comparing with $charset")
}
}
}
@Test
fun shouldReadAllUnicodeCodePoints() {
// Generate lines of ever-increasing length with sample unicode code points to stress all corner-cases in
// line lengths and ability to handle different bit-lengths of unicode code points.
var cp = 0
val rnd = Random(1)
val logFactor = 7 // log of number of code points that are sampled per each bit length of code point
fun nextCP(): Int {
if (cp == 10 || cp == 13) cp++ // skip line endings
if (cp in 0xD800..0xFFFF) cp = 0x10000 // skip surrogates
// to make the test run faster don't test all code points, the larger they are, the sparser they are sampled
// For each bit length of the code point we randomly sample ~2^logFactor code points for this test
val maxStep = cp.coerceAtLeast(1 shl logFactor).takeHighestOneBit() shr logFactor
val step = rnd.nextInt(1..maxStep)
return cp.also { cp += step }
}
val lines = ArrayList<String>().apply {
var len = 1
while (cp < Character.MAX_CODE_POINT) {
add(buildString {
repeat(len) {
appendCodePoint(nextCP())
if (cp >= Character.MAX_CODE_POINT) return@buildString
}
})
len++
}
}
// test all standard unicode encoding that should be able to represent all code points
for (separator in listOf(linuxLineSeparator, windowsLineSeparator)) {
val text = lines.joinToString(separator)
for (charset in listOf(Charsets.UTF_8, Charsets.UTF_16BE, Charsets.UTF_16LE, Charsets.UTF_32BE, Charsets.UTF_32LE)) {
testReadLine(text, lines, charset)
}
}
}
@Test
fun readSurrogatePairs() {
val c = "\uD83D\uDC4D" // thumb-up emoji
testReadLine("$c$linuxLineSeparator", listOf(c))
testReadLine("e $c$linuxLineSeparator", listOf("e $c"))
testReadLine("$c$windowsLineSeparator", listOf(c))
testReadLine("e $c$c", listOf("e $c$c"))
testReadLine("e $c$linuxLineSeparator$c", listOf("e $c", c))
}
private fun testReadLine(text: String, expected: List<String>, charset: Charset = Charsets.UTF_8) {
val actual = readLines(text, charset)
assertEquals(expected, actual, "Comparing with $charset")
val referenceExpected = readLinesReference(text, charset)
assertEquals(referenceExpected, actual, "Comparing to reference readLine")
}
private fun readLines(text: String, charset: Charset): List<String> {
text.byteInputStream(charset).use { stream ->
@Suppress("INVISIBLE_REFERENCE", "INVISIBLE_MEMBER")
return generateSequence { LineReader.readLine(stream, charset) }.toList().also {
assertTrue("All bytes should be read") { stream.read() == -1 }
}
}
}
private fun readLinesReference(text: String, charset: Charset): List<String> {
text.byteInputStream(charset).bufferedReader(charset).use { reader ->
return generateSequence { reader.readLine() }.toList()
}
}
}