Commonize CharCategory and related functions #KT-39177 #KT-43216 #KT-39906 #KT-30652

This commit is contained in:
Abduqodiri Qurbonzoda
2020-10-13 02:39:07 +03:00
parent 9b429fb535
commit 46b7a774b5
40 changed files with 3573 additions and 62 deletions
+87
View File
@@ -1,6 +1,8 @@
@kotlin.SinceKotlin(version = "1.2")
public val kotlin.String.Companion.CASE_INSENSITIVE_ORDER: kotlin.Comparator<kotlin.String> { get; }
public val kotlin.Char.category: kotlin.text.CharCategory { get; }
public val kotlin.CharSequence.indices: kotlin.ranges.IntRange { get; }
public val kotlin.CharSequence.lastIndex: kotlin.Int { get; }
@@ -361,13 +363,25 @@ public inline fun kotlin.text.StringBuilder.insertRange(index: kotlin.Int, value
public fun kotlin.CharSequence.isBlank(): kotlin.Boolean
public fun kotlin.Char.isDefined(): kotlin.Boolean
public fun kotlin.Char.isDigit(): kotlin.Boolean
@kotlin.internal.InlineOnly
public inline fun kotlin.CharSequence.isEmpty(): kotlin.Boolean
public fun kotlin.Char.isHighSurrogate(): kotlin.Boolean
public fun kotlin.Char.isISOControl(): kotlin.Boolean
public fun kotlin.Char.isLetter(): kotlin.Boolean
public fun kotlin.Char.isLetterOrDigit(): kotlin.Boolean
public fun kotlin.Char.isLowSurrogate(): kotlin.Boolean
public fun kotlin.Char.isLowerCase(): kotlin.Boolean
@kotlin.internal.InlineOnly
public inline fun kotlin.CharSequence.isNotBlank(): kotlin.Boolean
@@ -382,6 +396,10 @@ public inline fun kotlin.CharSequence?.isNullOrEmpty(): kotlin.Boolean
public fun kotlin.Char.isSurrogate(): kotlin.Boolean
public fun kotlin.Char.isTitleCase(): kotlin.Boolean
public fun kotlin.Char.isUpperCase(): kotlin.Boolean
public fun kotlin.Char.isWhitespace(): kotlin.Boolean
public operator fun kotlin.CharSequence.iterator(): kotlin.collections.CharIterator
@@ -1178,6 +1196,75 @@ public interface Appendable {
public abstract fun append(value: kotlin.CharSequence?, startIndex: kotlin.Int, endIndex: kotlin.Int): kotlin.text.Appendable
}
public final enum class CharCategory : kotlin.Enum<kotlin.text.CharCategory> {
enum entry UNASSIGNED
enum entry UPPERCASE_LETTER
enum entry LOWERCASE_LETTER
enum entry TITLECASE_LETTER
enum entry MODIFIER_LETTER
enum entry OTHER_LETTER
enum entry NON_SPACING_MARK
enum entry ENCLOSING_MARK
enum entry COMBINING_SPACING_MARK
enum entry DECIMAL_DIGIT_NUMBER
enum entry LETTER_NUMBER
enum entry OTHER_NUMBER
enum entry SPACE_SEPARATOR
enum entry LINE_SEPARATOR
enum entry PARAGRAPH_SEPARATOR
enum entry CONTROL
enum entry FORMAT
enum entry PRIVATE_USE
enum entry SURROGATE
enum entry DASH_PUNCTUATION
enum entry START_PUNCTUATION
enum entry END_PUNCTUATION
enum entry CONNECTOR_PUNCTUATION
enum entry OTHER_PUNCTUATION
enum entry MATH_SYMBOL
enum entry CURRENCY_SYMBOL
enum entry MODIFIER_SYMBOL
enum entry OTHER_SYMBOL
enum entry INITIAL_QUOTE_PUNCTUATION
enum entry FINAL_QUOTE_PUNCTUATION
public final val code: kotlin.String { get; }
public final operator fun contains(char: kotlin.Char): kotlin.Boolean
public companion object of CharCategory {
}
}
@kotlin.SinceKotlin(version = "1.4")
@kotlin.WasExperimental(markerClass = {kotlin.ExperimentalStdlibApi::class})
public open class CharacterCodingException : kotlin.Exception {
+87
View File
@@ -1,6 +1,8 @@
@kotlin.SinceKotlin(version = "1.2")
public val kotlin.String.Companion.CASE_INSENSITIVE_ORDER: kotlin.Comparator<kotlin.String> { get; }
public val kotlin.Char.category: kotlin.text.CharCategory { get; }
public val kotlin.CharSequence.indices: kotlin.ranges.IntRange { get; }
public val kotlin.CharSequence.lastIndex: kotlin.Int { get; }
@@ -361,13 +363,25 @@ public inline fun kotlin.text.StringBuilder.insertRange(index: kotlin.Int, value
public fun kotlin.CharSequence.isBlank(): kotlin.Boolean
public fun kotlin.Char.isDefined(): kotlin.Boolean
public fun kotlin.Char.isDigit(): kotlin.Boolean
@kotlin.internal.InlineOnly
public inline fun kotlin.CharSequence.isEmpty(): kotlin.Boolean
public fun kotlin.Char.isHighSurrogate(): kotlin.Boolean
public fun kotlin.Char.isISOControl(): kotlin.Boolean
public fun kotlin.Char.isLetter(): kotlin.Boolean
public fun kotlin.Char.isLetterOrDigit(): kotlin.Boolean
public fun kotlin.Char.isLowSurrogate(): kotlin.Boolean
public fun kotlin.Char.isLowerCase(): kotlin.Boolean
@kotlin.internal.InlineOnly
public inline fun kotlin.CharSequence.isNotBlank(): kotlin.Boolean
@@ -382,6 +396,10 @@ public inline fun kotlin.CharSequence?.isNullOrEmpty(): kotlin.Boolean
public fun kotlin.Char.isSurrogate(): kotlin.Boolean
public fun kotlin.Char.isTitleCase(): kotlin.Boolean
public fun kotlin.Char.isUpperCase(): kotlin.Boolean
public fun kotlin.Char.isWhitespace(): kotlin.Boolean
public operator fun kotlin.CharSequence.iterator(): kotlin.collections.CharIterator
@@ -1178,6 +1196,75 @@ public interface Appendable {
public abstract fun append(value: kotlin.CharSequence?, startIndex: kotlin.Int, endIndex: kotlin.Int): kotlin.text.Appendable
}
public final enum class CharCategory : kotlin.Enum<kotlin.text.CharCategory> {
enum entry UNASSIGNED
enum entry UPPERCASE_LETTER
enum entry LOWERCASE_LETTER
enum entry TITLECASE_LETTER
enum entry MODIFIER_LETTER
enum entry OTHER_LETTER
enum entry NON_SPACING_MARK
enum entry ENCLOSING_MARK
enum entry COMBINING_SPACING_MARK
enum entry DECIMAL_DIGIT_NUMBER
enum entry LETTER_NUMBER
enum entry OTHER_NUMBER
enum entry SPACE_SEPARATOR
enum entry LINE_SEPARATOR
enum entry PARAGRAPH_SEPARATOR
enum entry CONTROL
enum entry FORMAT
enum entry PRIVATE_USE
enum entry SURROGATE
enum entry DASH_PUNCTUATION
enum entry START_PUNCTUATION
enum entry END_PUNCTUATION
enum entry CONNECTOR_PUNCTUATION
enum entry OTHER_PUNCTUATION
enum entry MATH_SYMBOL
enum entry CURRENCY_SYMBOL
enum entry MODIFIER_SYMBOL
enum entry OTHER_SYMBOL
enum entry INITIAL_QUOTE_PUNCTUATION
enum entry FINAL_QUOTE_PUNCTUATION
public final val code: kotlin.String { get; }
public final operator fun contains(char: kotlin.Char): kotlin.Boolean
public companion object of CharCategory {
}
}
@kotlin.SinceKotlin(version = "1.4")
@kotlin.WasExperimental(markerClass = {kotlin.ExperimentalStdlibApi::class})
public open class CharacterCodingException : kotlin.Exception {
@@ -63,7 +63,6 @@ expect enum class RegexOption {
// From char.kt
expect fun Char.isWhitespace(): Boolean
expect fun Char.isHighSurrogate(): Boolean
expect fun Char.isLowSurrogate(): Boolean
@@ -0,0 +1,84 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateUnicodeData.kt
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//
// 1343 ranges totally
private object Category {
val decodedRangeStart: IntArray
val decodedRangeCategory: IntArray
init {
val toBase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
val fromBase64 = IntArray(128)
for (i in toBase64.indices) {
fromBase64[toBase64[i].toInt()] = i
}
// rangeStartDiff.length = 1482
val rangeStartDiff = "gBCFEDCKCDCaDDaDBhBCEEDDDDDEDXBHYBH5BRwBGDCHDCIDFHDCHFDCDEIRTEE7BGHDDJlCBbSEMOFGERwDEDDDDECEFCRBJhBFDCYFFCCzBvBjBBFC3BOhDBmBDGpBDDCtBBJIbEECLGDFCLDCgBBKVKEDiDDHCFECECKCEODBebC5CLBOKhBJDDDDWEBHFCFCPBZDEL1BVBSLPBgBB2BDBDICFBHKCCKCPDBHEDWBHEDDDDEDEDIBDGDCKCCGDDDCGECCWBFMDDCDEDDCHDDHKDDBKDBHFCWBFGFDBDDFEDBPDDKCHBGDCHEDWBFGFDCEDEDBHDDGDCKCGJEGDBFDDFDDDDDMEFDBFDCGBOKDFDFDCGFCXBQDDDDDBEGEDFDDKHBHDDGFCXBKBFCEFCFCHCHECCKDNCCHFCoBEDECFDDDDHDCCKJBGDCSDYBJEHBFDDEBIGKDCMuBFHEBGBIBKCkBFBFBXEIFJDFDGCKCEgBBDPEDGKKGECIBkBEOBDFFLBkBBIBEFFEClBrBCEBEGDBKGGDDDDDCHDENDCFEKDDlBDDFrBCDpKBECGEECpBBEChBBECGEECPB5BBECjCCDJUDQKG2CCGDsTCRBaCDrCDDIHNBEDLSDCJSCMLFCCM0BDHGFLBFDDKGKGEFDDBKGjBB1BHFChBDFmCKfDDDDDDCGDCFDKeCFLsBEaGKBDiBXDDD1BDGDEIGJEKGKGHBGCMF/BEBvBCEDDFHEKHKJJDDeDDGDKsBFEDCIEkBIICCDFKDDKeGCJHrBCDIIDBNBHEBEFDBFsB/BNBiBlB6BBF1EIiDJIGCGCIIIIGCGCIIIIOCIIIIIIDFEDDBFEDDDDEBDIFDDFEDBLFGCEEICFBJCDEDCLDKBFBKCCGDDKDDNDgBQNEBDMPFFDEDEBFFHECEBEEDFBEDDQjBCEDEFFCCJHBeEEfsIIEUCHCxCBeZoBGlCZLV8BuCW3FBJB2BIvDB4HOesBFCfKQgIjEW/BEgBCiIwBVCGnBCgBBpDvBBuBEDBHEFGCCjDCGEDCFCFlBDDF4BHCOBXJHBHBHBHBHBHBHBHBgBCECGHGEDIFBKCEDMEtBaB5CM2GaMEDDCKCGFCJEDFDDDC2CDDDB6CDCFrBB+CDEKgBkBMQfBKeIBPgBKnBPgKguGgC9vUDVB3jBD3BJoBGCsIBDQKCUuBDDKCcCCmCKCGIXJCNC/BBHGKDECEVFBEMCEEBqBDDGDFDXDCEBDGEG0BEICyBQCICKGSGDEBKcICXLCLBdDDBvBDECCDNCKECFCJKFBpBFEDCJDBICCKCEQBGDDByBEDCEFBYDCLEDDCKGCGCGJHBHBrBBEJDEwCjBIDCKGk9KMXExBEggCgoGuLCqDmBHMFFCKBNBFBIsDQRrLCQgCC2BoBMCCQGEGQDCQDDDDFDGDECEEFBnEEBFEDCKCDCaDDaDBFCKBtBCfDGCGCFEDDDDCECKDC"
val diff = decodeVarLenBase64(rangeStartDiff, fromBase64, 1342)
val start = IntArray(diff.size + 1)
for (i in diff.indices) {
start[i + 1] = start[i] + diff[i]
}
decodedRangeStart = start
// rangeCategory.length = 2033
val rangeCategory = "PsY44a41W54UYJYZYB14W7XC15WZPsYa84bl9Zw8b85Lr7C44brlerrYBZBCZCiBiBiBhCiiBhChiBhiCBhhChiCihBhChCChiBhChiClBCFhjCiBiBihDhiBhCCihBiBBhCCFCEbEbEb7EbGhCk7BixRkiCi4BRbh4BhRhCBRBCiiBBCiBChiZBCBCiBcGHhChCiBRBxxEYC40Rx8c6RGUm4GRFRFYRQZ44acG4wRYFEFGJYllGFlYGwcGmkEmcGFJFl8cYxwFGFGRFGFRJFGkkcYkxRm6aFGEGmmEmEGRYRFGxxYFRFRFRGQGIFmIFIGIooGFGFGYJ4EFmoIRFlxRlxRFRFxlRxlFllRxmFIGxxIoxRomFRIRxlFlmGRJFaL86F4mRxmGoRFRFRFRFllRxGIGRxmGxmGmxRxGRFlRRJmmFllGYRmmIRFllRlRFRFllRFxxGFIGmmRoxImxRFRllGmxRJ4aRFGxmIoRFlxRlxRFRFllRFxxGlImoGmmRxoIxoIGRmmIRxlFlmGRJ8FLRxmFFRFllRllRxxFlRlxRxlFRFRFRooGRIooRomRxFRIRJLc8aRmoIoGFllRlRFRFRlmGmoIooRGRGRxmGFRllGmxRJRYL8lGooYFllRlRFRFRFRmlIIxGooRGRIRlxFGRJxlFRGIFllRlRFlmGIGxIooRomF8xRxxFllILFGRJLcFxmIoRFRFRFxlRFRxxGxxIooGmmRRIRJxxIoYRFllGGRaFEGYJYRxlFRFRFlRFllGGlxRFxEGRJRFRFcY84c8mGcJL8G1WIFRFRGIGmmYFGRGRcGc88RYcYRFIGIGmmIomGFJYFooGmlFllGmmFIFIFGFmoIGIomFJIm8cBhRRxxBC4ECFRFRFlRFRFRFRFRFRFlRFRFRFRFRFRGYLRFcRBRCxxUF8YFMF1WRFYKFRFRFGRFGYRFGRFllRlRGRFmmIGIooGGY44E46FmxRJRLRY44U44GmmQRJRFEFRFGFlGRFRFxmGmoIooGmoIoxRxxIoGIGRxxcx4YJFRFRFRFRJLRcFmmIomRx4YFoGGmRomIGIGmxRJRJRYEYRGmmHRGIFmIGmIIooGFRJYcGcRmmIFomGmmIomGmlFJFmoGooGGIRYFIGIGRYJRFJFEYCRBRBYRGYGIGFGFllGomGFRCECECEGRGhCCiBCBCRBRCBCBCRBRCxBCBCRCDCDCDCiiRBj7CbCiiRBj7b7iCiiRxiCBRbCBbxxCiiRBj7bRMQUY9+V9+VYtOQMY9eY43X44Z1WY54XYMQRQrERLZ12ELZ12RERaRGHGHGR88B88BihBhiChhC8hcZBc8BB8CBCFi8cihBZBC8Z8CLKhCKr8cRZcZc88ZcZc85Z8ZcZc1WcZc1WcZcZcZcRcRLcLcZcZcZcZc1WLcZ1WZ1WZcZ1WZ1WZ1WZcZcZcRcRcBRCixBBCiBBihCCEBhCCchCGhCRY44LCiRRxxCFRkYRGFRFRFRFRFRFRFRFRFRGY9eY49eY44U49e49e1WYEYUY04VY48cRcRcRcRcRs4Y48ElK1Wc1W12U2cKGooUE88KqqEl4c8RFxxGm7bkkFUF4kEkFRFRFx8cLcFcRFcRLcLcLcLcLcFcFRFEFRcRFEYFEYFJFRhClmHnnYG4EhCEGFKGYRbEbhCCiBECiBhCk7bhClBihCiBBCBhCRhiBhhCCRhiFkkCFlGllGllGFooGmIcGRL88aRFYRIFIGRYJRGFYl4FGJFGYFGIRYFRGIFmoIGIGIYxEJRYFmEFJFRFGmoImoIGRFGFmIRJRYFEFcloGIFmlGmlFGFlmGFRllEYFomGo4YlkEoGRFRFRFRFRFRCbECk7bRCFooG4oGRJRFRFRFRTSFRFRCRCRlGFZFRFRlxFFbRF2VRFRFRF6cRGY41WRG40UX1W44V24Y44X33Y44R44U1WY50Z5R46YRFRFxxQY44a41W54UYJYZYB14W7XC15WZ12YYFEFEFRFRFRFlxRllRxxa65b86axcZcRQcR"
decodedRangeCategory = decodeVarLenBase64(rangeCategory, fromBase64, 1343)
}
}
private fun categoryValueFrom(code: Int, ch: Int): Int {
return when {
code < 0x20 -> code
code < 0x400 -> if ((ch and 1) == 1) code shr 5 else code and 0x1f
else ->
when (ch % 3) {
2 -> code shr 10
1 -> (code shr 5) and 0x1f
else -> code and 0x1f
}
}
}
/**
* Returns the Unicode general category of this character as an Int.
*/
internal fun Char.getCategoryValue(): Int {
val ch = this.toInt()
val index = binarySearchRange(Category.decodedRangeStart, ch)
val start = Category.decodedRangeStart[index]
val code = Category.decodedRangeCategory[index]
val value = categoryValueFrom(code, ch - start)
return if (value == 17) CharCategory.UNASSIGNED.value else value
}
internal fun decodeVarLenBase64(base64: String, fromBase64: IntArray, resultLength: Int): IntArray {
val result = IntArray(resultLength)
var index = 0
var int = 0
var shift = 0
for (char in base64) {
val sixBit = fromBase64[char.toInt()]
int = int or ((sixBit and 0x1f) shl shift)
if (sixBit < 0x20) {
result[index++] = int
int = 0
shift = 0
} else {
shift += 5
}
}
return result
}
@@ -0,0 +1,47 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateUnicodeData.kt
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//
// 37 ranges totally
private object Digit {
internal val rangeStart = intArrayOf(
0x0030, 0x0660, 0x06f0, 0x07c0, 0x0966, 0x09e6, 0x0a66, 0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6, 0x0d66, 0x0de6, 0x0e50, 0x0ed0, 0x0f20, 0x1040, 0x1090, 0x17e0,
0x1810, 0x1946, 0x19d0, 0x1a80, 0x1a90, 0x1b50, 0x1bb0, 0x1c40, 0x1c50, 0xa620, 0xa8d0, 0xa900, 0xa9d0, 0xa9f0, 0xaa50, 0xabf0, 0xff10,
)
}
internal fun binarySearchRange(array: IntArray, needle: Int): Int {
var bottom = 0
var top = array.size - 1
var middle = -1
var value = 0
while (bottom <= top) {
middle = (bottom + top) / 2
value = array[middle]
if (needle > value)
bottom = middle + 1
else if (needle == value)
return middle
else
top = middle - 1
}
return middle - (if (needle < value) 1 else 0)
}
/**
* Returns `true` if this character is a digit.
*/
internal fun Char.isDigitImpl(): Boolean {
val ch = this.toInt()
val index = binarySearchRange(Digit.rangeStart, ch)
val high = Digit.rangeStart[index] + 9
return ch <= high
}
@@ -0,0 +1,114 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateUnicodeData.kt
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//
// 222 ranges totally
private object Letter {
val decodedRangeStart: IntArray
val decodedRangeLength: IntArray
val decodedRangeCategory: IntArray
init {
val toBase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
val fromBase64 = IntArray(128)
for (i in toBase64.indices) {
fromBase64[toBase64[i].toInt()] = i
}
// rangeStartDiff.length = 356
val rangeStartDiff = "hCgBpCQGYHZH5BRpBPPPPPPRMP5BPPlCPP6BkEPPPPcPXPzBvBrB3BOiDoBHwD+E3DauCnFmBmB2D6E1BlBTiBmBlBP5BhBiBrBvBjBqBnBPRtBiCmCtBlB0BmB5BiB7BmBgEmChBZgCoEoGVpBSfRhBPqKQ2BwBYoFgB4CJuTiEvBuCuDrF5DgEgFlJ1DgFmBQtBsBRGsB+BPiBlD1EIjDPRPPPQPPPPPGQSQS/DxENVNU+B9zCwBwBPPCkDPNnBPqDYY1R8B7FkFgTgwGgwUwmBgKwBuBScmEP/BPPPPPPrBP8B7F1B/ErBqC6B7BiBmBfQsBUwCw/KwqIwLwETPcPjQgJxFgBlBsD"
val diff = decodeVarLenBase64(rangeStartDiff, fromBase64, 222)
val start = IntArray(diff.size)
for (i in diff.indices) {
if (i == 0) start[i] = diff[i]
else start[i] = start[i - 1] + diff[i]
}
decodedRangeStart = start
// rangeLength.length = 328
val rangeLength = "aaMBXHYH5BRpBPPPPPPRMP5BPPlCPPzBDOOPPcPXPzBvBjB3BOhDmBBpB7DoDYxB+EiBP1DoExBkBQhBekBPmBgBhBctBiBMWOOXhCsBpBkBUV3Ba4BkB0DlCgBXgBtD4FSdBfPhBPpKP0BvBXjEQ2CGsT8DhBtCqDpFvD1D3E0IrD2EkBJrBDOBsB+BPiBlB1EIjDPPPPPPPPPPPGPPMNLsBNPNPKCvBvBPPCkDPBmBPhDXXgD4B6FzEgDguG9vUtkB9JcuBSckEP/BPPPPPPBPf4FrBjEhBpC3B5BKaWPrBOwCk/KsCuLqDHPbPxPsFtEaaqDL"
decodedRangeLength = decodeVarLenBase64(rangeLength, fromBase64, 222)
// rangeCategory.length = 959
val rangeCategory = "GFjgggUHGGFFZZZmzpz5qB6s6020B60ptltB6smt2sB60mz22B1+vv+8BZZ5s2850BW5q1ymtB506smzBF3q1q1qB1q1q1+Bgii4wDTm74g3KiggxqM60q1q1Bq1o1q1BF1qlrqrBZ2q5wprBGFZWWZGHFsjiooLowgmOowjkwCkgoiIk7ligGogiioBkwkiYkzj2oNoi+sbkwj04DghhkQ8wgiYkgoioDsgnkwC4gikQ//v+85BkwvoIsgoyI4yguI0whiwEowri4CoghsJowgqYowgm4DkwgsY/nwnzPowhmYkg6wI8yggZswikwHgxgmIoxgqYkwgk4DkxgmIkgoioBsgssoBgzgyI8g9gL8g9kI0wgwJoxgkoC0wgioFkw/wI0w53iF4gioYowjmgBHGq1qkgwBF1q1q8qBHwghuIwghyKk0goQkwgoQk3goQHGFHkyg0pBgxj6IoinkxDswno7Ikwhz9Bo0gioB8z48Rwli0xN0mpjoX8w78pDwltoqKHFGGwwgsIHFH3q1q16BFHWFZ1q10q1B2qlwq1B1q10q1B2q1yq1B6q1gq1Biq1qhxBir1qp1Bqt1q1qB1g1q1+B//3q16B///q1qBH/qlqq9Bholqq9B1i00a1q10qD1op1HkwmigEigiy6Cptogq1Bixo1kDq7/j00B2qgoBWGFm1lz50B6s5q1+BGWhggzhwBFFhgk4//Bo2jigE8wguI8wguI8wgugUog1qoB4qjmIwwi2KgkYHHH4lBgiFWkgIWoghssMmz5smrBZ3q1y50B5sm7gzBtz1smzB5smz50BqzqtmzB5sgzqzBF2/9//5BowgoIwmnkzPkwgk4C8ys65BkgoqI0wgy6FghquZo2giY0ghiIsgh24B4ghsQ8QF/v1q1OFs0O8iCHHF1qggz/B8wg6Iznv+//B08QgohsjK0QGFk7hsQ4gB"
decodedRangeCategory = decodeVarLenBase64(rangeCategory, fromBase64, 222)
}
}
/**
* Returns `true` if this character is a letter.
*/
internal fun Char.isLetterImpl(): Boolean {
return getLetterType() != 0
}
/**
* Returns `true` if this character is a lower case letter.
*/
internal fun Char.isLowerCaseImpl(): Boolean {
return getLetterType() == 1
}
/**
* Returns `true` if this character is an upper case letter.
*/
internal fun Char.isUpperCaseImpl(): Boolean {
return getLetterType() == 2
}
/**
* Returns
* - `1` if the character is a lower case letter,
* - `2` if the character is an upper case letter,
* - `3` if the character is a letter but not a lower or upper case letter,
* - `0` otherwise.
*/
private fun Char.getLetterType(): Int {
val ch = this.toInt()
val index = binarySearchRange(Letter.decodedRangeStart, ch)
val rangeStart = Letter.decodedRangeStart[index]
val rangeEnd = rangeStart + Letter.decodedRangeLength[index] - 1
val code = Letter.decodedRangeCategory[index]
if (ch > rangeEnd) {
return 0
}
val lastTwoBits = code and 0x3
if (lastTwoBits == 0) { // gap pattern
var shift = 2
var threshold = rangeStart
for (i in 0..1) {
threshold += (code shr shift) and 0x7f
if (threshold > ch) {
return 3
}
shift += 7
threshold += (code shr shift) and 0x7f
if (threshold > ch) {
return 0
}
shift += 7
}
return 3
}
if (code <= 0x7) {
return lastTwoBits
}
val distance = (ch - rangeStart)
val shift = if (code <= 0x1F) distance % 2 else distance
return (code shr (2 * shift)) and 0x3
}
@@ -0,0 +1,31 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateUnicodeData.kt
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//
// 9 ranges totally
/**
* Returns `true` if this character is a whitespace.
*/
internal fun Char.isWhitespaceImpl(): Boolean {
val ch = this.toInt()
return ch in 0x0009..0x000d
|| ch in 0x001c..0x0020
|| ch == 0x00a0
|| ch > 0x1000 && (
ch == 0x1680
|| ch in 0x2000..0x200a
|| ch == 0x2028
|| ch == 0x2029
|| ch == 0x202f
|| ch == 0x205f
|| ch == 0x3000
)
}
@@ -0,0 +1,84 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateUnicodeData.kt
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//
// 1343 ranges totally
private object Category {
val decodedRangeStart: IntArray
val decodedRangeCategory: IntArray
init {
val toBase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
val fromBase64 = IntArray(128)
for (i in toBase64.indices) {
fromBase64[toBase64[i].toInt()] = i
}
// rangeStartDiff.length = 1482
val rangeStartDiff = "gBCFEDCKCDCaDDaDBhBCEEDDDDDEDXBHYBH5BRwBGDCHDCIDFHDCHFDCDEIRTEE7BGHDDJlCBbSEMOFGERwDEDDDDECEFCRBJhBFDCYFFCCzBvBjBBFC3BOhDBmBDGpBDDCtBBJIbEECLGDFCLDCgBBKVKEDiDDHCFECECKCEODBebC5CLBOKhBJDDDDWEBHFCFCPBZDEL1BVBSLPBgBB2BDBDICFBHKCCKCPDBHEDWBHEDDDDEDEDIBDGDCKCCGDDDCGECCWBFMDDCDEDDCHDDHKDDBKDBHFCWBFGFDBDDFEDBPDDKCHBGDCHEDWBFGFDCEDEDBHDDGDCKCGJEGDBFDDFDDDDDMEFDBFDCGBOKDFDFDCGFCXBQDDDDDBEGEDFDDKHBHDDGFCXBKBFCEFCFCHCHECCKDNCCHFCoBEDECFDDDDHDCCKJBGDCSDYBJEHBFDDEBIGKDCMuBFHEBGBIBKCkBFBFBXEIFJDFDGCKCEgBBDPEDGKKGECIBkBEOBDFFLBkBBIBEFFEClBrBCEBEGDBKGGDDDDDCHDENDCFEKDDlBDDFrBCDpKBECGEECpBBEChBBECGEECPB5BBECjCCDJUDQKG2CCGDsTCRBaCDrCDDIHNBEDLSDCJSCMLFCCM0BDHGFLBFDDKGKGEFDDBKGjBB1BHFChBDFmCKfDDDDDDCGDCFDKeCFLsBEaGKBDiBXDDD1BDGDEIGJEKGKGHBGCMF/BEBvBCEDDFHEKHKJJDDeDDGDKsBFEDCIEkBIICCDFKDDKeGCJHrBCDIIDBNBHEBEFDBFsB/BNBiBlB6BBF1EIiDJIGCGCIIIIGCGCIIIIOCIIIIIIDFEDDBFEDDDDEBDIFDDFEDBLFGCEEICFBJCDEDCLDKBFBKCCGDDKDDNDgBQNEBDMPFFDEDEBFFHECEBEEDFBEDDQjBCEDEFFCCJHBeEEfsIIEUCHCxCBeZoBGlCZLV8BuCW3FBJB2BIvDB4HOesBFCfKQgIjEW/BEgBCiIwBVCGnBCgBBpDvBBuBEDBHEFGCCjDCGEDCFCFlBDDF4BHCOBXJHBHBHBHBHBHBHBHBgBCECGHGEDIFBKCEDMEtBaB5CM2GaMEDDCKCGFCJEDFDDDC2CDDDB6CDCFrBB+CDEKgBkBMQfBKeIBPgBKnBPgKguGgC9vUDVB3jBD3BJoBGCsIBDQKCUuBDDKCcCCmCKCGIXJCNC/BBHGKDECEVFBEMCEEBqBDDGDFDXDCEBDGEG0BEICyBQCICKGSGDEBKcICXLCLBdDDBvBDECCDNCKECFCJKFBpBFEDCJDBICCKCEQBGDDByBEDCEFBYDCLEDDCKGCGCGJHBHBrBBEJDEwCjBIDCKGk9KMXExBEggCgoGuLCqDmBHMFFCKBNBFBIsDQRrLCQgCC2BoBMCCQGEGQDCQDDDDFDGDECEEFBnEEBFEDCKCDCaDDaDBFCKBtBCfDGCGCFEDDDDCECKDC"
val diff = decodeVarLenBase64(rangeStartDiff, fromBase64, 1342)
val start = IntArray(diff.size + 1)
for (i in diff.indices) {
start[i + 1] = start[i] + diff[i]
}
decodedRangeStart = start
// rangeCategory.length = 2033
val rangeCategory = "PsY44a41W54UYJYZYB14W7XC15WZPsYa84bl9Zw8b85Lr7C44brlerrYBZBCZCiBiBiBhCiiBhChiBhiCBhhChiCihBhChCChiBhChiClBCFhjCiBiBihDhiBhCCihBiBBhCCFCEbEbEb7EbGhCk7BixRkiCi4BRbh4BhRhCBRBCiiBBCiBChiZBCBCiBcGHhChCiBRBxxEYC40Rx8c6RGUm4GRFRFYRQZ44acG4wRYFEFGJYllGFlYGwcGmkEmcGFJFl8cYxwFGFGRFGFRJFGkkcYkxRm6aFGEGmmEmEGRYRFGxxYFRFRFRGQGIFmIFIGIooGFGFGYJ4EFmoIRFlxRlxRFRFxlRxlFllRxmFIGxxIoxRomFRIRxlFlmGRJFaL86F4mRxmGoRFRFRFRFllRxGIGRxmGxmGmxRxGRFlRRJmmFllGYRmmIRFllRlRFRFllRFxxGFIGmmRoxImxRFRllGmxRJ4aRFGxmIoRFlxRlxRFRFllRFxxGlImoGmmRxoIxoIGRmmIRxlFlmGRJ8FLRxmFFRFllRllRxxFlRlxRxlFRFRFRooGRIooRomRxFRIRJLc8aRmoIoGFllRlRFRFRlmGmoIooRGRGRxmGFRllGmxRJRYL8lGooYFllRlRFRFRFRmlIIxGooRGRIRlxFGRJxlFRGIFllRlRFlmGIGxIooRomF8xRxxFllILFGRJLcFxmIoRFRFRFxlRFRxxGxxIooGmmRRIRJxxIoYRFllGGRaFEGYJYRxlFRFRFlRFllGGlxRFxEGRJRFRFcY84c8mGcJL8G1WIFRFRGIGmmYFGRGRcGc88RYcYRFIGIGmmIomGFJYFooGmlFllGmmFIFIFGFmoIGIomFJIm8cBhRRxxBC4ECFRFRFlRFRFRFRFRFRFlRFRFRFRFRFRGYLRFcRBRCxxUF8YFMF1WRFYKFRFRFGRFGYRFGRFllRlRGRFmmIGIooGGY44E46FmxRJRLRY44U44GmmQRJRFEFRFGFlGRFRFxmGmoIooGmoIoxRxxIoGIGRxxcx4YJFRFRFRFRJLRcFmmIomRx4YFoGGmRomIGIGmxRJRJRYEYRGmmHRGIFmIGmIIooGFRJYcGcRmmIFomGmmIomGmlFJFmoGooGGIRYFIGIGRYJRFJFEYCRBRBYRGYGIGFGFllGomGFRCECECEGRGhCCiBCBCRBRCBCBCRBRCxBCBCRCDCDCDCiiRBj7CbCiiRBj7b7iCiiRxiCBRbCBbxxCiiRBj7bRMQUY9+V9+VYtOQMY9eY43X44Z1WY54XYMQRQrERLZ12ELZ12RERaRGHGHGR88B88BihBhiChhC8hcZBc8BB8CBCFi8cihBZBC8Z8CLKhCKr8cRZcZc88ZcZc85Z8ZcZc1WcZc1WcZcZcZcRcRLcLcZcZcZcZc1WLcZ1WZ1WZcZ1WZ1WZ1WZcZcZcRcRcBRCixBBCiBBihCCEBhCCchCGhCRY44LCiRRxxCFRkYRGFRFRFRFRFRFRFRFRFRGY9eY49eY44U49e49e1WYEYUY04VY48cRcRcRcRcRs4Y48ElK1Wc1W12U2cKGooUE88KqqEl4c8RFxxGm7bkkFUF4kEkFRFRFx8cLcFcRFcRLcLcLcLcLcFcFRFEFRcRFEYFEYFJFRhClmHnnYG4EhCEGFKGYRbEbhCCiBECiBhCk7bhClBihCiBBCBhCRhiBhhCCRhiFkkCFlGllGllGFooGmIcGRL88aRFYRIFIGRYJRGFYl4FGJFGYFGIRYFRGIFmoIGIGIYxEJRYFmEFJFRFGmoImoIGRFGFmIRJRYFEFcloGIFmlGmlFGFlmGFRllEYFomGo4YlkEoGRFRFRFRFRFRCbECk7bRCFooG4oGRJRFRFRFRTSFRFRCRCRlGFZFRFRlxFFbRF2VRFRFRF6cRGY41WRG40UX1W44V24Y44X33Y44R44U1WY50Z5R46YRFRFxxQY44a41W54UYJYZYB14W7XC15WZ12YYFEFEFRFRFRFlxRllRxxa65b86axcZcRQcR"
decodedRangeCategory = decodeVarLenBase64(rangeCategory, fromBase64, 1343)
}
}
private fun categoryValueFrom(code: Int, ch: Int): Int {
return when {
code < 0x20 -> code
code < 0x400 -> if ((ch and 1) == 1) code shr 5 else code and 0x1f
else ->
when (ch % 3) {
2 -> code shr 10
1 -> (code shr 5) and 0x1f
else -> code and 0x1f
}
}
}
/**
* Returns the Unicode general category of this character as an Int.
*/
internal fun Char.getCategoryValue(): Int {
val ch = this.toInt()
val index = binarySearchRange(Category.decodedRangeStart, ch)
val start = Category.decodedRangeStart[index]
val code = Category.decodedRangeCategory[index]
val value = categoryValueFrom(code, ch - start)
return if (value == 17) CharCategory.UNASSIGNED.value else value
}
internal fun decodeVarLenBase64(base64: String, fromBase64: IntArray, resultLength: Int): IntArray {
val result = IntArray(resultLength)
var index = 0
var int = 0
var shift = 0
for (char in base64) {
val sixBit = fromBase64[char.toInt()]
int = int or ((sixBit and 0x1f) shl shift)
if (sixBit < 0x20) {
result[index++] = int
int = 0
shift = 0
} else {
shift += 5
}
}
return result
}
@@ -0,0 +1,47 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateUnicodeData.kt
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//
// 37 ranges totally
private object Digit {
internal val rangeStart = intArrayOf(
0x0030, 0x0660, 0x06f0, 0x07c0, 0x0966, 0x09e6, 0x0a66, 0x0ae6, 0x0b66, 0x0be6, 0x0c66, 0x0ce6, 0x0d66, 0x0de6, 0x0e50, 0x0ed0, 0x0f20, 0x1040, 0x1090, 0x17e0,
0x1810, 0x1946, 0x19d0, 0x1a80, 0x1a90, 0x1b50, 0x1bb0, 0x1c40, 0x1c50, 0xa620, 0xa8d0, 0xa900, 0xa9d0, 0xa9f0, 0xaa50, 0xabf0, 0xff10,
)
}
internal fun binarySearchRange(array: IntArray, needle: Int): Int {
var bottom = 0
var top = array.size - 1
var middle = -1
var value = 0
while (bottom <= top) {
middle = (bottom + top) / 2
value = array[middle]
if (needle > value)
bottom = middle + 1
else if (needle == value)
return middle
else
top = middle - 1
}
return middle - (if (needle < value) 1 else 0)
}
/**
* Returns `true` if this character is a digit.
*/
internal fun Char.isDigitImpl(): Boolean {
val ch = this.toInt()
val index = binarySearchRange(Digit.rangeStart, ch)
val high = Digit.rangeStart[index] + 9
return ch <= high
}
@@ -0,0 +1,114 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateUnicodeData.kt
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//
// 222 ranges totally
private object Letter {
val decodedRangeStart: IntArray
val decodedRangeLength: IntArray
val decodedRangeCategory: IntArray
init {
val toBase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
val fromBase64 = IntArray(128)
for (i in toBase64.indices) {
fromBase64[toBase64[i].toInt()] = i
}
// rangeStartDiff.length = 356
val rangeStartDiff = "hCgBpCQGYHZH5BRpBPPPPPPRMP5BPPlCPP6BkEPPPPcPXPzBvBrB3BOiDoBHwD+E3DauCnFmBmB2D6E1BlBTiBmBlBP5BhBiBrBvBjBqBnBPRtBiCmCtBlB0BmB5BiB7BmBgEmChBZgCoEoGVpBSfRhBPqKQ2BwBYoFgB4CJuTiEvBuCuDrF5DgEgFlJ1DgFmBQtBsBRGsB+BPiBlD1EIjDPRPPPQPPPPPGQSQS/DxENVNU+B9zCwBwBPPCkDPNnBPqDYY1R8B7FkFgTgwGgwUwmBgKwBuBScmEP/BPPPPPPrBP8B7F1B/ErBqC6B7BiBmBfQsBUwCw/KwqIwLwETPcPjQgJxFgBlBsD"
val diff = decodeVarLenBase64(rangeStartDiff, fromBase64, 222)
val start = IntArray(diff.size)
for (i in diff.indices) {
if (i == 0) start[i] = diff[i]
else start[i] = start[i - 1] + diff[i]
}
decodedRangeStart = start
// rangeLength.length = 328
val rangeLength = "aaMBXHYH5BRpBPPPPPPRMP5BPPlCPPzBDOOPPcPXPzBvBjB3BOhDmBBpB7DoDYxB+EiBP1DoExBkBQhBekBPmBgBhBctBiBMWOOXhCsBpBkBUV3Ba4BkB0DlCgBXgBtD4FSdBfPhBPpKP0BvBXjEQ2CGsT8DhBtCqDpFvD1D3E0IrD2EkBJrBDOBsB+BPiBlB1EIjDPPPPPPPPPPPGPPMNLsBNPNPKCvBvBPPCkDPBmBPhDXXgD4B6FzEgDguG9vUtkB9JcuBSckEP/BPPPPPPBPf4FrBjEhBpC3B5BKaWPrBOwCk/KsCuLqDHPbPxPsFtEaaqDL"
decodedRangeLength = decodeVarLenBase64(rangeLength, fromBase64, 222)
// rangeCategory.length = 959
val rangeCategory = "GFjgggUHGGFFZZZmzpz5qB6s6020B60ptltB6smt2sB60mz22B1+vv+8BZZ5s2850BW5q1ymtB506smzBF3q1q1qB1q1q1+Bgii4wDTm74g3KiggxqM60q1q1Bq1o1q1BF1qlrqrBZ2q5wprBGFZWWZGHFsjiooLowgmOowjkwCkgoiIk7ligGogiioBkwkiYkzj2oNoi+sbkwj04DghhkQ8wgiYkgoioDsgnkwC4gikQ//v+85BkwvoIsgoyI4yguI0whiwEowri4CoghsJowgqYowgm4DkwgsY/nwnzPowhmYkg6wI8yggZswikwHgxgmIoxgqYkwgk4DkxgmIkgoioBsgssoBgzgyI8g9gL8g9kI0wgwJoxgkoC0wgioFkw/wI0w53iF4gioYowjmgBHGq1qkgwBF1q1q8qBHwghuIwghyKk0goQkwgoQk3goQHGFHkyg0pBgxj6IoinkxDswno7Ikwhz9Bo0gioB8z48Rwli0xN0mpjoX8w78pDwltoqKHFGGwwgsIHFH3q1q16BFHWFZ1q10q1B2qlwq1B1q10q1B2q1yq1B6q1gq1Biq1qhxBir1qp1Bqt1q1qB1g1q1+B//3q16B///q1qBH/qlqq9Bholqq9B1i00a1q10qD1op1HkwmigEigiy6Cptogq1Bixo1kDq7/j00B2qgoBWGFm1lz50B6s5q1+BGWhggzhwBFFhgk4//Bo2jigE8wguI8wguI8wgugUog1qoB4qjmIwwi2KgkYHHH4lBgiFWkgIWoghssMmz5smrBZ3q1y50B5sm7gzBtz1smzB5smz50BqzqtmzB5sgzqzBF2/9//5BowgoIwmnkzPkwgk4C8ys65BkgoqI0wgy6FghquZo2giY0ghiIsgh24B4ghsQ8QF/v1q1OFs0O8iCHHF1qggz/B8wg6Iznv+//B08QgohsjK0QGFk7hsQ4gB"
decodedRangeCategory = decodeVarLenBase64(rangeCategory, fromBase64, 222)
}
}
/**
* Returns `true` if this character is a letter.
*/
internal fun Char.isLetterImpl(): Boolean {
return getLetterType() != 0
}
/**
* Returns `true` if this character is a lower case letter.
*/
internal fun Char.isLowerCaseImpl(): Boolean {
return getLetterType() == 1
}
/**
* Returns `true` if this character is an upper case letter.
*/
internal fun Char.isUpperCaseImpl(): Boolean {
return getLetterType() == 2
}
/**
* Returns
* - `1` if the character is a lower case letter,
* - `2` if the character is an upper case letter,
* - `3` if the character is a letter but not a lower or upper case letter,
* - `0` otherwise.
*/
private fun Char.getLetterType(): Int {
val ch = this.toInt()
val index = binarySearchRange(Letter.decodedRangeStart, ch)
val rangeStart = Letter.decodedRangeStart[index]
val rangeEnd = rangeStart + Letter.decodedRangeLength[index] - 1
val code = Letter.decodedRangeCategory[index]
if (ch > rangeEnd) {
return 0
}
val lastTwoBits = code and 0x3
if (lastTwoBits == 0) { // gap pattern
var shift = 2
var threshold = rangeStart
for (i in 0..1) {
threshold += (code shr shift) and 0x7f
if (threshold > ch) {
return 3
}
shift += 7
threshold += (code shr shift) and 0x7f
if (threshold > ch) {
return 0
}
shift += 7
}
return 3
}
if (code <= 0x7) {
return lastTwoBits
}
val distance = (ch - rangeStart)
val shift = if (code <= 0x1F) distance % 2 else distance
return (code shr (2 * shift)) and 0x3
}
@@ -0,0 +1,31 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateUnicodeData.kt
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//
// 9 ranges totally
/**
* Returns `true` if this character is a whitespace.
*/
internal fun Char.isWhitespaceImpl(): Boolean {
val ch = this.toInt()
return ch in 0x0009..0x000d
|| ch in 0x001c..0x0020
|| ch == 0x00a0
|| ch > 0x1000 && (
ch == 0x1680
|| ch in 0x2000..0x200a
|| ch == 0x2028
|| ch == 0x2029
|| ch == 0x202f
|| ch == 0x205f
|| ch == 0x3000
)
}
@@ -0,0 +1,172 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
public actual enum class CharCategory(internal val value: Int, public actual val code: String) {
/**
* General category "Cn" in the Unicode specification.
*/
UNASSIGNED(0, "Cn"),
/**
* General category "Lu" in the Unicode specification.
*/
UPPERCASE_LETTER(1, "Lu"),
/**
* General category "Ll" in the Unicode specification.
*/
LOWERCASE_LETTER(2, "Ll"),
/**
* General category "Lt" in the Unicode specification.
*/
TITLECASE_LETTER(3, "Lt"),
/**
* General category "Lm" in the Unicode specification.
*/
MODIFIER_LETTER(4, "Lm"),
/**
* General category "Lo" in the Unicode specification.
*/
OTHER_LETTER(5, "Lo"),
/**
* General category "Mn" in the Unicode specification.
*/
NON_SPACING_MARK(6, "Mn"),
/**
* General category "Me" in the Unicode specification.
*/
ENCLOSING_MARK(7, "Me"),
/**
* General category "Mc" in the Unicode specification.
*/
COMBINING_SPACING_MARK(8, "Mc"),
/**
* General category "Nd" in the Unicode specification.
*/
DECIMAL_DIGIT_NUMBER(9, "Nd"),
/**
* General category "Nl" in the Unicode specification.
*/
LETTER_NUMBER(10, "Nl"),
/**
* General category "No" in the Unicode specification.
*/
OTHER_NUMBER(11, "No"),
/**
* General category "Zs" in the Unicode specification.
*/
SPACE_SEPARATOR(12, "Zs"),
/**
* General category "Zl" in the Unicode specification.
*/
LINE_SEPARATOR(13, "Zl"),
/**
* General category "Zp" in the Unicode specification.
*/
PARAGRAPH_SEPARATOR(14, "Zp"),
/**
* General category "Cc" in the Unicode specification.
*/
CONTROL(15, "Cc"),
/**
* General category "Cf" in the Unicode specification.
*/
FORMAT(16, "Cf"),
/**
* General category "Co" in the Unicode specification.
*/
PRIVATE_USE(18, "Co"),
/**
* General category "Cs" in the Unicode specification.
*/
SURROGATE(19, "Cs"),
/**
* General category "Pd" in the Unicode specification.
*/
DASH_PUNCTUATION(20, "Pd"),
/**
* General category "Ps" in the Unicode specification.
*/
START_PUNCTUATION(21, "Ps"),
/**
* General category "Pe" in the Unicode specification.
*/
END_PUNCTUATION(22, "Pe"),
/**
* General category "Pc" in the Unicode specification.
*/
CONNECTOR_PUNCTUATION(23, "Pc"),
/**
* General category "Po" in the Unicode specification.
*/
OTHER_PUNCTUATION(24, "Po"),
/**
* General category "Sm" in the Unicode specification.
*/
MATH_SYMBOL(25, "Sm"),
/**
* General category "Sc" in the Unicode specification.
*/
CURRENCY_SYMBOL(26, "Sc"),
/**
* General category "Sk" in the Unicode specification.
*/
MODIFIER_SYMBOL(27, "Sk"),
/**
* General category "So" in the Unicode specification.
*/
OTHER_SYMBOL(28, "So"),
/**
* General category "Pi" in the Unicode specification.
*/
INITIAL_QUOTE_PUNCTUATION(29, "Pi"),
/**
* General category "Pf" in the Unicode specification.
*/
FINAL_QUOTE_PUNCTUATION(30, "Pf");
/**
* Returns `true` if [char] character belongs to this category.
*/
public actual operator fun contains(char: Char): Boolean = char.getCategoryValue() == this.value
companion object {
internal fun valueOf(category: Int): CharCategory =
when (category) {
in 0..16 -> values()[category]
in 18..30 -> values()[category - 1]
else -> throw IllegalArgumentException("Category #$category is not defined.")
}
}
}
+139 -3
View File
@@ -5,9 +5,6 @@
package kotlin.text
// actually \s is enough to match all whitespace, but \xA0 added because of different regexp behavior of Rhino used in Selenium tests
public actual fun Char.isWhitespace(): Boolean = toString().matches("[\\s\\xA0]")
/**
* Converts this character to lower case using Unicode mapping rules of the invariant locale.
*/
@@ -91,3 +88,142 @@ public actual fun Char.isHighSurrogate(): Boolean = this in Char.MIN_HIGH_SURROG
* Returns `true` if this character is a Unicode low-surrogate code unit (also known as trailing-surrogate code unit).
*/
public actual fun Char.isLowSurrogate(): Boolean = this in Char.MIN_LOW_SURROGATE..Char.MAX_LOW_SURROGATE
/**
* Returns the Unicode general category of this character.
*/
public actual val Char.category: CharCategory
get() = CharCategory.valueOf(getCategoryValue())
/**
* Returns `true` if this character (Unicode code point) is defined in Unicode.
*
* A character is considered to be defined in Unicode if its [category] is not [CharCategory.UNASSIGNED].
*/
public actual fun Char.isDefined(): Boolean {
if (this < '\u0080') {
return true
}
return getCategoryValue() != CharCategory.UNASSIGNED.value
}
/**
* Returns `true` if this character is a letter.
*
* A character is considered to be a letter if its [category] is [CharCategory.UPPERCASE_LETTER],
* [CharCategory.LOWERCASE_LETTER], [CharCategory.TITLECASE_LETTER], [CharCategory.MODIFIER_LETTER], or [CharCategory.OTHER_LETTER].
*
* @sample samples.text.Chars.isLetter
*/
public actual fun Char.isLetter(): Boolean {
if (this in 'a'..'z' || this in 'A'..'Z') {
return true
}
if (this < '\u0080') {
return false
}
return isLetterImpl()
}
/**
* Returns `true` if this character is a letter or digit.
*
* @see isLetter
* @see isDigit
*
* @sample samples.text.Chars.isLetterOrDigit
*/
public actual fun Char.isLetterOrDigit(): Boolean {
if (this in 'a'..'z' || this in 'A'..'Z' || this in '0'..'9') {
return true
}
if (this < '\u0080') {
return false
}
return isDigitImpl() || isLetterImpl()
}
/**
* Returns `true` if this character is a digit.
*
* A character is considered to be a digit if its [category] is [CharCategory.DECIMAL_DIGIT_NUMBER].
*
* @sample samples.text.Chars.isDigit
*/
public actual fun Char.isDigit(): Boolean {
if (this in '0'..'9') {
return true
}
if (this < '\u0080') {
return false
}
return isDigitImpl()
}
/**
* Returns `true` if this character is an upper case letter.
*
* A character is considered to be an upper case letter if its [category] is [CharCategory.UPPERCASE_LETTER].
*
* @sample samples.text.Chars.isUpperCase
*/
public actual fun Char.isUpperCase(): Boolean {
if (this in 'A'..'Z') {
return true
}
if (this < '\u0080') {
return false
}
return isUpperCaseImpl()
}
/**
* Returns `true` if this character is a lower case letter.
*
* A character is considered to be a lower case letter if its [category] is [CharCategory.LOWERCASE_LETTER].
*
* @sample samples.text.Chars.isLowerCase
*/
public actual fun Char.isLowerCase(): Boolean {
if (this in 'a'..'z') {
return true
}
if (this < '\u0080') {
return false
}
return isLowerCaseImpl()
}
/**
* Returns `true` if this character is a title case letter.
*
* A character is considered to be a title case letter if its [category] is [CharCategory.TITLECASE_LETTER].
*
* @sample samples.text.Chars.isTitleCase
*/
public actual fun Char.isTitleCase(): Boolean {
if (this < '\u0080') {
return false
}
return getCategoryValue() == CharCategory.TITLECASE_LETTER.value
}
/**
* Returns `true` if this character is an ISO control character.
*
* A character is considered to be an ISO control character if its [category] is [CharCategory.CONTROL].
*
* @sample samples.text.Chars.isISOControl
*/
public actual fun Char.isISOControl(): Boolean {
return this <= '\u001F' || this in '\u007F'..'\u009F'
}
/**
* Determines whether a character is whitespace according to the Unicode standard.
* Returns `true` if the character is whitespace.
*
* @sample samples.text.Chars.isWhitespace
*/
public actual fun Char.isWhitespace(): Boolean = isWhitespaceImpl()
@@ -0,0 +1,177 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
public actual enum class CharCategory(public val value: Int, public actual val code: String) {
/**
* General category "Cn" in the Unicode specification.
*/
UNASSIGNED(0, "Cn"),
/**
* General category "Lu" in the Unicode specification.
*/
UPPERCASE_LETTER(1, "Lu"),
/**
* General category "Ll" in the Unicode specification.
*/
LOWERCASE_LETTER(2, "Ll"),
/**
* General category "Lt" in the Unicode specification.
*/
TITLECASE_LETTER(3, "Lt"),
/**
* General category "Lm" in the Unicode specification.
*/
MODIFIER_LETTER(4, "Lm"),
/**
* General category "Lo" in the Unicode specification.
*/
OTHER_LETTER(5, "Lo"),
/**
* General category "Mn" in the Unicode specification.
*/
NON_SPACING_MARK(6, "Mn"),
/**
* General category "Me" in the Unicode specification.
*/
ENCLOSING_MARK(7, "Me"),
/**
* General category "Mc" in the Unicode specification.
*/
COMBINING_SPACING_MARK(8, "Mc"),
/**
* General category "Nd" in the Unicode specification.
*/
DECIMAL_DIGIT_NUMBER(9, "Nd"),
/**
* General category "Nl" in the Unicode specification.
*/
LETTER_NUMBER(10, "Nl"),
/**
* General category "No" in the Unicode specification.
*/
OTHER_NUMBER(11, "No"),
/**
* General category "Zs" in the Unicode specification.
*/
SPACE_SEPARATOR(12, "Zs"),
/**
* General category "Zl" in the Unicode specification.
*/
LINE_SEPARATOR(13, "Zl"),
/**
* General category "Zp" in the Unicode specification.
*/
PARAGRAPH_SEPARATOR(14, "Zp"),
/**
* General category "Cc" in the Unicode specification.
*/
CONTROL(15, "Cc"),
/**
* General category "Cf" in the Unicode specification.
*/
FORMAT(16, "Cf"),
/**
* General category "Co" in the Unicode specification.
*/
PRIVATE_USE(18, "Co"),
/**
* General category "Cs" in the Unicode specification.
*/
SURROGATE(19, "Cs"),
/**
* General category "Pd" in the Unicode specification.
*/
DASH_PUNCTUATION(20, "Pd"),
/**
* General category "Ps" in the Unicode specification.
*/
START_PUNCTUATION(21, "Ps"),
/**
* General category "Pe" in the Unicode specification.
*/
END_PUNCTUATION(22, "Pe"),
/**
* General category "Pc" in the Unicode specification.
*/
CONNECTOR_PUNCTUATION(23, "Pc"),
/**
* General category "Po" in the Unicode specification.
*/
OTHER_PUNCTUATION(24, "Po"),
/**
* General category "Sm" in the Unicode specification.
*/
MATH_SYMBOL(25, "Sm"),
/**
* General category "Sc" in the Unicode specification.
*/
CURRENCY_SYMBOL(26, "Sc"),
/**
* General category "Sk" in the Unicode specification.
*/
MODIFIER_SYMBOL(27, "Sk"),
/**
* General category "So" in the Unicode specification.
*/
OTHER_SYMBOL(28, "So"),
/**
* General category "Pi" in the Unicode specification.
*/
INITIAL_QUOTE_PUNCTUATION(29, "Pi"),
/**
* General category "Pf" in the Unicode specification.
*/
FINAL_QUOTE_PUNCTUATION(30, "Pf");
/**
* Returns `true` if [char] character belongs to this category.
*/
public actual operator fun contains(char: Char): Boolean = Character.getType(char) == this.value
companion object {
/**
* Returns the [CharCategory] corresponding to the specified [category] that represents a Java general category constant.
*
* @throws IllegalArgumentException if the [category] does not represent a Java general category constant.
*/
public fun valueOf(category: Int): CharCategory =
when (category) {
in 0..16 -> values()[category]
in 18..30 -> values()[category - 1]
else -> throw IllegalArgumentException("Category #$category is not defined.")
}
}
}
+32 -14
View File
@@ -10,32 +10,51 @@ package kotlin.text
import java.util.Locale
/**
* Returns the Unicode general category of this character.
*/
public actual val Char.category: CharCategory
get() = CharCategory.valueOf(Character.getType(this))
/**
* Returns `true` if this character (Unicode code point) is defined in Unicode.
*
* A character is considered to be defined in Unicode if its [category] is not [CharCategory.UNASSIGNED].
*/
@kotlin.internal.InlineOnly
public inline fun Char.isDefined(): Boolean = Character.isDefined(this)
public actual inline fun Char.isDefined(): Boolean = Character.isDefined(this)
/**
* Returns `true` if this character is a letter.
*
* A character is considered to be a letter if its [category] is [CharCategory.UPPERCASE_LETTER],
* [CharCategory.LOWERCASE_LETTER], [CharCategory.TITLECASE_LETTER], [CharCategory.MODIFIER_LETTER], or [CharCategory.OTHER_LETTER].
*
* @sample samples.text.Chars.isLetter
*/
@kotlin.internal.InlineOnly
public inline fun Char.isLetter(): Boolean = Character.isLetter(this)
public actual inline fun Char.isLetter(): Boolean = Character.isLetter(this)
/**
* Returns `true` if this character is a letter or digit.
*
* @see isLetter
* @see isDigit
*
* @sample samples.text.Chars.isLetterOrDigit
*/
@kotlin.internal.InlineOnly
public inline fun Char.isLetterOrDigit(): Boolean = Character.isLetterOrDigit(this)
public actual inline fun Char.isLetterOrDigit(): Boolean = Character.isLetterOrDigit(this)
/**
* Returns `true` if this character (Unicode code point) is a digit.
* Returns `true` if this character is a digit.
*
* A character is considered to be a digit if its [category] is [CharCategory.DECIMAL_DIGIT_NUMBER].
*
* @sample samples.text.Chars.isDigit
*/
@kotlin.internal.InlineOnly
public inline fun Char.isDigit(): Boolean = Character.isDigit(this)
public actual inline fun Char.isDigit(): Boolean = Character.isDigit(this)
/**
@@ -47,10 +66,13 @@ public inline fun Char.isIdentifierIgnorable(): Boolean = Character.isIdentifier
/**
* Returns `true` if this character is an ISO control character.
*
* A character is considered to be an ISO control character if its [category] is [CharCategory.CONTROL].
*
* @sample samples.text.Chars.isISOControl
*/
@kotlin.internal.InlineOnly
public inline fun Char.isISOControl(): Boolean = Character.isISOControl(this)
public actual inline fun Char.isISOControl(): Boolean = Character.isISOControl(this)
/**
* Returns `true` if this character (Unicode code point) may be part of a Java identifier as other than the first character.
@@ -69,6 +91,7 @@ public inline fun Char.isJavaIdentifierStart(): Boolean = Character.isJavaIdenti
/**
* Determines whether a character is whitespace according to the Unicode standard.
* Returns `true` if the character is whitespace.
*
* @sample samples.text.Chars.isWhitespace
*/
public actual fun Char.isWhitespace(): Boolean = Character.isWhitespace(this) || Character.isSpaceChar(this)
@@ -78,14 +101,14 @@ public actual fun Char.isWhitespace(): Boolean = Character.isWhitespace(this) ||
* @sample samples.text.Chars.isUpperCase
*/
@kotlin.internal.InlineOnly
public inline fun Char.isUpperCase(): Boolean = Character.isUpperCase(this)
public actual inline fun Char.isUpperCase(): Boolean = Character.isUpperCase(this)
/**
* Returns `true` if this character is lower case.
* @sample samples.text.Chars.isLowerCase
*/
@kotlin.internal.InlineOnly
public inline fun Char.isLowerCase(): Boolean = Character.isLowerCase(this)
public actual inline fun Char.isLowerCase(): Boolean = Character.isLowerCase(this)
/**
* Converts this character to lower case using Unicode mapping rules of the invariant locale.
@@ -192,7 +215,7 @@ public fun Char.lowercase(locale: Locale): String = toString().lowercase(locale)
* @sample samples.text.Chars.isTitleCase
*/
@kotlin.internal.InlineOnly
public inline fun Char.isTitleCase(): Boolean = Character.isTitleCase(this)
public actual inline fun Char.isTitleCase(): Boolean = Character.isTitleCase(this)
/**
* Converts this character to title case using Unicode mapping rules of the invariant locale.
@@ -260,11 +283,6 @@ public fun Char.titlecase(locale: Locale): String {
return titlecaseChar().toString()
}
/**
* Returns a value indicating a character's general category.
*/
public val Char.category: CharCategory get() = CharCategory.valueOf(Character.getType(this))
/**
* Returns the Unicode directionality property for the given character.
*/
+85
View File
@@ -227,3 +227,88 @@ public fun Char.equals(other: Char, ignoreCase: Boolean = false): Boolean {
* Returns `true` if this character is a Unicode surrogate code unit.
*/
public fun Char.isSurrogate(): Boolean = this in Char.MIN_SURROGATE..Char.MAX_SURROGATE
/**
* Returns the Unicode general category of this character.
*/
public expect val Char.category: CharCategory
/**
* Returns `true` if this character (Unicode code point) is defined in Unicode.
*
* A character is considered to be defined in Unicode if its [category] is not [CharCategory.UNASSIGNED].
*/
public expect fun Char.isDefined(): Boolean
/**
* Returns `true` if this character is a letter.
*
* A character is considered to be a letter if its [category] is [CharCategory.UPPERCASE_LETTER],
* [CharCategory.LOWERCASE_LETTER], [CharCategory.TITLECASE_LETTER], [CharCategory.MODIFIER_LETTER], or [CharCategory.OTHER_LETTER].
*
* @sample samples.text.Chars.isLetter
*/
public expect fun Char.isLetter(): Boolean
/**
* Returns `true` if this character is a letter or digit.
*
* @see isLetter
* @see isDigit
*
* @sample samples.text.Chars.isLetterOrDigit
*/
public expect fun Char.isLetterOrDigit(): Boolean
/**
* Returns `true` if this character is a digit.
*
* A character is considered to be a digit if its [category] is [CharCategory.DECIMAL_DIGIT_NUMBER].
*
* @sample samples.text.Chars.isDigit
*/
public expect fun Char.isDigit(): Boolean
/**
* Returns `true` if this character is an upper case letter.
*
* A character is considered to be an upper case letter if its [category] is [CharCategory.UPPERCASE_LETTER].
*
* @sample samples.text.Chars.isUpperCase
*/
public expect fun Char.isUpperCase(): Boolean
/**
* Returns `true` if this character is a lower case letter.
*
* A character is considered to be a lower case letter if its [category] is [CharCategory.LOWERCASE_LETTER].
*
* @sample samples.text.Chars.isLowerCase
*/
public expect fun Char.isLowerCase(): Boolean
/**
* Returns `true` if this character is a title case letter.
*
* A character is considered to be a title case letter if its [category] is [CharCategory.TITLECASE_LETTER].
*
* @sample samples.text.Chars.isTitleCase
*/
public expect fun Char.isTitleCase(): Boolean
/**
* Returns `true` if this character is an ISO control character.
*
* A character is considered to be an ISO control character if its [category] is [CharCategory.CONTROL].
*
* @sample samples.text.Chars.isISOControl
*/
public expect fun Char.isISOControl(): Boolean
/**
* Determines whether a character is whitespace according to the Unicode standard.
* Returns `true` if the character is whitespace.
*
* @sample samples.text.Chars.isWhitespace
*/
public expect fun Char.isWhitespace(): Boolean
@@ -1,5 +1,5 @@
/*
* Copyright 2010-2018 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
@@ -8,166 +8,164 @@ package kotlin.text
/**
* Represents the character general category in the Unicode specification.
*/
public enum class CharCategory(public val value: Int, public val code: String) {
public expect enum class CharCategory {
/**
* General category "Cn" in the Unicode specification.
*/
UNASSIGNED(Character.UNASSIGNED.toInt(), "Cn"),
UNASSIGNED,
/**
* General category "Lu" in the Unicode specification.
*/
UPPERCASE_LETTER(Character.UPPERCASE_LETTER.toInt(), "Lu"),
UPPERCASE_LETTER,
/**
* General category "Ll" in the Unicode specification.
*/
LOWERCASE_LETTER(Character.LOWERCASE_LETTER.toInt(), "Ll"),
LOWERCASE_LETTER,
/**
* General category "Lt" in the Unicode specification.
*/
TITLECASE_LETTER(Character.TITLECASE_LETTER.toInt(), "Lt"),
TITLECASE_LETTER,
/**
* General category "Lm" in the Unicode specification.
*/
MODIFIER_LETTER(Character.MODIFIER_LETTER.toInt(), "Lm"),
MODIFIER_LETTER,
/**
* General category "Lo" in the Unicode specification.
*/
OTHER_LETTER(Character.OTHER_LETTER.toInt(), "Lo"),
OTHER_LETTER,
/**
* General category "Mn" in the Unicode specification.
*/
NON_SPACING_MARK(Character.NON_SPACING_MARK.toInt(), "Mn"),
NON_SPACING_MARK,
/**
* General category "Me" in the Unicode specification.
*/
ENCLOSING_MARK(Character.ENCLOSING_MARK.toInt(), "Me"),
ENCLOSING_MARK,
/**
* General category "Mc" in the Unicode specification.
*/
COMBINING_SPACING_MARK(Character.COMBINING_SPACING_MARK.toInt(), "Mc"),
COMBINING_SPACING_MARK,
/**
* General category "Nd" in the Unicode specification.
*/
DECIMAL_DIGIT_NUMBER(Character.DECIMAL_DIGIT_NUMBER.toInt(), "Nd"),
DECIMAL_DIGIT_NUMBER,
/**
* General category "Nl" in the Unicode specification.
*/
LETTER_NUMBER(Character.LETTER_NUMBER.toInt(), "Nl"),
LETTER_NUMBER,
/**
* General category "No" in the Unicode specification.
*/
OTHER_NUMBER(Character.OTHER_NUMBER.toInt(), "No"),
OTHER_NUMBER,
/**
* General category "Zs" in the Unicode specification.
*/
SPACE_SEPARATOR(Character.SPACE_SEPARATOR.toInt(), "Zs"),
SPACE_SEPARATOR,
/**
* General category "Zl" in the Unicode specification.
*/
LINE_SEPARATOR(Character.LINE_SEPARATOR.toInt(), "Zl"),
LINE_SEPARATOR,
/**
* General category "Zp" in the Unicode specification.
*/
PARAGRAPH_SEPARATOR(Character.PARAGRAPH_SEPARATOR.toInt(), "Zp"),
PARAGRAPH_SEPARATOR,
/**
* General category "Cc" in the Unicode specification.
*/
CONTROL(Character.CONTROL.toInt(), "Cc"),
CONTROL,
/**
* General category "Cf" in the Unicode specification.
*/
FORMAT(Character.FORMAT.toInt(), "Cf"),
FORMAT,
/**
* General category "Co" in the Unicode specification.
*/
PRIVATE_USE(Character.PRIVATE_USE.toInt(), "Co"),
PRIVATE_USE,
/**
* General category "Cs" in the Unicode specification.
*/
SURROGATE(Character.SURROGATE.toInt(), "Cs"),
SURROGATE,
/**
* General category "Pd" in the Unicode specification.
*/
DASH_PUNCTUATION(Character.DASH_PUNCTUATION.toInt(), "Pd"),
DASH_PUNCTUATION,
/**
* General category "Ps" in the Unicode specification.
*/
START_PUNCTUATION(Character.START_PUNCTUATION.toInt(), "Ps"),
START_PUNCTUATION,
/**
* General category "Pe" in the Unicode specification.
*/
END_PUNCTUATION(Character.END_PUNCTUATION.toInt(), "Pe"),
END_PUNCTUATION,
/**
* General category "Pc" in the Unicode specification.
*/
CONNECTOR_PUNCTUATION(Character.CONNECTOR_PUNCTUATION.toInt(), "Pc"),
CONNECTOR_PUNCTUATION,
/**
* General category "Po" in the Unicode specification.
*/
OTHER_PUNCTUATION(Character.OTHER_PUNCTUATION.toInt(), "Po"),
OTHER_PUNCTUATION,
/**
* General category "Sm" in the Unicode specification.
*/
MATH_SYMBOL(Character.MATH_SYMBOL.toInt(), "Sm"),
MATH_SYMBOL,
/**
* General category "Sc" in the Unicode specification.
*/
CURRENCY_SYMBOL(Character.CURRENCY_SYMBOL.toInt(), "Sc"),
CURRENCY_SYMBOL,
/**
* General category "Sk" in the Unicode specification.
*/
MODIFIER_SYMBOL(Character.MODIFIER_SYMBOL.toInt(), "Sk"),
MODIFIER_SYMBOL,
/**
* General category "So" in the Unicode specification.
*/
OTHER_SYMBOL(Character.OTHER_SYMBOL.toInt(), "So"),
OTHER_SYMBOL,
/**
* General category "Pi" in the Unicode specification.
*/
INITIAL_QUOTE_PUNCTUATION(Character.INITIAL_QUOTE_PUNCTUATION.toInt(), "Pi"),
INITIAL_QUOTE_PUNCTUATION,
/**
* General category "Pf" in the Unicode specification.
*/
FINAL_QUOTE_PUNCTUATION(Character.FINAL_QUOTE_PUNCTUATION.toInt(), "Pf");
FINAL_QUOTE_PUNCTUATION;
/**
* Two-letter code of this general category in the Unicode specification.
*/
public val code: String
/**
* Returns `true` if [char] character belongs to this category.
*/
public operator fun contains(char: Char): Boolean = Character.getType(char) == this.value
public companion object {
private val categoryMap by lazy { CharCategory.values().associateBy { it.value } }
public fun valueOf(category: Int): CharCategory = categoryMap[category] ?: throw IllegalArgumentException("Category #$category is not defined.")
}
public operator fun contains(char: Char): Boolean
}
+155
View File
@@ -7,6 +7,8 @@ package test.text
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertFalse
import kotlin.test.assertTrue
import kotlin.test.assertFails
class CharTest {
@@ -146,4 +148,157 @@ class CharTest {
testFails(100, radix = 36)
testFails(100, radix = 110)
}
private fun charToCategory() = mapOf(
'\u0378' to "Cn",
'A' to "Lu", // \u0041
'a' to "Ll", // \u0061
'Dž' to "Lt", // \u01C5
'ʰ' to "Lm", // \u02B0
'ƻ' to "Lo", // \u01BB
'\u0300' to "Mn",
'\u0489' to "Me",
'\u0903' to "Mc",
'0' to "Nd", // \u0030
'' to "Nl", // \u2160
'²' to "No", // \u00B2
' ' to "Zs", // \u0020
'\u2028' to "Zl",
'\u2029' to "Zp",
'\u0018' to "Cc",
'\u00AD' to "Cf",
'\uE000' to "Co",
'\uD800' to "Cs",
'\u002D' to "Pd",
'(' to "Ps", // \u0028
')' to "Pe", // \u0029
'_' to "Pc", // \u005F
'!' to "Po", // \u0021
'+' to "Sm", // \u002B
'$' to "Sc", // \u0024
'^' to "Sk", // \u005E
'©' to "So", // \u00A9
'«' to "Pi", // \u00AB
'»' to "Pf" // \u00BB
)
@Test
fun charCategory() {
for ((char, categoryCode) in charToCategory()) {
assertEquals(categoryCode, char.category.code, "char code: ${char.toInt().toString(radix = 16)}")
}
}
@Test
fun charCategoryUnassigned() {
val unassignedChar = '\u0378'
assertFalse(unassignedChar.isDefined())
assertEquals(CharCategory.UNASSIGNED, unassignedChar.category)
assertEquals("Cn", CharCategory.UNASSIGNED.code)
}
@Test
fun charCategoryUppercaseLetter() {
val latinCapitalLetterA = 'A' // \u0041
assertTrue(latinCapitalLetterA.isLetterOrDigit())
assertTrue(latinCapitalLetterA.isLetter())
assertTrue(latinCapitalLetterA.isUpperCase())
assertEquals(CharCategory.UPPERCASE_LETTER, latinCapitalLetterA.category)
assertEquals("Lu", CharCategory.UPPERCASE_LETTER.code)
}
@Test
fun charCategoryLowercaseLetter() {
val latinSmallLetterA = 'a' // \u0061
assertTrue(latinSmallLetterA.isLetterOrDigit())
assertTrue(latinSmallLetterA.isLetter())
assertTrue(latinSmallLetterA.isLowerCase())
assertEquals(CharCategory.LOWERCASE_LETTER, latinSmallLetterA.category)
assertEquals("Ll", CharCategory.LOWERCASE_LETTER.code)
}
@Test
fun charCategoryTitlecaseLetter() {
val latinCapitalLetterDz = 'Dž' // \u01C5
assertTrue(latinCapitalLetterDz.isLetterOrDigit())
assertTrue(latinCapitalLetterDz.isLetter())
assertTrue(latinCapitalLetterDz.isTitleCase())
assertEquals(CharCategory.TITLECASE_LETTER, latinCapitalLetterDz.category)
assertEquals("Lt", CharCategory.TITLECASE_LETTER.code)
}
@Test
fun charCategoryModifierLetter() {
val modifierLetterSmallH = 'ʰ' // \u02B0
assertTrue(modifierLetterSmallH.isLetterOrDigit())
assertTrue(modifierLetterSmallH.isLetter())
assertEquals(CharCategory.MODIFIER_LETTER, modifierLetterSmallH.category)
assertEquals("Lm", CharCategory.MODIFIER_LETTER.code)
}
@Test
fun charCategoryOtherLetter() {
val twoWithStroke = 'ƻ' // \u01BB
assertTrue(twoWithStroke.isLetterOrDigit())
assertTrue(twoWithStroke.isLetter())
assertEquals(CharCategory.OTHER_LETTER, twoWithStroke.category)
assertEquals("Lo", CharCategory.OTHER_LETTER.code)
}
@Test
fun charCategoryDecimalDigitNumber() {
val digitZero = '0' // \u0030
assertTrue(digitZero.isLetterOrDigit())
assertTrue(digitZero.isDigit())
assertEquals(CharCategory.DECIMAL_DIGIT_NUMBER, digitZero.category)
assertEquals("Nd", CharCategory.DECIMAL_DIGIT_NUMBER.code)
}
@Test
fun charCategoryLetterNumber() {
val romanNumberOne = '' // \u2160
assertFalse(romanNumberOne.isDigit())
assertEquals(CharCategory.LETTER_NUMBER, romanNumberOne.category)
assertEquals("Nl", CharCategory.LETTER_NUMBER.code)
}
@Test
fun charCategoryOtherNumber() {
val superscriptTwo = '²' // \u00B2
assertFalse(superscriptTwo.isDigit())
assertEquals(CharCategory.OTHER_NUMBER, superscriptTwo.category)
assertEquals("No", CharCategory.OTHER_NUMBER.code)
}
@Test
fun charCategorySpaceSeparator() {
val superscriptTwo = ' ' // \u0020
assertTrue(superscriptTwo.isWhitespace())
assertEquals(CharCategory.SPACE_SEPARATOR, superscriptTwo.category)
assertEquals("Zs", CharCategory.SPACE_SEPARATOR.code)
}
@Test
fun charCategoryLineSeparator() {
val lineSeparator = '\u2028'
assertTrue(lineSeparator.isWhitespace())
assertEquals(CharCategory.LINE_SEPARATOR, lineSeparator.category)
assertEquals("Zl", CharCategory.LINE_SEPARATOR.code)
}
@Test
fun charCategoryParagraphSeparator() {
val paragraphSeparator = '\u2029'
assertTrue(paragraphSeparator.isWhitespace())
assertEquals(CharCategory.PARAGRAPH_SEPARATOR, paragraphSeparator.category)
assertEquals("Zp", CharCategory.PARAGRAPH_SEPARATOR.code)
}
@Test
fun charCategoryControl() {
val controlCancel = '\u0018'
assertTrue(controlCancel.isISOControl())
assertEquals(CharCategory.CONTROL, controlCancel.category)
assertEquals("Cc", CharCategory.CONTROL.code)
}
}
+86 -1
View File
@@ -63,7 +63,6 @@ actual enum class RegexOption {
// From char.kt
actual fun Char.isWhitespace(): Boolean = TODO("Wasm stdlib: Text")
actual fun Char.isHighSurrogate(): Boolean = TODO("Wasm stdlib: Text")
actual fun Char.isLowSurrogate(): Boolean = TODO("Wasm stdlib: Text")
@@ -131,6 +130,92 @@ public actual fun Char.uppercaseChar(): Char = TODO("Wasm stdlib: Text")
@ExperimentalStdlibApi
public actual fun Char.uppercase(): String = TODO("Wasm stdlib: Text")
/**
* Returns the Unicode general category of this character.
*/
public actual val Char.category: CharCategory get() = TODO("Wasm stdlib: Text")
/**
* Returns `true` if this character (Unicode code point) is defined in Unicode.
*
* A character is considered to be defined in Unicode if its [category] is not [CharCategory.UNASSIGNED].
*/
public actual fun Char.isDefined(): Boolean = TODO("Wasm stdlib: Text")
/**
* Returns `true` if this character is a letter.
*
* A character is considered to be a letter if its [category] is [CharCategory.UPPERCASE_LETTER],
* [CharCategory.LOWERCASE_LETTER], [CharCategory.TITLECASE_LETTER], [CharCategory.MODIFIER_LETTER], or [CharCategory.OTHER_LETTER].
*
* @sample samples.text.Chars.isLetter
*/
public actual fun Char.isLetter(): Boolean = TODO("Wasm stdlib: Text")
/**
* Returns `true` if this character is a letter or digit.
*
* @see isLetter
* @see isDigit
*
* @sample samples.text.Chars.isLetterOrDigit
*/
public actual fun Char.isLetterOrDigit(): Boolean = TODO("Wasm stdlib: Text")
/**
* Returns `true` if this character is a digit.
*
* A character is considered to be a digit if its [category] is [CharCategory.DECIMAL_DIGIT_NUMBER].
*
* @sample samples.text.Chars.isDigit
*/
public actual fun Char.isDigit(): Boolean = TODO("Wasm stdlib: Text")
/**
* Returns `true` if this character is an upper case letter.
*
* A character is considered to be an upper case letter if its [category] is [CharCategory.UPPERCASE_LETTER].
*
* @sample samples.text.Chars.isUpperCase
*/
public actual fun Char.isUpperCase(): Boolean = TODO("Wasm stdlib: Text")
/**
* Returns `true` if this character is a lower case letter.
*
* A character is considered to be a lower case letter if its [category] is [CharCategory.LOWERCASE_LETTER].
*
* @sample samples.text.Chars.isLowerCase
*/
public actual fun Char.isLowerCase(): Boolean = TODO("Wasm stdlib: Text")
/**
* Returns `true` if this character is a title case letter.
*
* A character is considered to be a title case letter if its [category] is [CharCategory.TITLECASE_LETTER].
*
* @sample samples.text.Chars.isTitleCase
*/
public actual fun Char.isTitleCase(): Boolean = TODO("Wasm stdlib: Text")
/**
* Returns `true` if this character is an ISO control character.
*
* A character is considered to be an ISO control character if its [category] is [CharCategory.CONTROL].
*
* @sample samples.text.Chars.isISOControl
*/
public actual fun Char.isISOControl(): Boolean = TODO("Wasm stdlib: Text")
/**
* Determines whether a character is whitespace according to the Unicode standard.
* Returns `true` if the character is whitespace.
*
* @sample samples.text.Chars.isWhitespace
*/
public actual fun Char.isWhitespace(): Boolean = TODO("Wasm stdlib: Text")
// From string.kt
@@ -0,0 +1,171 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package kotlin.text
/**
* Represents the character general category in the Unicode specification.
*/
public actual enum class CharCategory {
/**
* General category "Cn" in the Unicode specification.
*/
UNASSIGNED,
/**
* General category "Lu" in the Unicode specification.
*/
UPPERCASE_LETTER,
/**
* General category "Ll" in the Unicode specification.
*/
LOWERCASE_LETTER,
/**
* General category "Lt" in the Unicode specification.
*/
TITLECASE_LETTER,
/**
* General category "Lm" in the Unicode specification.
*/
MODIFIER_LETTER,
/**
* General category "Lo" in the Unicode specification.
*/
OTHER_LETTER,
/**
* General category "Mn" in the Unicode specification.
*/
NON_SPACING_MARK,
/**
* General category "Me" in the Unicode specification.
*/
ENCLOSING_MARK,
/**
* General category "Mc" in the Unicode specification.
*/
COMBINING_SPACING_MARK,
/**
* General category "Nd" in the Unicode specification.
*/
DECIMAL_DIGIT_NUMBER,
/**
* General category "Nl" in the Unicode specification.
*/
LETTER_NUMBER,
/**
* General category "No" in the Unicode specification.
*/
OTHER_NUMBER,
/**
* General category "Zs" in the Unicode specification.
*/
SPACE_SEPARATOR,
/**
* General category "Zl" in the Unicode specification.
*/
LINE_SEPARATOR,
/**
* General category "Zp" in the Unicode specification.
*/
PARAGRAPH_SEPARATOR,
/**
* General category "Cc" in the Unicode specification.
*/
CONTROL,
/**
* General category "Cf" in the Unicode specification.
*/
FORMAT,
/**
* General category "Co" in the Unicode specification.
*/
PRIVATE_USE,
/**
* General category "Cs" in the Unicode specification.
*/
SURROGATE,
/**
* General category "Pd" in the Unicode specification.
*/
DASH_PUNCTUATION,
/**
* General category "Ps" in the Unicode specification.
*/
START_PUNCTUATION,
/**
* General category "Pe" in the Unicode specification.
*/
END_PUNCTUATION,
/**
* General category "Pc" in the Unicode specification.
*/
CONNECTOR_PUNCTUATION,
/**
* General category "Po" in the Unicode specification.
*/
OTHER_PUNCTUATION,
/**
* General category "Sm" in the Unicode specification.
*/
MATH_SYMBOL,
/**
* General category "Sc" in the Unicode specification.
*/
CURRENCY_SYMBOL,
/**
* General category "Sk" in the Unicode specification.
*/
MODIFIER_SYMBOL,
/**
* General category "So" in the Unicode specification.
*/
OTHER_SYMBOL,
/**
* General category "Pi" in the Unicode specification.
*/
INITIAL_QUOTE_PUNCTUATION,
/**
* General category "Pf" in the Unicode specification.
*/
FINAL_QUOTE_PUNCTUATION;
/**
* Two-letter code of this general category in the Unicode specification.
*/
public actual val code: String get() = TODO("Wasm stdlib: Text")
/**
* Returns `true` if [char] character belongs to this category.
*/
public actual operator fun contains(char: Char): Boolean = TODO("Wasm stdlib: Text")
}
@@ -34,4 +34,11 @@ task run(type: JavaExec) {
classpath sourceSets.main.runtimeClasspath
args = ["${rootDir}"]
systemProperty 'line.separator', '\n'
}
task generateUnicodeData(type: JavaExec) {
group 'application'
main 'generators.unicode.GenerateUnicodeDataKt'
classpath sourceSets.main.runtimeClasspath
args = ["${rootDir}"]
}
@@ -0,0 +1,99 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode
import generators.unicode.ranges.CharCategoryTestGenerator
import generators.unicode.ranges.RangesGenerator
import templates.COPYRIGHT_NOTICE
import templates.KotlinTarget
import templates.readCopyrightNoticeFromProfile
import java.io.File
import java.net.URL
import kotlin.system.exitProcess
// Go to https://www.unicode.org/versions/latest/ to find out the latest public version of the Unicode Character Database files.
private const val unicodeDataUrl = "https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt"
/**
* This program generates sources related to UnicodeData.txt.
* There are two ways to run the program.
* 1. Pass the root directory of the project to generate sources for js and js-ir.
* _CharCategoryTest.kt and supporting files are also generated to test the generated sources.
* The generated test is meant to be run after updating Unicode version and should not be merged to master.
* 2. Pass the name of the target to generate sources for, and the directory to generate sources in.
* No tests are generated.
*/
fun main(args: Array<String>) {
val unicodeDataLines = URL(unicodeDataUrl).openStream().reader().readLines()
val generators = mutableListOf<UnicodeDataGenerator>()
fun addRangesGenerators(generatedDir: File, target: KotlinTarget) {
val categoryRangesGenerator = RangesGenerator.forCharCategory(generatedDir.resolve("_CharCategories.kt"), target)
val digitRangesGenerator = RangesGenerator.forDigit(generatedDir.resolve("_DigitChars.kt"), target)
val letterRangesGenerator = RangesGenerator.forLetter(generatedDir.resolve("_LetterChars.kt"), target)
val whitespaceRangesGenerator = RangesGenerator.forWhitespace(generatedDir.resolve("_WhitespaceChars.kt"))
generators.add(categoryRangesGenerator)
generators.add(digitRangesGenerator)
generators.add(letterRangesGenerator)
generators.add(whitespaceRangesGenerator)
}
when (args.size) {
1 -> {
val baseDir = File(args.first())
val categoryTestFile = baseDir.resolve("libraries/stdlib/js/test/text/unicodeData/_CharCategoryTest.kt")
val categoryTestGenerator = CharCategoryTestGenerator(categoryTestFile)
generators.add(categoryTestGenerator)
val jsGeneratedDir = baseDir.resolve("libraries/stdlib/js/src/generated/")
addRangesGenerators(jsGeneratedDir, KotlinTarget.JS)
val jsIrGeneratedDir = baseDir.resolve("libraries/stdlib/js-ir/src/generated/")
addRangesGenerators(jsIrGeneratedDir, KotlinTarget.JS_IR)
// For debugging. To see the file content
val unicodeDataFile = baseDir.resolve("libraries/tools/kotlin-stdlib-gen/src/generators/unicode/UnicodeData.txt")
unicodeDataFile.writeText(unicodeDataLines.joinToString(separator = "\n"))
}
2 -> {
val (targetName, targetDir) = args
val target = KotlinTarget.values.singleOrNull { it.name.equals(targetName, ignoreCase = true) }
?: error("Invalid target: $targetName")
addRangesGenerators(File(targetDir), target)
}
else -> {
println(
"""Parameters:
<kotlin-base-dir> - generates UnicodeData.txt sources for js and js-ir targets using paths derived from specified base path
<UnicodeData.txt-path> <target> <target-dir> - generates UnicodeData.txt sources for the specified target in the specified target directory
"""
)
exitProcess(1)
}
}
COPYRIGHT_NOTICE =
readCopyrightNoticeFromProfile { Thread.currentThread().contextClassLoader.getResourceAsStream("apache.xml").reader() }
unicodeDataLines.forEach { line ->
val parts = line.split(";")
if (parts[0].length <= 4) {
generators.forEach { it.appendChar(parts[0], parts[1], parts[2]) }
}
}
generators.forEach { it.close() }
}
internal interface UnicodeDataGenerator {
fun appendChar(char: String, name: String, categoryCode: String)
fun close()
}
@@ -0,0 +1,173 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges
import generators.unicode.UnicodeDataGenerator
import generators.unicode.ranges.writers.writeHeader
import java.io.File
import java.io.FileWriter
internal class CharCategoryTestGenerator(private val outputFile: File) : UnicodeDataGenerator {
private var arrayIndex = 0
private var arraySize = 0
private var writer: FileWriter? = null
init {
outputFile.parentFile.mkdirs()
}
override fun appendChar(char: String, name: String, categoryCode: String) {
if (arraySize == 0) {
writer?.appendLine(")")
writer?.close()
generateUnicodeDataHeader(arrayIndex)
}
val isStart = name.endsWith(", First>")
writer?.appendLine(" CharProperties(char = '\\u$char', isStartOfARange = $isStart, categoryCode = \"$categoryCode\"),")
arraySize++
if (arraySize == 2048) {
arraySize = 0
arrayIndex++
}
}
override fun close() {
writer?.appendLine(")")
writer?.close()
generateFlattenUnicodeData()
generateCharProperties()
generateCharCategoryTest()
}
private fun generateFlattenUnicodeData() {
val file = outputFile.resolveSibling("_UnicodeDataFlatten.kt")
generateFileHeader(file)
writer?.appendLine("internal val unicodeData = arrayOf<Array<CharProperties>>(")
for (index in 0..arrayIndex) {
writer?.appendLine(" unicodeData$index,")
}
writer?.appendLine(").flatten()")
writer?.close()
}
private fun generateCharProperties() {
val file = outputFile.resolveSibling("_CharProperties.kt")
generateFileHeader(file)
writer?.appendLine("data class CharProperties(val char: Char, val isStartOfARange: Boolean, val categoryCode: String)")
writer?.close()
}
private fun generateCharCategoryTest() {
generateFileHeader(outputFile)
writer?.appendLine(
"""
import kotlin.test.*
class CharCategoryTest {
@Test
fun category() {
val charProperties = hashMapOf<Char, CharProperties>()
for (properties in unicodeData) {
charProperties[properties.char] = properties
}
var properties: CharProperties? = null
for (char in Char.MIN_VALUE..Char.MAX_VALUE) {
if (charProperties.containsKey(char)) {
properties = charProperties.getValue(char)
} else if (properties?.isStartOfARange != true) {
properties = null
}
val charCode = char.toInt().toString(radix = 16).padStart(length = 4, padChar = '0')
val expectedCategoryCode = properties?.categoryCode ?: CharCategory.UNASSIGNED.code
fun <T> test(expected: T, actual: T, name: String) {
assertEquals(expected, actual, "Char:[${"$"}char] with code:[${"$"}charCode] in Unicode has ${"$"}name = ${"$"}expected, but in Kotlin ${"$"}name = ${"$"}actual")
}
test(expectedCategoryCode, char.category.code, "category")
val expectedIsDigit = isDigit(expectedCategoryCode)
test(expectedIsDigit, char.isDigit(), "isDigit()")
val expectedIsLetter = isLetter(expectedCategoryCode)
test(expectedIsLetter, char.isLetter(), "isLetter()")
val expectedIsLetterOrDigit = expectedIsLetter || expectedIsDigit
test(expectedIsLetterOrDigit, char.isLetterOrDigit(), "isLetterOrDigit()")
val expectedIsLowerCase = isLowerCase(expectedCategoryCode)
test(expectedIsLowerCase, char.isLowerCase(), "isLowerCase()")
val expectedIsUpperCase = isUpperCase(expectedCategoryCode)
test(expectedIsUpperCase, char.isUpperCase(), "isUpperCase()")
val expectedIsWhitespace = isWhitespace(char, expectedCategoryCode)
test(expectedIsWhitespace, char.isWhitespace(), "isWhitespace()")
}
}
private fun isDigit(categoryCode: String): Boolean {
return categoryCode == CharCategory.DECIMAL_DIGIT_NUMBER.code
}
private fun isLetter(categoryCode: String): Boolean {
return categoryCode in listOf(
CharCategory.UPPERCASE_LETTER,
CharCategory.LOWERCASE_LETTER,
CharCategory.TITLECASE_LETTER,
CharCategory.MODIFIER_LETTER,
CharCategory.OTHER_LETTER
).map { it.code }
}
private fun isLowerCase(categoryCode: String): Boolean {
return categoryCode == CharCategory.LOWERCASE_LETTER.code
}
private fun isUpperCase(categoryCode: String): Boolean {
return categoryCode == CharCategory.UPPERCASE_LETTER.code
}
private fun isWhitespace(char: Char, categoryCode: String): Boolean {
return categoryCode in listOf(
CharCategory.SPACE_SEPARATOR.code,
CharCategory.LINE_SEPARATOR.code,
CharCategory.PARAGRAPH_SEPARATOR.code
) || char in '\u0009'..'\u000D' || char in '\u001C'..'\u001F'
}
}
""".trimIndent()
)
writer?.close()
}
private fun generateUnicodeDataHeader(arrayIndex: Int) {
val file = outputFile.resolveSibling("_UnicodeData$arrayIndex.kt")
generateFileHeader(file)
writer?.appendLine("internal val unicodeData$arrayIndex = arrayOf<CharProperties>(")
}
private fun generateFileHeader(file: File) {
writer = FileWriter(file)
writer?.writeHeader(file, "test.text.unicodeData")
writer?.appendLine()
}
}
@@ -0,0 +1,72 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges
import generators.requireExistingDir
import generators.unicode.UnicodeDataGenerator
import generators.unicode.ranges.builders.*
import generators.unicode.ranges.writers.*
import templates.KotlinTarget
import templates.Platform
import java.io.File
import java.io.FileWriter
internal class RangesGenerator private constructor(
private val outputFile: File,
private val rangesBuilder: RangesBuilder,
private val rangesWriter: RangesWriter,
) : UnicodeDataGenerator {
init {
outputFile.parentFile.requireExistingDir()
}
override fun appendChar(char: String, name: String, categoryCode: String) {
rangesBuilder.append(char, name, categoryCode)
}
override fun close() {
val (rangeStart, rangeEnd, rangeCategory) = rangesBuilder.build()
FileWriter(outputFile).use { writer ->
writer.writeHeader(outputFile, "kotlin.text")
writer.appendLine()
writer.appendLine("// ${rangeStart.size} ranges totally")
rangesWriter.write(rangeStart, rangeEnd, rangeCategory, writer)
}
}
companion object {
fun forCharCategory(outputFile: File, target: KotlinTarget): RangesGenerator {
val rangesBuilder = CharCategoryRangesBuilder()
val rangesWriter = RangesWritingStrategy.of(target, "Category").let {
if (target.platform == Platform.JS) VarLenBase64CategoryRangesWriter(it) else CategoryRangesWriter(it)
}
return RangesGenerator(outputFile, rangesBuilder, rangesWriter)
}
fun forLetter(outputFile: File, target: KotlinTarget): RangesGenerator {
val rangesBuilder = LetterRangesBuilder()
val rangesWriter = RangesWritingStrategy.of(target, "Letter").let {
if (target.platform == Platform.JS) VarLenBase64LetterRangesWriter(it) else LetterRangesWriter(it)
}
return RangesGenerator(outputFile, rangesBuilder, rangesWriter)
}
fun forDigit(outputFile: File, target: KotlinTarget): RangesGenerator {
val rangesBuilder = DigitRangesBuilder()
val rangesWriter = DigitRangesWriter(RangesWritingStrategy.of(target, "Digit"))
return RangesGenerator(outputFile, rangesBuilder, rangesWriter)
}
fun forWhitespace(outputFile: File): RangesGenerator {
val rangesBuilder = WhitespaceRangesBuilder()
val rangesWriter = WhitespaceRangesWriter()
return RangesGenerator(outputFile, rangesBuilder, rangesWriter)
}
}
}
@@ -0,0 +1,56 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges
import templates.KotlinTarget
import templates.Platform
import java.io.FileWriter
internal sealed class RangesWritingStrategy {
abstract val indentation: String
abstract val rangesAnnotation: String
abstract val rangesVisibilityModifier: String
abstract fun beforeWritingRanges(writer: FileWriter)
abstract fun afterWritingRanges(writer: FileWriter)
abstract fun rangeRef(name: String): String
companion object {
fun of(target: KotlinTarget, wrapperName: String): RangesWritingStrategy {
return when (target.platform) {
Platform.JS -> JsRangesWritingStrategy(wrapperName)
else -> NativeRangesWritingStrategy
}
}
}
}
internal object NativeRangesWritingStrategy : RangesWritingStrategy() {
override val indentation: String get() = ""
override val rangesAnnotation: String get() = "@SharedImmutable\n"
override val rangesVisibilityModifier: String get() = "private"
override fun beforeWritingRanges(writer: FileWriter) {}
override fun afterWritingRanges(writer: FileWriter) {}
override fun rangeRef(name: String): String = name
}
// see KT-42461, KT-40482
internal class JsRangesWritingStrategy(
private val wrapperName: String
) : RangesWritingStrategy() {
override val indentation: String get() = " ".repeat(4)
override val rangesAnnotation: String get() = ""
override val rangesVisibilityModifier: String get() = "internal"
override fun beforeWritingRanges(writer: FileWriter) {
writer.appendLine("private object $wrapperName {")
}
override fun afterWritingRanges(writer: FileWriter) {
writer.appendLine("}")
}
override fun rangeRef(name: String): String = "$wrapperName.$name"
}
@@ -0,0 +1,47 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.builders
import generators.unicode.ranges.patterns.PeriodicRangePattern
import generators.unicode.ranges.patterns.RangePattern
internal class CharCategoryRangesBuilder : RangesBuilder() {
override fun categoryId(categoryCode: String): String {
return categoryCode
}
override fun shouldSkip(categoryId: String): Boolean {
return false
}
override val makeOnePeriodCategory: (Array<String>) -> Int
get() = ::periodPatternCategory
override fun evolveLastRange(lastRange: RangePattern, charCode: Int, categoryId: String): RangePattern? {
require(lastRange is PeriodicRangePattern)
return when (lastRange.sequenceLength) {
1 -> PeriodicRangePattern.from(lastRange, charCode, categoryId, sequenceLength = 2, isPeriodic = true, unassignedCategoryId, ::periodPatternCategory)
?: PeriodicRangePattern.from(lastRange, charCode, categoryId, sequenceLength = 3, isPeriodic = true, unassignedCategoryId, ::periodPatternCategory)
2 -> PeriodicRangePattern.from(lastRange, charCode, categoryId, sequenceLength = 3, isPeriodic = true, unassignedCategoryId, ::periodPatternCategory)
else -> null
}
}
}
// 17 and 31 category values are not reserved. Use 17 to replace UNASSIGNED value (0) to be able to encode range pattern categories.
internal const val UNASSIGNED_CATEGORY_VALUE_REPLACEMENT = 17
private val categoryCodeToValue = CharCategory.values().associateBy({ it.code }, { if (it.value == 0) UNASSIGNED_CATEGORY_VALUE_REPLACEMENT else it.value })
private fun periodPatternCategory(categoryIds: Array<String>): Int {
// Each category value is <= 30, thus 5 bits is enough to represent it.
var pattern = 0
for (index in categoryIds.indices) {
val value = categoryCodeToValue[categoryIds[index]]!!
pattern = pattern or (value shl (5 * index))
}
return pattern
}
@@ -0,0 +1,25 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.builders
import generators.unicode.ranges.patterns.RangePattern
internal class DigitRangesBuilder : RangesBuilder() {
override fun categoryId(categoryCode: String): String {
return categoryCode
}
override fun shouldSkip(categoryId: String): Boolean {
return categoryId != CharCategory.DECIMAL_DIGIT_NUMBER.code
}
override val makeOnePeriodCategory: (Array<String>) -> Int
get() = { 0 }
override fun evolveLastRange(lastRange: RangePattern, charCode: Int, categoryId: String): RangePattern? {
return null
}
}
@@ -0,0 +1,89 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.builders
import generators.unicode.ranges.patterns.PeriodicRangePattern
import generators.unicode.ranges.patterns.RangePattern
import generators.unicode.ranges.patterns.GapRangePattern
internal class LetterRangesBuilder : RangesBuilder() {
override fun categoryId(categoryCode: String): String = when (categoryCode) {
CharCategory.LOWERCASE_LETTER.code -> categoryCode
CharCategory.UPPERCASE_LETTER.code -> categoryCode
in letterCategoryCodes -> "OL" // other letter
else -> "NL" // not a letter
}
override fun shouldSkip(categoryId: String): Boolean {
return categoryId == "NL"
}
override val makeOnePeriodCategory: (Array<String>) -> Int
get() = ::periodPatternCategory
override fun evolveLastRange(lastRange: RangePattern, charCode: Int, categoryId: String): RangePattern? {
return when (lastRange) {
is PeriodicRangePattern -> when (lastRange.sequenceLength) {
1 ->
PeriodicRangePattern.from(lastRange, charCode, categoryId, sequenceLength = 2, isPeriodic = true, unassignedCategoryId, ::periodPatternCategory)
?: GapRangePattern.from(lastRange, charCode, categoryId, unassignedCategoryId, ::gapPatternCategory)
?: PeriodicRangePattern.from(lastRange, charCode, categoryId, sequenceLength = 15, isPeriodic = false, unassignedCategoryId, ::periodPatternCategory)
2 ->
GapRangePattern.from(lastRange, charCode, categoryId, unassignedCategoryId, ::gapPatternCategory)
?: PeriodicRangePattern.from(lastRange, charCode, categoryId, sequenceLength = 15, isPeriodic = false, unassignedCategoryId, ::periodPatternCategory)
else -> null
}
is GapRangePattern ->
PeriodicRangePattern.from(lastRange, charCode, categoryId, sequenceLength = 15, isPeriodic = false, unassignedCategoryId, ::periodPatternCategory)
else ->
error("Unreachable")
}
}
}
private val letterCategoryCodes = listOf(
CharCategory.UPPERCASE_LETTER.code,
CharCategory.LOWERCASE_LETTER.code,
CharCategory.TITLECASE_LETTER.code,
CharCategory.MODIFIER_LETTER.code,
CharCategory.OTHER_LETTER.code
)
private fun bitmask(categoryId: String) = when (categoryId) {
CharCategory.LOWERCASE_LETTER.code -> 0b01
CharCategory.UPPERCASE_LETTER.code -> 0b10
"OL" -> 0b11
"NL" -> 0b00
"" -> 0b00
else -> error("Unknown categoryID: $categoryId")
}
private fun periodPatternCategory(categoryIds: Array<String>): Int {
var pattern = 0
for (index in categoryIds.indices) {
val value = bitmask(categoryIds[index])
pattern = pattern or (value shl (2 * index))
}
pattern = pattern or (1 shl (2 * categoryIds.size))
check(pattern and 0x3 != 0)
return pattern
}
private fun gapPatternCategory(start: Int, end: Int, gaps: List<GapRangePattern.Companion.Gap>): Int {
var pattern = 0
var shift = 2
for (i in gaps.indices) {
val gap = gaps[i]
val charsBeforeGap = gap.start - if (i == 0) start else gaps[i - 1].let { it.start + it.length }
pattern += charsBeforeGap shl shift
shift += GapRangePattern.CHARS_BITS
pattern += gap.length shl shift
shift += GapRangePattern.GAP_BITS
}
check(pattern and 0x3 == 0)
return pattern
}
@@ -0,0 +1,192 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.builders
import generators.unicode.ranges.patterns.PeriodicRangePattern
import generators.unicode.ranges.patterns.RangePattern
import generators.unicode.ranges.patterns.rangeLength
/**
* The base class of character ranges builders.
*/
internal abstract class RangesBuilder {
private val ranges = mutableListOf<RangePattern>()
private var lastAppendedCharCode = -1
/**
* Appends a line from the UnicodeData.txt file.
*/
fun append(char: String, name: String, categoryCode: String) {
val charCode = char.toInt(radix = 16)
val categoryId = categoryId(categoryCode)
when {
name.endsWith(", First>") -> rangeFirst(charCode, categoryId)
name.endsWith(", Last>") -> rangeLast(charCode, categoryId)
else -> append(charCode, categoryId)
}
lastAppendedCharCode = charCode
}
/**
* Optimizes the number of ranges and returns them.
*
* Returns a [Triple] containing lists of range starts, ends and categories in that particular order.
*/
fun build(): Triple<List<Int>, List<Int>, List<Int>> {
for (code in lastAppendedCharCode + 1..0xffff) {
appendSingleChar(code, unassignedCategoryId)
}
var index = ranges.lastIndex
while (index > 0) {
val previous = ranges[index - 1]
val previousEnd = previous.rangeEnd()
val previousEndCategory = previous.categoryIdOf(previousEnd)
val current = ranges[index]
if (current.prepend(previousEnd, previousEndCategory)) {
val newPrevious = removeLast(previous)
if (newPrevious != null) {
ranges[index - 1] = newPrevious
} else {
ranges.removeAt(index - 1)
index--
}
} else {
index--
}
}
// if (this is LetterRangesBuilder) {
// println(ranges.joinToString(separator = "\n"))
// }
// if (this is CharCategoryRangesBuilder) {
// println(ranges.subList(fromIndex = 0, toIndex = 10).joinToString(separator = "\n"))
// }
return Triple(ranges.map { it.rangeStart() }, ranges.map { it.rangeEnd() }, ranges.map { it.category() })
}
/**
* Appends the [charCode] as the start of a range of chars with the specified [categoryId].
*/
private fun rangeFirst(charCode: Int, categoryId: String) {
append(charCode, categoryId)
}
/**
* Appends the [charCode] as the end of a range of chars with the specified [categoryId].
* Chars between last appended char and the [charCode] are considered to have the specified [categoryId].
*/
private fun rangeLast(charCode: Int, categoryId: String) {
if (!shouldSkip(categoryId)) {
check(ranges.last().rangeEnd() == lastAppendedCharCode)
check(ranges.last().categoryIdOf(lastAppendedCharCode) == categoryId)
}
for (code in lastAppendedCharCode + 1..charCode) {
appendSingleChar(code, categoryId)
}
}
/**
* Appends the [charCode] with the specified [categoryId].
* Chars between last appended char and the [charCode] are considered to be unassigned.
*/
private fun append(charCode: Int, categoryId: String) {
for (code in lastAppendedCharCode + 1 until charCode) {
appendSingleChar(code, unassignedCategoryId)
}
appendSingleChar(charCode, categoryId)
}
/**
* Appends the [charCode] with the specified [categoryId] to the last range, or a new range containing the [charCode] is created.
* The last range can be transformed to another range type to accommodate the [charCode].
*/
private fun appendSingleChar(charCode: Int, categoryId: String) {
if (shouldSkip(categoryId)) return
if (ranges.isEmpty()) {
ranges.add(createRange(charCode, categoryId))
return
}
val lastRange = ranges.last()
if (!lastRange.append(charCode, categoryId)) {
val newLastRange = evolveLastRange(lastRange, charCode, categoryId)
if (newLastRange != null) {
ranges[ranges.lastIndex] = newLastRange
} else {
ranges.add(createRange(charCode, categoryId))
}
}
}
/**
* Category id used for unassigned chars.
*/
protected val unassignedCategoryId: String
get() = categoryId(CharCategory.UNASSIGNED.code)
/**
* Creates the simplest range containing the single [charCode].
*/
private fun createRange(charCode: Int, categoryId: String): RangePattern {
return PeriodicRangePattern.from(charCode, categoryId, sequenceLength = 1, isPeriodic = true, unassignedCategoryId, makeOnePeriodCategory)
}
/**
* Removes the last char in the specified [range].
* Returns the simplest pattern that accommodated the remaining chars in the [range],
* or `null` if the [range] contained a single char.
*/
private fun removeLast(range: RangePattern): RangePattern? {
if (range.rangeLength() == 1) {
return null
}
val rangeStart = range.rangeStart()
var result = createRange(rangeStart, range.categoryIdOf(rangeStart))
for (code in rangeStart + 1 until range.rangeEnd()) {
val categoryId = range.categoryIdOf(code)
if (!shouldSkip(categoryId)) {
result = if (result.append(code, categoryId)) result else evolveLastRange(result, code, categoryId)!!
}
}
return result
}
/**
* The id to use for the [categoryCode] - the Unicode general category code.
*/
protected abstract fun categoryId(categoryCode: String): String
/**
* Returns true if this range builder skips chars with the specified [categoryId].
*/
protected abstract fun shouldSkip(categoryId: String): Boolean
/**
* The function to use to transform periodic ranges with period equal to 1 to an Int representation.
*/
protected abstract val makeOnePeriodCategory: (Array<String>) -> Int
/**
* Appends the [charCode] with the specified [categoryId] to the [lastRange] and returns the resulting range,
* or returns `null` if [charCode] can't be appended to the [lastRange].
* The [lastRange] can be transformed to another range type to accommodate the [charCode].
*/
protected abstract fun evolveLastRange(
lastRange: RangePattern,
charCode: Int,
categoryId: String
): RangePattern?
}
@@ -0,0 +1,43 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.builders
import generators.unicode.ranges.patterns.RangePattern
internal class WhitespaceRangesBuilder : RangesBuilder() {
init {
// Cc CONTROL spaces
append("0009", "<Space, First>", WS)
append("000D", "<Space, Last>", WS)
append("001C", "<Space, First>", WS)
append("001F", "<Space, Last>", WS)
}
override fun categoryId(categoryCode: String): String {
return if (categoryCode == WS || categoryCode in whitespaceCategories) WS else NOT_WS
}
override fun shouldSkip(categoryId: String): Boolean {
return categoryId == NOT_WS
}
override val makeOnePeriodCategory: (Array<String>) -> Int
get() = { 0 }
override fun evolveLastRange(lastRange: RangePattern, charCode: Int, categoryId: String): RangePattern? {
return null
}
}
private const val WS = "WS"
private const val NOT_WS = "NOT_WS"
private val whitespaceCategories = listOf(
CharCategory.SPACE_SEPARATOR.code,
CharCategory.LINE_SEPARATOR.code,
CharCategory.PARAGRAPH_SEPARATOR.code
)
@@ -0,0 +1,138 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.patterns
import generators.unicode.ranges.writers.hex
/**
* A range of consequent chars that starts with a letter and ends with a letter, and contains multiple ranges of consequent not-letter chars.
*
* All letter chars in this range have the same category id.
*
* @param charCode the start of this range
* @param categoryId the category id of the char with the specified [charCode]
* @param unassignedCategoryId the categoryId of the unassigned chars.
* Chars that are not appended or prepended are considered to be unassigned
* @param makeCategory the function used to transform this range to an Int representation that is returned from the [category] function.
*/
internal class GapRangePattern private constructor(
charCode: Int,
private val categoryId: String,
private val unassignedCategoryId: String,
private val makeCategory: (start: Int, end: Int, gaps: List<Gap>) -> Int
) : RangePattern {
private val start: Int = charCode
private var end: Int = charCode
private val gaps = mutableListOf<Gap>()
init {
require(categoryId == "OL")
}
override fun append(charCode: Int, categoryId: String): Boolean {
require(charCode > end)
if (categoryId == unassignedCategoryId) {
return true
}
if (categoryId != this.categoryId) {
return false
}
// lll_gap_lll_X_l
if (end == charCode - 1) {
// _X_ is empty -> append the letter
end = charCode
return true
}
val newGap = Gap(start = end + 1, length = charCode - end - 1)
val charsBeforeNewGap = newGap.start - if (gaps.isEmpty()) start else gaps.last().let { it.start + it.length }
val bits = (gaps.size + 1) * (CHARS_BITS + GAP_BITS)
if (isValid(charsBeforeNewGap, newGap.length) && bits <= TOTAL_BITS) {
gaps.add(newGap)
end = charCode
return true
}
return false
}
override fun prepend(charCode: Int, categoryId: String): Boolean {
assert(charCode < start)
return false
}
override fun rangeStart(): Int {
return start
}
override fun rangeEnd(): Int {
return end
}
override fun category(): Int {
return makeCategory(start, end, gaps)
}
override fun categoryIdOf(charCode: Int): String {
require(charCode in start..end)
for (gap in gaps) {
if (charCode < gap.start) {
return categoryId
}
if (charCode < gap.start + gap.length) {
return unassignedCategoryId
}
}
return categoryId
}
override fun toString(): String {
return "GapPattern{" +
"start=" + start.hex() +
", end=" + end.hex() +
", length=" + rangeLength() +
", gaps=" + gaps +
", categoryId=" + categoryId +
"}"
}
companion object {
internal const val CHARS_BITS = 7
internal const val GAP_BITS = 7
private const val TOTAL_BITS = 29
internal data class Gap(val start: Int, val length: Int)
fun from(
range: RangePattern,
charCode: Int,
categoryId: String,
unassignedCategoryId: String,
makeCategory: (start: Int, end: Int, gaps: List<Gap>) -> Int
): RangePattern? {
val start = range.rangeStart()
val startCategoryId = range.categoryIdOf(start)
check(startCategoryId != unassignedCategoryId)
if (startCategoryId != categoryId || categoryId != "OL") return null
val gapRange = GapRangePattern(start, startCategoryId, unassignedCategoryId, makeCategory)
if (gapRange.append(start + 1, range.rangeEnd(), range::categoryIdOf, charCode, categoryId)) {
return gapRange
}
return null
}
private fun isValid(charsBeforeGap: Int, gapLength: Int): Boolean {
return charsBeforeGap < (1 shl CHARS_BITS) && gapLength < (1 shl GAP_BITS)
}
}
}
@@ -0,0 +1,195 @@
/*
* Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.patterns
import generators.unicode.ranges.writers.hex
/**
* A range of consequent chars.
*
* The chars in the range may have periodic categories, e.g., [Lu, Ll, Lu, Ll, ...].
*
* @param charCode the start of this range
* @param categoryId the category id of the char with the specified [charCode]
* @param sequenceLength the maximum length this range can have.
* If [isPeriodic] is true than this range can be longer with:
* for every `charCode >= start + sequenceLength` categoryIdOf(charCode) is equal to categoryIdOf(charCode - sequenceLength)
* @param isPeriodic true if this range is a periodic range with period [sequenceLength]
* @param unassignedCategoryId the categoryId of the unassigned chars.
* Chars that are not appended or prepended are considered to be unassigned
* @param makeCategory the function used to transform this range to an Int representation that is returned from the [category] function.
* [makeCategory] is called with an array having its size equal to `minOf(sequenceLength, rangeLength())`.
*/
internal class PeriodicRangePattern private constructor(
charCode: Int,
categoryId: String,
val sequenceLength: Int,
isPeriodic: Boolean,
unassignedCategoryId: String,
private val makeCategory: (Array<String>) -> Int
) : RangePattern {
private var start: Int = charCode
private var end: Int = charCode
private val bag: Bag = Bag(sequenceLength, isPeriodic, unassignedCategoryId)
init {
bag.fill(charCode, categoryId)
}
override fun append(charCode: Int, categoryId: String): Boolean {
require(charCode > end)
if (!bag.fill(end + 1, charCode - 1, { bag.unassignedCategoryId }, charCode, categoryId)) {
return false
}
end = charCode
return true
}
override fun prepend(charCode: Int, categoryId: String): Boolean {
require(charCode < start)
if (!bag.fill(charCode + 1, start - 1, { bag.unassignedCategoryId }, charCode, categoryId)) {
return false
}
start = charCode
return true
}
override fun rangeStart(): Int {
return start
}
override fun rangeEnd(): Int {
return end
}
override fun category(): Int {
return makeCategory(orderedCategoryIds())
}
private fun orderedCategoryIds(): Array<String> {
val size = minOf(sequenceLength, rangeLength())
return Array(size) { categoryIdOf(start + it) }
}
override fun categoryIdOf(charCode: Int): String {
if (charCode !in start..end) {
throw IllegalArgumentException("Char code ${charCode.hex()} is not in $this")
}
val categoryId = bag.categoryIdOf(charCode)
check(categoryId != null)
return categoryId
}
override fun toString(): String {
return "PeriodicRangePattern{" +
"start=" + start.hex() +
", end=" + end.hex() +
", length=" + rangeLength() +
", orderedCategoryIds=" + orderedCategoryIds().contentToString() +
", bag=" + bag +
"}"
}
companion object {
fun from(
range: RangePattern,
charCode: Int,
categoryId: String,
sequenceLength: Int,
isPeriodic: Boolean,
unassignedCategoryId: String,
makeCategory: (Array<String>) -> Int
): PeriodicRangePattern? {
require(charCode > range.rangeEnd())
val start = range.rangeStart()
val newRange = from(start, range.categoryIdOf(start), sequenceLength, isPeriodic, unassignedCategoryId, makeCategory)
if (newRange.append(start + 1, range.rangeEnd(), range::categoryIdOf, charCode, categoryId)) {
return newRange
}
return null
}
fun from(
charCode: Int,
categoryId: String,
sequenceLength: Int,
isPeriodic: Boolean,
unassignedCategoryId: String,
makeCategory: (Array<String>) -> Int
): PeriodicRangePattern {
return PeriodicRangePattern(charCode, categoryId, sequenceLength, isPeriodic, unassignedCategoryId, makeCategory)
}
}
}
/**
* A set of chars with their corresponding categories.
*
* Category Id of a char with code equal to `charCode` is placed at index `charCode % sequenceLength` of the [categoryIds].
*/
private class Bag(
private val sequenceLength: Int,
private val isPeriodic: Boolean,
val unassignedCategoryId: String
) {
private val categoryIds = arrayOfNulls<String>(sequenceLength)
fun categoryIdOf(charCode: Int): String? {
return categoryIds[charCode % sequenceLength]
}
/**
* Returns true if a range with the specified [rangeStart], [rangeEnd] and [categoryIdOf] was successfully added
* together with a char with the specified [charCode] and [categoryId].
*
* The [charCode] must go immediately after the [rangeEnd] or before the [rangeStart].
*/
fun fill(rangeStart: Int, rangeEnd: Int, categoryIdOf: (Int) -> String, charCode: Int, categoryId: String): Boolean {
require(charCode == rangeStart - 1 || charCode == rangeEnd + 1)
val attempt = categoryIds.copyOf()
for (ch in rangeStart..rangeEnd) {
if (!attempt.fill(ch, categoryIdOf(ch))) return false
}
if (!attempt.fill(charCode, categoryId)) return false
attempt.copyInto(categoryIds)
return true
}
/**
* Returns true if the [charCode] with the [categoryId] was successfully placed in [categoryIds].
*/
fun fill(charCode: Int, categoryId: String): Boolean {
return categoryIds.fill(charCode, categoryId)
}
/**
* Returns true if the [charCode] with the [categoryId] was successfully placed in this array.
*
* The [charCode] is placed at index `charCode % sequenceLength`.
*/
private fun Array<String?>.fill(charCode: Int, categoryId: String): Boolean {
val index = charCode % sequenceLength
val current = this[index]
if (current == null || (isPeriodic && current == categoryId)) {
this[index] = categoryId
return true
}
return false
}
override fun toString(): String {
return "Bag{" +
"sequenceLength=" + sequenceLength +
", isPeriodic=" + isPeriodic +
", unassignedCategoryId=" + unassignedCategoryId +
", categoryIds=" + categoryIds.contentToString() +
"}"
}
}
@@ -0,0 +1,61 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.patterns
/**
* A range of consequent chars that fit within a particular pattern.
*/
internal interface RangePattern {
/**
* Appends the [charCode] to this range pattern.
* Returns true if the [charCode] with the specified [categoryId] could be accommodated within this pattern.
* Returns false otherwise.
*/
fun append(charCode: Int, categoryId: String): Boolean
/**
* Prepends the [charCode] to this range pattern.
* Returns true if the [charCode] with the specified [categoryId] could be accommodated within this pattern.
* Returns false otherwise.
*/
fun prepend(charCode: Int, categoryId: String): Boolean
/**
* Char code of the first char in this range.
*/
fun rangeStart(): Int
/**
* Char code of the last char in this range.
*/
fun rangeEnd(): Int
/**
* An integer value that contains information about the category of each char in this range.
*/
fun category(): Int
/**
* Returns category id of the char with the specified [charCode].
* Throws an exception if the [charCode] is not in `rangeStart()..rangeEnd()`.
*/
fun categoryIdOf(charCode: Int): String
}
internal fun RangePattern.rangeLength(): Int = rangeEnd() - rangeStart() + 1
internal fun RangePattern.append(rangeStart: Int, rangeEnd: Int, categoryIdOf: (Int) -> String, charCode: Int, categoryId: String): Boolean {
for (code in rangeStart..rangeEnd) {
if (!append(code, categoryIdOf(code))) {
return false
}
}
if (!append(charCode, categoryId)) {
return false
}
return true
}
@@ -0,0 +1,172 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.writers
import generators.unicode.ranges.RangesWritingStrategy
import generators.unicode.ranges.builders.UNASSIGNED_CATEGORY_VALUE_REPLACEMENT
import java.io.FileWriter
internal open class CategoryRangesWriter(protected val strategy: RangesWritingStrategy) : RangesWriter {
override fun write(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter) {
beforeWritingRanges(writer)
writeRangeStart(rangeStart, writer)
writeRangeCategory(rangeCategory, writer)
writeInit(rangeStart, rangeEnd, rangeCategory, writer)
afterWritingRanges(writer)
}
protected open fun beforeWritingRanges(writer: FileWriter) {
strategy.beforeWritingRanges(writer)
}
protected open fun afterWritingRanges(writer: FileWriter) {
strategy.afterWritingRanges(writer)
writer.appendLine()
writer.appendLine(categoryValueFrom())
writer.appendLine()
writer.appendLine(getCategoryValue())
}
protected open fun writeRangeStart(elements: List<Int>, writer: FileWriter) {
writer.writeIntArray("rangeStart", elements, strategy)
writer.appendLine()
}
protected open fun writeRangeCategory(elements: List<Int>, writer: FileWriter) {
writer.writeIntArray("rangeCategory", elements, strategy)
writer.appendLine()
}
protected open fun writeInit(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter) {}
private fun categoryValueFrom(): String = """
private fun categoryValueFrom(code: Int, ch: Int): Int {
return when {
code < 0x20 -> code
code < 0x400 -> if ((ch and 1) == 1) code shr 5 else code and 0x1f
else ->
when (ch % 3) {
2 -> code shr 10
1 -> (code shr 5) and 0x1f
else -> code and 0x1f
}
}
}
""".trimIndent()
private fun getCategoryValue(): String = """
/**
* Returns the Unicode general category of this character as an Int.
*/
internal fun Char.getCategoryValue(): Int {
val ch = this.toInt()
val index = ${indexOf("ch")}
val start = ${startAt("index")}
val code = ${categoryAt("index")}
val value = categoryValueFrom(code, ch - start)
return if (value == $UNASSIGNED_CATEGORY_VALUE_REPLACEMENT) CharCategory.UNASSIGNED.value else value
}
""".trimIndent()
protected open fun indexOf(charCode: String): String {
return "binarySearchRange(${strategy.rangeRef("rangeStart")}, $charCode)"
}
protected open fun startAt(index: String): String {
return "${strategy.rangeRef("rangeStart")}[$index]"
}
protected open fun categoryAt(index: String): String {
return "${strategy.rangeRef("rangeCategory")}[$index]"
}
}
internal class VarLenBase64CategoryRangesWriter(strategy: RangesWritingStrategy) : CategoryRangesWriter(strategy) {
override fun afterWritingRanges(writer: FileWriter) {
super.afterWritingRanges(writer)
writer.appendLine()
writer.appendLine(decodeVarLenBase64())
}
override fun writeInit(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter) {
val rangeLength = rangeStart.zipWithNext { a, b -> b - a }
val base64RangeLength = rangeLength.toVarLenBase64()
val base64RangeCategory = rangeCategory.toVarLenBase64()
writer.appendLine(
"""
val decodedRangeStart: IntArray
val decodedRangeCategory: IntArray
init {
val toBase64 = "$TO_BASE64"
val fromBase64 = IntArray(128)
for (i in toBase64.indices) {
fromBase64[toBase64[i].toInt()] = i
}
// rangeStartDiff.length = ${base64RangeLength.length}
val rangeStartDiff = "$base64RangeLength"
val diff = decodeVarLenBase64(rangeStartDiff, fromBase64, ${rangeLength.size})
val start = IntArray(diff.size + 1)
for (i in diff.indices) {
start[i + 1] = start[i] + diff[i]
}
decodedRangeStart = start
// rangeCategory.length = ${base64RangeCategory.length}
val rangeCategory = "$base64RangeCategory"
decodedRangeCategory = decodeVarLenBase64(rangeCategory, fromBase64, ${rangeCategory.size})
}
""".replaceIndent(strategy.indentation)
)
}
override fun writeRangeStart(elements: List<Int>, writer: FileWriter) {}
override fun writeRangeCategory(elements: List<Int>, writer: FileWriter) {}
private fun decodeVarLenBase64() = """
internal fun decodeVarLenBase64(base64: String, fromBase64: IntArray, resultLength: Int): IntArray {
val result = IntArray(resultLength)
var index = 0
var int = 0
var shift = 0
for (char in base64) {
val sixBit = fromBase64[char.toInt()]
int = int or ((sixBit and 0x1f) shl shift)
if (sixBit < 0x20) {
result[index++] = int
int = 0
shift = 0
} else {
shift += 5
}
}
return result
}
""".trimIndent()
override fun indexOf(charCode: String): String {
return "binarySearchRange(${strategy.rangeRef("decodedRangeStart")}, $charCode)"
}
override fun startAt(index: String): String {
return "${strategy.rangeRef("decodedRangeStart")}[$index]"
}
override fun categoryAt(index: String): String {
return "${strategy.rangeRef("decodedRangeCategory")}[$index]"
}
}
@@ -0,0 +1,60 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.writers
import generators.unicode.ranges.RangesWritingStrategy
import java.io.FileWriter
internal class DigitRangesWriter(private val strategy: RangesWritingStrategy) : RangesWriter {
override fun write(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter) {
// digit ranges always have length equal to 10, so that the difference between the last char code in range and the first one is always 9.
// Therefore, no need to generate ranges end
check(rangeStart.indices.all { rangeEnd[it] - rangeStart[it] == 9 })
strategy.beforeWritingRanges(writer)
writer.writeIntArray("rangeStart", rangeStart, strategy)
strategy.afterWritingRanges(writer)
writer.appendLine()
writer.appendLine(binarySearchRange())
writer.appendLine()
writer.appendLine(isDigitImpl())
}
private fun binarySearchRange(): String = """
internal fun binarySearchRange(array: IntArray, needle: Int): Int {
var bottom = 0
var top = array.size - 1
var middle = -1
var value = 0
while (bottom <= top) {
middle = (bottom + top) / 2
value = array[middle]
if (needle > value)
bottom = middle + 1
else if (needle == value)
return middle
else
top = middle - 1
}
return middle - (if (needle < value) 1 else 0)
}
""".trimIndent()
private fun isDigitImpl(): String {
val rangeStart = strategy.rangeRef("rangeStart")
return """
/**
* Returns `true` if this character is a digit.
*/
internal fun Char.isDigitImpl(): Boolean {
val ch = this.toInt()
val index = binarySearchRange($rangeStart, ch)
val high = $rangeStart[index] + 9
return ch <= high
}
""".trimIndent()
}
}
@@ -0,0 +1,76 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.writers
import generators.unicode.ranges.RangesWritingStrategy
import templates.COPYRIGHT_NOTICE
import templates.autoGeneratedWarning
import java.io.File
import java.io.FileWriter
internal fun FileWriter.writeHeader(file: File, pkg: String) {
println("Generating file: $file")
appendLine(COPYRIGHT_NOTICE)
appendLine("package $pkg")
appendLine()
appendLine(autoGeneratedWarning("GenerateUnicodeData.kt"))
}
internal fun FileWriter.writeIntArray(
name: String,
elements: List<Int>,
strategy: RangesWritingStrategy
) {
fun appendWithIndentation(string: String) {
append(strategy.indentation + string)
}
append(strategy.rangesAnnotation)
appendWithIndentation("${strategy.rangesVisibilityModifier} val $name = intArrayOf(")
for (i in elements.indices) {
if (i % 20 == 0) {
appendLine()
appendWithIndentation(" ")
}
append(elements[i].hex() + ", ")
}
appendLine()
appendWithIndentation(")")
appendLine()
}
internal const val TO_BASE64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
internal fun List<Int>.toVarLenBase64(): String {
val base64 = flatMap { it.to6Bits() }
return base64.joinToString(separator = "") { TO_BASE64[it].toString() }
}
private fun Int.to6Bits(): List<Int> {
require(this >= 0)
val result = mutableListOf<Int>()
var value = this
do {
var fiveBits = value and 0x1f
value = value shr 5
if (value != 0) {
fiveBits = fiveBits or 0x20
}
result.add(fiveBits)
} while (value != 0)
return result
}
internal fun Int.hex(): String {
val result = toString(radix = 16)
if (result.first() == '-') {
return "-0x" + result.substring(startIndex = 1).padStart(4, '0')
}
return "0x" + result.padStart(4, '0')
}
@@ -0,0 +1,210 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.writers
import generators.unicode.ranges.RangesWritingStrategy
import generators.unicode.ranges.patterns.GapRangePattern
import java.io.FileWriter
internal open class LetterRangesWriter(protected val strategy: RangesWritingStrategy) : RangesWriter {
override fun write(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter) {
beforeWritingRanges(writer)
writeRangeStart(rangeStart, writer)
writeRangeLength(rangeEnd.mapIndexed { i, e -> e - rangeStart[i] + 1 }, writer)
writeRangeCategory(rangeCategory, writer)
writeInit(rangeStart, rangeEnd, rangeCategory, writer)
afterWritingRanges(writer)
}
protected open fun beforeWritingRanges(writer: FileWriter) {
strategy.beforeWritingRanges(writer)
}
protected open fun afterWritingRanges(writer: FileWriter) {
strategy.afterWritingRanges(writer)
writer.appendLine()
writer.appendLine(getLetterType())
}
protected open fun writeRangeStart(elements: List<Int>, writer: FileWriter) {
writer.writeIntArray("rangeStart", elements, strategy)
writer.appendLine()
}
protected open fun writeRangeLength(elements: List<Int>, writer: FileWriter) {
writer.writeIntArray("rangeLength", elements, strategy)
writer.appendLine()
}
protected open fun writeRangeCategory(elements: List<Int>, writer: FileWriter) {
writer.writeIntArray("rangeCategory", elements, strategy)
writer.appendLine()
}
protected open fun writeInit(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter) {}
private fun getLetterType(): String = """
/**
* Returns `true` if this character is a letter.
*/
internal fun Char.isLetterImpl(): Boolean {
return getLetterType() != 0
}
/**
* Returns `true` if this character is a lower case letter.
*/
internal fun Char.isLowerCaseImpl(): Boolean {
return getLetterType() == 1
}
/**
* Returns `true` if this character is an upper case letter.
*/
internal fun Char.isUpperCaseImpl(): Boolean {
return getLetterType() == 2
}
/**
* Returns
* - `1` if the character is a lower case letter,
* - `2` if the character is an upper case letter,
* - `3` if the character is a letter but not a lower or upper case letter,
* - `0` otherwise.
*/
private fun Char.getLetterType(): Int {
val ch = this.toInt()
val index = ${indexOf("ch")}
val rangeStart = ${startAt("index")}
val rangeEnd = rangeStart + ${lengthAt("index")} - 1
val code = ${categoryAt("index")}
if (ch > rangeEnd) {
return 0
}
val lastTwoBits = code and 0x3
if (lastTwoBits == 0) { // gap pattern
var shift = 2
var threshold = rangeStart
for (i in 0..1) {
threshold += (code shr shift) and 0x${((1 shl GapRangePattern.CHARS_BITS) - 1).toString(16)}
if (threshold > ch) {
return 3
}
shift += ${GapRangePattern.CHARS_BITS}
threshold += (code shr shift) and 0x${((1 shl GapRangePattern.GAP_BITS) - 1).toString(16)}
if (threshold > ch) {
return 0
}
shift += ${GapRangePattern.GAP_BITS}
}
return 3
}
if (code <= 0x7) {
return lastTwoBits
}
val distance = (ch - rangeStart)
val shift = if (code <= 0x1F) distance % 2 else distance
return (code shr (2 * shift)) and 0x3
}
""".trimIndent()
protected open fun indexOf(charCode: String): String {
return "binarySearchRange(${strategy.rangeRef("rangeStart")}, $charCode)"
}
protected open fun startAt(index: String): String {
return "${strategy.rangeRef("rangeStart")}[$index]"
}
protected open fun lengthAt(index: String): String {
return "${strategy.rangeRef("rangeLength")}[$index]"
}
protected open fun categoryAt(index: String): String {
return "${strategy.rangeRef("rangeCategory")}[$index]"
}
}
internal class VarLenBase64LetterRangesWriter(strategy: RangesWritingStrategy) : LetterRangesWriter(strategy) {
override fun afterWritingRanges(writer: FileWriter) {
super.afterWritingRanges(writer)
writer.appendLine()
}
override fun writeInit(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter) {
val rangeStartDiff = rangeStart.mapIndexed { i, e -> if (i == 0) e else e - rangeStart[i - 1] }
val rangeLength = rangeEnd.mapIndexed { i, e -> e - rangeStart[i] + 1 }
val base64RangeStartDiff = rangeStartDiff.toVarLenBase64()
val base64RangeLength = rangeLength.toVarLenBase64()
val base64RangeCategory = rangeCategory.toVarLenBase64()
writer.appendLine(
"""
val decodedRangeStart: IntArray
val decodedRangeLength: IntArray
val decodedRangeCategory: IntArray
init {
val toBase64 = "$TO_BASE64"
val fromBase64 = IntArray(128)
for (i in toBase64.indices) {
fromBase64[toBase64[i].toInt()] = i
}
// rangeStartDiff.length = ${base64RangeStartDiff.length}
val rangeStartDiff = "$base64RangeStartDiff"
val diff = decodeVarLenBase64(rangeStartDiff, fromBase64, ${rangeStartDiff.size})
val start = IntArray(diff.size)
for (i in diff.indices) {
if (i == 0) start[i] = diff[i]
else start[i] = start[i - 1] + diff[i]
}
decodedRangeStart = start
// rangeLength.length = ${base64RangeLength.length}
val rangeLength = "$base64RangeLength"
decodedRangeLength = decodeVarLenBase64(rangeLength, fromBase64, ${rangeLength.size})
// rangeCategory.length = ${base64RangeCategory.length}
val rangeCategory = "$base64RangeCategory"
decodedRangeCategory = decodeVarLenBase64(rangeCategory, fromBase64, ${rangeCategory.size})
}
""".replaceIndent(strategy.indentation)
)
}
override fun writeRangeStart(elements: List<Int>, writer: FileWriter) {}
override fun writeRangeLength(elements: List<Int>, writer: FileWriter) {}
override fun writeRangeCategory(elements: List<Int>, writer: FileWriter) {}
override fun indexOf(charCode: String): String {
return "binarySearchRange(${strategy.rangeRef("decodedRangeStart")}, $charCode)"
}
override fun startAt(index: String): String {
return "${strategy.rangeRef("decodedRangeStart")}[$index]"
}
override fun lengthAt(index: String): String {
return "${strategy.rangeRef("decodedRangeLength")}[$index]"
}
override fun categoryAt(index: String): String {
return "${strategy.rangeRef("decodedRangeCategory")}[$index]"
}
}
@@ -0,0 +1,12 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.writers
import java.io.FileWriter
interface RangesWriter {
fun write(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter)
}
@@ -0,0 +1,62 @@
/*
* Copyright 2010-2020 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/
package generators.unicode.ranges.writers
import java.io.FileWriter
internal class WhitespaceRangesWriter : RangesWriter {
override fun write(rangeStart: List<Int>, rangeEnd: List<Int>, rangeCategory: List<Int>, writer: FileWriter) {
writer.appendLine(isWhitespaceImpl(rangeStart, rangeEnd))
}
private fun isWhitespaceImpl(rangeStart: List<Int>, rangeEnd: List<Int>): String {
val checks = rangeChecks(rangeStart, rangeEnd, "ch")
return """
/**
* Returns `true` if this character is a whitespace.
*/
internal fun Char.isWhitespaceImpl(): Boolean {
val ch = this.toInt()
return $checks
}
""".trimIndent()
}
private fun rangeChecks(rangeStart: List<Int>, rangeEnd: List<Int>, ch: String): String {
val tab = " "
var tabCount = 5
val builder = StringBuilder()
for (i in rangeStart.indices) {
if (i != 0) {
builder.append(tab.repeat(tabCount)).append("|| ")
}
val start = rangeStart[i]
val end = rangeEnd[i]
when (start) {
end -> {
if (start > 0x1000 && tabCount == 5) {
builder.appendLine("$ch > 0x1000 && (")
tabCount = 6
builder.append(tab.repeat(tabCount))
}
builder.appendLine("$ch == ${start.hex()}")
}
end - 1 -> {
builder.appendLine("$ch == ${start.hex()}")
builder.append(tab.repeat(tabCount)).append("|| ")
builder.appendLine("$ch == ${end.hex()}")
}
else -> {
builder.appendLine("$ch in ${start.hex()}..${end.hex()}")
}
}
}
return builder.append(tab.repeat(5)).append(")").toString()
}
}
@@ -11,8 +11,8 @@ import java.io.FileWriter
import java.io.Reader
import javax.xml.xpath.XPathFactory
val COMMON_AUTOGENERATED_WARNING: String = """//
// NOTE: THIS FILE IS AUTO-GENERATED by the GenerateStandardLib.kt
fun autoGeneratedWarning(generator: String): String = """//
// NOTE: THIS FILE IS AUTO-GENERATED by the $generator
// See: https://github.com/JetBrains/kotlin/tree/master/libraries/stdlib
//"""
@@ -92,7 +92,7 @@ fun List<MemberBuilder>.writeTo(file: File, targetedSource: TargetedSourceFile)
}
writer.append("package ${sourceFile.packageName ?: "kotlin"}\n\n")
writer.append("${COMMON_AUTOGENERATED_WARNING}\n\n")
writer.append("${autoGeneratedWarning("GenerateStandardLib.kt")}\n\n")
if (target.platform == Platform.JS) {
writer.appendln("import kotlin.js.*")
if (sourceFile == SourceFile.Arrays) {