SubwayTooter-Android-App/app/src/main/java/jp/juggler/util/CharacterGroup.kt

408 lines
14 KiB
Kotlin
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package jp.juggler.util
import android.text.SpannableStringBuilder
import android.util.SparseBooleanArray
import android.util.SparseIntArray
import java.util.ArrayList
import java.util.regex.Pattern
object CharacterGroup {
// Tokenizerが終端に達したことを示す
const val END = -1
// 文字コードから文字列を作る
private fun c2s(tmp: CharArray, c: Char): String {
tmp[0] = c
return String(tmp, 0, 1)
}
private fun i2s(tmp: CharArray, c: Int): String {
tmp[0] = c.toChar()
return String(tmp, 0, 1)
}
private val mapWhitespace = SparseBooleanArray().apply {
intArrayOf(
0x0009, // HORIZONTAL TABULATION
0x000A, // LINE FEED
0x000B, // VERTICAL TABULATION
0x000C, // FORM FEED
0x000D, // CARRIAGE RETURN
0x001C, // FILE SEPARATOR
0x001D, // GROUP SEPARATOR
0x001E, // RECORD SEPARATOR
0x001F, // UNIT SEPARATOR
0x0020,
0x0085, // next line (latin-1)
0x00A0, //非区切りスペース
0x1680,
0x180E,
0x2000,
0x2001,
0x2002,
0x2003,
0x2004,
0x2005,
0x2006,
0x2007, //非区切りスペース
0x2008,
0x2009,
0x200A,
0x200B,
0x200C,
0x200D,
0x2028, // line separator
0x2029, // paragraph separator
0x202F, //非区切りスペース
0x205F,
0x2060,
0x3000,
0x3164,
0xFEFF
).forEach {
put(it, true)
}
}
// 空白とみなす文字なら真
fun isWhitespace(cp: Int): Boolean = mapWhitespace.get(cp, false)
internal val reWhitespace by lazy {
val quotedKeys = Pattern.quote(
StringBuilder().apply {
val size = mapWhitespace.size()
ensureCapacity(size)
for (i in 0 until size) {
append(mapWhitespace.keyAt(i).toChar())
}
}.toString()
)
"[$quotedKeys]+".asciiPattern()
}
internal val reNotWhitespace by lazy {
val quotedKeys = Pattern.quote(
StringBuilder().apply {
val size = mapWhitespace.size()
ensureCapacity(size)
for (i in 0 until size) {
append(mapWhitespace.keyAt(i).toChar())
}
}.toString()
)
"[^$quotedKeys]+".asciiPattern()
}
private fun SparseBooleanArray.keys() = (0 until size()).map { keyAt(it) }
internal val reWhitespaceBeforeLineFeed by lazy {
val whitespaces = mapWhitespace.keys()
.map { it.toChar() }
.filter { it != '\n' }
.joinToString("")
"[$whitespaces]+\n".asciiPattern()
}
// 文字列のリストからグループIDを決定する
private fun findGroupId(list: Array<String>): Int {
// グループのIDは、グループ中の文字(長さ1)のunicode値の最小
var id = Integer.MAX_VALUE
for (s in list) {
if (s.length == 1) {
val c = s[0].code
if (c < id) id = c
}
}
if (id == Integer.MAX_VALUE) error("missing group id")
return id
}
// 文字列からグループIDを調べるマップ
// 文字数1: unicode => group_id
private val map1 = SparseIntArray()
// 文字数2: unicode 二つを合成した数値 => group_id。半角カナ濁音など
private val map2 = SparseIntArray()
// ユニコード文字を正規化する。
// 簡易版なので全ての文字には対応していない
fun getUnifiedCharacter(c: Char): Char {
val v1 = map1[c.code]
return if (v1 != 0) v1.toChar() else c
}
// グループをmapに登録する
private fun addGroup(list: Array<String>) {
val group_id = findGroupId(list)
// 文字列からグループIDを調べるマップを更新
for (s in list) {
val map: SparseIntArray
val key: Int
val v1 = s[0].code
if (s.length == 1) {
map = map1
key = v1
} else {
map = map2
val v2 = s[1].code
key = v1 or (v2 shl 16)
}
val old = map.get(key)
if (old != 0 && old != group_id) error("group conflict: $s")
map.put(key, group_id)
}
}
// 入力された文字列から 文字,グループ,終端 のどれかを順に列挙する
class Tokenizer {
internal var text: CharSequence = ""
internal var end: Int = 0
var offset: Int = 0
internal fun reset(text: CharSequence, start: Int, end: Int): Tokenizer {
this.text = text
this.offset = start
this.end = end
return this
}
// returns END or group_id or UTF-16 character
operator fun next(): Int {
var pos = offset
// 空白を読み飛ばす
while (pos < end && isWhitespace(text[pos].code)) ++pos
// 終端までの文字数
val remain = end - pos
if (remain <= 0) {
// 空白を読み飛ばしたら終端になった
// 終端の場合、末尾の空白はoffsetに含めない
return END
}
val v1 = text[pos].code
// グループに登録された文字を長い順にチェック
var check_len = if (remain > 2) 2 else remain
while (check_len > 0) {
val group_id = when (check_len) {
1 -> map1.get(v1)
else -> map2.get(v1 or (text[pos + 1].code shl 16))
}
if (group_id != 0) {
this.offset = pos + check_len
return group_id
}
--check_len
}
this.offset = pos + 1
return v1
}
}
init {
val tmp = CharArray(1)
val array2 = arrayOf("", "")
val array4 = arrayOf("", "", "", "")
// 数字
for (i in 0..8) {
array2[0] = c2s(tmp, '0' + i)
array2[1] = c2s(tmp, '' + i)
addGroup(array2)
}
// 英字
for (i in 0..25) {
array4[0] = c2s(tmp, 'a' + i)
array4[1] = c2s(tmp, 'A' + i)
array4[2] = c2s(tmp, '' + i)
array4[3] = c2s(tmp, '' + i)
addGroup(array4)
}
// ハイフン
addGroup(
arrayOf(
i2s(tmp, 0x002D), // ASCIIのハイフン
i2s(tmp, 0x30FC), // 全角カナの長音 Shift_JIS由来
i2s(tmp, 0x2010),
i2s(tmp, 0x2011),
i2s(tmp, 0x2013),
i2s(tmp, 0x2014),
i2s(tmp, 0x2015), // 全角カナのダッシュ Shift_JIS由来
i2s(tmp, 0x2212),
i2s(tmp, 0xFF0d), // 全角カナの長音 MS932由来
i2s(tmp, 0xFF70) // 半角カナの長音 MS932由来
)
)
addGroup(arrayOf("", "!"))
addGroup(arrayOf("", "\""))
addGroup(arrayOf("", "#"))
addGroup(arrayOf("", "$"))
addGroup(arrayOf("", "%"))
addGroup(arrayOf("", "&"))
addGroup(arrayOf("", "'"))
addGroup(arrayOf("", "("))
addGroup(arrayOf("", ")"))
addGroup(arrayOf("", "*"))
addGroup(arrayOf("", "+"))
addGroup(arrayOf("", ",", "", ""))
addGroup(arrayOf("", ".", "", ""))
addGroup(arrayOf("", "/"))
addGroup(arrayOf("", ":"))
addGroup(arrayOf("", ";"))
addGroup(arrayOf("", "<"))
addGroup(arrayOf("", "="))
addGroup(arrayOf("", ">"))
addGroup(arrayOf("", "?"))
addGroup(arrayOf("", "@"))
addGroup(arrayOf("", "["))
addGroup(arrayOf("", "\\", ""))
addGroup(arrayOf("", "]"))
addGroup(arrayOf("", "^"))
addGroup(arrayOf("_", "_"))
addGroup(arrayOf("", "`"))
addGroup(arrayOf("", "{"))
addGroup(arrayOf("", "|", ""))
addGroup(arrayOf("", "}"))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
// チルダ
addGroup(
arrayOf(
"~",
i2s(tmp, 0x301C),
i2s(tmp, 0xFF5E)
)
)
// 半角カナの濁音,半濁音は2文字になる
addGroup(arrayOf("", "", "ガ"))
addGroup(arrayOf("", "", "ギ"))
addGroup(arrayOf("", "", "グ"))
addGroup(arrayOf("", "", "ゲ"))
addGroup(arrayOf("", "", "ゴ"))
addGroup(arrayOf("", "", "ザ"))
addGroup(arrayOf("", "", "ジ"))
addGroup(arrayOf("", "", "ズ"))
addGroup(arrayOf("", "", "ゼ"))
addGroup(arrayOf("", "", "ゾ"))
addGroup(arrayOf("", "", "ダ"))
addGroup(arrayOf("", "", "ヂ"))
addGroup(arrayOf("", "", "ヅ"))
addGroup(arrayOf("", "", "デ"))
addGroup(arrayOf("", "", "ド"))
addGroup(arrayOf("", "", "バ"))
addGroup(arrayOf("", "", "ビ"))
addGroup(arrayOf("", "", "ブ"))
addGroup(arrayOf("", "", "ベ"))
addGroup(arrayOf("", "", "ボ"))
addGroup(arrayOf("", "", "パ"))
addGroup(arrayOf("", "", "ピ"))
addGroup(arrayOf("", "", "プ"))
addGroup(arrayOf("", "", "ペ"))
addGroup(arrayOf("", "", "ポ"))
addGroup(arrayOf("", "う゛", "ヴ"))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "ソ", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
}
}
// 末尾の空白や開業を取り除く
fun SpannableStringBuilder.removeEndWhitespaces(): SpannableStringBuilder {
var pos = length
while (pos > 0 && CharacterGroup.isWhitespace(codePointBefore(pos))) {
// whitespaces are always 1 == Character.charCount(c)
--pos
}
if (pos < length) delete(pos, length)
return this
}
// 行末の空白を除去。連続する改行を2つまでに制限する。
fun SpannableStringBuilder.neatSpaces(): SpannableStringBuilder {
// 行末の空白を除去
val m = CharacterGroup.reWhitespaceBeforeLineFeed.matcher(this)
val matchList = ArrayList<Pair<Int, Int>>()
while (m.find()) {
matchList.add(Pair(m.start(), m.end()))
}
for (pair in matchList.reversed()) {
delete(pair.first, pair.second - 1)
}
// 連続する改行をまとめる
var previousBrCount = 0
for (i in this.indices.reversed()) {
val c = this[i]
if (c != '\n') {
previousBrCount = 0
} else if (++previousBrCount >= 3) {
delete(i, i + 1)
}
}
return this
}