SubwayTooter-Android-App/app/src/main/java/jp/juggler/subwaytooter/util/CharacterGroup.kt

307 lines
9.4 KiB
Kotlin
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package jp.juggler.subwaytooter.util
import android.util.SparseIntArray
class CharacterGroup {
companion object {
// Tokenizerが終端に達したことを示す
const val END = - 1
// 文字コードから文字列を作る
fun c2s( tmp: CharArray,c : Char) : String {
tmp[0] = c
return String(tmp, 0, 1)
}
fun i2s(tmp : CharArray, c : Int) : String {
tmp[0] = c.toChar()
return String(tmp, 0, 1)
}
// 空白とみなす文字なら真
fun isWhitespace(cp : Int) : Boolean {
when(cp) {
0x0009 // HORIZONTAL TABULATION
, 0x000A // LINE FEED
, 0x000B // VERTICAL TABULATION
, 0x000C // FORM FEED
, 0x000D // CARRIAGE RETURN
, 0x001C // FILE SEPARATOR
, 0x001D // GROUP SEPARATOR
, 0x001E // RECORD SEPARATOR
, 0x001F // UNIT SEPARATOR
, 0x0020, 0x0085 // next line (latin-1)
, 0x00A0 //非区切りスペース
, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007 //非区切りスペース
, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, 0x2028 // line separator
, 0x2029 // paragraph separator
,
0x202F //非区切りスペース
, 0x205F, 0x2060, 0x3000, 0x3164, 0xFEFF -> return true
else -> return false // Character.isWhitespace( cp ); は不要っぽい
}
}
// 文字列のリストからグループIDを決定する
private fun findGroupId( list : Array<String>) : Int {
// グループのIDは、グループ中の文字(長さ1)のunicode値の最小
var id = Integer.MAX_VALUE
for(s in list) {
if(s.length == 1) {
val c = s[0].toInt()
if(c < id) id = c
}
}
if(id == Integer.MAX_VALUE) {
throw RuntimeException("missing group id")
}
return id
}
}
// 文字列からグループIDを調べるマップ
// 文字数1: unicode => group_id
private val map1 = SparseIntArray()
// 文字数2: unicode 二つを合成した数値 => group_id。半角カナ濁音など
private val map2 = SparseIntArray()
// グループをmapに登録する
private fun addGroup( list: Array<String> ) {
val group_id = findGroupId(list)
// 文字列からグループIDを調べるマップを更新
for(s in list) {
val map : SparseIntArray
val key : Int
val v1 = s[0].toInt()
if(s.length == 1) {
map = map1
key = v1
} else {
map = map2
val v2 = s[1].toInt()
key = v1 or (v2 shl 16)
}
val old = map.get(key)
if(old != 0 && old != group_id) {
throw RuntimeException("group conflict: $s")
}
map.put(key, group_id)
}
}
// 入力された文字列から 文字,グループ,終端 のどれかを順に列挙する
inner class Tokenizer{
internal var text : CharSequence = ""
internal var end : Int = 0
var offset : Int = 0
internal fun reset(text : CharSequence, start : Int, end : Int) : Tokenizer {
this.text = text
this.offset = start
this.end = end
return this
}
// returns END or group_id or UTF-16 character
operator fun next() : Int {
var pos = offset
// 空白を読み飛ばす
while(pos < end && isWhitespace(text[pos].toInt())) ++ pos
// 終端までの文字数
val remain = end - pos
if(remain <= 0) {
// 空白を読み飛ばしたら終端になった
// 終端の場合、末尾の空白はoffsetに含めない
return END
}
val v1 = text[pos].toInt()
// グループに登録された文字を長い順にチェック
var check_len = if(remain > 2) 2 else remain
while(check_len > 0) {
val group_id = if(check_len == 1)
map1.get(v1)
else
map2.get(v1 or (text[pos + 1].toInt() shl 16))
if(group_id != 0) {
this.offset = pos + check_len
return group_id
}
-- check_len
}
this.offset = pos + 1
return v1
}
}
fun tokenizer() : Tokenizer {
return Tokenizer()
}
init {
val tmp = CharArray(1)
val array2 = arrayOf("","")
val array4 = arrayOf("","","","")
// 数字
for(i in 0 .. 8) {
array2[0] = c2s(tmp,'0' + i)
array2[1] = c2s(tmp,'' + i)
addGroup(array2)
}
// 英字
for(i in 0 .. 25) {
array4[0] = c2s(tmp, 'a' + i)
array4[1] = c2s(tmp, 'A' + i)
array4[2] = c2s(tmp, '' + i)
array4[3] = c2s(tmp, '' + i)
addGroup(array4)
}
// ハイフン
addGroup(arrayOf(
i2s(tmp, 0x002D), // ASCIIのハイフン
i2s(tmp, 0x30FC), // 全角カナの長音 Shift_JIS由来
i2s(tmp, 0x2010),
i2s(tmp, 0x2011),
i2s(tmp, 0x2013),
i2s(tmp, 0x2014),
i2s(tmp, 0x2015), // 全角カナのダッシュ Shift_JIS由来
i2s(tmp, 0x2212),
i2s(tmp, 0xFF0d), // 全角カナの長音 MS932由来
i2s(tmp, 0xFF70) // 半角カナの長音 MS932由来
))
addGroup(arrayOf("", "!"))
addGroup(arrayOf("", "\""))
addGroup(arrayOf("", "#"))
addGroup(arrayOf("", "$"))
addGroup(arrayOf("", "%"))
addGroup(arrayOf("", "&"))
addGroup(arrayOf("", "'"))
addGroup(arrayOf("", "("))
addGroup(arrayOf("", ")"))
addGroup(arrayOf("", "*"))
addGroup(arrayOf("", "+"))
addGroup(arrayOf("", ",", "", ""))
addGroup(arrayOf("", ".", "", ""))
addGroup(arrayOf("", "/"))
addGroup(arrayOf("", ":"))
addGroup(arrayOf("", ";"))
addGroup(arrayOf("", "<"))
addGroup(arrayOf("", "="))
addGroup(arrayOf("", ">"))
addGroup(arrayOf("", "?"))
addGroup(arrayOf("", "@"))
addGroup(arrayOf("", "["))
addGroup(arrayOf("", "\\", ""))
addGroup(arrayOf("", "]"))
addGroup(arrayOf("", "^"))
addGroup(arrayOf("_", "_"))
addGroup(arrayOf("", "`"))
addGroup(arrayOf("", "{"))
addGroup(arrayOf("", "|", ""))
addGroup(arrayOf("", "}"))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
// チルダ
addGroup(arrayOf("~", i2s(tmp, 0x301C), i2s(tmp, 0xFF5E)))
// 半角カナの濁音,半濁音は2文字になる
addGroup(arrayOf("", "", "ガ"))
addGroup(arrayOf("", "", "ギ"))
addGroup(arrayOf("", "", "グ"))
addGroup(arrayOf("", "", "ゲ"))
addGroup(arrayOf("", "", "ゴ"))
addGroup(arrayOf("", "", "ザ"))
addGroup(arrayOf("", "", "ジ"))
addGroup(arrayOf("", "", "ズ"))
addGroup(arrayOf("", "", "ゼ"))
addGroup(arrayOf("", "", "ゾ"))
addGroup(arrayOf("", "", "ダ"))
addGroup(arrayOf("", "", "ヂ"))
addGroup(arrayOf("", "", "ヅ"))
addGroup(arrayOf("", "", "デ"))
addGroup(arrayOf("", "", "ド"))
addGroup(arrayOf("", "", "バ"))
addGroup(arrayOf("", "", "ビ"))
addGroup(arrayOf("", "", "ブ"))
addGroup(arrayOf("", "", "ベ"))
addGroup(arrayOf("", "", "ボ"))
addGroup(arrayOf("", "", "パ"))
addGroup(arrayOf("", "", "ピ"))
addGroup(arrayOf("", "", "プ"))
addGroup(arrayOf("", "", "ペ"))
addGroup(arrayOf("", "", "ポ"))
addGroup(arrayOf("", "う゛", "ヴ"))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "ソ", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", "", "", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
addGroup(arrayOf("", "", ""))
}
}