2018-01-04 19:52:25 +01:00
|
|
|
|
package jp.juggler.subwaytooter.util
|
|
|
|
|
|
2018-11-30 11:54:32 +01:00
|
|
|
|
import android.util.SparseBooleanArray
|
2018-01-04 19:52:25 +01:00
|
|
|
|
import android.util.SparseIntArray
|
2018-11-30 11:54:32 +01:00
|
|
|
|
import java.util.regex.Pattern
|
2018-01-04 19:52:25 +01:00
|
|
|
|
|
|
|
|
|
class CharacterGroup {
|
|
|
|
|
|
|
|
|
|
companion object {
|
|
|
|
|
|
|
|
|
|
// Tokenizerが終端に達したことを示す
|
|
|
|
|
const val END = - 1
|
|
|
|
|
|
|
|
|
|
// 文字コードから文字列を作る
|
2018-07-06 17:22:22 +02:00
|
|
|
|
fun c2s(tmp : CharArray, c : Char) : String {
|
2018-01-04 19:52:25 +01:00
|
|
|
|
tmp[0] = c
|
|
|
|
|
return String(tmp, 0, 1)
|
|
|
|
|
}
|
2018-07-06 17:22:22 +02:00
|
|
|
|
|
2018-01-04 19:52:25 +01:00
|
|
|
|
fun i2s(tmp : CharArray, c : Int) : String {
|
|
|
|
|
tmp[0] = c.toChar()
|
|
|
|
|
return String(tmp, 0, 1)
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-30 11:54:32 +01:00
|
|
|
|
private val mapWhitespace = SparseBooleanArray().apply {
|
|
|
|
|
intArrayOf(
|
2018-01-04 19:52:25 +01:00
|
|
|
|
0x0009 // HORIZONTAL TABULATION
|
2018-11-30 11:54:32 +01:00
|
|
|
|
, 0x000A // LINE FEED
|
|
|
|
|
, 0x000B // VERTICAL TABULATION
|
|
|
|
|
, 0x000C // FORM FEED
|
|
|
|
|
, 0x000D // CARRIAGE RETURN
|
|
|
|
|
, 0x001C // FILE SEPARATOR
|
|
|
|
|
, 0x001D // GROUP SEPARATOR
|
|
|
|
|
, 0x001E // RECORD SEPARATOR
|
|
|
|
|
, 0x001F // UNIT SEPARATOR
|
|
|
|
|
, 0x0020, 0x0085 // next line (latin-1)
|
|
|
|
|
, 0x00A0 //非区切りスペース
|
|
|
|
|
, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007 //非区切りスペース
|
|
|
|
|
, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, 0x2028 // line separator
|
|
|
|
|
, 0x2029 // paragraph separator
|
|
|
|
|
, 0x202F //非区切りスペース
|
|
|
|
|
, 0x205F, 0x2060, 0x3000, 0x3164, 0xFEFF
|
|
|
|
|
).forEach {
|
|
|
|
|
put(it,true)
|
2018-01-04 19:52:25 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-30 11:54:32 +01:00
|
|
|
|
// 空白とみなす文字なら真
|
|
|
|
|
fun isWhitespace(cp : Int) : Boolean = mapWhitespace.get(cp,false)
|
|
|
|
|
|
2018-11-30 21:58:13 +01:00
|
|
|
|
internal val reWhitespace =Pattern.compile(
|
|
|
|
|
StringBuilder().apply{
|
|
|
|
|
append("[\\s\\t\\x0d\\x0a")
|
|
|
|
|
for(i in 0 until mapWhitespace.size()){
|
|
|
|
|
val k = mapWhitespace.keyAt(i)
|
|
|
|
|
if( k > 0x20 ) append(k.toChar())
|
|
|
|
|
}
|
|
|
|
|
append("]+")
|
|
|
|
|
}.toString()
|
|
|
|
|
)
|
2018-11-30 11:54:32 +01:00
|
|
|
|
|
|
|
|
|
|
2018-01-04 19:52:25 +01:00
|
|
|
|
// 文字列のリストからグループIDを決定する
|
2018-07-06 17:22:22 +02:00
|
|
|
|
private fun findGroupId(list : Array<String>) : Int {
|
2018-01-04 19:52:25 +01:00
|
|
|
|
// グループのIDは、グループ中の文字(長さ1)のunicode値の最小
|
|
|
|
|
var id = Integer.MAX_VALUE
|
|
|
|
|
for(s in list) {
|
|
|
|
|
if(s.length == 1) {
|
|
|
|
|
val c = s[0].toInt()
|
|
|
|
|
if(c < id) id = c
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if(id == Integer.MAX_VALUE) {
|
2018-01-10 16:47:35 +01:00
|
|
|
|
throw RuntimeException("missing group id")
|
2018-01-04 19:52:25 +01:00
|
|
|
|
}
|
|
|
|
|
return id
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 文字列からグループIDを調べるマップ
|
|
|
|
|
|
|
|
|
|
// 文字数1: unicode => group_id
|
|
|
|
|
private val map1 = SparseIntArray()
|
|
|
|
|
|
|
|
|
|
// 文字数2: unicode 二つを合成した数値 => group_id。半角カナ+濁音など
|
|
|
|
|
private val map2 = SparseIntArray()
|
2018-07-08 19:00:47 +02:00
|
|
|
|
|
|
|
|
|
// ユニコード文字を正規化する。
|
|
|
|
|
// 簡易版なので全ての文字には対応していない
|
|
|
|
|
fun getUnifiedCharacter(c:Char):Char{
|
|
|
|
|
val v1 = map1[c.toInt()]
|
|
|
|
|
return if( v1 != 0 ) v1.toChar() else c
|
|
|
|
|
}
|
2018-01-04 19:52:25 +01:00
|
|
|
|
|
|
|
|
|
// グループをmapに登録する
|
2018-07-06 17:22:22 +02:00
|
|
|
|
private fun addGroup(list : Array<String>) {
|
2018-01-04 19:52:25 +01:00
|
|
|
|
|
|
|
|
|
val group_id = findGroupId(list)
|
|
|
|
|
|
|
|
|
|
// 文字列からグループIDを調べるマップを更新
|
|
|
|
|
for(s in list) {
|
|
|
|
|
|
|
|
|
|
val map : SparseIntArray
|
|
|
|
|
val key : Int
|
|
|
|
|
|
|
|
|
|
val v1 = s[0].toInt()
|
|
|
|
|
if(s.length == 1) {
|
|
|
|
|
map = map1
|
|
|
|
|
key = v1
|
|
|
|
|
} else {
|
|
|
|
|
map = map2
|
|
|
|
|
val v2 = s[1].toInt()
|
|
|
|
|
key = v1 or (v2 shl 16)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
val old = map.get(key)
|
|
|
|
|
if(old != 0 && old != group_id) {
|
|
|
|
|
throw RuntimeException("group conflict: $s")
|
|
|
|
|
}
|
|
|
|
|
map.put(key, group_id)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 入力された文字列から 文字,グループ,終端 のどれかを順に列挙する
|
2018-07-06 17:22:22 +02:00
|
|
|
|
inner class Tokenizer {
|
2018-01-04 19:52:25 +01:00
|
|
|
|
|
|
|
|
|
internal var text : CharSequence = ""
|
|
|
|
|
internal var end : Int = 0
|
|
|
|
|
var offset : Int = 0
|
|
|
|
|
|
|
|
|
|
internal fun reset(text : CharSequence, start : Int, end : Int) : Tokenizer {
|
|
|
|
|
this.text = text
|
|
|
|
|
this.offset = start
|
|
|
|
|
this.end = end
|
|
|
|
|
return this
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// returns END or group_id or UTF-16 character
|
|
|
|
|
operator fun next() : Int {
|
|
|
|
|
|
|
|
|
|
var pos = offset
|
|
|
|
|
|
|
|
|
|
// 空白を読み飛ばす
|
|
|
|
|
while(pos < end && isWhitespace(text[pos].toInt())) ++ pos
|
|
|
|
|
|
|
|
|
|
// 終端までの文字数
|
|
|
|
|
val remain = end - pos
|
|
|
|
|
if(remain <= 0) {
|
|
|
|
|
// 空白を読み飛ばしたら終端になった
|
|
|
|
|
// 終端の場合、末尾の空白はoffsetに含めない
|
|
|
|
|
return END
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
val v1 = text[pos].toInt()
|
|
|
|
|
|
|
|
|
|
// グループに登録された文字を長い順にチェック
|
|
|
|
|
var check_len = if(remain > 2) 2 else remain
|
|
|
|
|
while(check_len > 0) {
|
|
|
|
|
val group_id = if(check_len == 1)
|
|
|
|
|
map1.get(v1)
|
|
|
|
|
else
|
|
|
|
|
map2.get(v1 or (text[pos + 1].toInt() shl 16))
|
|
|
|
|
if(group_id != 0) {
|
|
|
|
|
this.offset = pos + check_len
|
|
|
|
|
return group_id
|
|
|
|
|
}
|
|
|
|
|
-- check_len
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
this.offset = pos + 1
|
|
|
|
|
return v1
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fun tokenizer() : Tokenizer {
|
|
|
|
|
return Tokenizer()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
init {
|
|
|
|
|
val tmp = CharArray(1)
|
2018-07-06 17:22:22 +02:00
|
|
|
|
val array2 = arrayOf("", "")
|
|
|
|
|
val array4 = arrayOf("", "", "", "")
|
2018-01-04 19:52:25 +01:00
|
|
|
|
// 数字
|
|
|
|
|
for(i in 0 .. 8) {
|
2018-07-06 17:22:22 +02:00
|
|
|
|
array2[0] = c2s(tmp, '0' + i)
|
|
|
|
|
array2[1] = c2s(tmp, '0' + i)
|
2018-01-04 19:52:25 +01:00
|
|
|
|
addGroup(array2)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 英字
|
|
|
|
|
for(i in 0 .. 25) {
|
|
|
|
|
array4[0] = c2s(tmp, 'a' + i)
|
|
|
|
|
array4[1] = c2s(tmp, 'A' + i)
|
|
|
|
|
array4[2] = c2s(tmp, 'a' + i)
|
|
|
|
|
array4[3] = c2s(tmp, 'A' + i)
|
|
|
|
|
addGroup(array4)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ハイフン
|
2018-07-06 17:22:22 +02:00
|
|
|
|
addGroup(
|
|
|
|
|
arrayOf(
|
|
|
|
|
i2s(tmp, 0x002D), // ASCIIのハイフン
|
|
|
|
|
i2s(tmp, 0x30FC), // 全角カナの長音 Shift_JIS由来
|
|
|
|
|
i2s(tmp, 0x2010),
|
|
|
|
|
i2s(tmp, 0x2011),
|
|
|
|
|
i2s(tmp, 0x2013),
|
|
|
|
|
i2s(tmp, 0x2014),
|
|
|
|
|
i2s(tmp, 0x2015), // 全角カナのダッシュ Shift_JIS由来
|
|
|
|
|
i2s(tmp, 0x2212),
|
|
|
|
|
i2s(tmp, 0xFF0d), // 全角カナの長音 MS932由来
|
|
|
|
|
i2s(tmp, 0xFF70) // 半角カナの長音 MS932由来
|
|
|
|
|
)
|
|
|
|
|
)
|
2018-01-04 19:52:25 +01:00
|
|
|
|
|
|
|
|
|
addGroup(arrayOf("!", "!"))
|
|
|
|
|
addGroup(arrayOf(""", "\""))
|
|
|
|
|
addGroup(arrayOf("#", "#"))
|
|
|
|
|
addGroup(arrayOf("$", "$"))
|
|
|
|
|
addGroup(arrayOf("%", "%"))
|
|
|
|
|
addGroup(arrayOf("&", "&"))
|
|
|
|
|
addGroup(arrayOf("'", "'"))
|
|
|
|
|
addGroup(arrayOf("(", "("))
|
|
|
|
|
addGroup(arrayOf(")", ")"))
|
|
|
|
|
addGroup(arrayOf("*", "*"))
|
|
|
|
|
addGroup(arrayOf("+", "+"))
|
|
|
|
|
addGroup(arrayOf(",", ",", "、", "、"))
|
|
|
|
|
addGroup(arrayOf(".", ".", "。", "。"))
|
|
|
|
|
addGroup(arrayOf("/", "/"))
|
|
|
|
|
addGroup(arrayOf(":", ":"))
|
|
|
|
|
addGroup(arrayOf(";", ";"))
|
|
|
|
|
addGroup(arrayOf("<", "<"))
|
|
|
|
|
addGroup(arrayOf("=", "="))
|
|
|
|
|
addGroup(arrayOf(">", ">"))
|
|
|
|
|
addGroup(arrayOf("?", "?"))
|
|
|
|
|
addGroup(arrayOf("@", "@"))
|
|
|
|
|
addGroup(arrayOf("[", "["))
|
|
|
|
|
addGroup(arrayOf("\", "\\", "¥"))
|
|
|
|
|
addGroup(arrayOf("]", "]"))
|
|
|
|
|
addGroup(arrayOf("^", "^"))
|
|
|
|
|
addGroup(arrayOf("_", "_"))
|
|
|
|
|
addGroup(arrayOf("`", "`"))
|
|
|
|
|
addGroup(arrayOf("{", "{"))
|
|
|
|
|
addGroup(arrayOf("|", "|", "¦"))
|
|
|
|
|
addGroup(arrayOf("}", "}"))
|
|
|
|
|
|
|
|
|
|
addGroup(arrayOf("・", "・", "・"))
|
|
|
|
|
addGroup(arrayOf("「", "「", "「"))
|
|
|
|
|
addGroup(arrayOf("」", "」", "」"))
|
|
|
|
|
|
|
|
|
|
// チルダ
|
|
|
|
|
addGroup(arrayOf("~", i2s(tmp, 0x301C), i2s(tmp, 0xFF5E)))
|
|
|
|
|
|
|
|
|
|
// 半角カナの濁音,半濁音は2文字になる
|
|
|
|
|
addGroup(arrayOf("ガ", "が", "ガ"))
|
|
|
|
|
addGroup(arrayOf("ギ", "ぎ", "ギ"))
|
|
|
|
|
addGroup(arrayOf("グ", "ぐ", "グ"))
|
|
|
|
|
addGroup(arrayOf("ゲ", "げ", "ゲ"))
|
|
|
|
|
addGroup(arrayOf("ゴ", "ご", "ゴ"))
|
|
|
|
|
addGroup(arrayOf("ザ", "ざ", "ザ"))
|
|
|
|
|
addGroup(arrayOf("ジ", "じ", "ジ"))
|
|
|
|
|
addGroup(arrayOf("ズ", "ず", "ズ"))
|
|
|
|
|
addGroup(arrayOf("ゼ", "ぜ", "ゼ"))
|
|
|
|
|
addGroup(arrayOf("ゾ", "ぞ", "ゾ"))
|
|
|
|
|
addGroup(arrayOf("ダ", "だ", "ダ"))
|
|
|
|
|
addGroup(arrayOf("ヂ", "ぢ", "ヂ"))
|
|
|
|
|
addGroup(arrayOf("ヅ", "づ", "ヅ"))
|
|
|
|
|
addGroup(arrayOf("デ", "で", "デ"))
|
|
|
|
|
addGroup(arrayOf("ド", "ど", "ド"))
|
|
|
|
|
addGroup(arrayOf("バ", "ば", "バ"))
|
|
|
|
|
addGroup(arrayOf("ビ", "び", "ビ"))
|
|
|
|
|
addGroup(arrayOf("ブ", "ぶ", "ブ"))
|
|
|
|
|
addGroup(arrayOf("ベ", "べ", "ベ"))
|
|
|
|
|
addGroup(arrayOf("ボ", "ぼ", "ボ"))
|
|
|
|
|
addGroup(arrayOf("パ", "ぱ", "パ"))
|
|
|
|
|
addGroup(arrayOf("ピ", "ぴ", "ピ"))
|
|
|
|
|
addGroup(arrayOf("プ", "ぷ", "プ"))
|
|
|
|
|
addGroup(arrayOf("ペ", "ぺ", "ペ"))
|
|
|
|
|
addGroup(arrayOf("ポ", "ぽ", "ポ"))
|
|
|
|
|
addGroup(arrayOf("ヴ", "う゛", "ヴ"))
|
|
|
|
|
|
|
|
|
|
addGroup(arrayOf("あ", "ア", "ア", "ぁ", "ァ", "ァ"))
|
|
|
|
|
addGroup(arrayOf("い", "イ", "イ", "ぃ", "ィ", "ィ"))
|
|
|
|
|
addGroup(arrayOf("う", "ウ", "ウ", "ぅ", "ゥ", "ゥ"))
|
|
|
|
|
addGroup(arrayOf("え", "エ", "エ", "ぇ", "ェ", "ェ"))
|
|
|
|
|
addGroup(arrayOf("お", "オ", "オ", "ぉ", "ォ", "ォ"))
|
|
|
|
|
addGroup(arrayOf("か", "カ", "カ"))
|
|
|
|
|
addGroup(arrayOf("き", "キ", "キ"))
|
|
|
|
|
addGroup(arrayOf("く", "ク", "ク"))
|
|
|
|
|
addGroup(arrayOf("け", "ケ", "ケ"))
|
|
|
|
|
addGroup(arrayOf("こ", "コ", "コ"))
|
|
|
|
|
addGroup(arrayOf("さ", "サ", "サ"))
|
|
|
|
|
addGroup(arrayOf("し", "シ", "シ"))
|
|
|
|
|
addGroup(arrayOf("す", "ス", "ス"))
|
|
|
|
|
addGroup(arrayOf("せ", "セ", "セ"))
|
|
|
|
|
addGroup(arrayOf("そ", "ソ", "ソ"))
|
|
|
|
|
addGroup(arrayOf("た", "タ", "タ"))
|
|
|
|
|
addGroup(arrayOf("ち", "チ", "チ"))
|
|
|
|
|
addGroup(arrayOf("つ", "ツ", "ツ", "っ", "ッ", "ッ"))
|
|
|
|
|
addGroup(arrayOf("て", "テ", "テ"))
|
|
|
|
|
addGroup(arrayOf("と", "ト", "ト"))
|
|
|
|
|
addGroup(arrayOf("な", "ナ", "ナ"))
|
|
|
|
|
addGroup(arrayOf("に", "ニ", "ニ"))
|
|
|
|
|
addGroup(arrayOf("ぬ", "ヌ", "ヌ"))
|
|
|
|
|
addGroup(arrayOf("ね", "ネ", "ネ"))
|
|
|
|
|
addGroup(arrayOf("の", "ノ", "ノ"))
|
|
|
|
|
addGroup(arrayOf("は", "ハ", "ハ"))
|
|
|
|
|
addGroup(arrayOf("ひ", "ヒ", "ヒ"))
|
|
|
|
|
addGroup(arrayOf("ふ", "フ", "フ"))
|
|
|
|
|
addGroup(arrayOf("へ", "ヘ", "ヘ"))
|
|
|
|
|
addGroup(arrayOf("ほ", "ホ", "ホ"))
|
|
|
|
|
addGroup(arrayOf("ま", "マ", "マ"))
|
|
|
|
|
addGroup(arrayOf("み", "ミ", "ミ"))
|
|
|
|
|
addGroup(arrayOf("む", "ム", "ム"))
|
|
|
|
|
addGroup(arrayOf("め", "メ", "メ"))
|
|
|
|
|
addGroup(arrayOf("も", "モ", "モ"))
|
|
|
|
|
addGroup(arrayOf("や", "ヤ", "ヤ", "ゃ", "ャ", "ャ"))
|
|
|
|
|
addGroup(arrayOf("ゆ", "ユ", "ユ", "ゅ", "ュ", "ュ"))
|
|
|
|
|
addGroup(arrayOf("よ", "ヨ", "ヨ", "ょ", "ョ", "ョ"))
|
|
|
|
|
addGroup(arrayOf("ら", "ラ", "ラ"))
|
|
|
|
|
addGroup(arrayOf("り", "リ", "リ"))
|
|
|
|
|
addGroup(arrayOf("る", "ル", "ル"))
|
|
|
|
|
addGroup(arrayOf("れ", "レ", "レ"))
|
|
|
|
|
addGroup(arrayOf("ろ", "ロ", "ロ"))
|
|
|
|
|
addGroup(arrayOf("わ", "ワ", "ワ"))
|
|
|
|
|
addGroup(arrayOf("を", "ヲ", "ヲ"))
|
|
|
|
|
addGroup(arrayOf("ん", "ン", "ン"))
|
|
|
|
|
}
|
|
|
|
|
}
|