package jp.juggler.subwaytooter.util import android.util.SparseIntArray class CharacterGroup { companion object { // Tokenizerが終端に達したことを示す const val END = - 1 // 文字コードから文字列を作る fun c2s(tmp : CharArray, c : Char) : String { tmp[0] = c return String(tmp, 0, 1) } fun i2s(tmp : CharArray, c : Int) : String { tmp[0] = c.toChar() return String(tmp, 0, 1) } // 空白とみなす文字なら真 fun isWhitespace(cp : Int) : Boolean { when(cp) { 0x0009 // HORIZONTAL TABULATION , 0x000A // LINE FEED , 0x000B // VERTICAL TABULATION , 0x000C // FORM FEED , 0x000D // CARRIAGE RETURN , 0x001C // FILE SEPARATOR , 0x001D // GROUP SEPARATOR , 0x001E // RECORD SEPARATOR , 0x001F // UNIT SEPARATOR , 0x0020, 0x0085 // next line (latin-1) , 0x00A0 //非区切りスペース , 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007 //非区切りスペース , 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, 0x2028 // line separator , 0x2029 // paragraph separator , 0x202F //非区切りスペース , 0x205F, 0x2060, 0x3000, 0x3164, 0xFEFF -> return true else -> return false // Character.isWhitespace( cp ); は不要っぽい } } // 文字列のリストからグループIDを決定する private fun findGroupId(list : Array) : Int { // グループのIDは、グループ中の文字(長さ1)のunicode値の最小 var id = Integer.MAX_VALUE for(s in list) { if(s.length == 1) { val c = s[0].toInt() if(c < id) id = c } } if(id == Integer.MAX_VALUE) { throw RuntimeException("missing group id") } return id } } // 文字列からグループIDを調べるマップ // 文字数1: unicode => group_id private val map1 = SparseIntArray() // 文字数2: unicode 二つを合成した数値 => group_id。半角カナ+濁音など private val map2 = SparseIntArray() // ユニコード文字を正規化する。 // 簡易版なので全ての文字には対応していない fun getUnifiedCharacter(c:Char):Char{ val v1 = map1[c.toInt()] return if( v1 != 0 ) v1.toChar() else c } // グループをmapに登録する private fun addGroup(list : Array) { val group_id = findGroupId(list) // 文字列からグループIDを調べるマップを更新 for(s in list) { val map : SparseIntArray val key : Int val v1 = s[0].toInt() if(s.length == 1) { map = map1 key = v1 } else { map = map2 val v2 = s[1].toInt() key = v1 or (v2 shl 16) } val old = map.get(key) if(old != 0 && old != group_id) { throw RuntimeException("group conflict: $s") } map.put(key, group_id) } } // 入力された文字列から 文字,グループ,終端 のどれかを順に列挙する inner class Tokenizer { internal var text : CharSequence = "" internal var end : Int = 0 var offset : Int = 0 internal fun reset(text : CharSequence, start : Int, end : Int) : Tokenizer { this.text = text this.offset = start this.end = end return this } // returns END or group_id or UTF-16 character operator fun next() : Int { var pos = offset // 空白を読み飛ばす while(pos < end && isWhitespace(text[pos].toInt())) ++ pos // 終端までの文字数 val remain = end - pos if(remain <= 0) { // 空白を読み飛ばしたら終端になった // 終端の場合、末尾の空白はoffsetに含めない return END } val v1 = text[pos].toInt() // グループに登録された文字を長い順にチェック var check_len = if(remain > 2) 2 else remain while(check_len > 0) { val group_id = if(check_len == 1) map1.get(v1) else map2.get(v1 or (text[pos + 1].toInt() shl 16)) if(group_id != 0) { this.offset = pos + check_len return group_id } -- check_len } this.offset = pos + 1 return v1 } } fun tokenizer() : Tokenizer { return Tokenizer() } init { val tmp = CharArray(1) val array2 = arrayOf("", "") val array4 = arrayOf("", "", "", "") // 数字 for(i in 0 .. 8) { array2[0] = c2s(tmp, '0' + i) array2[1] = c2s(tmp, '0' + i) addGroup(array2) } // 英字 for(i in 0 .. 25) { array4[0] = c2s(tmp, 'a' + i) array4[1] = c2s(tmp, 'A' + i) array4[2] = c2s(tmp, 'a' + i) array4[3] = c2s(tmp, 'A' + i) addGroup(array4) } // ハイフン addGroup( arrayOf( i2s(tmp, 0x002D), // ASCIIのハイフン i2s(tmp, 0x30FC), // 全角カナの長音 Shift_JIS由来 i2s(tmp, 0x2010), i2s(tmp, 0x2011), i2s(tmp, 0x2013), i2s(tmp, 0x2014), i2s(tmp, 0x2015), // 全角カナのダッシュ Shift_JIS由来 i2s(tmp, 0x2212), i2s(tmp, 0xFF0d), // 全角カナの長音 MS932由来 i2s(tmp, 0xFF70) // 半角カナの長音 MS932由来 ) ) addGroup(arrayOf("!", "!")) addGroup(arrayOf(""", "\"")) addGroup(arrayOf("#", "#")) addGroup(arrayOf("$", "$")) addGroup(arrayOf("%", "%")) addGroup(arrayOf("&", "&")) addGroup(arrayOf("'", "'")) addGroup(arrayOf("(", "(")) addGroup(arrayOf(")", ")")) addGroup(arrayOf("*", "*")) addGroup(arrayOf("+", "+")) addGroup(arrayOf(",", ",", "、", "、")) addGroup(arrayOf(".", ".", "。", "。")) addGroup(arrayOf("/", "/")) addGroup(arrayOf(":", ":")) addGroup(arrayOf(";", ";")) addGroup(arrayOf("<", "<")) addGroup(arrayOf("=", "=")) addGroup(arrayOf(">", ">")) addGroup(arrayOf("?", "?")) addGroup(arrayOf("@", "@")) addGroup(arrayOf("[", "[")) addGroup(arrayOf("\", "\\", "¥")) addGroup(arrayOf("]", "]")) addGroup(arrayOf("^", "^")) addGroup(arrayOf("_", "_")) addGroup(arrayOf("`", "`")) addGroup(arrayOf("{", "{")) addGroup(arrayOf("|", "|", "¦")) addGroup(arrayOf("}", "}")) addGroup(arrayOf("・", "・", "・")) addGroup(arrayOf("「", "「", "「")) addGroup(arrayOf("」", "」", "」")) // チルダ addGroup(arrayOf("~", i2s(tmp, 0x301C), i2s(tmp, 0xFF5E))) // 半角カナの濁音,半濁音は2文字になる addGroup(arrayOf("ガ", "が", "ガ")) addGroup(arrayOf("ギ", "ぎ", "ギ")) addGroup(arrayOf("グ", "ぐ", "グ")) addGroup(arrayOf("ゲ", "げ", "ゲ")) addGroup(arrayOf("ゴ", "ご", "ゴ")) addGroup(arrayOf("ザ", "ざ", "ザ")) addGroup(arrayOf("ジ", "じ", "ジ")) addGroup(arrayOf("ズ", "ず", "ズ")) addGroup(arrayOf("ゼ", "ぜ", "ゼ")) addGroup(arrayOf("ゾ", "ぞ", "ゾ")) addGroup(arrayOf("ダ", "だ", "ダ")) addGroup(arrayOf("ヂ", "ぢ", "ヂ")) addGroup(arrayOf("ヅ", "づ", "ヅ")) addGroup(arrayOf("デ", "で", "デ")) addGroup(arrayOf("ド", "ど", "ド")) addGroup(arrayOf("バ", "ば", "バ")) addGroup(arrayOf("ビ", "び", "ビ")) addGroup(arrayOf("ブ", "ぶ", "ブ")) addGroup(arrayOf("ベ", "べ", "ベ")) addGroup(arrayOf("ボ", "ぼ", "ボ")) addGroup(arrayOf("パ", "ぱ", "パ")) addGroup(arrayOf("ピ", "ぴ", "ピ")) addGroup(arrayOf("プ", "ぷ", "プ")) addGroup(arrayOf("ペ", "ぺ", "ペ")) addGroup(arrayOf("ポ", "ぽ", "ポ")) addGroup(arrayOf("ヴ", "う゛", "ヴ")) addGroup(arrayOf("あ", "ア", "ア", "ぁ", "ァ", "ァ")) addGroup(arrayOf("い", "イ", "イ", "ぃ", "ィ", "ィ")) addGroup(arrayOf("う", "ウ", "ウ", "ぅ", "ゥ", "ゥ")) addGroup(arrayOf("え", "エ", "エ", "ぇ", "ェ", "ェ")) addGroup(arrayOf("お", "オ", "オ", "ぉ", "ォ", "ォ")) addGroup(arrayOf("か", "カ", "カ")) addGroup(arrayOf("き", "キ", "キ")) addGroup(arrayOf("く", "ク", "ク")) addGroup(arrayOf("け", "ケ", "ケ")) addGroup(arrayOf("こ", "コ", "コ")) addGroup(arrayOf("さ", "サ", "サ")) addGroup(arrayOf("し", "シ", "シ")) addGroup(arrayOf("す", "ス", "ス")) addGroup(arrayOf("せ", "セ", "セ")) addGroup(arrayOf("そ", "ソ", "ソ")) addGroup(arrayOf("た", "タ", "タ")) addGroup(arrayOf("ち", "チ", "チ")) addGroup(arrayOf("つ", "ツ", "ツ", "っ", "ッ", "ッ")) addGroup(arrayOf("て", "テ", "テ")) addGroup(arrayOf("と", "ト", "ト")) addGroup(arrayOf("な", "ナ", "ナ")) addGroup(arrayOf("に", "ニ", "ニ")) addGroup(arrayOf("ぬ", "ヌ", "ヌ")) addGroup(arrayOf("ね", "ネ", "ネ")) addGroup(arrayOf("の", "ノ", "ノ")) addGroup(arrayOf("は", "ハ", "ハ")) addGroup(arrayOf("ひ", "ヒ", "ヒ")) addGroup(arrayOf("ふ", "フ", "フ")) addGroup(arrayOf("へ", "ヘ", "ヘ")) addGroup(arrayOf("ほ", "ホ", "ホ")) addGroup(arrayOf("ま", "マ", "マ")) addGroup(arrayOf("み", "ミ", "ミ")) addGroup(arrayOf("む", "ム", "ム")) addGroup(arrayOf("め", "メ", "メ")) addGroup(arrayOf("も", "モ", "モ")) addGroup(arrayOf("や", "ヤ", "ヤ", "ゃ", "ャ", "ャ")) addGroup(arrayOf("ゆ", "ユ", "ユ", "ゅ", "ュ", "ュ")) addGroup(arrayOf("よ", "ヨ", "ヨ", "ょ", "ョ", "ョ")) addGroup(arrayOf("ら", "ラ", "ラ")) addGroup(arrayOf("り", "リ", "リ")) addGroup(arrayOf("る", "ル", "ル")) addGroup(arrayOf("れ", "レ", "レ")) addGroup(arrayOf("ろ", "ロ", "ロ")) addGroup(arrayOf("わ", "ワ", "ワ")) addGroup(arrayOf("を", "ヲ", "ヲ")) addGroup(arrayOf("ん", "ン", "ン")) } }