package jp.juggler.util import android.text.SpannableStringBuilder import android.util.SparseBooleanArray import android.util.SparseIntArray import java.util.ArrayList import java.util.regex.Pattern object CharacterGroup { // Tokenizerが終端に達したことを示す const val END = -1 // 文字コードから文字列を作る private fun c2s(tmp: CharArray, c: Char): String { tmp[0] = c return String(tmp, 0, 1) } private fun i2s(tmp: CharArray, c: Int): String { tmp[0] = c.toChar() return String(tmp, 0, 1) } private val mapWhitespace = SparseBooleanArray().apply { intArrayOf( 0x0009, // HORIZONTAL TABULATION 0x000A, // LINE FEED 0x000B, // VERTICAL TABULATION 0x000C, // FORM FEED 0x000D, // CARRIAGE RETURN 0x001C, // FILE SEPARATOR 0x001D, // GROUP SEPARATOR 0x001E, // RECORD SEPARATOR 0x001F, // UNIT SEPARATOR 0x0020, 0x0085, // next line (latin-1) 0x00A0, //非区切りスペース 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, //非区切りスペース 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D, 0x2028, // line separator 0x2029, // paragraph separator 0x202F, //非区切りスペース 0x205F, 0x2060, 0x3000, 0x3164, 0xFEFF ).forEach { put(it, true) } } // 空白とみなす文字なら真 fun isWhitespace(cp: Int): Boolean = mapWhitespace.get(cp, false) internal val reWhitespace by lazy { val quotedKeys = Pattern.quote( StringBuilder().apply { val size = mapWhitespace.size() ensureCapacity(size) for (i in 0 until size) { append(mapWhitespace.keyAt(i).toChar()) } }.toString() ) "[$quotedKeys]+".asciiPattern() } internal val reNotWhitespace by lazy { val quotedKeys = Pattern.quote( StringBuilder().apply { val size = mapWhitespace.size() ensureCapacity(size) for (i in 0 until size) { append(mapWhitespace.keyAt(i).toChar()) } }.toString() ) "[^$quotedKeys]+".asciiPattern() } private fun SparseBooleanArray.keys() = (0 until size()).map { keyAt(it) } internal val reWhitespaceBeforeLineFeed by lazy { val whitespaces = mapWhitespace.keys() .map { it.toChar() } .filter { it != '\n' } .joinToString("") "[$whitespaces]+\n".asciiPattern() } // 文字列のリストからグループIDを決定する private fun findGroupId(list: Array): Int { // グループのIDは、グループ中の文字(長さ1)のunicode値の最小 var id = Integer.MAX_VALUE for (s in list) { if (s.length == 1) { val c = s[0].code if (c < id) id = c } } if (id == Integer.MAX_VALUE) error("missing group id") return id } // 文字列からグループIDを調べるマップ // 文字数1: unicode => group_id private val map1 = SparseIntArray() // 文字数2: unicode 二つを合成した数値 => group_id。半角カナ+濁音など private val map2 = SparseIntArray() // ユニコード文字を正規化する。 // 簡易版なので全ての文字には対応していない fun getUnifiedCharacter(c: Char): Char { val v1 = map1[c.code] return if (v1 != 0) v1.toChar() else c } // グループをmapに登録する private fun addGroup(list: Array) { val group_id = findGroupId(list) // 文字列からグループIDを調べるマップを更新 for (s in list) { val map: SparseIntArray val key: Int val v1 = s[0].code if (s.length == 1) { map = map1 key = v1 } else { map = map2 val v2 = s[1].code key = v1 or (v2 shl 16) } val old = map.get(key) if (old != 0 && old != group_id) error("group conflict: $s") map.put(key, group_id) } } // 入力された文字列から 文字,グループ,終端 のどれかを順に列挙する class Tokenizer { internal var text: CharSequence = "" internal var end: Int = 0 var offset: Int = 0 internal fun reset(text: CharSequence, start: Int, end: Int): Tokenizer { this.text = text this.offset = start this.end = end return this } // returns END or group_id or UTF-16 character operator fun next(): Int { var pos = offset // 空白を読み飛ばす while (pos < end && isWhitespace(text[pos].code)) ++pos // 終端までの文字数 val remain = end - pos if (remain <= 0) { // 空白を読み飛ばしたら終端になった // 終端の場合、末尾の空白はoffsetに含めない return END } val v1 = text[pos].code // グループに登録された文字を長い順にチェック var check_len = if (remain > 2) 2 else remain while (check_len > 0) { val group_id = when (check_len) { 1 -> map1.get(v1) else -> map2.get(v1 or (text[pos + 1].code shl 16)) } if (group_id != 0) { this.offset = pos + check_len return group_id } --check_len } this.offset = pos + 1 return v1 } } init { val tmp = CharArray(1) val array2 = arrayOf("", "") val array4 = arrayOf("", "", "", "") // 数字 for (i in 0..8) { array2[0] = c2s(tmp, '0' + i) array2[1] = c2s(tmp, '0' + i) addGroup(array2) } // 英字 for (i in 0..25) { array4[0] = c2s(tmp, 'a' + i) array4[1] = c2s(tmp, 'A' + i) array4[2] = c2s(tmp, 'a' + i) array4[3] = c2s(tmp, 'A' + i) addGroup(array4) } // ハイフン addGroup( arrayOf( i2s(tmp, 0x002D), // ASCIIのハイフン i2s(tmp, 0x30FC), // 全角カナの長音 Shift_JIS由来 i2s(tmp, 0x2010), i2s(tmp, 0x2011), i2s(tmp, 0x2013), i2s(tmp, 0x2014), i2s(tmp, 0x2015), // 全角カナのダッシュ Shift_JIS由来 i2s(tmp, 0x2212), i2s(tmp, 0xFF0d), // 全角カナの長音 MS932由来 i2s(tmp, 0xFF70) // 半角カナの長音 MS932由来 ) ) addGroup(arrayOf("!", "!")) addGroup(arrayOf(""", "\"")) addGroup(arrayOf("#", "#")) addGroup(arrayOf("$", "$")) addGroup(arrayOf("%", "%")) addGroup(arrayOf("&", "&")) addGroup(arrayOf("'", "'")) addGroup(arrayOf("(", "(")) addGroup(arrayOf(")", ")")) addGroup(arrayOf("*", "*")) addGroup(arrayOf("+", "+")) addGroup(arrayOf(",", ",", "、", "、")) addGroup(arrayOf(".", ".", "。", "。")) addGroup(arrayOf("/", "/")) addGroup(arrayOf(":", ":")) addGroup(arrayOf(";", ";")) addGroup(arrayOf("<", "<")) addGroup(arrayOf("=", "=")) addGroup(arrayOf(">", ">")) addGroup(arrayOf("?", "?")) addGroup(arrayOf("@", "@")) addGroup(arrayOf("[", "[")) addGroup(arrayOf("\", "\\", "¥")) addGroup(arrayOf("]", "]")) addGroup(arrayOf("^", "^")) addGroup(arrayOf("_", "_")) addGroup(arrayOf("`", "`")) addGroup(arrayOf("{", "{")) addGroup(arrayOf("|", "|", "¦")) addGroup(arrayOf("}", "}")) addGroup(arrayOf("・", "・", "・")) addGroup(arrayOf("「", "「", "「")) addGroup(arrayOf("」", "」", "」")) // チルダ addGroup( arrayOf( "~", i2s(tmp, 0x301C), i2s(tmp, 0xFF5E) ) ) // 半角カナの濁音,半濁音は2文字になる addGroup(arrayOf("ガ", "が", "ガ")) addGroup(arrayOf("ギ", "ぎ", "ギ")) addGroup(arrayOf("グ", "ぐ", "グ")) addGroup(arrayOf("ゲ", "げ", "ゲ")) addGroup(arrayOf("ゴ", "ご", "ゴ")) addGroup(arrayOf("ザ", "ざ", "ザ")) addGroup(arrayOf("ジ", "じ", "ジ")) addGroup(arrayOf("ズ", "ず", "ズ")) addGroup(arrayOf("ゼ", "ぜ", "ゼ")) addGroup(arrayOf("ゾ", "ぞ", "ゾ")) addGroup(arrayOf("ダ", "だ", "ダ")) addGroup(arrayOf("ヂ", "ぢ", "ヂ")) addGroup(arrayOf("ヅ", "づ", "ヅ")) addGroup(arrayOf("デ", "で", "デ")) addGroup(arrayOf("ド", "ど", "ド")) addGroup(arrayOf("バ", "ば", "バ")) addGroup(arrayOf("ビ", "び", "ビ")) addGroup(arrayOf("ブ", "ぶ", "ブ")) addGroup(arrayOf("ベ", "べ", "ベ")) addGroup(arrayOf("ボ", "ぼ", "ボ")) addGroup(arrayOf("パ", "ぱ", "パ")) addGroup(arrayOf("ピ", "ぴ", "ピ")) addGroup(arrayOf("プ", "ぷ", "プ")) addGroup(arrayOf("ペ", "ぺ", "ペ")) addGroup(arrayOf("ポ", "ぽ", "ポ")) addGroup(arrayOf("ヴ", "う゛", "ヴ")) addGroup(arrayOf("あ", "ア", "ア", "ぁ", "ァ", "ァ")) addGroup(arrayOf("い", "イ", "イ", "ぃ", "ィ", "ィ")) addGroup(arrayOf("う", "ウ", "ウ", "ぅ", "ゥ", "ゥ")) addGroup(arrayOf("え", "エ", "エ", "ぇ", "ェ", "ェ")) addGroup(arrayOf("お", "オ", "オ", "ぉ", "ォ", "ォ")) addGroup(arrayOf("か", "カ", "カ")) addGroup(arrayOf("き", "キ", "キ")) addGroup(arrayOf("く", "ク", "ク")) addGroup(arrayOf("け", "ケ", "ケ")) addGroup(arrayOf("こ", "コ", "コ")) addGroup(arrayOf("さ", "サ", "サ")) addGroup(arrayOf("し", "シ", "シ")) addGroup(arrayOf("す", "ス", "ス")) addGroup(arrayOf("せ", "セ", "セ")) addGroup(arrayOf("そ", "ソ", "ソ")) addGroup(arrayOf("た", "タ", "タ")) addGroup(arrayOf("ち", "チ", "チ")) addGroup(arrayOf("つ", "ツ", "ツ", "っ", "ッ", "ッ")) addGroup(arrayOf("て", "テ", "テ")) addGroup(arrayOf("と", "ト", "ト")) addGroup(arrayOf("な", "ナ", "ナ")) addGroup(arrayOf("に", "ニ", "ニ")) addGroup(arrayOf("ぬ", "ヌ", "ヌ")) addGroup(arrayOf("ね", "ネ", "ネ")) addGroup(arrayOf("の", "ノ", "ノ")) addGroup(arrayOf("は", "ハ", "ハ")) addGroup(arrayOf("ひ", "ヒ", "ヒ")) addGroup(arrayOf("ふ", "フ", "フ")) addGroup(arrayOf("へ", "ヘ", "ヘ")) addGroup(arrayOf("ほ", "ホ", "ホ")) addGroup(arrayOf("ま", "マ", "マ")) addGroup(arrayOf("み", "ミ", "ミ")) addGroup(arrayOf("む", "ム", "ム")) addGroup(arrayOf("め", "メ", "メ")) addGroup(arrayOf("も", "モ", "モ")) addGroup(arrayOf("や", "ヤ", "ヤ", "ゃ", "ャ", "ャ")) addGroup(arrayOf("ゆ", "ユ", "ユ", "ゅ", "ュ", "ュ")) addGroup(arrayOf("よ", "ヨ", "ヨ", "ょ", "ョ", "ョ")) addGroup(arrayOf("ら", "ラ", "ラ")) addGroup(arrayOf("り", "リ", "リ")) addGroup(arrayOf("る", "ル", "ル")) addGroup(arrayOf("れ", "レ", "レ")) addGroup(arrayOf("ろ", "ロ", "ロ")) addGroup(arrayOf("わ", "ワ", "ワ")) addGroup(arrayOf("を", "ヲ", "ヲ")) addGroup(arrayOf("ん", "ン", "ン")) } } // 末尾の空白や開業を取り除く fun SpannableStringBuilder.removeEndWhitespaces(): SpannableStringBuilder { var pos = length while (pos > 0 && CharacterGroup.isWhitespace(codePointBefore(pos))) { // whitespaces are always 1 == Character.charCount(c) --pos } if (pos < length) delete(pos, length) return this } // 行末の空白を除去。連続する改行を2つまでに制限する。 fun SpannableStringBuilder.neatSpaces(): SpannableStringBuilder { // 行末の空白を除去 val m = CharacterGroup.reWhitespaceBeforeLineFeed.matcher(this) val matchList = ArrayList>() while (m.find()) { matchList.add(Pair(m.start(), m.end())) } for (pair in matchList.reversed()) { delete(pair.first, pair.second - 1) } // 連続する改行をまとめる var previousBrCount = 0 for (i in this.indices.reversed()) { val c = this[i] if (c != '\n') { previousBrCount = 0 } else if (++previousBrCount >= 3) { delete(i, i + 1) } } return this }