package jp.juggler.subwaytooter.util import android.content.Context import android.net.Uri import android.text.Spannable import android.text.SpannableStringBuilder import android.text.Spanned import java.util.ArrayList import java.util.HashMap import java.util.HashSet import java.util.regex.Pattern import jp.juggler.subwaytooter.App1 import jp.juggler.subwaytooter.Pref import jp.juggler.subwaytooter.R import jp.juggler.subwaytooter.api.entity.TootAccount import jp.juggler.subwaytooter.api.entity.TootAttachmentLike import jp.juggler.subwaytooter.api.entity.TootMention import jp.juggler.subwaytooter.span.EmojiImageSpan import jp.juggler.subwaytooter.span.HighlightSpan import jp.juggler.subwaytooter.span.MyClickableSpan import jp.juggler.subwaytooter.table.HighlightWord import jp.juggler.subwaytooter.table.SavedAccount object HTMLDecoder { private val log = LogCategory("HTMLDecoder") private const val DEBUG_HTML_PARSER = false private const val OPEN_TYPE_OPEN_CLOSE = 1 private const val OPEN_TYPE_OPEN = 2 private const val OPEN_TYPE_CLOSE = 3 private const val TAG_TEXT = "<>text" private const val TAG_END = "<>end" private val reTag = Pattern.compile("<(/?)(\\w+)") private val reTagEnd = Pattern.compile("(/?)>$") private val reHref = Pattern.compile("\\bhref=\"([^\"]*)\"") private val block_tag = HashSet() private fun prepareTagInformation() { synchronized(block_tag) { if(block_tag.isEmpty()) { block_tag.add("div") block_tag.add("p") block_tag.add("li") block_tag.add("h1") block_tag.add("h2") block_tag.add("h3") block_tag.add("h4") block_tag.add("h5") block_tag.add("body") } } } private fun isWhitespace(codepoint : Int) : Boolean { return CharacterGroup.isWhitespace(codepoint) || when(codepoint) { 0x0a, 0x0d -> true else -> false } } private val reEntity = Pattern.compile("&(#?)(\\w+);") private val entity_map = HashMap() private fun _addEntity(s : String, c : Char) { entity_map.put(s, c) } private fun chr(num : Int) : Char { return num.toChar() } fun decodeEntity(src : String) : String { var sb : StringBuilder? = null val m = reEntity.matcher(src) var last_end = 0 while(m.find()) { if(sb == null) sb = StringBuilder() val start = m.start() val end = m.end() try { if(start > last_end) { sb.append(src.substring(last_end, start)) } val is_numeric = m.group(1).isNotEmpty() val part = m.group(2) if(! is_numeric) { val c = entity_map[part] if(c != null) { sb.append(c) continue } } else { val c : Int try { c = if(part[0] == 'x') { Integer.parseInt(part.substring(1), 16) } else { Integer.parseInt(part, 10) } sb.append(c.toChar()) continue } catch(ex : Throwable) { log.trace(ex) } } sb.append(src.substring(start, end)) } finally { last_end = end } } // 全くマッチしなかった if(sb == null) return src val end = src.length if(end > last_end) { sb.append(src.substring(last_end, end)) } return sb.toString() } // static final Pattern reEntityEscape = Pattern.compile( "[<>\"'&]" ); fun encodeEntity(src : String) : String { val sb = StringBuilder() var i = 0 val ie = src.length while(i < ie) { val c = src[i] when(c) { '<' -> sb.append("<") '>' -> sb.append(">") '"' -> sb.append(""") '\'' -> sb.append("'") '&' -> sb.append("&") else -> sb.append(c) } ++ i } return sb.toString() } ////////////////////////////////////////////////////////////////////////////////////// private class TokenParser internal constructor(internal val src : String) { internal var next : Int = 0 internal var tag : String = "" internal var open_type : Int = 0 internal var text : String = "" init { this.next = 0 eat() } internal fun eat() { // end? if(next >= src.length) { tag = TAG_END open_type = OPEN_TYPE_OPEN_CLOSE return } // text ? var end = src.indexOf('<', next) if(end == - 1) end = src.length if(end > next) { this.text = src.substring(next, end) this.tag = TAG_TEXT this.open_type = OPEN_TYPE_OPEN_CLOSE next = end return } // tag ? end = src.indexOf('>', next) if(end == - 1) { end = src.length } else { ++ end } text = src.substring(next, end) next = end val m = reTag.matcher(text) if(m.find()) { val is_close = m.group(1).isNotEmpty() tag = m.group(2).toLowerCase() val m2 = reTagEnd.matcher(text) var is_openclose = false if(m2.find()) { is_openclose = m2.group(1).isNotEmpty() } open_type = if(is_close) OPEN_TYPE_CLOSE else if(is_openclose) OPEN_TYPE_OPEN_CLOSE else OPEN_TYPE_OPEN if(tag == "br") open_type = OPEN_TYPE_OPEN_CLOSE } else { tag = TAG_TEXT this.open_type = OPEN_TYPE_OPEN_CLOSE } } } private class Node { internal val child_nodes = ArrayList() internal val tag : String internal val text : String private val href : String? get() { val m = reHref.matcher(text) if(m.find()) { val href = decodeEntity(m.group(1)) if( href.isNotEmpty() ) { return href } } return null } internal constructor() { tag = "<>root" text = "" } internal constructor(t : TokenParser) { this.tag = t.tag this.text = t.text } internal fun addChild(t : TokenParser, indent : String) { if(DEBUG_HTML_PARSER) log.d("parseChild: %s(%s", indent, tag) while(true) { if(TAG_END == t.tag) break if(OPEN_TYPE_CLOSE == t.open_type) { t.eat() break } val open_type = t.open_type val child = Node(t) child_nodes.add(child) t.eat() if(DEBUG_HTML_PARSER) log.d("parseChild: %s|%s %s [%s]", indent, child.tag, open_type, child.text) if(OPEN_TYPE_OPEN == open_type) { child.addChild(t, indent + "--") } } if(DEBUG_HTML_PARSER) log.d("parseChild: %s)%s", indent, tag) } internal fun encodeSpan( context : Context?, account : LinkClickContext?, sb : SpannableStringBuilder, options : DecodeOptions ) { if(TAG_TEXT == tag) { if(context != null && options.decodeEmoji) { sb.append(options.decodeEmoji(context, decodeEntity(text))) } else { sb.append(decodeEntity(text)) } return } if(DEBUG_HTML_PARSER) sb.append("(start ").append(tag).append(")") val sb_tmp : SpannableStringBuilder = if("a" == tag || "style" == tag || "script" == tag) { SpannableStringBuilder() } else { sb } var start = sb_tmp.length if("img" == tag) { sb_tmp.append("") } else { for(child in child_nodes) { child.encodeSpan(context, account, sb_tmp, options) } } var end = sb_tmp.length if("a" == tag) { start = sb.length if(context != null) { sb.append(encodeUrl(options.short, context, sb_tmp.toString(), href, options.attachmentList)) } else { sb.append(sb_tmp.toString()) } end = sb.length } else if(sb_tmp !== sb) { // style もscript も読み捨てる } if(end > start && "a" == tag) { if(account != null) { val href = href if(href != null) { val link_text = sb.subSequence(start, end).toString() val span = MyClickableSpan(account, link_text, href, account.findAcctColor(href), options.linkTag) sb.setSpan(span, start, end, Spannable.SPAN_EXCLUSIVE_EXCLUSIVE) } } // リンクスパンを設定した後に色をつける val list = options.highlight_trie?.matchList(sb, start, end) if(list != null) { for(range in list) { val word = HighlightWord.load(range.word) if(word != null) { options.hasHighlight = true sb.setSpan(HighlightSpan(word.color_fg, word.color_bg), range.start, range.end, Spanned.SPAN_EXCLUSIVE_EXCLUSIVE) if(word.sound_type != HighlightWord.SOUND_TYPE_NONE) { options.highlight_sound = word } } } } } if(DEBUG_HTML_PARSER) sb.append("(end ").append(tag).append(")") if("br" == tag) sb.append('\n') if(block_tag.contains(tag)) { if(sb.isNotEmpty()) { // 末尾の改行を数える var last_br_count = 0 var last = sb.length - 1 while(last >= 0) { val c = sb[last --] if(c == '\n') { ++ last_br_count continue } if(Character.isWhitespace(c)) continue break } // 末尾の改行が2文字未満なら改行を追加する while(last_br_count ++ < 2) sb.append('\n') } } } internal fun is_media_attachment(list_attachment :ArrayList?, url : String?) : Boolean { if(url == null || list_attachment == null) return false for(a in list_attachment) { if(a.hasUrl(url)) return true } return false } private fun encodeUrl( bShort : Boolean, context : Context, display_url : String, href : String?, list_attachment :ArrayList? ) : CharSequence { if(! display_url.startsWith("http")) { if(display_url.startsWith("@") && href != null && App1.pref.getBoolean(Pref.KEY_MENTION_FULL_ACCT, false)) { // メンションをfull acct にする val m = TootAccount.reAccountUrl.matcher(href) if(m.find()) { return "@" + m.group(2) + "@" + m.group(1) } } // ハッシュタグやメンションは変更しない return display_url } if(! bShort) { return display_url } if(is_media_attachment(list_attachment, href)) { val sb = SpannableStringBuilder() sb.append(href) val start = 0 val end = sb.length sb.setSpan(EmojiImageSpan(context, R.drawable.emj_1f5bc), start, end, Spanned.SPAN_EXCLUSIVE_EXCLUSIVE) return sb } try { val uri = Uri.parse(display_url) val sb = StringBuilder() sb.append(uri.authority) val a = uri.encodedPath val q = uri.encodedQuery val f = uri.encodedFragment val remain = a + (if(q == null) "" else "?" + q) + if(f == null) "" else "#" + f if(remain.length > 10) { sb.append(remain.substring(0, 10)) sb.append("…") } else { sb.append(remain) } return sb } catch(ex : Throwable) { log.trace(ex) return display_url } } } fun decodeHTML( context : Context?, account : LinkClickContext?, src : String?, options : DecodeOptions ) : SpannableStringBuilder { prepareTagInformation() val sb = SpannableStringBuilder() try { if(src != null) { val tracker = TokenParser(src) val rootNode = Node() while(TAG_END != tracker.tag) { rootNode.addChild(tracker, "") } rootNode.encodeSpan(context, account, sb, options) var end = sb.length while(end > 0 && isWhitespace(sb[end - 1].toInt())) -- end if(end < sb.length) { sb.delete(end, sb.length) } // sb.append( "\n" ); // sb.append(src); } } catch(ex : Throwable) { log.trace(ex) } return sb } fun decodeMentions(access_info : SavedAccount, src_list : ArrayList?, link_tag : Any?) : Spannable? { if(src_list == null || src_list.isEmpty()) return null val sb = SpannableStringBuilder() for(item in src_list) { if(sb.isNotEmpty()) sb.append(" ") val start = sb.length sb.append('@') if(App1.pref.getBoolean(Pref.KEY_MENTION_FULL_ACCT, false)) { sb.append(access_info.getFullAcct(item.acct)) } else { sb.append(item.acct) } val end = sb.length val url = item.url if(end > start){ val link_text = sb.subSequence(start, end).toString() val span = MyClickableSpan(access_info, link_text, url, access_info.findAcctColor(item.url), link_tag) sb.setSpan(span, start, end, Spannable.SPAN_EXCLUSIVE_EXCLUSIVE) } } return sb } private fun init1() { _addEntity("amp", '&') // ampersand _addEntity("gt", '>') // greater than _addEntity("lt", '<') // less than _addEntity("quot", '"') // double quote _addEntity("apos", '\'') // single quote _addEntity("AElig", chr(198)) // capital AE diphthong (ligature) _addEntity("Aacute", chr(193)) // capital A, acute accent _addEntity("Acirc", chr(194)) // capital A, circumflex accent _addEntity("Agrave", chr(192)) // capital A, grave accent _addEntity("Aring", chr(197)) // capital A, ring _addEntity("Atilde", chr(195)) // capital A, tilde _addEntity("Auml", chr(196)) // capital A, dieresis or umlaut mark _addEntity("Ccedil", chr(199)) // capital C, cedilla _addEntity("ETH", chr(208)) // capital Eth, Icelandic _addEntity("Eacute", chr(201)) // capital E, acute accent _addEntity("Ecirc", chr(202)) // capital E, circumflex accent _addEntity("Egrave", chr(200)) // capital E, grave accent _addEntity("Euml", chr(203)) // capital E, dieresis or umlaut mark _addEntity("Iacute", chr(205)) // capital I, acute accent _addEntity("Icirc", chr(206)) // capital I, circumflex accent _addEntity("Igrave", chr(204)) // capital I, grave accent _addEntity("Iuml", chr(207)) // capital I, dieresis or umlaut mark _addEntity("Ntilde", chr(209)) // capital N, tilde _addEntity("Oacute", chr(211)) // capital O, acute accent _addEntity("Ocirc", chr(212)) // capital O, circumflex accent _addEntity("Ograve", chr(210)) // capital O, grave accent _addEntity("Oslash", chr(216)) // capital O, slash _addEntity("Otilde", chr(213)) // capital O, tilde _addEntity("Ouml", chr(214)) // capital O, dieresis or umlaut mark _addEntity("THORN", chr(222)) // capital THORN, Icelandic _addEntity("Uacute", chr(218)) // capital U, acute accent _addEntity("Ucirc", chr(219)) // capital U, circumflex accent _addEntity("Ugrave", chr(217)) // capital U, grave accent _addEntity("Uuml", chr(220)) // capital U, dieresis or umlaut mark _addEntity("Yacute", chr(221)) // capital Y, acute accent _addEntity("aacute", chr(225)) // small a, acute accent _addEntity("acirc", chr(226)) // small a, circumflex accent _addEntity("aelig", chr(230)) // small ae diphthong (ligature) _addEntity("agrave", chr(224)) // small a, grave accent _addEntity("aring", chr(229)) // small a, ring _addEntity("atilde", chr(227)) // small a, tilde _addEntity("auml", chr(228)) // small a, dieresis or umlaut mark _addEntity("ccedil", chr(231)) // small c, cedilla _addEntity("eacute", chr(233)) // small e, acute accent _addEntity("ecirc", chr(234)) // small e, circumflex accent _addEntity("egrave", chr(232)) // small e, grave accent _addEntity("eth", chr(240)) // small eth, Icelandic _addEntity("euml", chr(235)) // small e, dieresis or umlaut mark _addEntity("iacute", chr(237)) // small i, acute accent _addEntity("icirc", chr(238)) // small i, circumflex accent _addEntity("igrave", chr(236)) // small i, grave accent _addEntity("iuml", chr(239)) // small i, dieresis or umlaut mark _addEntity("ntilde", chr(241)) // small n, tilde _addEntity("oacute", chr(243)) // small o, acute accent _addEntity("ocirc", chr(244)) // small o, circumflex accent _addEntity("ograve", chr(242)) // small o, grave accent _addEntity("oslash", chr(248)) // small o, slash _addEntity("otilde", chr(245)) // small o, tilde _addEntity("ouml", chr(246)) // small o, dieresis or umlaut mark _addEntity("szlig", chr(223)) // small sharp s, German (sz ligature) _addEntity("thorn", chr(254)) // small thorn, Icelandic _addEntity("uacute", chr(250)) // small u, acute accent _addEntity("ucirc", chr(251)) // small u, circumflex accent _addEntity("ugrave", chr(249)) // small u, grave accent _addEntity("uuml", chr(252)) // small u, dieresis or umlaut mark _addEntity("yacute", chr(253)) // small y, acute accent _addEntity("yuml", chr(255)) // small y, dieresis or umlaut mark _addEntity("copy", chr(169)) // copyright sign _addEntity("reg", chr(174)) // registered sign _addEntity("nbsp", chr(160)) // non breaking space _addEntity("iexcl", chr(161)) _addEntity("cent", chr(162)) _addEntity("pound", chr(163)) _addEntity("curren", chr(164)) _addEntity("yen", chr(165)) _addEntity("brvbar", chr(166)) _addEntity("sect", chr(167)) _addEntity("uml", chr(168)) _addEntity("ordf", chr(170)) _addEntity("laquo", chr(171)) _addEntity("not", chr(172)) _addEntity("shy", chr(173)) _addEntity("macr", chr(175)) _addEntity("deg", chr(176)) _addEntity("plusmn", chr(177)) _addEntity("sup1", chr(185)) _addEntity("sup2", chr(178)) _addEntity("sup3", chr(179)) _addEntity("acute", chr(180)) _addEntity("micro", chr(181)) _addEntity("para", chr(182)) _addEntity("middot", chr(183)) _addEntity("cedil", chr(184)) _addEntity("ordm", chr(186)) _addEntity("raquo", chr(187)) _addEntity("frac14", chr(188)) _addEntity("frac12", chr(189)) _addEntity("frac34", chr(190)) _addEntity("iquest", chr(191)) _addEntity("times", chr(215)) } private fun init2() { _addEntity("divide", chr(247)) _addEntity("OElig", chr(338)) _addEntity("oelig", chr(339)) _addEntity("Scaron", chr(352)) _addEntity("scaron", chr(353)) _addEntity("Yuml", chr(376)) _addEntity("fnof", chr(402)) _addEntity("circ", chr(710)) _addEntity("tilde", chr(732)) _addEntity("Alpha", chr(913)) _addEntity("Beta", chr(914)) _addEntity("Gamma", chr(915)) _addEntity("Delta", chr(916)) _addEntity("Epsilon", chr(917)) _addEntity("Zeta", chr(918)) _addEntity("Eta", chr(919)) _addEntity("Theta", chr(920)) _addEntity("Iota", chr(921)) _addEntity("Kappa", chr(922)) _addEntity("Lambda", chr(923)) _addEntity("Mu", chr(924)) _addEntity("Nu", chr(925)) _addEntity("Xi", chr(926)) _addEntity("Omicron", chr(927)) _addEntity("Pi", chr(928)) _addEntity("Rho", chr(929)) _addEntity("Sigma", chr(931)) _addEntity("Tau", chr(932)) _addEntity("Upsilon", chr(933)) _addEntity("Phi", chr(934)) _addEntity("Chi", chr(935)) _addEntity("Psi", chr(936)) _addEntity("Omega", chr(937)) _addEntity("alpha", chr(945)) _addEntity("beta", chr(946)) _addEntity("gamma", chr(947)) _addEntity("delta", chr(948)) _addEntity("epsilon", chr(949)) _addEntity("zeta", chr(950)) _addEntity("eta", chr(951)) _addEntity("theta", chr(952)) _addEntity("iota", chr(953)) _addEntity("kappa", chr(954)) _addEntity("lambda", chr(955)) _addEntity("mu", chr(956)) _addEntity("nu", chr(957)) _addEntity("xi", chr(958)) _addEntity("omicron", chr(959)) _addEntity("pi", chr(960)) _addEntity("rho", chr(961)) _addEntity("sigmaf", chr(962)) _addEntity("sigma", chr(963)) _addEntity("tau", chr(964)) _addEntity("upsilon", chr(965)) _addEntity("phi", chr(966)) _addEntity("chi", chr(967)) _addEntity("psi", chr(968)) _addEntity("omega", chr(969)) _addEntity("thetasym", chr(977)) _addEntity("upsih", chr(978)) _addEntity("piv", chr(982)) _addEntity("ensp", chr(8194)) _addEntity("emsp", chr(8195)) _addEntity("thinsp", chr(8201)) _addEntity("zwnj", chr(8204)) _addEntity("zwj", chr(8205)) _addEntity("lrm", chr(8206)) _addEntity("rlm", chr(8207)) _addEntity("ndash", chr(8211)) _addEntity("mdash", chr(8212)) _addEntity("lsquo", chr(8216)) _addEntity("rsquo", chr(8217)) _addEntity("sbquo", chr(8218)) _addEntity("ldquo", chr(8220)) _addEntity("rdquo", chr(8221)) _addEntity("bdquo", chr(8222)) _addEntity("dagger", chr(8224)) _addEntity("Dagger", chr(8225)) _addEntity("bull", chr(8226)) _addEntity("hellip", chr(8230)) _addEntity("permil", chr(8240)) _addEntity("prime", chr(8242)) _addEntity("Prime", chr(8243)) _addEntity("lsaquo", chr(8249)) _addEntity("rsaquo", chr(8250)) _addEntity("oline", chr(8254)) _addEntity("frasl", chr(8260)) _addEntity("euro", chr(8364)) _addEntity("image", chr(8465)) _addEntity("weierp", chr(8472)) _addEntity("real", chr(8476)) _addEntity("trade", chr(8482)) _addEntity("alefsym", chr(8501)) _addEntity("larr", chr(8592)) _addEntity("uarr", chr(8593)) _addEntity("rarr", chr(8594)) _addEntity("darr", chr(8595)) _addEntity("harr", chr(8596)) _addEntity("crarr", chr(8629)) _addEntity("lArr", chr(8656)) } private fun init3() { _addEntity("uArr", chr(8657)) _addEntity("rArr", chr(8658)) _addEntity("dArr", chr(8659)) _addEntity("hArr", chr(8660)) _addEntity("forall", chr(8704)) _addEntity("part", chr(8706)) _addEntity("exist", chr(8707)) _addEntity("empty", chr(8709)) _addEntity("nabla", chr(8711)) _addEntity("isin", chr(8712)) _addEntity("notin", chr(8713)) _addEntity("ni", chr(8715)) _addEntity("prod", chr(8719)) _addEntity("sum", chr(8721)) _addEntity("minus", chr(8722)) _addEntity("lowast", chr(8727)) _addEntity("radic", chr(8730)) _addEntity("prop", chr(8733)) _addEntity("infin", chr(8734)) _addEntity("ang", chr(8736)) _addEntity("and", chr(8743)) _addEntity("or", chr(8744)) _addEntity("cap", chr(8745)) _addEntity("cup", chr(8746)) _addEntity("int", chr(8747)) _addEntity("there4", chr(8756)) _addEntity("sim", chr(8764)) _addEntity("cong", chr(8773)) _addEntity("asymp", chr(8776)) _addEntity("ne", chr(8800)) _addEntity("equiv", chr(8801)) _addEntity("le", chr(8804)) _addEntity("ge", chr(8805)) _addEntity("sub", chr(8834)) _addEntity("sup", chr(8835)) _addEntity("nsub", chr(8836)) _addEntity("sube", chr(8838)) _addEntity("supe", chr(8839)) _addEntity("oplus", chr(8853)) _addEntity("otimes", chr(8855)) _addEntity("perp", chr(8869)) _addEntity("sdot", chr(8901)) _addEntity("lceil", chr(8968)) _addEntity("rceil", chr(8969)) _addEntity("lfloor", chr(8970)) _addEntity("rfloor", chr(8971)) _addEntity("lang", chr(9001)) _addEntity("rang", chr(9002)) _addEntity("loz", chr(9674)) _addEntity("spades", chr(9824)) _addEntity("clubs", chr(9827)) _addEntity("hearts", chr(9829)) _addEntity("diams", chr(9830)) } init { init1() init2() init3() } }