SubwayTooter-Android-App/app/src/main/java/jp/juggler/subwaytooter/util/HTMLDecoder.kt

742 lines
22 KiB
Kotlin

package jp.juggler.subwaytooter.util
import android.content.Context
import android.net.Uri
import android.text.Spannable
import android.text.SpannableStringBuilder
import android.text.Spanned
import java.util.ArrayList
import java.util.HashMap
import java.util.HashSet
import java.util.regex.Pattern
import jp.juggler.subwaytooter.App1
import jp.juggler.subwaytooter.Pref
import jp.juggler.subwaytooter.R
import jp.juggler.subwaytooter.api.entity.TootAccount
import jp.juggler.subwaytooter.api.entity.TootAttachmentLike
import jp.juggler.subwaytooter.api.entity.TootMention
import jp.juggler.subwaytooter.span.EmojiImageSpan
import jp.juggler.subwaytooter.span.HighlightSpan
import jp.juggler.subwaytooter.span.MyClickableSpan
import jp.juggler.subwaytooter.table.HighlightWord
import jp.juggler.subwaytooter.table.SavedAccount
object HTMLDecoder {
private val log = LogCategory("HTMLDecoder")
private const val DEBUG_HTML_PARSER = false
private const val OPEN_TYPE_OPEN_CLOSE = 1
private const val OPEN_TYPE_OPEN = 2
private const val OPEN_TYPE_CLOSE = 3
private const val TAG_TEXT = "<>text"
private const val TAG_END = "<>end"
private val reTag = Pattern.compile("<(/?)(\\w+)")
private val reTagEnd = Pattern.compile("(/?)>$")
private val reHref = Pattern.compile("\\bhref=\"([^\"]*)\"")
private val block_tag = HashSet<String>()
private fun prepareTagInformation() {
synchronized(block_tag) {
if(block_tag.isEmpty()) {
block_tag.add("div")
block_tag.add("p")
block_tag.add("li")
block_tag.add("h1")
block_tag.add("h2")
block_tag.add("h3")
block_tag.add("h4")
block_tag.add("h5")
block_tag.add("body")
}
}
}
private fun isWhitespace(codepoint : Int) : Boolean {
return CharacterGroup.isWhitespace(codepoint) || when(codepoint) {
0x0a, 0x0d -> true
else -> false
}
}
private val reEntity = Pattern.compile("&(#?)(\\w+);")
private val entity_map = HashMap<String, Char>()
private fun _addEntity(s : String, c : Char) {
entity_map.put(s, c)
}
private fun chr(num : Int) : Char {
return num.toChar()
}
fun decodeEntity(src : String) : String {
var sb : StringBuilder? = null
val m = reEntity.matcher(src)
var last_end = 0
while(m.find()) {
if(sb == null) sb = StringBuilder()
val start = m.start()
val end = m.end()
try {
if(start > last_end) {
sb.append(src.substring(last_end, start))
}
val is_numeric = m.group(1).isNotEmpty()
val part = m.group(2)
if(! is_numeric) {
val c = entity_map[part]
if(c != null) {
sb.append(c)
continue
}
} else {
val c : Int
try {
c = if(part[0] == 'x') {
Integer.parseInt(part.substring(1), 16)
} else {
Integer.parseInt(part, 10)
}
sb.append(c.toChar())
continue
} catch(ex : Throwable) {
log.trace(ex)
}
}
sb.append(src.substring(start, end))
} finally {
last_end = end
}
}
// 全くマッチしなかった
if(sb == null) return src
val end = src.length
if(end > last_end) {
sb.append(src.substring(last_end, end))
}
return sb.toString()
}
// static final Pattern reEntityEscape = Pattern.compile( "[<>\"'&]" );
fun encodeEntity(src : String) : String {
val sb = StringBuilder()
var i = 0
val ie = src.length
while(i < ie) {
val c = src[i]
when(c) {
'<' -> sb.append("&lt;")
'>' -> sb.append("&gt;")
'"' -> sb.append("&quot;")
'\'' -> sb.append("&#039;")
'&' -> sb.append("&amp;")
else -> sb.append(c)
}
++ i
}
return sb.toString()
}
//////////////////////////////////////////////////////////////////////////////////////
private class TokenParser internal constructor(internal val src : String) {
internal var next : Int = 0
internal var tag : String = ""
internal var open_type : Int = 0
internal var text : String = ""
init {
this.next = 0
eat()
}
internal fun eat() {
// end?
if(next >= src.length) {
tag = TAG_END
open_type = OPEN_TYPE_OPEN_CLOSE
return
}
// text ?
var end = src.indexOf('<', next)
if(end == - 1) end = src.length
if(end > next) {
this.text = src.substring(next, end)
this.tag = TAG_TEXT
this.open_type = OPEN_TYPE_OPEN_CLOSE
next = end
return
}
// tag ?
end = src.indexOf('>', next)
if(end == - 1) {
end = src.length
} else {
++ end
}
text = src.substring(next, end)
next = end
val m = reTag.matcher(text)
if(m.find()) {
val is_close = m.group(1).isNotEmpty()
tag = m.group(2).toLowerCase()
val m2 = reTagEnd.matcher(text)
var is_openclose = false
if(m2.find()) {
is_openclose = m2.group(1).isNotEmpty()
}
open_type = if(is_close) OPEN_TYPE_CLOSE else if(is_openclose) OPEN_TYPE_OPEN_CLOSE else OPEN_TYPE_OPEN
if(tag == "br") open_type = OPEN_TYPE_OPEN_CLOSE
} else {
tag = TAG_TEXT
this.open_type = OPEN_TYPE_OPEN_CLOSE
}
}
}
private class Node {
internal val child_nodes = ArrayList<Node>()
internal val tag : String
internal val text : String
private val href : String?
get() {
val m = reHref.matcher(text)
if(m.find()) {
val href = decodeEntity(m.group(1))
if( href.isNotEmpty() ) {
return href
}
}
return null
}
internal constructor() {
tag = "<>root"
text = ""
}
internal constructor(t : TokenParser) {
this.tag = t.tag
this.text = t.text
}
internal fun addChild(t : TokenParser, indent : String) {
if(DEBUG_HTML_PARSER) log.d("parseChild: %s(%s", indent, tag)
while(true) {
if(TAG_END == t.tag) break
if(OPEN_TYPE_CLOSE == t.open_type) {
t.eat()
break
}
val open_type = t.open_type
val child = Node(t)
child_nodes.add(child)
t.eat()
if(DEBUG_HTML_PARSER)
log.d("parseChild: %s|%s %s [%s]", indent, child.tag, open_type, child.text)
if(OPEN_TYPE_OPEN == open_type) {
child.addChild(t, indent + "--")
}
}
if(DEBUG_HTML_PARSER) log.d("parseChild: %s)%s", indent, tag)
}
internal fun encodeSpan(
context : Context?, account : LinkClickContext?, sb : SpannableStringBuilder, options : DecodeOptions
) {
if(TAG_TEXT == tag) {
if(context != null && options.decodeEmoji) {
sb.append(options.decodeEmoji(context, decodeEntity(text)))
} else {
sb.append(decodeEntity(text))
}
return
}
if(DEBUG_HTML_PARSER) sb.append("(start ").append(tag).append(")")
val sb_tmp : SpannableStringBuilder = if("a" == tag || "style" == tag || "script" == tag) {
SpannableStringBuilder()
} else {
sb
}
var start = sb_tmp.length
if("img" == tag) {
sb_tmp.append("<img/>")
} else {
for(child in child_nodes) {
child.encodeSpan(context, account, sb_tmp, options)
}
}
var end = sb_tmp.length
if("a" == tag) {
start = sb.length
if(context != null) {
sb.append(encodeUrl(options.short, context, sb_tmp.toString(), href, options.attachmentList))
} else {
sb.append(sb_tmp.toString())
}
end = sb.length
} else if(sb_tmp !== sb) {
// style もscript も読み捨てる
}
if(end > start && "a" == tag) {
if(account != null) {
val href = href
if(href != null) {
val link_text = sb.subSequence(start, end).toString()
val span = MyClickableSpan(account, link_text, href, account.findAcctColor(href), options.linkTag)
sb.setSpan(span, start, end, Spannable.SPAN_EXCLUSIVE_EXCLUSIVE)
}
}
// リンクスパンを設定した後に色をつける
val list = options.highlightTrie?.matchList(sb, start, end)
if(list != null) {
for(range in list) {
val word = HighlightWord.load(range.word)
if(word != null) {
options.hasHighlight = true
sb.setSpan(HighlightSpan(word.color_fg, word.color_bg), range.start, range.end, Spanned.SPAN_EXCLUSIVE_EXCLUSIVE)
if(word.sound_type != HighlightWord.SOUND_TYPE_NONE) {
options.highlight_sound = word
}
}
}
}
}
if(DEBUG_HTML_PARSER) sb.append("(end ").append(tag).append(")")
if("br" == tag) sb.append('\n')
if(block_tag.contains(tag)) {
if(sb.isNotEmpty()) {
// 末尾の改行を数える
var last_br_count = 0
var last = sb.length - 1
while(last >= 0) {
val c = sb[last --]
if(c == '\n') {
++ last_br_count
continue
}
if(Character.isWhitespace(c)) continue
break
}
// 末尾の改行が2文字未満なら改行を追加する
while(last_br_count ++ < 2) sb.append('\n')
}
}
}
internal fun is_media_attachment(list_attachment :ArrayList<TootAttachmentLike>?, url : String?) : Boolean {
if(url == null || list_attachment == null) return false
for(a in list_attachment) {
if(a.hasUrl(url)) return true
}
return false
}
private fun encodeUrl(
bShort : Boolean, context : Context, display_url : String, href : String?, list_attachment :ArrayList<TootAttachmentLike>?
) : CharSequence {
if(! display_url.startsWith("http")) {
if(display_url.startsWith("@") && href != null && Pref.bpMentionFullAcct(App1.pref)) {
// メンションをfull acct にする
val m = TootAccount.reAccountUrl.matcher(href)
if(m.find()) {
return "@" + m.group(2) + "@" + m.group(1)
}
}
// ハッシュタグやメンションは変更しない
return display_url
}
if(! bShort) {
return display_url
}
if(is_media_attachment(list_attachment, href)) {
val sb = SpannableStringBuilder()
sb.append(href)
val start = 0
val end = sb.length
sb.setSpan(EmojiImageSpan(context, R.drawable.emj_1f5bc), start, end, Spanned.SPAN_EXCLUSIVE_EXCLUSIVE)
return sb
}
try {
val uri = Uri.parse(display_url)
val sb = StringBuilder()
sb.append(uri.authority)
val a = uri.encodedPath
val q = uri.encodedQuery
val f = uri.encodedFragment
val remain = a + (if(q == null) "" else "?" + q) + if(f == null) "" else "#" + f
if(remain.length > 10) {
sb.append(remain.substring(0, 10))
sb.append("")
} else {
sb.append(remain)
}
return sb
} catch(ex : Throwable) {
log.trace(ex)
return display_url
}
}
}
fun decodeHTML(
context : Context?,
account : LinkClickContext?,
src : String?,
options : DecodeOptions
)
: SpannableStringBuilder
{
prepareTagInformation()
val sb = SpannableStringBuilder()
try {
if(src != null) {
val tracker = TokenParser(src)
val rootNode = Node()
while(TAG_END != tracker.tag) {
rootNode.addChild(tracker, "")
}
rootNode.encodeSpan(context, account, sb, options)
var end = sb.length
while(end > 0 && isWhitespace(sb[end - 1].toInt())) -- end
if(end < sb.length) {
sb.delete(end, sb.length)
}
// sb.append( "\n" );
// sb.append(src);
}
} catch(ex : Throwable) {
log.trace(ex)
}
return sb
}
fun decodeMentions(access_info : SavedAccount, src_list : ArrayList<TootMention>?, link_tag : Any?) : Spannable? {
if(src_list == null || src_list.isEmpty()) return null
val sb = SpannableStringBuilder()
for(item in src_list) {
if(sb.isNotEmpty()) sb.append(" ")
val start = sb.length
sb.append('@')
if( Pref.bpMentionFullAcct(App1.pref)) {
sb.append(access_info.getFullAcct(item.acct))
} else {
sb.append(item.acct)
}
val end = sb.length
val url = item.url
if(end > start){
val link_text = sb.subSequence(start, end).toString()
val span = MyClickableSpan(access_info, link_text, url, access_info.findAcctColor(item.url), link_tag)
sb.setSpan(span, start, end, Spannable.SPAN_EXCLUSIVE_EXCLUSIVE)
}
}
return sb
}
private fun init1() {
_addEntity("amp", '&') // ampersand
_addEntity("gt", '>') // greater than
_addEntity("lt", '<') // less than
_addEntity("quot", '"') // double quote
_addEntity("apos", '\'') // single quote
_addEntity("AElig", chr(198)) // capital AE diphthong (ligature)
_addEntity("Aacute", chr(193)) // capital A, acute accent
_addEntity("Acirc", chr(194)) // capital A, circumflex accent
_addEntity("Agrave", chr(192)) // capital A, grave accent
_addEntity("Aring", chr(197)) // capital A, ring
_addEntity("Atilde", chr(195)) // capital A, tilde
_addEntity("Auml", chr(196)) // capital A, dieresis or umlaut mark
_addEntity("Ccedil", chr(199)) // capital C, cedilla
_addEntity("ETH", chr(208)) // capital Eth, Icelandic
_addEntity("Eacute", chr(201)) // capital E, acute accent
_addEntity("Ecirc", chr(202)) // capital E, circumflex accent
_addEntity("Egrave", chr(200)) // capital E, grave accent
_addEntity("Euml", chr(203)) // capital E, dieresis or umlaut mark
_addEntity("Iacute", chr(205)) // capital I, acute accent
_addEntity("Icirc", chr(206)) // capital I, circumflex accent
_addEntity("Igrave", chr(204)) // capital I, grave accent
_addEntity("Iuml", chr(207)) // capital I, dieresis or umlaut mark
_addEntity("Ntilde", chr(209)) // capital N, tilde
_addEntity("Oacute", chr(211)) // capital O, acute accent
_addEntity("Ocirc", chr(212)) // capital O, circumflex accent
_addEntity("Ograve", chr(210)) // capital O, grave accent
_addEntity("Oslash", chr(216)) // capital O, slash
_addEntity("Otilde", chr(213)) // capital O, tilde
_addEntity("Ouml", chr(214)) // capital O, dieresis or umlaut mark
_addEntity("THORN", chr(222)) // capital THORN, Icelandic
_addEntity("Uacute", chr(218)) // capital U, acute accent
_addEntity("Ucirc", chr(219)) // capital U, circumflex accent
_addEntity("Ugrave", chr(217)) // capital U, grave accent
_addEntity("Uuml", chr(220)) // capital U, dieresis or umlaut mark
_addEntity("Yacute", chr(221)) // capital Y, acute accent
_addEntity("aacute", chr(225)) // small a, acute accent
_addEntity("acirc", chr(226)) // small a, circumflex accent
_addEntity("aelig", chr(230)) // small ae diphthong (ligature)
_addEntity("agrave", chr(224)) // small a, grave accent
_addEntity("aring", chr(229)) // small a, ring
_addEntity("atilde", chr(227)) // small a, tilde
_addEntity("auml", chr(228)) // small a, dieresis or umlaut mark
_addEntity("ccedil", chr(231)) // small c, cedilla
_addEntity("eacute", chr(233)) // small e, acute accent
_addEntity("ecirc", chr(234)) // small e, circumflex accent
_addEntity("egrave", chr(232)) // small e, grave accent
_addEntity("eth", chr(240)) // small eth, Icelandic
_addEntity("euml", chr(235)) // small e, dieresis or umlaut mark
_addEntity("iacute", chr(237)) // small i, acute accent
_addEntity("icirc", chr(238)) // small i, circumflex accent
_addEntity("igrave", chr(236)) // small i, grave accent
_addEntity("iuml", chr(239)) // small i, dieresis or umlaut mark
_addEntity("ntilde", chr(241)) // small n, tilde
_addEntity("oacute", chr(243)) // small o, acute accent
_addEntity("ocirc", chr(244)) // small o, circumflex accent
_addEntity("ograve", chr(242)) // small o, grave accent
_addEntity("oslash", chr(248)) // small o, slash
_addEntity("otilde", chr(245)) // small o, tilde
_addEntity("ouml", chr(246)) // small o, dieresis or umlaut mark
_addEntity("szlig", chr(223)) // small sharp s, German (sz ligature)
_addEntity("thorn", chr(254)) // small thorn, Icelandic
_addEntity("uacute", chr(250)) // small u, acute accent
_addEntity("ucirc", chr(251)) // small u, circumflex accent
_addEntity("ugrave", chr(249)) // small u, grave accent
_addEntity("uuml", chr(252)) // small u, dieresis or umlaut mark
_addEntity("yacute", chr(253)) // small y, acute accent
_addEntity("yuml", chr(255)) // small y, dieresis or umlaut mark
_addEntity("copy", chr(169)) // copyright sign
_addEntity("reg", chr(174)) // registered sign
_addEntity("nbsp", chr(160)) // non breaking space
_addEntity("iexcl", chr(161))
_addEntity("cent", chr(162))
_addEntity("pound", chr(163))
_addEntity("curren", chr(164))
_addEntity("yen", chr(165))
_addEntity("brvbar", chr(166))
_addEntity("sect", chr(167))
_addEntity("uml", chr(168))
_addEntity("ordf", chr(170))
_addEntity("laquo", chr(171))
_addEntity("not", chr(172))
_addEntity("shy", chr(173))
_addEntity("macr", chr(175))
_addEntity("deg", chr(176))
_addEntity("plusmn", chr(177))
_addEntity("sup1", chr(185))
_addEntity("sup2", chr(178))
_addEntity("sup3", chr(179))
_addEntity("acute", chr(180))
_addEntity("micro", chr(181))
_addEntity("para", chr(182))
_addEntity("middot", chr(183))
_addEntity("cedil", chr(184))
_addEntity("ordm", chr(186))
_addEntity("raquo", chr(187))
_addEntity("frac14", chr(188))
_addEntity("frac12", chr(189))
_addEntity("frac34", chr(190))
_addEntity("iquest", chr(191))
_addEntity("times", chr(215))
}
private fun init2() {
_addEntity("divide", chr(247))
_addEntity("OElig", chr(338))
_addEntity("oelig", chr(339))
_addEntity("Scaron", chr(352))
_addEntity("scaron", chr(353))
_addEntity("Yuml", chr(376))
_addEntity("fnof", chr(402))
_addEntity("circ", chr(710))
_addEntity("tilde", chr(732))
_addEntity("Alpha", chr(913))
_addEntity("Beta", chr(914))
_addEntity("Gamma", chr(915))
_addEntity("Delta", chr(916))
_addEntity("Epsilon", chr(917))
_addEntity("Zeta", chr(918))
_addEntity("Eta", chr(919))
_addEntity("Theta", chr(920))
_addEntity("Iota", chr(921))
_addEntity("Kappa", chr(922))
_addEntity("Lambda", chr(923))
_addEntity("Mu", chr(924))
_addEntity("Nu", chr(925))
_addEntity("Xi", chr(926))
_addEntity("Omicron", chr(927))
_addEntity("Pi", chr(928))
_addEntity("Rho", chr(929))
_addEntity("Sigma", chr(931))
_addEntity("Tau", chr(932))
_addEntity("Upsilon", chr(933))
_addEntity("Phi", chr(934))
_addEntity("Chi", chr(935))
_addEntity("Psi", chr(936))
_addEntity("Omega", chr(937))
_addEntity("alpha", chr(945))
_addEntity("beta", chr(946))
_addEntity("gamma", chr(947))
_addEntity("delta", chr(948))
_addEntity("epsilon", chr(949))
_addEntity("zeta", chr(950))
_addEntity("eta", chr(951))
_addEntity("theta", chr(952))
_addEntity("iota", chr(953))
_addEntity("kappa", chr(954))
_addEntity("lambda", chr(955))
_addEntity("mu", chr(956))
_addEntity("nu", chr(957))
_addEntity("xi", chr(958))
_addEntity("omicron", chr(959))
_addEntity("pi", chr(960))
_addEntity("rho", chr(961))
_addEntity("sigmaf", chr(962))
_addEntity("sigma", chr(963))
_addEntity("tau", chr(964))
_addEntity("upsilon", chr(965))
_addEntity("phi", chr(966))
_addEntity("chi", chr(967))
_addEntity("psi", chr(968))
_addEntity("omega", chr(969))
_addEntity("thetasym", chr(977))
_addEntity("upsih", chr(978))
_addEntity("piv", chr(982))
_addEntity("ensp", chr(8194))
_addEntity("emsp", chr(8195))
_addEntity("thinsp", chr(8201))
_addEntity("zwnj", chr(8204))
_addEntity("zwj", chr(8205))
_addEntity("lrm", chr(8206))
_addEntity("rlm", chr(8207))
_addEntity("ndash", chr(8211))
_addEntity("mdash", chr(8212))
_addEntity("lsquo", chr(8216))
_addEntity("rsquo", chr(8217))
_addEntity("sbquo", chr(8218))
_addEntity("ldquo", chr(8220))
_addEntity("rdquo", chr(8221))
_addEntity("bdquo", chr(8222))
_addEntity("dagger", chr(8224))
_addEntity("Dagger", chr(8225))
_addEntity("bull", chr(8226))
_addEntity("hellip", chr(8230))
_addEntity("permil", chr(8240))
_addEntity("prime", chr(8242))
_addEntity("Prime", chr(8243))
_addEntity("lsaquo", chr(8249))
_addEntity("rsaquo", chr(8250))
_addEntity("oline", chr(8254))
_addEntity("frasl", chr(8260))
_addEntity("euro", chr(8364))
_addEntity("image", chr(8465))
_addEntity("weierp", chr(8472))
_addEntity("real", chr(8476))
_addEntity("trade", chr(8482))
_addEntity("alefsym", chr(8501))
_addEntity("larr", chr(8592))
_addEntity("uarr", chr(8593))
_addEntity("rarr", chr(8594))
_addEntity("darr", chr(8595))
_addEntity("harr", chr(8596))
_addEntity("crarr", chr(8629))
_addEntity("lArr", chr(8656))
}
private fun init3() {
_addEntity("uArr", chr(8657))
_addEntity("rArr", chr(8658))
_addEntity("dArr", chr(8659))
_addEntity("hArr", chr(8660))
_addEntity("forall", chr(8704))
_addEntity("part", chr(8706))
_addEntity("exist", chr(8707))
_addEntity("empty", chr(8709))
_addEntity("nabla", chr(8711))
_addEntity("isin", chr(8712))
_addEntity("notin", chr(8713))
_addEntity("ni", chr(8715))
_addEntity("prod", chr(8719))
_addEntity("sum", chr(8721))
_addEntity("minus", chr(8722))
_addEntity("lowast", chr(8727))
_addEntity("radic", chr(8730))
_addEntity("prop", chr(8733))
_addEntity("infin", chr(8734))
_addEntity("ang", chr(8736))
_addEntity("and", chr(8743))
_addEntity("or", chr(8744))
_addEntity("cap", chr(8745))
_addEntity("cup", chr(8746))
_addEntity("int", chr(8747))
_addEntity("there4", chr(8756))
_addEntity("sim", chr(8764))
_addEntity("cong", chr(8773))
_addEntity("asymp", chr(8776))
_addEntity("ne", chr(8800))
_addEntity("equiv", chr(8801))
_addEntity("le", chr(8804))
_addEntity("ge", chr(8805))
_addEntity("sub", chr(8834))
_addEntity("sup", chr(8835))
_addEntity("nsub", chr(8836))
_addEntity("sube", chr(8838))
_addEntity("supe", chr(8839))
_addEntity("oplus", chr(8853))
_addEntity("otimes", chr(8855))
_addEntity("perp", chr(8869))
_addEntity("sdot", chr(8901))
_addEntity("lceil", chr(8968))
_addEntity("rceil", chr(8969))
_addEntity("lfloor", chr(8970))
_addEntity("rfloor", chr(8971))
_addEntity("lang", chr(9001))
_addEntity("rang", chr(9002))
_addEntity("loz", chr(9674))
_addEntity("spades", chr(9824))
_addEntity("clubs", chr(9827))
_addEntity("hearts", chr(9829))
_addEntity("diams", chr(9830))
}
init {
init1()
init2()
init3()
}
}