package jp.juggler.subwaytooter.util; import android.text.Spannable; import android.text.SpannableStringBuilder; import android.text.TextUtils; import android.text.style.ClickableSpan; import android.view.View; import java.util.ArrayList; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import jp.juggler.subwaytooter.api.entity.TootMention; import jp.juggler.subwaytooter.api.entity.TootTag; public class HTMLDecoder { static final LogCategory log = new LogCategory( "HTMLDecoder" ); static final int OPEN_TYPE_OPENCLOSE = 1; static final int OPEN_TYPE_OPEN = 2; static final int OPEN_TYPE_CLOSE = 3; static final String TAG_TEXT = "<>text"; static final String TAG_END = "<>end"; static final Pattern reTag = Pattern.compile( "<(/?)(\\w+)" ); static final Pattern reTagEnd = Pattern.compile( "(/?)>$" ); static final Pattern reHref = Pattern.compile( "\\bhref=\"([^\"]*)\"" ); static class TokenParser { final String src; int next; String tag; int open_type; String text; public TokenParser( String src ){ this.src = src; this.next = 0; eat(); } void eat(){ // end? if( next >= src.length() ){ tag = TAG_END; open_type = OPEN_TYPE_OPENCLOSE; return; } // text ? int end = src.indexOf( '<', next ); if( end == - 1 ) end = src.length(); if( end > next ){ this.text = src.substring( next, end ); this.tag = TAG_TEXT; this.open_type = OPEN_TYPE_OPENCLOSE; next = end; return; } // tag ? end = src.indexOf( '>', next ); if( end == - 1 ){ end = src.length(); }else{ ++ end; } text = src.substring( next, end ); next = end; Matcher m = reTag.matcher( text ); if( m.find() ){ boolean is_close = ! TextUtils.isEmpty( m.group( 1 ) ); tag = m.group( 2 ).toLowerCase(); Matcher m2 = reTagEnd.matcher( text ); boolean is_openclose = false; if( m2.find() ){ is_openclose = ! TextUtils.isEmpty( m2.group( 1 ) ); } open_type = is_close ? OPEN_TYPE_CLOSE : is_openclose ? OPEN_TYPE_OPENCLOSE : OPEN_TYPE_OPEN; }else{ tag = TAG_TEXT; this.open_type = OPEN_TYPE_OPENCLOSE; } } } public interface LinkClickCallback { void onClickLink( String url ); } public static LinkClickCallback link_callback; static final boolean DEBUG_HTML_PARSER = false; static class Node { final ArrayList< Node > child_nodes = new ArrayList<>(); String tag; String text; public Node(){ tag = "<>root"; text = ""; } public Node( TokenParser t ){ this.tag = t.tag; this.text = t.text; } public void parseChild( TokenParser t, String indent ){ if( DEBUG_HTML_PARSER ) log.d( "parseChild: %s(%s", indent, tag ); for( ; ; ){ if( TAG_END.equals( t.tag ) ) break; if( OPEN_TYPE_CLOSE == t.open_type ){ t.eat(); break; } int open_type = t.open_type; Node child = new Node( t ); child_nodes.add( child ); t.eat(); if( DEBUG_HTML_PARSER ) log.d( "parseChild: %s|%s %s [%s]", indent, child.tag, open_type, child.text ); if( OPEN_TYPE_OPEN == open_type ){ child.parseChild( t, indent + "--" ); } } if( DEBUG_HTML_PARSER ) log.d( "parseChild: %s)%s", indent, tag ); } public void encodeSpan( SpannableStringBuilder sb ){ if( TAG_TEXT.equals( tag ) ){ sb.append( Emojione.decodeEmoji( decodeEntity( text ) ) ); return; } if( DEBUG_HTML_PARSER ) sb.append( "(start " + tag + ")" ); int start = sb.length(); for( Node child : child_nodes ){ child.encodeSpan( sb ); } if( "a".equals( tag ) ){ Matcher m = reHref.matcher( text ); if( m.find() ){ final String href = decodeEntity( m.group( 1 ) ).toString(); if( ! TextUtils.isEmpty( href ) ){ sb.setSpan( new ClickableSpan() { @Override public void onClick( View widget ){ if( link_callback != null ){ link_callback.onClickLink( href ); } } }, start, sb.length(), Spannable.SPAN_EXCLUSIVE_EXCLUSIVE ); } } } if( DEBUG_HTML_PARSER ) sb.append( "(end " + tag + ")" ); if( "br".equals( tag ) ) sb.append( '\n' ); if( "p".equals( tag ) ){ if( sb.charAt( sb.length()-1 ) != '\n' ) sb.append( '\n' ); sb.append( '\n' ); } } } public static SpannableStringBuilder decodeHTML( String src ){ try{ TokenParser tracker = new TokenParser( src ); Node rootNode = new Node(); rootNode.parseChild( tracker, "" ); SpannableStringBuilder sb = new SpannableStringBuilder(); rootNode.encodeSpan( sb ); int end = sb.length(); while( end > 0 && Character.isWhitespace( sb.charAt( end-1 ) ) ) --end; if( end < sb.length() ){ sb.delete( end,sb.length() ); } return sb; }catch( Throwable ex ){ } return null; } public static Spannable decodeTags( TootTag.List src_list ){ if( src_list == null || src_list.isEmpty()) return null; SpannableStringBuilder sb = new SpannableStringBuilder(); for(TootTag item : src_list){ if(sb.length() > 0) sb.append(" "); int start = sb.length(); sb.append('#'); sb.append(item.name); final String item_url = item.url; sb.setSpan( new ClickableSpan() { @Override public void onClick( View widget ){ if( link_callback != null ){ link_callback.onClickLink( item_url ); } } }, start, sb.length(), Spannable.SPAN_EXCLUSIVE_EXCLUSIVE ); } return sb; } public static Spannable decodeMentions( TootMention.List src_list ){ if( src_list == null || src_list.isEmpty()) return null; SpannableStringBuilder sb = new SpannableStringBuilder(); for(TootMention item : src_list){ if(sb.length() > 0) sb.append(" "); int start = sb.length(); sb.append('@'); sb.append( item.acct ); final String item_url = item.url; sb.setSpan( new ClickableSpan() { @Override public void onClick( View widget ){ if( link_callback != null ){ link_callback.onClickLink( item_url ); } } }, start, sb.length(), Spannable.SPAN_EXCLUSIVE_EXCLUSIVE ); } return sb; } ////////////////////////////////////////////////////////////////////////////////////// private static final HashMap< String, Character > entity_map = new HashMap<>(); private static void _addEntity( String s, char c ){ entity_map.put( s, c ); } private static char chr( int num ){ return (char) num; } static final Pattern reEntity = Pattern.compile( "&(#?)(\\w+);" ); public static String decodeEntity( String src ){ StringBuilder sb = null; Matcher m = reEntity.matcher( src ); int last_end = 0; while( m.find() ){ if(sb==null) sb = new StringBuilder(); int start = m.start(); int end = m.end(); try{ if( start > last_end ){ sb.append( src.substring( last_end, start ) ); } boolean is_numeric = m.group( 1 ).length() > 0; String part = m.group( 2 ); if( ! is_numeric ){ Character c = entity_map.get( part ); if( c != null ){ sb.append( (char) c ); continue; } }else{ int c; try{ if( part.charAt( 0 ) == 'x' ){ c = Integer.parseInt( part.substring( 1 ), 16 ); }else{ c = Integer.parseInt( part, 10 ); } sb.append( (char) c ); continue; }catch( Throwable ex ){ ex.printStackTrace(); } } sb.append( src.substring( start, end ) ); }finally{ last_end = end; } } // 全くマッチしなかった if( sb == null ) return src; int end = src.length(); if( end > last_end ){ sb.append( src.substring( last_end, end ) ); } return sb.toString(); } private static void init1(){ _addEntity( "amp", '&' ); // ampersand _addEntity( "gt", '>' ); // greater than _addEntity( "lt", '<' ); // less than _addEntity( "quot", '"' ); // double quote _addEntity( "apos", '\'' ); // single quote _addEntity( "AElig", chr( 198 ) ); // capital AE diphthong (ligature) _addEntity( "Aacute", chr( 193 ) ); // capital A, acute accent _addEntity( "Acirc", chr( 194 ) ); // capital A, circumflex accent _addEntity( "Agrave", chr( 192 ) ); // capital A, grave accent _addEntity( "Aring", chr( 197 ) ); // capital A, ring _addEntity( "Atilde", chr( 195 ) ); // capital A, tilde _addEntity( "Auml", chr( 196 ) ); // capital A, dieresis or umlaut mark _addEntity( "Ccedil", chr( 199 ) ); // capital C, cedilla _addEntity( "ETH", chr( 208 ) ); // capital Eth, Icelandic _addEntity( "Eacute", chr( 201 ) ); // capital E, acute accent _addEntity( "Ecirc", chr( 202 ) ); // capital E, circumflex accent _addEntity( "Egrave", chr( 200 ) ); // capital E, grave accent _addEntity( "Euml", chr( 203 ) ); // capital E, dieresis or umlaut mark _addEntity( "Iacute", chr( 205 ) ); // capital I, acute accent _addEntity( "Icirc", chr( 206 ) ); // capital I, circumflex accent _addEntity( "Igrave", chr( 204 ) ); // capital I, grave accent _addEntity( "Iuml", chr( 207 ) ); // capital I, dieresis or umlaut mark _addEntity( "Ntilde", chr( 209 ) ); // capital N, tilde _addEntity( "Oacute", chr( 211 ) ); // capital O, acute accent _addEntity( "Ocirc", chr( 212 ) ); // capital O, circumflex accent _addEntity( "Ograve", chr( 210 ) ); // capital O, grave accent _addEntity( "Oslash", chr( 216 ) ); // capital O, slash _addEntity( "Otilde", chr( 213 ) ); // capital O, tilde _addEntity( "Ouml", chr( 214 ) ); // capital O, dieresis or umlaut mark _addEntity( "THORN", chr( 222 ) ); // capital THORN, Icelandic _addEntity( "Uacute", chr( 218 ) ); // capital U, acute accent _addEntity( "Ucirc", chr( 219 ) ); // capital U, circumflex accent _addEntity( "Ugrave", chr( 217 ) ); // capital U, grave accent _addEntity( "Uuml", chr( 220 ) ); // capital U, dieresis or umlaut mark _addEntity( "Yacute", chr( 221 ) ); // capital Y, acute accent _addEntity( "aacute", chr( 225 ) ); // small a, acute accent _addEntity( "acirc", chr( 226 ) ); // small a, circumflex accent _addEntity( "aelig", chr( 230 ) ); // small ae diphthong (ligature) _addEntity( "agrave", chr( 224 ) ); // small a, grave accent _addEntity( "aring", chr( 229 ) ); // small a, ring _addEntity( "atilde", chr( 227 ) ); // small a, tilde _addEntity( "auml", chr( 228 ) ); // small a, dieresis or umlaut mark _addEntity( "ccedil", chr( 231 ) ); // small c, cedilla _addEntity( "eacute", chr( 233 ) ); // small e, acute accent _addEntity( "ecirc", chr( 234 ) ); // small e, circumflex accent _addEntity( "egrave", chr( 232 ) ); // small e, grave accent _addEntity( "eth", chr( 240 ) ); // small eth, Icelandic _addEntity( "euml", chr( 235 ) ); // small e, dieresis or umlaut mark _addEntity( "iacute", chr( 237 ) ); // small i, acute accent _addEntity( "icirc", chr( 238 ) ); // small i, circumflex accent _addEntity( "igrave", chr( 236 ) ); // small i, grave accent _addEntity( "iuml", chr( 239 ) ); // small i, dieresis or umlaut mark _addEntity( "ntilde", chr( 241 ) ); // small n, tilde _addEntity( "oacute", chr( 243 ) ); // small o, acute accent _addEntity( "ocirc", chr( 244 ) ); // small o, circumflex accent _addEntity( "ograve", chr( 242 ) ); // small o, grave accent _addEntity( "oslash", chr( 248 ) ); // small o, slash _addEntity( "otilde", chr( 245 ) ); // small o, tilde _addEntity( "ouml", chr( 246 ) ); // small o, dieresis or umlaut mark _addEntity( "szlig", chr( 223 ) ); // small sharp s, German (sz ligature) _addEntity( "thorn", chr( 254 ) ); // small thorn, Icelandic _addEntity( "uacute", chr( 250 ) ); // small u, acute accent _addEntity( "ucirc", chr( 251 ) ); // small u, circumflex accent _addEntity( "ugrave", chr( 249 ) ); // small u, grave accent _addEntity( "uuml", chr( 252 ) ); // small u, dieresis or umlaut mark _addEntity( "yacute", chr( 253 ) ); // small y, acute accent _addEntity( "yuml", chr( 255 ) ); // small y, dieresis or umlaut mark _addEntity( "copy", chr( 169 ) ); // copyright sign _addEntity( "reg", chr( 174 ) ); // registered sign _addEntity( "nbsp", chr( 160 ) ); // non breaking space _addEntity( "iexcl", chr( 161 ) ); _addEntity( "cent", chr( 162 ) ); _addEntity( "pound", chr( 163 ) ); _addEntity( "curren", chr( 164 ) ); _addEntity( "yen", chr( 165 ) ); _addEntity( "brvbar", chr( 166 ) ); _addEntity( "sect", chr( 167 ) ); _addEntity( "uml", chr( 168 ) ); _addEntity( "ordf", chr( 170 ) ); _addEntity( "laquo", chr( 171 ) ); _addEntity( "not", chr( 172 ) ); _addEntity( "shy", chr( 173 ) ); _addEntity( "macr", chr( 175 ) ); _addEntity( "deg", chr( 176 ) ); _addEntity( "plusmn", chr( 177 ) ); _addEntity( "sup1", chr( 185 ) ); _addEntity( "sup2", chr( 178 ) ); _addEntity( "sup3", chr( 179 ) ); _addEntity( "acute", chr( 180 ) ); _addEntity( "micro", chr( 181 ) ); _addEntity( "para", chr( 182 ) ); _addEntity( "middot", chr( 183 ) ); _addEntity( "cedil", chr( 184 ) ); _addEntity( "ordm", chr( 186 ) ); _addEntity( "raquo", chr( 187 ) ); _addEntity( "frac14", chr( 188 ) ); _addEntity( "frac12", chr( 189 ) ); _addEntity( "frac34", chr( 190 ) ); _addEntity( "iquest", chr( 191 ) ); _addEntity( "times", chr( 215 ) ); } private static void init2(){ _addEntity( "divide", chr( 247 ) ); _addEntity( "OElig", chr( 338 ) ); _addEntity( "oelig", chr( 339 ) ); _addEntity( "Scaron", chr( 352 ) ); _addEntity( "scaron", chr( 353 ) ); _addEntity( "Yuml", chr( 376 ) ); _addEntity( "fnof", chr( 402 ) ); _addEntity( "circ", chr( 710 ) ); _addEntity( "tilde", chr( 732 ) ); _addEntity( "Alpha", chr( 913 ) ); _addEntity( "Beta", chr( 914 ) ); _addEntity( "Gamma", chr( 915 ) ); _addEntity( "Delta", chr( 916 ) ); _addEntity( "Epsilon", chr( 917 ) ); _addEntity( "Zeta", chr( 918 ) ); _addEntity( "Eta", chr( 919 ) ); _addEntity( "Theta", chr( 920 ) ); _addEntity( "Iota", chr( 921 ) ); _addEntity( "Kappa", chr( 922 ) ); _addEntity( "Lambda", chr( 923 ) ); _addEntity( "Mu", chr( 924 ) ); _addEntity( "Nu", chr( 925 ) ); _addEntity( "Xi", chr( 926 ) ); _addEntity( "Omicron", chr( 927 ) ); _addEntity( "Pi", chr( 928 ) ); _addEntity( "Rho", chr( 929 ) ); _addEntity( "Sigma", chr( 931 ) ); _addEntity( "Tau", chr( 932 ) ); _addEntity( "Upsilon", chr( 933 ) ); _addEntity( "Phi", chr( 934 ) ); _addEntity( "Chi", chr( 935 ) ); _addEntity( "Psi", chr( 936 ) ); _addEntity( "Omega", chr( 937 ) ); _addEntity( "alpha", chr( 945 ) ); _addEntity( "beta", chr( 946 ) ); _addEntity( "gamma", chr( 947 ) ); _addEntity( "delta", chr( 948 ) ); _addEntity( "epsilon", chr( 949 ) ); _addEntity( "zeta", chr( 950 ) ); _addEntity( "eta", chr( 951 ) ); _addEntity( "theta", chr( 952 ) ); _addEntity( "iota", chr( 953 ) ); _addEntity( "kappa", chr( 954 ) ); _addEntity( "lambda", chr( 955 ) ); _addEntity( "mu", chr( 956 ) ); _addEntity( "nu", chr( 957 ) ); _addEntity( "xi", chr( 958 ) ); _addEntity( "omicron", chr( 959 ) ); _addEntity( "pi", chr( 960 ) ); _addEntity( "rho", chr( 961 ) ); _addEntity( "sigmaf", chr( 962 ) ); _addEntity( "sigma", chr( 963 ) ); _addEntity( "tau", chr( 964 ) ); _addEntity( "upsilon", chr( 965 ) ); _addEntity( "phi", chr( 966 ) ); _addEntity( "chi", chr( 967 ) ); _addEntity( "psi", chr( 968 ) ); _addEntity( "omega", chr( 969 ) ); _addEntity( "thetasym", chr( 977 ) ); _addEntity( "upsih", chr( 978 ) ); _addEntity( "piv", chr( 982 ) ); _addEntity( "ensp", chr( 8194 ) ); _addEntity( "emsp", chr( 8195 ) ); _addEntity( "thinsp", chr( 8201 ) ); _addEntity( "zwnj", chr( 8204 ) ); _addEntity( "zwj", chr( 8205 ) ); _addEntity( "lrm", chr( 8206 ) ); _addEntity( "rlm", chr( 8207 ) ); _addEntity( "ndash", chr( 8211 ) ); _addEntity( "mdash", chr( 8212 ) ); _addEntity( "lsquo", chr( 8216 ) ); _addEntity( "rsquo", chr( 8217 ) ); _addEntity( "sbquo", chr( 8218 ) ); _addEntity( "ldquo", chr( 8220 ) ); _addEntity( "rdquo", chr( 8221 ) ); _addEntity( "bdquo", chr( 8222 ) ); _addEntity( "dagger", chr( 8224 ) ); _addEntity( "Dagger", chr( 8225 ) ); _addEntity( "bull", chr( 8226 ) ); _addEntity( "hellip", chr( 8230 ) ); _addEntity( "permil", chr( 8240 ) ); _addEntity( "prime", chr( 8242 ) ); _addEntity( "Prime", chr( 8243 ) ); _addEntity( "lsaquo", chr( 8249 ) ); _addEntity( "rsaquo", chr( 8250 ) ); _addEntity( "oline", chr( 8254 ) ); _addEntity( "frasl", chr( 8260 ) ); _addEntity( "euro", chr( 8364 ) ); _addEntity( "image", chr( 8465 ) ); _addEntity( "weierp", chr( 8472 ) ); _addEntity( "real", chr( 8476 ) ); _addEntity( "trade", chr( 8482 ) ); _addEntity( "alefsym", chr( 8501 ) ); _addEntity( "larr", chr( 8592 ) ); _addEntity( "uarr", chr( 8593 ) ); _addEntity( "rarr", chr( 8594 ) ); _addEntity( "darr", chr( 8595 ) ); _addEntity( "harr", chr( 8596 ) ); _addEntity( "crarr", chr( 8629 ) ); _addEntity( "lArr", chr( 8656 ) ); } private static void init3(){ _addEntity( "uArr", chr( 8657 ) ); _addEntity( "rArr", chr( 8658 ) ); _addEntity( "dArr", chr( 8659 ) ); _addEntity( "hArr", chr( 8660 ) ); _addEntity( "forall", chr( 8704 ) ); _addEntity( "part", chr( 8706 ) ); _addEntity( "exist", chr( 8707 ) ); _addEntity( "empty", chr( 8709 ) ); _addEntity( "nabla", chr( 8711 ) ); _addEntity( "isin", chr( 8712 ) ); _addEntity( "notin", chr( 8713 ) ); _addEntity( "ni", chr( 8715 ) ); _addEntity( "prod", chr( 8719 ) ); _addEntity( "sum", chr( 8721 ) ); _addEntity( "minus", chr( 8722 ) ); _addEntity( "lowast", chr( 8727 ) ); _addEntity( "radic", chr( 8730 ) ); _addEntity( "prop", chr( 8733 ) ); _addEntity( "infin", chr( 8734 ) ); _addEntity( "ang", chr( 8736 ) ); _addEntity( "and", chr( 8743 ) ); _addEntity( "or", chr( 8744 ) ); _addEntity( "cap", chr( 8745 ) ); _addEntity( "cup", chr( 8746 ) ); _addEntity( "int", chr( 8747 ) ); _addEntity( "there4", chr( 8756 ) ); _addEntity( "sim", chr( 8764 ) ); _addEntity( "cong", chr( 8773 ) ); _addEntity( "asymp", chr( 8776 ) ); _addEntity( "ne", chr( 8800 ) ); _addEntity( "equiv", chr( 8801 ) ); _addEntity( "le", chr( 8804 ) ); _addEntity( "ge", chr( 8805 ) ); _addEntity( "sub", chr( 8834 ) ); _addEntity( "sup", chr( 8835 ) ); _addEntity( "nsub", chr( 8836 ) ); _addEntity( "sube", chr( 8838 ) ); _addEntity( "supe", chr( 8839 ) ); _addEntity( "oplus", chr( 8853 ) ); _addEntity( "otimes", chr( 8855 ) ); _addEntity( "perp", chr( 8869 ) ); _addEntity( "sdot", chr( 8901 ) ); _addEntity( "lceil", chr( 8968 ) ); _addEntity( "rceil", chr( 8969 ) ); _addEntity( "lfloor", chr( 8970 ) ); _addEntity( "rfloor", chr( 8971 ) ); _addEntity( "lang", chr( 9001 ) ); _addEntity( "rang", chr( 9002 ) ); _addEntity( "loz", chr( 9674 ) ); _addEntity( "spades", chr( 9824 ) ); _addEntity( "clubs", chr( 9827 ) ); _addEntity( "hearts", chr( 9829 ) ); _addEntity( "diams", chr( 9830 ) ); } static{ init1(); init2(); init3(); } }