Twidere-App-Android-Twitter.../twidere.extension.twitlonger/src/main/java/com/twitter/Extractor.java

568 lines
17 KiB
Java

package com.twitter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
/**
* A class to extract usernames, lists, hashtags and URLs from Tweet text.
*/
public class Extractor {
protected boolean extractURLWithoutProtocol = true;
/**
* Fullwidth at sign: '@'
*/
private static final char FULLWIDTH_AT_SIGN = '\uff20';
/**
* Fullwidth number sign: '#'
*/
private static final char FULLWIDTH_NUMBER_SIGN = '\uff03';
/**
* Create a new extractor.
*/
public Extractor() {
}
/**
* Extract $cashtag references from Tweet text.
*
* @param text of the tweet from which to extract cashtags
* @return List of cashtags referenced (without the leading $ sign)
*/
public List<String> extractCashtags(final String text) {
if (text == null || text.length() == 0) return Collections.emptyList();
final ArrayList<String> extracted = new ArrayList<String>();
for (final Entity entity : extractCashtagsWithIndices(text)) {
extracted.add(entity.value);
}
return extracted;
}
/**
* Extract $cashtag references from Tweet text.
*
* @param text of the tweet from which to extract cashtags
* @return List of cashtags referenced (without the leading $ sign)
*/
public List<Entity> extractCashtagsWithIndices(final String text) {
if (text == null || text.length() == 0) return Collections.emptyList();
// Performance optimization.
// If text doesn't contain $, text doesn't contain
// cashtag, so we can simply return an empty list.
if (text.indexOf('$') == -1) return Collections.emptyList();
final ArrayList<Entity> extracted = new ArrayList<Entity>();
final Matcher matcher = Regex.VALID_CASHTAG.matcher(text);
while (matcher.find()) {
extracted.add(new Entity(matcher, Entity.Type.CASHTAG, Regex.VALID_CASHTAG_GROUP_CASHTAG_FULL));
}
return extracted;
}
/**
* Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
*
* @param text text of tweet
* @return list of extracted entities
*/
public List<Entity> extractEntitiesWithIndices(final String text) {
final ArrayList<Entity> entities = new ArrayList<Entity>();
entities.addAll(extractURLsWithIndices(text));
entities.addAll(extractHashtagsWithIndices(text, false));
entities.addAll(extractMentionsOrListsWithIndices(text));
entities.addAll(extractCashtagsWithIndices(text));
removeOverlappingEntities(entities);
return entities;
}
/**
* Extract #hashtag references from Tweet text.
*
* @param text of the tweet from which to extract hashtags
* @return List of hashtags referenced (without the leading # sign)
*/
public List<String> extractHashtags(final String text) {
return extractHashtags(text, true);
}
public List<String> extractHashtags(final String text, final boolean exclude_duplicate) {
if (text == null || text.length() == 0) return Collections.emptyList();
final ArrayList<String> extracted = new ArrayList<String>();
for (final Entity entity : extractHashtagsWithIndices(text)) {
if (!exclude_duplicate || !extracted.contains(entity.value)) {
extracted.add(entity.value);
}
}
return extracted;
}
/**
* Extract #hashtag references from Tweet text.
*
* @param text of the tweet from which to extract hashtags
* @return List of hashtags referenced (without the leading # sign)
*/
public List<Entity> extractHashtagsWithIndices(final String text) {
return extractHashtagsWithIndices(text, true);
}
/**
* Extract @username references from Tweet text. A mention is an occurance
* of @username anywhere in a Tweet.
*
* @param text of the tweet from which to extract usernames
* @return List of usernames referenced (without the leading @ sign)
*/
public Set<String> extractMentionedScreennames(final String text) {
return extractMentionedScreennames(text, true);
}
public Set<String> extractMentionedScreennames(final String text, final boolean exclude_duplicate) {
if (text == null || text.length() == 0) return Collections.emptySet();
final Set<String> extracted = new HashSet<String>();
for (final Entity entity : extractMentionedScreennamesWithIndices(text)) {
if (!exclude_duplicate || !extracted.contains(entity.value)) {
extracted.add(entity.value);
}
}
return extracted;
}
/**
* Extract @username references from Tweet text. A mention is an occurance
* of @username anywhere in a Tweet.
*
* @param text of the tweet from which to extract usernames
* @return List of usernames referenced (without the leading @ sign)
*/
public List<Entity> extractMentionedScreennamesWithIndices(final String text) {
final ArrayList<Entity> extracted = new ArrayList<Entity>();
for (final Entity entity : extractMentionsOrListsWithIndices(text)) {
if (entity.listSlug == null) {
extracted.add(entity);
}
}
return extracted;
}
public List<Entity> extractMentionsOrListsWithIndices(final String text) {
if (text == null || text.length() == 0) return Collections.emptyList();
// Performance optimization.
// If text doesn't contain @ at all, the text doesn't
// contain @mention. So we can simply return an empty list.
boolean found = false;
for (final char c : text.toCharArray()) {
if (c == '@' || c == FULLWIDTH_AT_SIGN) {
found = true;
break;
}
}
if (!found) return Collections.emptyList();
final ArrayList<Entity> extracted = new ArrayList<Entity>();
final Matcher matcher = Regex.VALID_MENTION_OR_LIST.matcher(text);
while (matcher.find()) {
final String after = text.substring(matcher.end());
if (!Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) {
if (matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST) == null) {
extracted.add(new Entity(matcher, Entity.Type.MENTION, Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME));
} else {
extracted.add(new Entity(matcher.start(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME) - 1, matcher
.end(Regex.VALID_MENTION_OR_LIST_GROUP_LIST), matcher
.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME), matcher
.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST), Entity.Type.MENTION));
}
}
}
return extracted;
}
/**
* Extract a @username reference from the beginning of Tweet text. A reply
* is an occurance of @username at the beginning of a Tweet, preceded by 0
* or more spaces.
*
* @param text of the tweet from which to extract the replied to username
* @return username referenced, if any (without the leading @ sign). Returns
* null if this is not a reply.
*/
public String extractReplyScreenname(final String text) {
if (text == null) return null;
final Matcher matcher = Regex.VALID_REPLY.matcher(text);
if (matcher.find()) {
final String after = text.substring(matcher.end());
if (Regex.INVALID_MENTION_MATCH_END.matcher(after).find())
return null;
else
return matcher.group(Regex.VALID_REPLY_GROUP_USERNAME);
} else
return null;
}
/**
* Extract URL references from Tweet text.
*
* @param text of the tweet from which to extract URLs
* @return List of URLs referenced.
*/
public List<String> extractURLs(final String text) {
if (text == null || text.length() == 0) return Collections.emptyList();
final ArrayList<String> urls = new ArrayList<String>();
for (final Entity entity : extractURLsWithIndices(text)) {
urls.add(entity.value);
}
return urls;
}
/**
* Extract URL references from Tweet text.
*
* @param text of the tweet from which to extract URLs
* @return List of URLs referenced.
*/
public List<Entity> extractURLsWithIndices(final String text) {
if (text == null || text.length() == 0
|| (extractURLWithoutProtocol ? text.indexOf('.') : text.indexOf(':')) == -1) // Performance
// optimization.
// If text doesn't contain '.' or ':' at all, text doesn't contain
// URL,
// so we can simply return an empty list.
return Collections.emptyList();
final ArrayList<Entity> urls = new ArrayList<Entity>();
final Matcher matcher = Regex.VALID_URL.matcher(text);
while (matcher.find()) {
if (matcher.group(Regex.VALID_URL_GROUP_PROTOCOL) == null) {
// skip if protocol is not present and
// 'extractURLWithoutProtocol' is false
// or URL is preceded by invalid character.
if (!extractURLWithoutProtocol
|| Regex.INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN.matcher(
matcher.group(Regex.VALID_URL_GROUP_BEFORE)).matches()) {
continue;
}
}
String url = matcher.group(Regex.VALID_URL_GROUP_URL);
final int start = matcher.start(Regex.VALID_URL_GROUP_URL);
int end = matcher.end(Regex.VALID_URL_GROUP_URL);
final Matcher tco_matcher = Regex.VALID_TCO_URL.matcher(url);
if (tco_matcher.find()) {
// In the case of t.co URLs, don't allow additional path
// characters.
url = tco_matcher.group();
end = start + url.length();
}
urls.add(new Entity(start, end, url, Entity.Type.URL));
}
return urls;
}
public boolean isExtractURLWithoutProtocol() {
return extractURLWithoutProtocol;
}
/*
* Modify Unicode-based indices of the entities to UTF-16 based indices.
*
* In UTF-16 based indices, Unicode supplementary characters are counted as
* two characters.
*
* This method requires that the list of entities be in ascending order by
* start index.
*
* @param text original text
*
* @param entities entities with Unicode based indices
*/
public void modifyIndicesFromUnicodeToUTF16(final String text, final List<Entity> entities) {
final IndexConverter convert = new IndexConverter(text);
for (final Entity entity : entities) {
entity.start = convert.codePointsToCodeUnits(entity.start);
entity.end = convert.codePointsToCodeUnits(entity.end);
}
}
/*
* Modify UTF-16-based indices of the entities to Unicode-based indices.
*
* In Unicode-based indices, Unicode supplementary characters are counted as
* single characters.
*
* This method requires that the list of entities be in ascending order by
* start index.
*
* @param text original text
*
* @param entities entities with UTF-16 based indices
*/
public void modifyIndicesFromUTF16ToToUnicode(final String text, final List<Entity> entities) {
final IndexConverter convert = new IndexConverter(text);
for (final Entity entity : entities) {
entity.start = convert.codeUnitsToCodePoints(entity.start);
entity.end = convert.codeUnitsToCodePoints(entity.end);
}
}
public void setExtractURLWithoutProtocol(final boolean extractURLWithoutProtocol) {
this.extractURLWithoutProtocol = extractURLWithoutProtocol;
}
/**
* Extract #hashtag references from Tweet text.
*
* @param text of the tweet from which to extract hashtags
* @param checkUrlOverlap if true, check if extracted hashtags overlap URLs
* and remove overlapping ones
* @return List of hashtags referenced (without the leading # sign)
*/
private List<Entity> extractHashtagsWithIndices(final String text, final boolean checkUrlOverlap) {
if (text == null || text.length() == 0) return Collections.emptyList();
// Performance optimization.
// If text doesn't contain # at all, text doesn't contain
// hashtag, so we can simply return an empty list.
boolean found = false;
for (final char c : text.toCharArray()) {
if (c == '#' || c == FULLWIDTH_NUMBER_SIGN) {
found = true;
break;
}
}
if (!found) return Collections.emptyList();
final ArrayList<Entity> extracted = new ArrayList<Entity>();
final Matcher matcher = Regex.VALID_HASHTAG.matcher(text);
while (matcher.find()) {
final String after = text.substring(matcher.end());
if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) {
extracted.add(new Entity(matcher, Entity.Type.HASHTAG, Regex.VALID_HASHTAG_GROUP_TAG));
}
}
if (checkUrlOverlap) {
// extract URLs
final List<Entity> urls = extractURLsWithIndices(text);
if (!urls.isEmpty()) {
extracted.addAll(urls);
// remove overlap
removeOverlappingEntities(extracted);
// remove URL entities
final Iterator<Entity> it = extracted.iterator();
while (it.hasNext()) {
final Entity entity = it.next();
if (entity.getType() != Entity.Type.HASHTAG) {
it.remove();
}
}
}
}
return extracted;
}
private void removeOverlappingEntities(final List<Entity> entities) {
// sort by index
Collections.<Entity> sort(entities, new Comparator<Entity>() {
@Override
public int compare(final Entity e1, final Entity e2) {
return e1.start - e2.start;
}
});
// Remove overlapping entities.
// Two entities overlap only when one is URL and the other is
// hashtag/mention
// which is a part of the URL. When it happens, we choose URL over
// hashtag/mention
// by selecting the one with smaller start index.
if (!entities.isEmpty()) {
final Iterator<Entity> it = entities.iterator();
Entity prev = it.next();
while (it.hasNext()) {
final Entity cur = it.next();
if (prev.getEnd() > cur.getStart()) {
it.remove();
} else {
prev = cur;
}
}
}
}
public static class Entity {
protected int start;
protected int end;
protected final String value;
// listSlug is used to store the list portion of @mention/list.
protected final String listSlug;
protected final Type type;
protected String displayURL = null;
protected String expandedURL = null;
public Entity(final int start, final int end, final String value, final String listSlug, final Type type) {
this.start = start;
this.end = end;
this.value = value;
this.listSlug = listSlug;
this.type = type;
}
public Entity(final int start, final int end, final String value, final Type type) {
this(start, end, value, null, type);
}
public Entity(final Matcher matcher, final Type type, final int groupNumber) {
// Offset -1 on start index to include @, # symbols for mentions and
// hashtags
this(matcher, type, groupNumber, -1);
}
public Entity(final Matcher matcher, final Type type, final int groupNumber, final int startOffset) {
this(matcher.start(groupNumber) + startOffset, matcher.end(groupNumber), matcher.group(groupNumber), type);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) return true;
if (!(obj instanceof Entity)) return false;
final Entity other = (Entity) obj;
if (type.equals(other.type) && start == other.start && end == other.end && value.equals(other.value))
return true;
else
return false;
}
public String getDisplayURL() {
return displayURL;
}
public Integer getEnd() {
return end;
}
public String getExpandedURL() {
return expandedURL;
}
public String getListSlug() {
return listSlug;
}
public Integer getStart() {
return start;
}
public Type getType() {
return type;
}
public String getValue() {
return value;
}
@Override
public int hashCode() {
return type.hashCode() + value.hashCode() + start + end;
}
public void setDisplayURL(final String displayURL) {
this.displayURL = displayURL;
}
public void setExpandedURL(final String expandedURL) {
this.expandedURL = expandedURL;
}
@Override
public String toString() {
return value + "(" + type + ") [" + start + "," + end + "]";
}
public enum Type {
URL, HASHTAG, MENTION, CASHTAG
}
}
/**
* An efficient converter of indices between code points and code units.
*/
private static final class IndexConverter {
protected final String text;
// Keep track of a single corresponding pair of code unit and code point
// offsets so that we can re-use counting work if the next requested
// entity is near the most recent entity.
protected int codePointIndex = 0;
protected int charIndex = 0;
IndexConverter(final String text) {
this.text = text;
}
/**
* @param codePointIndex Index into the string measured in code points.
* @return the code unit index that corresponds to the specified code
* point index.
*/
int codePointsToCodeUnits(final int codePointIndex) {
// Note that offsetByCodePoints accepts negative indices.
charIndex = text.offsetByCodePoints(charIndex, codePointIndex - this.codePointIndex);
this.codePointIndex = codePointIndex;
return charIndex;
}
/**
* @param charIndex Index into the string measured in code units.
* @return The code point index that corresponds to the specified
* character index.
*/
int codeUnitsToCodePoints(final int charIndex) {
if (charIndex < this.charIndex) {
codePointIndex -= text.codePointCount(charIndex, this.charIndex);
} else {
codePointIndex += text.codePointCount(this.charIndex, charIndex);
}
this.charIndex = charIndex;
// Make sure that charIndex never points to the second code unit of
// a
// surrogate pair.
if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
this.charIndex -= 1;
}
return codePointIndex;
}
}
}