Fixes #3444 - Strip off HTML from podcast descriptions

This commit is contained in:
fossterer 2019-10-24 23:20:31 -04:00
parent 437f3f29c0
commit ca83c59537
4 changed files with 49 additions and 21 deletions

2
.gitignore vendored
View File

@ -13,6 +13,8 @@ bin/
gen/
target/
build/
**/*.project
**/*.classpath
# Local configuration file (sdk path, etc)
local.properties

View File

@ -367,20 +367,14 @@ public class OnlineFeedViewActivity extends AppCompatActivity {
* This method is executed on a background thread
*/
private void beforeShowFeedInformation(Feed feed) {
final HtmlToPlainText formatter = new HtmlToPlainText();
if(Feed.TYPE_ATOM1.equals(feed.getType()) && feed.getDescription() != null) {
// remove HTML tags from descriptions
Log.d(TAG, "Removing HTML from feed description");
Document feedDescription = Jsoup.parse(feed.getDescription());
feed.setDescription(StringUtils.trim(formatter.getPlainText(feedDescription)));
}
Log.d(TAG, "Removing HTML from feed description");
feed.setDescription(HtmlToPlainText.getPlainText(feed.getDescription()));
Log.d(TAG, "Removing HTML from shownotes");
if (feed.getItems() != null) {
for (FeedItem item : feed.getItems()) {
if (item.getDescription() != null) {
Document itemDescription = Jsoup.parse(item.getDescription());
item.setDescription(StringUtils.trim(formatter.getPlainText(itemDescription)));
}
item.setDescription(HtmlToPlainText.getPlainText(item.getDescription()));
}
}
}

View File

@ -167,16 +167,8 @@ public class FeedInfoFragment extends Fragment {
txtvTitle.setText(feed.getTitle());
String description = feed.getDescription();
if(description != null) {
if(Feed.TYPE_ATOM1.equals(feed.getType())) {
HtmlToPlainText formatter = new HtmlToPlainText();
Document feedDescription = Jsoup.parse(feed.getDescription());
description = StringUtils.trim(formatter.getPlainText(feedDescription));
}
} else {
description = "";
}
String description = HtmlToPlainText.getPlainText(feed.getDescription());
txtvDescription.setText(description);
if (!TextUtils.isEmpty(feed.getAuthor())) {

View File

@ -1,12 +1,19 @@
package de.danoeh.antennapod.core.util.syndication;
import android.text.TextUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import java.util.regex.Pattern;
/**
* This class is based on <code>HtmlToPlainText</code> from jsoup's examples package.
*
@ -26,6 +33,39 @@ import org.jsoup.select.NodeVisitor;
*/
public class HtmlToPlainText {
/**
* Use this method to strip off HTML encoding from given text
* <p>
* Replaces bullet points with *, ignores colors/bold/...
*
* @param str String with any encoding
* @return Human readable text with minimal HTML formatting
*/
public static String getPlainText(String str) {
if (!TextUtils.isEmpty(str) && isHtml(str)) {
HtmlToPlainText formatter = new HtmlToPlainText();
Document feedDescription = Jsoup.parse(str);
str = StringUtils.trim(formatter.getPlainText(feedDescription));
} else if (TextUtils.isEmpty(str)) {
str = "";
}
return str;
}
/**
* Use this method to determine if a given text has any HTML tag
*
* @param str String to be tested for presence of HTML content
* @return <b>True</b> if text contains any HTML tags</br><b>False</b> is no HTML tag is found
*/
private static boolean isHtml(String str) {
final String HTML_TAG_PATTERN = "<(\"[^\"]*\"|'[^']*'|[^'\">])*>";
Pattern htmlValidator = TextUtils.isEmpty(HTML_TAG_PATTERN) ? null : Pattern.compile(HTML_TAG_PATTERN);
return htmlValidator.matcher(str).find();
}
/**
* Format an Element to plain-text
* @param element the root element to format