Fixes #3444 - Strip off HTML from podcast descriptions

2025-02-01 19:26:46 +01:00 · 2019-10-24 23:20:31 -04:00 · 2019-10-24 23:20:31 -04:00 · ca83c59537
commit ca83c59537
parent 437f3f29c0
4 changed files with 49 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,6 +13,8 @@ bin/
 gen/
 target/
 build/
+**/*.project
+**/*.classpath

 # Local configuration file (sdk path, etc)
 local.properties
--- a/app/src/main/java/de/danoeh/antennapod/activity/OnlineFeedViewActivity.java
+++ b/app/src/main/java/de/danoeh/antennapod/activity/OnlineFeedViewActivity.java
@ -367,20 +367,14 @@ public class OnlineFeedViewActivity extends AppCompatActivity {
     * This method is executed on a background thread
     */
    private void beforeShowFeedInformation(Feed feed) {
-        final HtmlToPlainText formatter = new HtmlToPlainText();
-        if(Feed.TYPE_ATOM1.equals(feed.getType()) && feed.getDescription() != null) {
-            // remove HTML tags from descriptions
-            Log.d(TAG, "Removing HTML from feed description");
-            Document feedDescription = Jsoup.parse(feed.getDescription());
-            feed.setDescription(StringUtils.trim(formatter.getPlainText(feedDescription)));
-        }
+        Log.d(TAG, "Removing HTML from feed description");
+
+        feed.setDescription(HtmlToPlainText.getPlainText(feed.getDescription()));
+
        Log.d(TAG, "Removing HTML from shownotes");
        if (feed.getItems() != null) {
            for (FeedItem item : feed.getItems()) {
-                if (item.getDescription() != null) {
-                    Document itemDescription = Jsoup.parse(item.getDescription());
-                    item.setDescription(StringUtils.trim(formatter.getPlainText(itemDescription)));
-                }
+                item.setDescription(HtmlToPlainText.getPlainText(item.getDescription()));
            }
        }
    }
--- a/app/src/main/java/de/danoeh/antennapod/fragment/FeedInfoFragment.java
+++ b/app/src/main/java/de/danoeh/antennapod/fragment/FeedInfoFragment.java
@ -167,16 +167,8 @@ public class FeedInfoFragment extends Fragment {

        txtvTitle.setText(feed.getTitle());

-        String description = feed.getDescription();
-        if(description != null) {
-            if(Feed.TYPE_ATOM1.equals(feed.getType())) {
-                HtmlToPlainText formatter = new HtmlToPlainText();
-                Document feedDescription = Jsoup.parse(feed.getDescription());
-                description = StringUtils.trim(formatter.getPlainText(feedDescription));
-            }
-        } else {
-            description = "";
-        }
+        String description = HtmlToPlainText.getPlainText(feed.getDescription());
+
        txtvDescription.setText(description);

        if (!TextUtils.isEmpty(feed.getAuthor())) {
--- a/core/src/main/java/de/danoeh/antennapod/core/util/syndication/HtmlToPlainText.java
+++ b/core/src/main/java/de/danoeh/antennapod/core/util/syndication/HtmlToPlainText.java
@ -1,12 +1,19 @@
 package de.danoeh.antennapod.core.util.syndication;

+import android.text.TextUtils;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
 import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;

+import java.util.regex.Pattern;
+
 /**
 * This class is based on <code>HtmlToPlainText</code> from jsoup's examples package.
 *
@ -26,6 +33,39 @@ import org.jsoup.select.NodeVisitor;
 */
 public class HtmlToPlainText {

+    /**
+     * Use this method to strip off HTML encoding from given text
+     * <p>
+     * Replaces bullet points with *, ignores colors/bold/...
+     *
+     * @param str String with any encoding
+     * @return Human readable text with minimal HTML formatting
+     */
+    public static String getPlainText(String str) {
+        if (!TextUtils.isEmpty(str) && isHtml(str)) {
+            HtmlToPlainText formatter = new HtmlToPlainText();
+            Document feedDescription = Jsoup.parse(str);
+            str = StringUtils.trim(formatter.getPlainText(feedDescription));
+        } else if (TextUtils.isEmpty(str)) {
+            str = "";
+        }
+
+        return str;
+    }
+
+    /**
+     * Use this method to determine if a given text has any HTML tag
+     *
+     * @param str String to be tested for presence of HTML content
+     * @return <b>True</b> if text contains any HTML tags</br><b>False</b> is no HTML tag is found
+     */
+    private static boolean isHtml(String str) {
+        final String HTML_TAG_PATTERN = "<(\"[^\"]*\"|'[^']*'|[^'\">])*>";
+        Pattern htmlValidator = TextUtils.isEmpty(HTML_TAG_PATTERN) ? null : Pattern.compile(HTML_TAG_PATTERN);
+
+        return htmlValidator.matcher(str).find();
+    }
+
    /**
     * Format an Element to plain-text
     * @param element the root element to format