From 8720fe663d20f59935156e0bb41da65d80136467 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Mon, 16 Oct 2023 15:05:42 +0200 Subject: [PATCH] support for sitemaps --- README.md | 2 +- docs/source/supported-readers.md | 2 +- .../standard/gui/standardfeeddetails.cpp | 2 - .../services/standard/parsers/atomparser.cpp | 2 +- .../standard/parsers/sitemapparser.cpp | 90 ++++++++++++------- .../services/standard/parsers/sitemapparser.h | 5 +- .../services/standard/standardfeed.cpp | 3 - .../services/standard/standardfeed.h | 3 +- .../standardfeedsimportexportmodel.cpp | 4 - .../services/standard/standardserviceroot.cpp | 22 +++++ 10 files changed, 86 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 6f2439af5..aa15e5033 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ RSS Guard ### [Discord server](https://discord.gg/7xbVMPPNqH) | [Downloads](https://github.com/martinrotter/rssguard/releases) | [Development builds](https://github.com/martinrotter/rssguard/releases/tag/devbuild) | [Documentation](https://rssguard.readthedocs.io) -RSS Guard is a simple RSS/ATOM feed reader for Windows, Linux, BSD, OS/2 or macOS which can work with RSS/ATOM/JSON feeds as well as many online feed services: +RSS Guard is a simple RSS/ATOM feed reader for Windows, Linux, BSD, OS/2 or macOS which can work with RSS/ATOM/JSON/Sitemap feeds as well as many online feed services: * [Feedly](https://feedly.com) * [Gmail](https://developers.google.com/gmail/api) * Google Reader API ([Bazqux](https://bazqux.com), [FreshRSS](https://freshrss.org), [Inoreader](https://www.inoreader.com), [Miniflux](https://miniflux.app), [Reedah](http://reedah.com), [The Old Reader](https://theoldreader.com) and more) diff --git a/docs/source/supported-readers.md b/docs/source/supported-readers.md index f9c567284..f29f49cbb 100644 --- a/docs/source/supported-readers.md +++ b/docs/source/supported-readers.md @@ -1,6 +1,6 @@ Supported Feed Readers ====================== -RSS Guard is multi-account application and supports many web-based feed readers via built-in plugins. One of the plugins, of course, provides the support for standard list of `RSS/ATOM/JSON` feeds with the set of features everyone would expect from classic feed reader. +RSS Guard is multi-account application and supports many web-based feed readers via built-in plugins. One of the plugins, of course, provides the support for standard list of `RSS/ATOM/JSON/Sitemap` feeds with the set of features everyone would expect from classic feed reader. I organized the supported web-based feed readers into an elegant table: diff --git a/src/librssguard/services/standard/gui/standardfeeddetails.cpp b/src/librssguard/services/standard/gui/standardfeeddetails.cpp index 5224bd79c..5979555d6 100644 --- a/src/librssguard/services/standard/gui/standardfeeddetails.cpp +++ b/src/librssguard/services/standard/gui/standardfeeddetails.cpp @@ -54,8 +54,6 @@ StandardFeedDetails::StandardFeedDetails(QWidget* parent) : QWidget(parent) { QVariant::fromValue(int(StandardFeed::Type::Json))); m_ui.m_cmbType->addItem(StandardFeed::typeToString(StandardFeed::Type::Sitemap), QVariant::fromValue(int(StandardFeed::Type::Sitemap))); - m_ui.m_cmbType->addItem(StandardFeed::typeToString(StandardFeed::Type::SitemapIndex), - QVariant::fromValue(int(StandardFeed::Type::SitemapIndex))); // Load available encodings. const QList encodings = QTextCodec::availableCodecs(); diff --git a/src/librssguard/services/standard/parsers/atomparser.cpp b/src/librssguard/services/standard/parsers/atomparser.cpp index e849c5f7d..0ad57360a 100644 --- a/src/librssguard/services/standard/parsers/atomparser.cpp +++ b/src/librssguard/services/standard/parsers/atomparser.cpp @@ -180,7 +180,7 @@ QString AtomParser::xmlMessageUrl(const QDomElement& msg_element) const { QList AtomParser::xmlMessageEnclosures(const QDomElement& msg_element) const { QList enclosures; - QDomNodeList elem_links = msg_element.toElement().elementsByTagNameNS(m_atomNamespace, QSL("link")); + QDomNodeList elem_links = msg_element.elementsByTagNameNS(m_atomNamespace, QSL("link")); for (int i = 0; i < elem_links.size(); i++) { QDomElement link = elem_links.at(i).toElement(); diff --git a/src/librssguard/services/standard/parsers/sitemapparser.cpp b/src/librssguard/services/standard/parsers/sitemapparser.cpp index 52c5bc101..b11fc5a9a 100644 --- a/src/librssguard/services/standard/parsers/sitemapparser.cpp +++ b/src/librssguard/services/standard/parsers/sitemapparser.cpp @@ -9,8 +9,8 @@ #include "definitions/definitions.h" #include "exceptions/applicationexception.h" #include "exceptions/feedrecognizedbutfailedexception.h" +#include "miscellaneous/textfactory.h" #include "services/standard/definitions.h" -#include "services/standard/standardfeed.h" #include #include @@ -68,7 +68,11 @@ QPair> SitemapParser::guessFeed(const QByteAr QDomElement root_element = xml_document.documentElement(); - if (root_element.tagName() != QSL("urlset") && root_element.tagName() != QSL("sitemapindex")) { + if (root_element.tagName() == QSL("sitemapindex")) { + throw FeedRecognizedButFailedException(QObject::tr("sitemap indices are not supported")); + } + + if (root_element.tagName() != QSL("urlset")) { throw ApplicationException(QObject::tr("not a Sitemap")); } @@ -76,17 +80,8 @@ QPair> SitemapParser::guessFeed(const QByteAr QList icon_possible_locations; feed->setEncoding(xml_schema_encoding); - - if (root_element.tagName() == QSL("urlset")) { - // Sitemap. - feed->setType(StandardFeed::Type::Sitemap); - feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::Sitemap)); - } - else { - // Sitemap index. - feed->setType(StandardFeed::Type::SitemapIndex); - feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::SitemapIndex)); - } + feed->setType(StandardFeed::Type::Sitemap); + feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::Sitemap)); return {feed, icon_possible_locations}; } @@ -108,45 +103,76 @@ QString SitemapParser::sitemapVideoNamespace() const { } QDomNodeList SitemapParser::xmlMessageElements() { - return {}; + return m_xml.elementsByTagNameNS(sitemapNamespace(), QSL("url")); } -// TODO: implement - QString SitemapParser::xmlMessageTitle(const QDomElement& msg_element) const { - return {}; + QString str_title = msg_element.elementsByTagNameNS(sitemapNewsNamespace(), QSL("title")).at(0).toElement().text(); + + if (str_title.isEmpty()) { + str_title = msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("title")).at(0).toElement().text(); + } + + return str_title; } QString SitemapParser::xmlMessageUrl(const QDomElement& msg_element) const { - return {}; + return msg_element.elementsByTagNameNS(sitemapNamespace(), QSL("loc")).at(0).toElement().text(); } QString SitemapParser::xmlMessageDescription(const QDomElement& msg_element) const { - return {}; -} - -QString SitemapParser::xmlMessageAuthor(const QDomElement& msg_element) const { - return {}; + return xmlRawChild(msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("description")).at(0).toElement()); } QDateTime SitemapParser::xmlMessageDateCreated(const QDomElement& msg_element) const { - return {}; + QString str_date = msg_element.elementsByTagNameNS(sitemapNamespace(), QSL("lastmod")).at(0).toElement().text(); + + if (str_date.isEmpty()) { + str_date = + msg_element.elementsByTagNameNS(sitemapNewsNamespace(), QSL("publication_date")).at(0).toElement().text(); + } + + return TextFactory::parseDateTime(str_date); } QString SitemapParser::xmlMessageId(const QDomElement& msg_element) const { - return {}; + return xmlMessageUrl(msg_element); } QList SitemapParser::xmlMessageEnclosures(const QDomElement& msg_element) const { - return {}; -} + QList enclosures; -QList SitemapParser::xmlMessageCategories(const QDomElement& msg_element) const { - return {}; -} + // sitemap-image + QDomNodeList elem_links = msg_element.elementsByTagNameNS(sitemapImageNamespace(), QSL("image")); -QString SitemapParser::xmlMessageRawContents(const QDomElement& msg_element) const { - return {}; + for (int i = 0; i < elem_links.size(); i++) { + QDomElement link = elem_links.at(i).toElement(); + QString loc = link.elementsByTagNameNS(sitemapImageNamespace(), QSL("loc")).at(0).toElement().text(); + + if (!loc.isEmpty()) { + // NOTE: The MIME is made up. + enclosures.append(Enclosure(loc, QSL("image/png"))); + } + } + + // sitemap-video + elem_links = msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("video")); + + for (int i = 0; i < elem_links.size(); i++) { + QDomElement link = elem_links.at(i).toElement(); + QString loc = link.elementsByTagNameNS(sitemapVideoNamespace(), QSL("player_loc")).at(0).toElement().text(); + + if (loc.isEmpty()) { + loc = link.elementsByTagNameNS(sitemapVideoNamespace(), QSL("content_loc")).at(0).toElement().text(); + } + + if (!loc.isEmpty()) { + // NOTE: The MIME is made up. + enclosures.append(Enclosure(loc, QSL("video/mpeg"))); + } + } + + return enclosures; } bool SitemapParser::isGzip(const QByteArray& content) { diff --git a/src/librssguard/services/standard/parsers/sitemapparser.h b/src/librssguard/services/standard/parsers/sitemapparser.h index 7279ab951..c0df070a4 100644 --- a/src/librssguard/services/standard/parsers/sitemapparser.h +++ b/src/librssguard/services/standard/parsers/sitemapparser.h @@ -5,6 +5,8 @@ #include "services/standard/parsers/feedparser.h" +#include "services/standard/standardfeed.h" + class SitemapParser : public FeedParser { public: explicit SitemapParser(const QString& data); @@ -20,12 +22,9 @@ class SitemapParser : public FeedParser { virtual QString xmlMessageTitle(const QDomElement& msg_element) const; virtual QString xmlMessageUrl(const QDomElement& msg_element) const; virtual QString xmlMessageDescription(const QDomElement& msg_element) const; - virtual QString xmlMessageAuthor(const QDomElement& msg_element) const; virtual QDateTime xmlMessageDateCreated(const QDomElement& msg_element) const; virtual QString xmlMessageId(const QDomElement& msg_element) const; virtual QList xmlMessageEnclosures(const QDomElement& msg_element) const; - virtual QList xmlMessageCategories(const QDomElement& msg_element) const; - virtual QString xmlMessageRawContents(const QDomElement& msg_element) const; private: QString sitemapNamespace() const; diff --git a/src/librssguard/services/standard/standardfeed.cpp b/src/librssguard/services/standard/standardfeed.cpp index 894edda22..24035a1ad 100644 --- a/src/librssguard/services/standard/standardfeed.cpp +++ b/src/librssguard/services/standard/standardfeed.cpp @@ -159,9 +159,6 @@ QString StandardFeed::typeToString(StandardFeed::Type type) { case Type::Sitemap: return QSL("Sitemap"); - case Type::SitemapIndex: - return QSL("Sitemap Index"); - case Type::Rss2X: default: return QSL("RSS 2.0/2.0.1"); diff --git a/src/librssguard/services/standard/standardfeed.h b/src/librssguard/services/standard/standardfeed.h index e56cd6477..de51ceed2 100644 --- a/src/librssguard/services/standard/standardfeed.h +++ b/src/librssguard/services/standard/standardfeed.h @@ -36,8 +36,7 @@ class StandardFeed : public Feed { Rdf = 2, // Sometimes denoted as RSS 1.0. Atom10 = 3, Json = 4, - SitemapIndex = 5, - Sitemap = 6 + Sitemap = 5 }; explicit StandardFeed(RootItem* parent_item = nullptr); diff --git a/src/librssguard/services/standard/standardfeedsimportexportmodel.cpp b/src/librssguard/services/standard/standardfeedsimportexportmodel.cpp index 1e07970e3..62f52c96b 100644 --- a/src/librssguard/services/standard/standardfeedsimportexportmodel.cpp +++ b/src/librssguard/services/standard/standardfeedsimportexportmodel.cpp @@ -161,10 +161,6 @@ bool FeedsImportExportModel::exportToOMPL20(QByteArray& result, bool export_icon outline_feed.setAttribute(QSL("version"), QSL("Sitemap")); break; - case StandardFeed::Type::SitemapIndex: - outline_feed.setAttribute(QSL("version"), QSL("SitemapIndex")); - break; - default: break; } diff --git a/src/librssguard/services/standard/standardserviceroot.cpp b/src/librssguard/services/standard/standardserviceroot.cpp index 79bdb387c..aa714a4ba 100644 --- a/src/librssguard/services/standard/standardserviceroot.cpp +++ b/src/librssguard/services/standard/standardserviceroot.cpp @@ -22,11 +22,16 @@ #include "services/standard/parsers/jsonparser.h" #include "services/standard/parsers/rdfparser.h" #include "services/standard/parsers/rssparser.h" +#include "services/standard/parsers/sitemapparser.h" #include "services/standard/standardcategory.h" #include "services/standard/standardfeed.h" #include "services/standard/standardfeedsimportexportmodel.h" #include "services/standard/standardserviceentrypoint.h" +#if defined(ENABLE_COMPRESSED_SITEMAP) +#include "3rd-party/qcompressor/qcompressor.h" +#endif + #include #include #include @@ -180,6 +185,20 @@ QList StandardServiceRoot::obtainNewMessages(Feed* feed, throw FeedFetchException(Feed::Status::NetworkError, NetworkFactory::networkErrorText(network_result)); } + // Sitemap parser supports gzip-encoded data too. + if (SitemapParser::isGzip(feed_contents)) { +#if defined(ENABLE_COMPRESSED_SITEMAP) + qWarningNN << LOGSEC_CORE << "Decompressing gzipped feed data."; + + QByteArray uncompressed_feed_contents; + QCompressor::gzipDecompress(feed_contents, uncompressed_feed_contents); + + feed_contents = uncompressed_feed_contents; +#else + qWarningNN << LOGSEC_CORE << "This feed is gzipped."; +#endif + } + // Encode downloaded data for further parsing. QTextCodec* codec = QTextCodec::codecForName(f->encoding().toLocal8Bit()); @@ -243,6 +262,9 @@ QList StandardServiceRoot::obtainNewMessages(Feed* feed, messages = JsonParser(formatted_feed_contents).messages(); break; + case StandardFeed::Type::Sitemap: + messages = SitemapParser(formatted_feed_contents).messages(); + default: break; }