support for sitemaps

This commit is contained in:
Martin Rotter 2023-10-16 15:05:42 +02:00
parent fe72cd1564
commit 8720fe663d
10 changed files with 86 additions and 49 deletions

View File

@ -10,7 +10,7 @@ RSS Guard
### [Discord server](https://discord.gg/7xbVMPPNqH) | [Downloads](https://github.com/martinrotter/rssguard/releases) | [Development builds](https://github.com/martinrotter/rssguard/releases/tag/devbuild) | [Documentation](https://rssguard.readthedocs.io)
RSS Guard is a simple RSS/ATOM feed reader for Windows, Linux, BSD, OS/2 or macOS which can work with RSS/ATOM/JSON feeds as well as many online feed services:
RSS Guard is a simple RSS/ATOM feed reader for Windows, Linux, BSD, OS/2 or macOS which can work with RSS/ATOM/JSON/Sitemap feeds as well as many online feed services:
* [Feedly](https://feedly.com)
* [Gmail](https://developers.google.com/gmail/api)
* Google Reader API ([Bazqux](https://bazqux.com), [FreshRSS](https://freshrss.org), [Inoreader](https://www.inoreader.com), [Miniflux](https://miniflux.app), [Reedah](http://reedah.com), [The Old Reader](https://theoldreader.com) and more)

View File

@ -1,6 +1,6 @@
Supported Feed Readers
======================
RSS Guard is multi-account application and supports many web-based feed readers via built-in plugins. One of the plugins, of course, provides the support for standard list of `RSS/ATOM/JSON` feeds with the set of features everyone would expect from classic feed reader.
RSS Guard is multi-account application and supports many web-based feed readers via built-in plugins. One of the plugins, of course, provides the support for standard list of `RSS/ATOM/JSON/Sitemap` feeds with the set of features everyone would expect from classic feed reader.
I organized the supported web-based feed readers into an elegant table:

View File

@ -54,8 +54,6 @@ StandardFeedDetails::StandardFeedDetails(QWidget* parent) : QWidget(parent) {
QVariant::fromValue(int(StandardFeed::Type::Json)));
m_ui.m_cmbType->addItem(StandardFeed::typeToString(StandardFeed::Type::Sitemap),
QVariant::fromValue(int(StandardFeed::Type::Sitemap)));
m_ui.m_cmbType->addItem(StandardFeed::typeToString(StandardFeed::Type::SitemapIndex),
QVariant::fromValue(int(StandardFeed::Type::SitemapIndex)));
// Load available encodings.
const QList<QByteArray> encodings = QTextCodec::availableCodecs();

View File

@ -180,7 +180,7 @@ QString AtomParser::xmlMessageUrl(const QDomElement& msg_element) const {
QList<Enclosure> AtomParser::xmlMessageEnclosures(const QDomElement& msg_element) const {
QList<Enclosure> enclosures;
QDomNodeList elem_links = msg_element.toElement().elementsByTagNameNS(m_atomNamespace, QSL("link"));
QDomNodeList elem_links = msg_element.elementsByTagNameNS(m_atomNamespace, QSL("link"));
for (int i = 0; i < elem_links.size(); i++) {
QDomElement link = elem_links.at(i).toElement();

View File

@ -9,8 +9,8 @@
#include "definitions/definitions.h"
#include "exceptions/applicationexception.h"
#include "exceptions/feedrecognizedbutfailedexception.h"
#include "miscellaneous/textfactory.h"
#include "services/standard/definitions.h"
#include "services/standard/standardfeed.h"
#include <QDomDocument>
#include <QTextCodec>
@ -68,7 +68,11 @@ QPair<StandardFeed*, QList<IconLocation>> SitemapParser::guessFeed(const QByteAr
QDomElement root_element = xml_document.documentElement();
if (root_element.tagName() != QSL("urlset") && root_element.tagName() != QSL("sitemapindex")) {
if (root_element.tagName() == QSL("sitemapindex")) {
throw FeedRecognizedButFailedException(QObject::tr("sitemap indices are not supported"));
}
if (root_element.tagName() != QSL("urlset")) {
throw ApplicationException(QObject::tr("not a Sitemap"));
}
@ -76,17 +80,8 @@ QPair<StandardFeed*, QList<IconLocation>> SitemapParser::guessFeed(const QByteAr
QList<IconLocation> icon_possible_locations;
feed->setEncoding(xml_schema_encoding);
if (root_element.tagName() == QSL("urlset")) {
// Sitemap.
feed->setType(StandardFeed::Type::Sitemap);
feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::Sitemap));
}
else {
// Sitemap index.
feed->setType(StandardFeed::Type::SitemapIndex);
feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::SitemapIndex));
}
feed->setType(StandardFeed::Type::Sitemap);
feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::Sitemap));
return {feed, icon_possible_locations};
}
@ -108,45 +103,76 @@ QString SitemapParser::sitemapVideoNamespace() const {
}
QDomNodeList SitemapParser::xmlMessageElements() {
return {};
return m_xml.elementsByTagNameNS(sitemapNamespace(), QSL("url"));
}
// TODO: implement
QString SitemapParser::xmlMessageTitle(const QDomElement& msg_element) const {
return {};
QString str_title = msg_element.elementsByTagNameNS(sitemapNewsNamespace(), QSL("title")).at(0).toElement().text();
if (str_title.isEmpty()) {
str_title = msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("title")).at(0).toElement().text();
}
return str_title;
}
QString SitemapParser::xmlMessageUrl(const QDomElement& msg_element) const {
return {};
return msg_element.elementsByTagNameNS(sitemapNamespace(), QSL("loc")).at(0).toElement().text();
}
QString SitemapParser::xmlMessageDescription(const QDomElement& msg_element) const {
return {};
}
QString SitemapParser::xmlMessageAuthor(const QDomElement& msg_element) const {
return {};
return xmlRawChild(msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("description")).at(0).toElement());
}
QDateTime SitemapParser::xmlMessageDateCreated(const QDomElement& msg_element) const {
return {};
QString str_date = msg_element.elementsByTagNameNS(sitemapNamespace(), QSL("lastmod")).at(0).toElement().text();
if (str_date.isEmpty()) {
str_date =
msg_element.elementsByTagNameNS(sitemapNewsNamespace(), QSL("publication_date")).at(0).toElement().text();
}
return TextFactory::parseDateTime(str_date);
}
QString SitemapParser::xmlMessageId(const QDomElement& msg_element) const {
return {};
return xmlMessageUrl(msg_element);
}
QList<Enclosure> SitemapParser::xmlMessageEnclosures(const QDomElement& msg_element) const {
return {};
}
QList<Enclosure> enclosures;
QList<MessageCategory> SitemapParser::xmlMessageCategories(const QDomElement& msg_element) const {
return {};
}
// sitemap-image
QDomNodeList elem_links = msg_element.elementsByTagNameNS(sitemapImageNamespace(), QSL("image"));
QString SitemapParser::xmlMessageRawContents(const QDomElement& msg_element) const {
return {};
for (int i = 0; i < elem_links.size(); i++) {
QDomElement link = elem_links.at(i).toElement();
QString loc = link.elementsByTagNameNS(sitemapImageNamespace(), QSL("loc")).at(0).toElement().text();
if (!loc.isEmpty()) {
// NOTE: The MIME is made up.
enclosures.append(Enclosure(loc, QSL("image/png")));
}
}
// sitemap-video
elem_links = msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("video"));
for (int i = 0; i < elem_links.size(); i++) {
QDomElement link = elem_links.at(i).toElement();
QString loc = link.elementsByTagNameNS(sitemapVideoNamespace(), QSL("player_loc")).at(0).toElement().text();
if (loc.isEmpty()) {
loc = link.elementsByTagNameNS(sitemapVideoNamespace(), QSL("content_loc")).at(0).toElement().text();
}
if (!loc.isEmpty()) {
// NOTE: The MIME is made up.
enclosures.append(Enclosure(loc, QSL("video/mpeg")));
}
}
return enclosures;
}
bool SitemapParser::isGzip(const QByteArray& content) {

View File

@ -5,6 +5,8 @@
#include "services/standard/parsers/feedparser.h"
#include "services/standard/standardfeed.h"
class SitemapParser : public FeedParser {
public:
explicit SitemapParser(const QString& data);
@ -20,12 +22,9 @@ class SitemapParser : public FeedParser {
virtual QString xmlMessageTitle(const QDomElement& msg_element) const;
virtual QString xmlMessageUrl(const QDomElement& msg_element) const;
virtual QString xmlMessageDescription(const QDomElement& msg_element) const;
virtual QString xmlMessageAuthor(const QDomElement& msg_element) const;
virtual QDateTime xmlMessageDateCreated(const QDomElement& msg_element) const;
virtual QString xmlMessageId(const QDomElement& msg_element) const;
virtual QList<Enclosure> xmlMessageEnclosures(const QDomElement& msg_element) const;
virtual QList<MessageCategory> xmlMessageCategories(const QDomElement& msg_element) const;
virtual QString xmlMessageRawContents(const QDomElement& msg_element) const;
private:
QString sitemapNamespace() const;

View File

@ -159,9 +159,6 @@ QString StandardFeed::typeToString(StandardFeed::Type type) {
case Type::Sitemap:
return QSL("Sitemap");
case Type::SitemapIndex:
return QSL("Sitemap Index");
case Type::Rss2X:
default:
return QSL("RSS 2.0/2.0.1");

View File

@ -36,8 +36,7 @@ class StandardFeed : public Feed {
Rdf = 2, // Sometimes denoted as RSS 1.0.
Atom10 = 3,
Json = 4,
SitemapIndex = 5,
Sitemap = 6
Sitemap = 5
};
explicit StandardFeed(RootItem* parent_item = nullptr);

View File

@ -161,10 +161,6 @@ bool FeedsImportExportModel::exportToOMPL20(QByteArray& result, bool export_icon
outline_feed.setAttribute(QSL("version"), QSL("Sitemap"));
break;
case StandardFeed::Type::SitemapIndex:
outline_feed.setAttribute(QSL("version"), QSL("SitemapIndex"));
break;
default:
break;
}

View File

@ -22,11 +22,16 @@
#include "services/standard/parsers/jsonparser.h"
#include "services/standard/parsers/rdfparser.h"
#include "services/standard/parsers/rssparser.h"
#include "services/standard/parsers/sitemapparser.h"
#include "services/standard/standardcategory.h"
#include "services/standard/standardfeed.h"
#include "services/standard/standardfeedsimportexportmodel.h"
#include "services/standard/standardserviceentrypoint.h"
#if defined(ENABLE_COMPRESSED_SITEMAP)
#include "3rd-party/qcompressor/qcompressor.h"
#endif
#include <QAction>
#include <QClipboard>
#include <QSqlTableModel>
@ -180,6 +185,20 @@ QList<Message> StandardServiceRoot::obtainNewMessages(Feed* feed,
throw FeedFetchException(Feed::Status::NetworkError, NetworkFactory::networkErrorText(network_result));
}
// Sitemap parser supports gzip-encoded data too.
if (SitemapParser::isGzip(feed_contents)) {
#if defined(ENABLE_COMPRESSED_SITEMAP)
qWarningNN << LOGSEC_CORE << "Decompressing gzipped feed data.";
QByteArray uncompressed_feed_contents;
QCompressor::gzipDecompress(feed_contents, uncompressed_feed_contents);
feed_contents = uncompressed_feed_contents;
#else
qWarningNN << LOGSEC_CORE << "This feed is gzipped.";
#endif
}
// Encode downloaded data for further parsing.
QTextCodec* codec = QTextCodec::codecForName(f->encoding().toLocal8Bit());
@ -243,6 +262,9 @@ QList<Message> StandardServiceRoot::obtainNewMessages(Feed* feed,
messages = JsonParser(formatted_feed_contents).messages();
break;
case StandardFeed::Type::Sitemap:
messages = SitemapParser(formatted_feed_contents).messages();
default:
break;
}