refactoring of parsers, initial work in sitemap parser

This commit is contained in:
Martin Rotter 2023-10-13 13:57:23 +02:00
parent 7eac82b841
commit efea17f3aa
30 changed files with 849 additions and 187 deletions

View File

@ -117,6 +117,7 @@ option(REVISION_FROM_GIT "Get revision using `git rev-parse`" ON)
option(NO_UPDATE_CHECK "Disable automatic checking for new application updates" OFF)
option(IS_FLATPAK_BUILD "Set to 'ON' when building RSS Guard with Flatpak." OFF)
option(FORCE_BUNDLE_ICONS "Forcibly bundle icon themes into RSS Guard." OFF)
option(ENABLE_COMPRESSED_SITEMAP "Enable support for gzip-compressed sitemap feeds. Requires zlib." OFF)
# Import Qt libraries.
set(QT6_MIN_VERSION 6.3.0)

View File

@ -0,0 +1,196 @@
#include "qcompressor.h"
/**
* @brief Compresses the given buffer using the standard GZIP algorithm
* @param input The buffer to be compressed
* @param output The result of the compression
* @param level The compression level to be used (@c 0 = no compression, @c 9 = max, @c -1 = default)
* @return @c true if the compression was successful, @c false otherwise
*/
bool QCompressor::gzipCompress(QByteArray input, QByteArray &output, int level)
{
// Prepare output
output.clear();
// Is there something to do?
if(input.length())
{
// Declare vars
int flush = 0;
// Prepare deflater status
z_stream strm;
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
strm.avail_in = 0;
strm.next_in = Z_NULL;
// Initialize deflater
int ret = deflateInit2(&strm, qMax(-1, qMin(9, level)), Z_DEFLATED, GZIP_WINDOWS_BIT, 8, Z_DEFAULT_STRATEGY);
if (ret != Z_OK)
return(false);
// Prepare output
output.clear();
// Extract pointer to input data
char *input_data = input.data();
int input_data_left = input.length();
// Compress data until available
do {
// Determine current chunk size
int chunk_size = qMin(GZIP_CHUNK_SIZE, input_data_left);
// Set deflater references
strm.next_in = (unsigned char*)input_data;
strm.avail_in = chunk_size;
// Update interval variables
input_data += chunk_size;
input_data_left -= chunk_size;
// Determine if it is the last chunk
flush = (input_data_left <= 0 ? Z_FINISH : Z_NO_FLUSH);
// Deflate chunk and cumulate output
do {
// Declare vars
char out[GZIP_CHUNK_SIZE];
// Set deflater references
strm.next_out = (unsigned char*)out;
strm.avail_out = GZIP_CHUNK_SIZE;
// Try to deflate chunk
ret = deflate(&strm, flush);
// Check errors
if(ret == Z_STREAM_ERROR)
{
// Clean-up
deflateEnd(&strm);
// Return
return(false);
}
// Determine compressed size
int have = (GZIP_CHUNK_SIZE - strm.avail_out);
// Cumulate result
if(have > 0)
output.append((char*)out, have);
} while (strm.avail_out == 0);
} while (flush != Z_FINISH);
// Clean-up
(void)deflateEnd(&strm);
// Return
return(ret == Z_STREAM_END);
}
else
return(true);
}
/**
* @brief Decompresses the given buffer using the standard GZIP algorithm
* @param input The buffer to be decompressed
* @param output The result of the decompression
* @return @c true if the decompression was successfull, @c false otherwise
*/
bool QCompressor::gzipDecompress(QByteArray input, QByteArray &output)
{
// Prepare output
output.clear();
// Is there something to do?
if(input.length() > 0)
{
// Prepare inflater status
z_stream strm;
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
strm.avail_in = 0;
strm.next_in = Z_NULL;
// Initialize inflater
int ret = inflateInit2(&strm, GZIP_WINDOWS_BIT);
if (ret != Z_OK)
return(false);
// Extract pointer to input data
char *input_data = input.data();
int input_data_left = input.length();
// Decompress data until available
do {
// Determine current chunk size
int chunk_size = qMin(GZIP_CHUNK_SIZE, input_data_left);
// Check for termination
if(chunk_size <= 0)
break;
// Set inflater references
strm.next_in = (unsigned char*)input_data;
strm.avail_in = chunk_size;
// Update interval variables
input_data += chunk_size;
input_data_left -= chunk_size;
// Inflate chunk and cumulate output
do {
// Declare vars
char out[GZIP_CHUNK_SIZE];
// Set inflater references
strm.next_out = (unsigned char*)out;
strm.avail_out = GZIP_CHUNK_SIZE;
// Try to inflate chunk
ret = inflate(&strm, Z_NO_FLUSH);
switch (ret) {
case Z_NEED_DICT:
ret = Z_DATA_ERROR;
case Z_DATA_ERROR:
case Z_MEM_ERROR:
case Z_STREAM_ERROR:
// Clean-up
inflateEnd(&strm);
// Return
return(false);
}
// Determine decompressed size
int have = (GZIP_CHUNK_SIZE - strm.avail_out);
// Cumulate result
if(have > 0)
output.append((char*)out, have);
} while (strm.avail_out == 0);
} while (ret != Z_STREAM_END);
// Clean-up
inflateEnd(&strm);
// Return
return (ret == Z_STREAM_END);
}
else
return(true);
}

View File

@ -0,0 +1,17 @@
#ifndef QCOMPRESSOR_H
#define QCOMPRESSOR_H
#include <zlib.h>
#include <QByteArray>
#define GZIP_WINDOWS_BIT 15 + 16
#define GZIP_CHUNK_SIZE 32 * 1024
class QCompressor
{
public:
static bool gzipCompress(QByteArray input, QByteArray &output, int level = -1);
static bool gzipDecompress(QByteArray input, QByteArray &output);
};
#endif // QCOMPRESSOR_H

View File

@ -49,6 +49,8 @@ set(SOURCES
dynamic-shortcuts/shortcutcatcher.h
exceptions/applicationexception.cpp
exceptions/applicationexception.h
exceptions/feedrecognizedbutfailedexception.cpp
exceptions/feedrecognizedbutfailedexception.h
exceptions/feedfetchexception.cpp
exceptions/feedfetchexception.h
exceptions/filteringexception.cpp
@ -383,6 +385,8 @@ set(SOURCES
services/standard/parsers/rdfparser.h
services/standard/parsers/rssparser.cpp
services/standard/parsers/rssparser.h
services/standard/parsers/sitemapparser.cpp
services/standard/parsers/sitemapparser.h
services/standard/standardcategory.cpp
services/standard/standardcategory.h
services/standard/standardfeed.cpp
@ -536,9 +540,26 @@ else()
3rd-party/sqlite/sqlite3.h
)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_COLUMN_METADATA=1")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_COLUMN_METADATA=1")
endif()
# Add ZLIB.
set(ZLIB_ROOT "c:\\Users\\rotter\\Downloads\\zlib-1.3")
if(ENABLE_COMPRESSED_SITEMAP)
find_package(ZLIB REQUIRED)
# Add qcompressor.
list(APPEND SOURCES
3rd-party/qcompressor/qcompressor.cpp
3rd-party/qcompressor/qcompressor.h
)
endif(ENABLE_COMPRESSED_SITEMAP)
if(ZLIB_FOUND)
message(STATUS "Using system zlib ${ZLIB_VERSION_STRING}.")
endif(ZLIB_FOUND)
# Add SimpleCrypt.
list(APPEND SOURCES
3rd-party/sc/simplecrypt.cpp
@ -676,6 +697,23 @@ if(SQLite3_FOUND)
)
endif()
if(ZLIB_FOUND)
target_include_directories(rssguard AFTER
PRIVATE
${ZLIB_INCLUDE_DIRS}
)
target_compile_definitions(rssguard
PRIVATE
ENABLE_COMPRESSED_SITEMAP
)
target_link_libraries(rssguard PRIVATE
${ZLIB_LIBRARIES}
)
endif(ZLIB_FOUND)
# Qt.
target_link_libraries(rssguard PUBLIC
Qt${QT_VERSION_MAJOR}::Core

View File

@ -24,4 +24,12 @@ struct UpdatedArticles {
QList<Message> m_all;
};
struct IconLocation {
QString m_url;
// The "bool" if true means that the URL is direct and download directly, if false then
// only use its domain and download via 3rd-party service.
bool m_isDirect;
};
#endif // TYPEDEFS_H

View File

@ -2,7 +2,7 @@
#include "exceptions/feedfetchexception.h"
FeedFetchException::FeedFetchException(Feed::Status feed_status, QString message)
FeedFetchException::FeedFetchException(Feed::Status feed_status, const QString& message)
: ApplicationException(message), m_feedStatus(feed_status) {}
Feed::Status FeedFetchException::feedStatus() const {

View File

@ -8,7 +8,7 @@
class FeedFetchException : public ApplicationException {
public:
explicit FeedFetchException(Feed::Status feed_status, QString message = {});
explicit FeedFetchException(Feed::Status feed_status, const QString& message = {});
Feed::Status feedStatus() const;

View File

@ -0,0 +1,5 @@
// For license of this file, see <project-root-folder>/LICENSE.md.
#include "exceptions/feedrecognizedbutfailedexception.h"
FeedRecognizedButFailedException::FeedRecognizedButFailedException(const QString &message) : ApplicationException(message) {}

View File

@ -0,0 +1,13 @@
// For license of this file, see <project-root-folder>/LICENSE.md.
#ifndef UNRECOGNIZEDFEEDFORMATEXCEPTION_H
#define UNRECOGNIZEDFEEDFORMATEXCEPTION_H
#include "exceptions/applicationexception.h"
class FeedRecognizedButFailedException : public ApplicationException {
public:
explicit FeedRecognizedButFailedException(const QString& message = {});
};
#endif // UNRECOGNIZEDFEEDFORMATEXCEPTION_H

View File

@ -4,12 +4,12 @@
#include "definitions/definitions.h"
ScriptException::ScriptException(Reason reason, QString message) : ApplicationException(message), m_reason(reason) {
ScriptException::ScriptException(Reason reason, const QString& message)
: ApplicationException(message), m_reason(reason) {
if (message.isEmpty()) {
setMessage(messageForReason(reason));
}
else if (reason == ScriptException::Reason::InterpreterError ||
reason == ScriptException::Reason::OtherError) {
else if (reason == ScriptException::Reason::InterpreterError || reason == ScriptException::Reason::OtherError) {
setMessage(messageForReason(reason) + QSL(": '%1'").arg(message));
}
}

View File

@ -8,7 +8,7 @@
#include <QCoreApplication>
class ScriptException : public ApplicationException {
Q_DECLARE_TR_FUNCTIONS(ScriptException)
Q_DECLARE_TR_FUNCTIONS(ScriptException)
public:
enum class Reason {
@ -19,7 +19,7 @@ class ScriptException : public ApplicationException {
OtherError
};
explicit ScriptException(Reason reason = Reason::OtherError, QString message = QString());
explicit ScriptException(Reason reason = Reason::OtherError, const QString& message = QString());
Reason reason() const;

View File

@ -29,9 +29,9 @@ class ArticleListNotification : public BaseToastNotification {
private slots:
void openArticleInArticleList();
void openArticleInWebBrowser();
void onMessageSelected(const QModelIndex& current, const QModelIndex& previous);
void showFeed(int index);
void openArticleInWebBrowser();
void markAllRead();
private:

View File

@ -160,7 +160,7 @@ QString NetworkFactory::sanitizeUrl(const QString& url) {
return QString(url).replace(QRegularExpression(QSL("[^\\w\\-.~:\\/?#\\[\\]@!$&'()*+,;=% \\|]")), {});
}
QNetworkReply::NetworkError NetworkFactory::downloadIcon(const QList<QPair<QString, bool>>& urls,
QNetworkReply::NetworkError NetworkFactory::downloadIcon(const QList<IconLocation>& urls,
int timeout,
QPixmap& output,
const QList<QPair<QByteArray, QByteArray>>& additional_headers,
@ -168,15 +168,15 @@ QNetworkReply::NetworkError NetworkFactory::downloadIcon(const QList<QPair<QStri
QNetworkReply::NetworkError network_result = QNetworkReply::NetworkError::UnknownNetworkError;
for (const auto& url : urls) {
if (url.first.isEmpty()) {
if (url.m_url.isEmpty()) {
continue;
}
QByteArray icon_data;
if (url.second) {
if (url.m_isDirect) {
// Download directly.
network_result = performNetworkOperation(url.first,
network_result = performNetworkOperation(url.m_url,
timeout,
{},
icon_data,
@ -206,7 +206,7 @@ QNetworkReply::NetworkError NetworkFactory::downloadIcon(const QList<QPair<QStri
}
else {
// Duck Duck Go.
QUrl url_full = QUrl(url.first);
QUrl url_full = QUrl(url.m_url);
QString host = url_full.host();
if (host.startsWith(QSL("www."))) {

View File

@ -5,6 +5,7 @@
#include "network-web/httpresponse.h"
#include "definitions/typedefs.h"
#include "services/abstract/feed.h"
#include <QCoreApplication>
@ -38,7 +39,11 @@ class NetworkFactory {
explicit NetworkFactory() = default;
public:
enum class NetworkAuthentication { NoAuthentication = 0, Basic = 1, Token = 2 };
enum class NetworkAuthentication {
NoAuthentication = 0,
Basic = 1,
Token = 2
};
static QStringList extractFeedLinksFromHtmlPage(const QUrl& url, const QString& html);
static QPair<QByteArray, QByteArray> generateBasicAuthHeader(NetworkAuthentication protection,
@ -51,7 +56,7 @@ class NetworkFactory {
// Performs SYNCHRONOUS favicon download for the site,
// given URL belongs to.
static QNetworkReply::NetworkError downloadIcon(const QList<QPair<QString, bool>>& urls,
static QNetworkReply::NetworkError downloadIcon(const QList<IconLocation>& urls,
int timeout,
QPixmap& output,
const QList<QPair<QByteArray, QByteArray>>& additional_headers,

View File

@ -703,7 +703,7 @@ RootItem* GreaderNetwork::decodeTagsSubscriptions(const QString& categories,
if (obtain_icons) {
QString icon_url = subscription[QSL("iconUrl")].toString();
QList<QPair<QString, bool>> icon_urls;
QList<IconLocation> icon_urls;
if (!icon_url.isEmpty()) {
if (icon_url.startsWith(QSL("//"))) {

View File

@ -52,6 +52,10 @@ StandardFeedDetails::StandardFeedDetails(QWidget* parent) : QWidget(parent) {
QVariant::fromValue(int(StandardFeed::Type::Rss2X)));
m_ui.m_cmbType->addItem(StandardFeed::typeToString(StandardFeed::Type::Json),
QVariant::fromValue(int(StandardFeed::Type::Json)));
m_ui.m_cmbType->addItem(StandardFeed::typeToString(StandardFeed::Type::Sitemap),
QVariant::fromValue(int(StandardFeed::Type::Sitemap)));
m_ui.m_cmbType->addItem(StandardFeed::typeToString(StandardFeed::Type::SitemapIndex),
QVariant::fromValue(int(StandardFeed::Type::SitemapIndex)));
// Load available encodings.
const QList<QByteArray> encodings = QTextCodec::availableCodecs();

View File

@ -2,7 +2,13 @@
#include "services/standard/parsers/atomparser.h"
#include "definitions/definitions.h"
#include "exceptions/applicationexception.h"
#include "miscellaneous/textfactory.h"
#include "services/standard/definitions.h"
#include "services/standard/standardfeed.h"
#include <QTextCodec>
AtomParser::AtomParser(const QString& data) : FeedParser(data) {
QString version = m_xml.documentElement().attribute(QSL("version"));
@ -15,6 +21,70 @@ AtomParser::AtomParser(const QString& data) : FeedParser(data) {
}
}
AtomParser::~AtomParser() {}
QPair<StandardFeed*, QList<IconLocation>> AtomParser::guessFeed(const QByteArray& content,
const QString& content_type) const {
QString xml_schema_encoding = QSL(DEFAULT_FEED_ENCODING);
QString xml_contents_encoded;
QString enc =
QRegularExpression(QSL("encoding=\"([A-Z0-9\\-]+)\""), QRegularExpression::PatternOption::CaseInsensitiveOption)
.match(content)
.captured(1);
if (!enc.isEmpty()) {
// Some "encoding" attribute was found get the encoding
// out of it.
xml_schema_encoding = enc;
}
QTextCodec* custom_codec = QTextCodec::codecForName(xml_schema_encoding.toLocal8Bit());
if (custom_codec != nullptr) {
xml_contents_encoded = custom_codec->toUnicode(content);
}
else {
xml_contents_encoded = QString::fromUtf8(content);
}
// Feed XML was obtained, guess it now.
QDomDocument xml_document;
QString error_msg;
int error_line, error_column;
if (!xml_document.setContent(xml_contents_encoded, true, &error_msg, &error_line, &error_column)) {
throw ApplicationException(QObject::tr("XML is not well-formed, %1").arg(error_msg));
}
QDomElement root_element = xml_document.documentElement();
if (root_element.namespaceURI() != atomNamespace()) {
throw ApplicationException(QObject::tr("not an ATOM feed"));
}
auto* feed = new StandardFeed();
QList<IconLocation> icon_possible_locations;
feed->setEncoding(xml_schema_encoding);
feed->setType(StandardFeed::Type::Atom10);
feed->setTitle(root_element.namedItem(QSL("title")).toElement().text());
feed->setDescription(root_element.namedItem(QSL("subtitle")).toElement().text());
QString icon_link = root_element.namedItem(QSL("icon")).toElement().text();
if (!icon_link.isEmpty()) {
icon_possible_locations.append({icon_link, true});
}
QString home_page = root_element.namedItem(QSL("link")).toElement().attribute(QSL("href"));
if (!home_page.isEmpty()) {
icon_possible_locations.prepend({home_page, false});
}
return {feed, icon_possible_locations};
}
QString AtomParser::feedAuthor() const {
auto authors = m_xml.documentElement().elementsByTagNameNS(m_atomNamespace, QSL("author"));

View File

@ -13,8 +13,10 @@
class AtomParser : public FeedParser {
public:
explicit AtomParser(const QString& data);
virtual ~AtomParser();
QString atomNamespace() const;
virtual QPair<StandardFeed*, QList<IconLocation>> guessFeed(const QByteArray& content,
const QString& content_type) const;
protected:
virtual QString xmlMessageTitle(const QDomElement& msg_element) const;
@ -29,6 +31,8 @@ class AtomParser : public FeedParser {
virtual QString feedAuthor() const;
private:
QString atomNamespace() const;
QString m_atomNamespace;
};

View File

@ -15,6 +15,9 @@
FeedParser::FeedParser(QString data, bool is_xml)
: m_isXml(is_xml), m_data(std::move(data)), m_mrssNamespace(QSL("http://search.yahoo.com/mrss/")) {
if (m_data.isEmpty()) {
return;
}
if (m_isXml) {
// XML.
@ -36,6 +39,8 @@ FeedParser::FeedParser(QString data, bool is_xml)
}
}
FeedParser::~FeedParser() {}
QString FeedParser::xmlMessageRawContents(const QDomElement& msg_element) const {
QString raw_contents;
QTextStream str(&raw_contents);

View File

@ -10,12 +10,18 @@
#include <QString>
#include "core/message.h"
#include "definitions/typedefs.h"
class StandardFeed;
// Base class for all XML-based feed parsers.
class FeedParser {
public:
explicit FeedParser(QString data, bool is_xml = true);
virtual ~FeedParser();
virtual QPair<StandardFeed*, QList<IconLocation>> guessFeed(const QByteArray& content,
const QString& content_type) const = 0;
virtual QList<Message> messages();
protected:

View File

@ -2,7 +2,13 @@
#include "services/standard/parsers/jsonparser.h"
#include "definitions/definitions.h"
#include "definitions/typedefs.h"
#include "exceptions/applicationexception.h"
#include "exceptions/feedrecognizedbutfailedexception.h"
#include "miscellaneous/textfactory.h"
#include "services/standard/definitions.h"
#include "services/standard/standardfeed.h"
#include <QJsonArray>
#include <QJsonDocument>
@ -10,6 +16,51 @@
JsonParser::JsonParser(const QString& data) : FeedParser(data, false) {}
JsonParser::~JsonParser() {}
QPair<StandardFeed*, QList<IconLocation>> JsonParser::guessFeed(const QByteArray& content,
const QString& content_type) const {
if (content_type.contains(QSL("json"), Qt::CaseSensitivity::CaseInsensitive) ||
content.simplified().startsWith('{')) {
QJsonParseError json_err;
QJsonDocument json = QJsonDocument::fromJson(content, &json_err);
if (json.isNull() && !json_err.errorString().isEmpty()) {
throw FeedRecognizedButFailedException(QObject::tr("JSON error '%1'").arg(json_err.errorString()));
}
auto* feed = new StandardFeed();
QList<IconLocation> icon_possible_locations;
feed->setEncoding(QSL(DEFAULT_FEED_ENCODING));
feed->setType(StandardFeed::Type::Json);
feed->setTitle(json.object()[QSL("title")].toString());
feed->setDescription(json.object()[QSL("description")].toString());
auto home_page = json.object()[QSL("home_page_url")].toString();
if (!home_page.isEmpty()) {
icon_possible_locations.prepend({home_page, false});
}
auto icon = json.object()[QSL("favicon")].toString();
if (icon.isEmpty()) {
icon = json.object()[QSL("icon")].toString();
}
if (!icon.isEmpty()) {
// Low priority, download directly.
icon_possible_locations.append({icon, true});
}
return QPair<StandardFeed*, QList<IconLocation>>(feed, icon_possible_locations);
}
else {
throw ApplicationException(QObject::tr("not a JSON feed"));
}
}
QString JsonParser::feedAuthor() const {
QString global_author = m_json.object()[QSL("author")].toObject()[QSL("name")].toString();

View File

@ -10,6 +10,10 @@
class JsonParser : public FeedParser {
public:
explicit JsonParser(const QString& data);
virtual ~JsonParser();
virtual QPair<StandardFeed*, QList<IconLocation>> guessFeed(const QByteArray& content,
const QString& content_type) const;
protected:
virtual QString feedAuthor() const;

View File

@ -2,15 +2,81 @@
#include "services/standard/parsers/rdfparser.h"
#include "exceptions/applicationexception.h"
#include "miscellaneous/textfactory.h"
#include "services/standard/definitions.h"
#include "services/standard/standardfeed.h"
#include <QDomDocument>
#include <QTextCodec>
RdfParser::RdfParser(const QString& data)
: FeedParser(data), m_rdfNamespace(QSL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")),
m_rssNamespace(QSL("http://purl.org/rss/1.0/")), m_rssCoNamespace(QSL("http://purl.org/rss/1.0/modules/content/")),
m_dcElNamespace(QSL("http://purl.org/dc/elements/1.1/")) {}
RdfParser::~RdfParser() {}
QPair<StandardFeed*, QList<IconLocation>> RdfParser::guessFeed(const QByteArray& content,
const QString& content_type) const {
QString xml_schema_encoding = QSL(DEFAULT_FEED_ENCODING);
QString xml_contents_encoded;
QString enc =
QRegularExpression(QSL("encoding=\"([A-Z0-9\\-]+)\""), QRegularExpression::PatternOption::CaseInsensitiveOption)
.match(content)
.captured(1);
if (!enc.isEmpty()) {
// Some "encoding" attribute was found get the encoding
// out of it.
xml_schema_encoding = enc;
}
QTextCodec* custom_codec = QTextCodec::codecForName(xml_schema_encoding.toLocal8Bit());
if (custom_codec != nullptr) {
xml_contents_encoded = custom_codec->toUnicode(content);
}
else {
xml_contents_encoded = QString::fromUtf8(content);
}
// Feed XML was obtained, guess it now.
QDomDocument xml_document;
QString error_msg;
int error_line, error_column;
if (!xml_document.setContent(xml_contents_encoded, true, &error_msg, &error_line, &error_column)) {
throw ApplicationException(QObject::tr("XML is not well-formed, %1").arg(error_msg));
}
QDomElement root_element = xml_document.documentElement();
if (root_element.namespaceURI() != rdfNamespace()) {
throw ApplicationException(QObject::tr("not an RDF feed"));
}
auto* feed = new StandardFeed();
QList<IconLocation> icon_possible_locations;
feed->setEncoding(xml_schema_encoding);
feed->setType(StandardFeed::Type::Rdf);
QDomElement channel_element = root_element.elementsByTagNameNS(rssNamespace(), QSL("channel")).at(0).toElement();
feed->setTitle(channel_element.elementsByTagNameNS(rssNamespace(), QSL("title")).at(0).toElement().text());
feed
->setDescription(channel_element.elementsByTagNameNS(rssNamespace(), QSL("description")).at(0).toElement().text());
QString home_page = channel_element.elementsByTagNameNS(rssNamespace(), QSL("link")).at(0).toElement().text();
if (!home_page.isEmpty()) {
icon_possible_locations.prepend({home_page, false});
}
return {feed, icon_possible_locations};
}
QDomNodeList RdfParser::xmlMessageElements() {
return m_xml.elementsByTagNameNS(m_rssNamespace, QSL("item"));
}

View File

@ -12,9 +12,10 @@
class RdfParser : public FeedParser {
public:
explicit RdfParser(const QString& data);
virtual ~RdfParser();
QString rdfNamespace() const;
QString rssNamespace() const;
virtual QPair<StandardFeed*, QList<IconLocation>> guessFeed(const QByteArray& content,
const QString& content_type) const;
protected:
virtual QString xmlMessageTitle(const QDomElement& msg_element) const;
@ -27,6 +28,9 @@ class RdfParser : public FeedParser {
virtual QDomNodeList xmlMessageElements();
private:
QString rdfNamespace() const;
QString rssNamespace() const;
QString m_rdfNamespace;
QString m_rssNamespace;
QString m_rssCoNamespace;

View File

@ -2,13 +2,97 @@
#include "services/standard/parsers/rssparser.h"
#include "exceptions/applicationexception.h"
#include "miscellaneous/textfactory.h"
#include "services/standard/definitions.h"
#include "services/standard/standardfeed.h"
#include <QDomDocument>
#include <QTextCodec>
#include <QTextStream>
RssParser::RssParser(const QString& data) : FeedParser(data) {}
RssParser::~RssParser() {}
QPair<StandardFeed*, QList<IconLocation>> RssParser::guessFeed(const QByteArray& content,
const QString& content_type) const {
QString xml_schema_encoding = QSL(DEFAULT_FEED_ENCODING);
QString xml_contents_encoded;
QString enc =
QRegularExpression(QSL("encoding=\"([A-Z0-9\\-]+)\""), QRegularExpression::PatternOption::CaseInsensitiveOption)
.match(content)
.captured(1);
if (!enc.isEmpty()) {
// Some "encoding" attribute was found get the encoding
// out of it.
xml_schema_encoding = enc;
}
QTextCodec* custom_codec = QTextCodec::codecForName(xml_schema_encoding.toLocal8Bit());
if (custom_codec != nullptr) {
xml_contents_encoded = custom_codec->toUnicode(content);
}
else {
xml_contents_encoded = QString::fromUtf8(content);
}
// Feed XML was obtained, guess it now.
QDomDocument xml_document;
QString error_msg;
int error_line, error_column;
if (!xml_document.setContent(xml_contents_encoded, true, &error_msg, &error_line, &error_column)) {
throw ApplicationException(QObject::tr("XML is not well-formed, %1").arg(error_msg));
}
QDomElement root_element = xml_document.documentElement();
if (root_element.tagName() != QL1S("rss")) {
throw ApplicationException(QObject::tr("not a RSS feed"));
}
auto* feed = new StandardFeed();
QList<IconLocation> icon_possible_locations;
feed->setEncoding(xml_schema_encoding);
QString rss_type = root_element.attribute(QSL("version"), QSL("2.0"));
if (rss_type == QL1S("0.91") || rss_type == QL1S("0.92") || rss_type == QL1S("0.93")) {
feed->setType(StandardFeed::Type::Rss0X);
}
else {
feed->setType(StandardFeed::Type::Rss2X);
}
QDomElement channel_element = root_element.namedItem(QSL("channel")).toElement();
feed->setTitle(channel_element.namedItem(QSL("title")).toElement().text());
feed->setDescription(channel_element.namedItem(QSL("description")).toElement().text());
QString icon_url_link = channel_element.namedItem(QSL("image")).namedItem(QSL("url")).toElement().text();
if (!icon_url_link.isEmpty()) {
icon_possible_locations.append({icon_url_link, true});
}
auto channel_links = channel_element.elementsByTagName(QSL("link"));
for (int i = 0; i < channel_links.size(); i++) {
QString home_page = channel_links.at(i).toElement().text();
if (!home_page.isEmpty()) {
icon_possible_locations.prepend({home_page, false});
break;
}
}
return {feed, icon_possible_locations};
}
QDomNodeList RssParser::xmlMessageElements() {
QDomNode channel_elem = m_xml.namedItem(QSL("rss")).namedItem(QSL("channel"));

View File

@ -12,6 +12,10 @@
class RssParser : public FeedParser {
public:
explicit RssParser(const QString& data);
virtual ~RssParser();
virtual QPair<StandardFeed*, QList<IconLocation>> guessFeed(const QByteArray& content,
const QString& content_type) const;
protected:
virtual QDomNodeList xmlMessageElements();

View File

@ -0,0 +1,154 @@
// For license of this file, see <project-root-folder>/LICENSE.md.
#include "services/standard/parsers/sitemapparser.h"
#if defined(ENABLE_COMPRESSED_SITEMAP)
#include "3rd-party/qcompressor/qcompressor.h"
#endif
#include "definitions/definitions.h"
#include "exceptions/applicationexception.h"
#include "exceptions/feedrecognizedbutfailedexception.h"
#include "services/standard/definitions.h"
#include "services/standard/standardfeed.h"
#include <QDomDocument>
#include <QTextCodec>
#include <QTextStream>
SitemapParser::SitemapParser(const QString& data) : FeedParser(data) {}
SitemapParser::~SitemapParser() {}
QPair<StandardFeed*, QList<IconLocation>> SitemapParser::guessFeed(const QByteArray& content,
const QString& content_type) const {
QByteArray uncompressed_content;
if (isGzip(content)) {
#if defined(ENABLE_COMPRESSED_SITEMAP)
QCompressor::gzipDecompress(content, uncompressed_content);
#else
throw FeedRecognizedButFailedException(QObject::tr("support for gzipped sitemaps is not enabled"));
#endif
}
else {
uncompressed_content = content;
}
QString xml_schema_encoding = QSL(DEFAULT_FEED_ENCODING);
QString xml_contents_encoded;
QString enc =
QRegularExpression(QSL("encoding=\"([A-Z0-9\\-]+)\""), QRegularExpression::PatternOption::CaseInsensitiveOption)
.match(uncompressed_content)
.captured(1);
if (!enc.isEmpty()) {
// Some "encoding" attribute was found get the encoding
// out of it.
xml_schema_encoding = enc;
}
QTextCodec* custom_codec = QTextCodec::codecForName(xml_schema_encoding.toLocal8Bit());
if (custom_codec != nullptr) {
xml_contents_encoded = custom_codec->toUnicode(uncompressed_content);
}
else {
xml_contents_encoded = QString::fromUtf8(uncompressed_content);
}
// Feed XML was obtained, guess it now.
QDomDocument xml_document;
QString error_msg;
int error_line, error_column;
if (!xml_document.setContent(xml_contents_encoded, true, &error_msg, &error_line, &error_column)) {
throw ApplicationException(QObject::tr("XML is not well-formed, %1").arg(error_msg));
}
QDomElement root_element = xml_document.documentElement();
if (root_element.tagName() != QSL("urlset") && root_element.tagName() != QSL("sitemapindex")) {
throw ApplicationException(QObject::tr("not a Sitemap"));
}
auto* feed = new StandardFeed();
QList<IconLocation> icon_possible_locations;
feed->setEncoding(xml_schema_encoding);
if (root_element.tagName() == QSL("urlset")) {
// Sitemap.
feed->setType(StandardFeed::Type::Sitemap);
feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::Sitemap));
}
else {
// Sitemap index.
feed->setType(StandardFeed::Type::SitemapIndex);
feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::SitemapIndex));
}
return {feed, icon_possible_locations};
}
QString SitemapParser::sitemapNamespace() const {
return QSL("http://www.sitemaps.org/schemas/sitemap/0.9");
}
QString SitemapParser::sitemapNewsNamespace() const {
return QSL("http://www.google.com/schemas/sitemap-news/0.9");
}
QString SitemapParser::sitemapImageNamespace() const {
return QSL("http://www.google.com/schemas/sitemap-image/1.1");
}
QString SitemapParser::sitemapVideoNamespace() const {
return QSL("http://www.google.com/schemas/sitemap-video/1.1");
}
QDomNodeList SitemapParser::xmlMessageElements() {
return {};
}
// TODO: implement
QString SitemapParser::xmlMessageTitle(const QDomElement& msg_element) const {
return {};
}
QString SitemapParser::xmlMessageUrl(const QDomElement& msg_element) const {
return {};
}
QString SitemapParser::xmlMessageDescription(const QDomElement& msg_element) const {
return {};
}
QString SitemapParser::xmlMessageAuthor(const QDomElement& msg_element) const {
return {};
}
QDateTime SitemapParser::xmlMessageDateCreated(const QDomElement& msg_element) const {
return {};
}
QString SitemapParser::xmlMessageId(const QDomElement& msg_element) const {
return {};
}
QList<Enclosure> SitemapParser::xmlMessageEnclosures(const QDomElement& msg_element) const {
return {};
}
QList<MessageCategory> SitemapParser::xmlMessageCategories(const QDomElement& msg_element) const {
return {};
}
QString SitemapParser::xmlMessageRawContents(const QDomElement& msg_element) const {
return {};
}
bool SitemapParser::isGzip(const QByteArray& content) {
return ((content[0] & 0xFF) == 0x1f) && ((content[1] & 0xFF) == 0x8b);
}

View File

@ -0,0 +1,37 @@
// For license of this file, see <project-root-folder>/LICENSE.md.
#ifndef SITEMAPPARSER_H
#define SITEMAPPARSER_H
#include "services/standard/parsers/feedparser.h"
class SitemapParser : public FeedParser {
public:
explicit SitemapParser(const QString& data);
virtual ~SitemapParser();
virtual QPair<StandardFeed*, QList<IconLocation>> guessFeed(const QByteArray& content,
const QString& content_type) const;
static bool isGzip(const QByteArray& content);
protected:
virtual QDomNodeList xmlMessageElements();
virtual QString xmlMessageTitle(const QDomElement& msg_element) const;
virtual QString xmlMessageUrl(const QDomElement& msg_element) const;
virtual QString xmlMessageDescription(const QDomElement& msg_element) const;
virtual QString xmlMessageAuthor(const QDomElement& msg_element) const;
virtual QDateTime xmlMessageDateCreated(const QDomElement& msg_element) const;
virtual QString xmlMessageId(const QDomElement& msg_element) const;
virtual QList<Enclosure> xmlMessageEnclosures(const QDomElement& msg_element) const;
virtual QList<MessageCategory> xmlMessageCategories(const QDomElement& msg_element) const;
virtual QString xmlMessageRawContents(const QDomElement& msg_element) const;
private:
QString sitemapNamespace() const;
QString sitemapNewsNamespace() const;
QString sitemapImageNamespace() const;
QString sitemapVideoNamespace() const;
};
#endif // SITEMAPPARSER_H

View File

@ -5,16 +5,20 @@
#include "database/databasequeries.h"
#include "definitions/definitions.h"
#include "exceptions/applicationexception.h"
#include "exceptions/feedrecognizedbutfailedexception.h"
#include "exceptions/networkexception.h"
#include "exceptions/scriptexception.h"
#include "miscellaneous/settings.h"
#include "miscellaneous/textfactory.h"
#include "services/standard/definitions.h"
#include "services/standard/gui/formstandardfeeddetails.h"
#include "services/standard/parsers/atomparser.h"
#include "services/standard/parsers/rdfparser.h"
#include "services/standard/standardserviceroot.h"
#include "services/standard/parsers/atomparser.h"
#include "services/standard/parsers/jsonparser.h"
#include "services/standard/parsers/rdfparser.h"
#include "services/standard/parsers/rssparser.h"
#include "services/standard/parsers/sitemapparser.h"
#include <QCommandLineParser>
#include <QDomDocument>
#include <QDomElement>
@ -24,6 +28,7 @@
#include <QPointer>
#include <QProcess>
#include <QProcessEnvironment>
#include <QScopedPointer>
#include <QTextCodec>
#include <QVariant>
#include <QXmlStreamReader>
@ -151,6 +156,12 @@ QString StandardFeed::typeToString(StandardFeed::Type type) {
case Type::Json:
return QSL("JSON 1.0/1.1");
case Type::Sitemap:
return QSL("Sitemap");
case Type::SitemapIndex:
return QSL("Sitemap Index");
case Type::Rss2X:
default:
return QSL("RSS 2.0/2.0.1");
@ -270,172 +281,36 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type,
}
StandardFeed* feed = nullptr;
QList<IconLocation> icon_possible_locations;
QList<QSharedPointer<FeedParser>> parsers;
// Now we need to obtain list of URLs of icons.
// Priority of links:
// 1. Links of "homepage" obtained from feed files which will be processed via DuckDuckGo.
// 2. Direct links of "favicon", "icon", "logo" obtained from feed files which will be downloaded directly.
// 3. Link of the feed file itself which will be processed via DuckDuckGo.
// The "bool" if true means that the URL is direct and download directly, if false then
// only use its domain and download via DuckDuckGo.
QList<QPair<QString, bool>> icon_possible_locations;
parsers.append(QSharedPointer<FeedParser>(new AtomParser({})));
parsers.append(QSharedPointer<FeedParser>(new RssParser({})));
parsers.append(QSharedPointer<FeedParser>(new RdfParser({})));
parsers.append(QSharedPointer<FeedParser>(new JsonParser({})));
parsers.append(QSharedPointer<FeedParser>(new SitemapParser({})));
if (content_type.contains(QSL("json"), Qt::CaseSensitivity::CaseInsensitive) || feed_contents.startsWith('{')) {
feed = new StandardFeed();
for (const QSharedPointer<FeedParser>& parser : parsers) {
try {
QPair<StandardFeed*, QList<IconLocation>> res = parser->guessFeed(feed_contents, content_type);
// We have JSON feed.
feed->setEncoding(QSL(DEFAULT_FEED_ENCODING));
feed->setType(Type::Json);
QJsonParseError json_err;
QJsonDocument json = QJsonDocument::fromJson(feed_contents, &json_err);
if (json.isNull() && !json_err.errorString().isEmpty()) {
throw ApplicationException(tr("JSON error '%1'").arg(json_err.errorString()));
feed = res.first;
icon_possible_locations = res.second;
break;
}
feed->setTitle(json.object()[QSL("title")].toString());
feed->setDescription(json.object()[QSL("description")].toString());
auto home_page = json.object()[QSL("home_page_url")].toString();
if (!home_page.isEmpty()) {
icon_possible_locations.prepend({home_page, false});
catch (const FeedRecognizedButFailedException& format_ex) {
// Parser reports that it is right parser for this feed
// but its support is not enabled or available or it is broken.
// In this case abort.
throw format_ex;
}
auto icon = json.object()[QSL("favicon")].toString();
if (icon.isEmpty()) {
icon = json.object()[QSL("icon")].toString();
}
if (!icon.isEmpty()) {
// Low priority, download directly.
icon_possible_locations.append({icon, true});
catch (const ApplicationException& ex) {
qWarningNN << LOGSEC_CORE << "Feed guessing error:" << QUOTE_W_SPACE_DOT(ex.message());
}
}
else {
// Feed XML was obtained, now we need to try to guess
// its encoding before we can read further data.
QString xml_schema_encoding;
QString xml_contents_encoded;
QString enc =
QRegularExpression(QSL("encoding=\"([A-Z0-9\\-]+)\""), QRegularExpression::PatternOption::CaseInsensitiveOption)
.match(feed_contents)
.captured(1);
if (!enc.isEmpty()) {
// Some "encoding" attribute was found get the encoding
// out of it.
xml_schema_encoding = enc;
}
QTextCodec* custom_codec = QTextCodec::codecForName(xml_schema_encoding.toLocal8Bit());
QString encod;
if (custom_codec != nullptr) {
// Feed encoding was probably guessed.
xml_contents_encoded = custom_codec->toUnicode(feed_contents);
encod = xml_schema_encoding;
}
else {
// Feed encoding probably not guessed, set it as
// default.
xml_contents_encoded = feed_contents;
encod = QSL(DEFAULT_FEED_ENCODING);
}
// Feed XML was obtained, guess it now.
QDomDocument xml_document;
QString error_msg;
int error_line, error_column;
if (!xml_document.setContent(xml_contents_encoded, true, &error_msg, &error_line, &error_column)) {
throw ApplicationException(tr("XML is not well-formed, %1").arg(error_msg));
}
feed = new StandardFeed();
feed->setEncoding(encod);
QDomElement root_element = xml_document.documentElement();
RdfParser rdf(QSL("<a/>"));
AtomParser atom(QSL("<a/>"));
if (root_element.namespaceURI() == rdf.rdfNamespace()) {
// We found RDF feed.
QDomElement channel_element =
root_element.elementsByTagNameNS(rdf.rssNamespace(), QSL("channel")).at(0).toElement();
feed->setType(Type::Rdf);
feed->setTitle(channel_element.elementsByTagNameNS(rdf.rssNamespace(), QSL("title")).at(0).toElement().text());
feed->setDescription(channel_element.elementsByTagNameNS(rdf.rssNamespace(), QSL("description"))
.at(0)
.toElement()
.text());
QString home_page = channel_element.elementsByTagNameNS(rdf.rssNamespace(), QSL("link")).at(0).toElement().text();
if (!home_page.isEmpty()) {
icon_possible_locations.prepend({home_page, false});
}
}
else if (root_element.tagName() == QL1S("rss")) {
// We found RSS 0.91/0.92/0.93/2.0/2.0.1 feed.
QString rss_type = root_element.attribute(QSL("version"), QSL("2.0"));
if (rss_type == QL1S("0.91") || rss_type == QL1S("0.92") || rss_type == QL1S("0.93")) {
feed->setType(Type::Rss0X);
}
else {
feed->setType(Type::Rss2X);
}
QDomElement channel_element = root_element.namedItem(QSL("channel")).toElement();
feed->setTitle(channel_element.namedItem(QSL("title")).toElement().text());
feed->setDescription(channel_element.namedItem(QSL("description")).toElement().text());
QString icon_url_link = channel_element.namedItem(QSL("image")).namedItem(QSL("url")).toElement().text();
if (!icon_url_link.isEmpty()) {
icon_possible_locations.append({icon_url_link, true});
}
auto channel_links = channel_element.elementsByTagName(QSL("link"));
for (int i = 0; i < channel_links.size(); i++) {
QString home_page = channel_links.at(i).toElement().text();
if (!home_page.isEmpty()) {
icon_possible_locations.prepend({home_page, false});
break;
}
}
}
else if (root_element.namespaceURI() == atom.atomNamespace()) {
// We found ATOM feed.
feed->setType(Type::Atom10);
feed->setTitle(root_element.namedItem(QSL("title")).toElement().text());
feed->setDescription(root_element.namedItem(QSL("subtitle")).toElement().text());
QString icon_link = root_element.namedItem(QSL("icon")).toElement().text();
if (!icon_link.isEmpty()) {
icon_possible_locations.append({icon_link, true});
}
QString home_page = root_element.namedItem(QSL("link")).toElement().attribute(QSL("href"));
if (!home_page.isEmpty()) {
icon_possible_locations.prepend({home_page, false});
}
}
else {
// File was downloaded and it really was XML file
// but feed format was NOT recognized.
feed->deleteLater();
throw ApplicationException(tr("XML feed file format unrecognized"));
}
if (feed == nullptr) {
throw ApplicationException(tr("feed format not recognized"));
}
if (source_type == SourceType::Url && icon_possible_locations.isEmpty()) {
@ -474,7 +349,9 @@ bool StandardFeed::performDragDropChange(RootItem* target_item) {
qApp->showGuiMessage(Notification::Event::GeneralEvent,
{tr("Cannot move feed"),
tr("Cannot move feed, detailed information was logged via debug log."),
tr("Cannot move feed, detailed "
"information was logged via "
"debug log."),
QSystemTrayIcon::MessageIcon::Critical});
return false;
}
@ -550,7 +427,10 @@ QString StandardFeed::runScriptProcess(const QStringList& cmd_args,
if (!raw_error.simplified().isEmpty()) {
qWarningNN << LOGSEC_CORE
<< "Received error output from custom script even if it reported that it exited normally:"
<< "Received error output from "
"custom script even if it "
"reported that it exited "
"normally:"
<< QUOTE_W_SPACE_DOT(raw_error);
}

View File

@ -24,14 +24,20 @@ class StandardFeed : public Feed {
friend class StandardCategory;
public:
enum class SourceType { Url = 0, Script = 1, LocalFile = 2 };
enum class SourceType {
Url = 0,
Script = 1,
LocalFile = 2
};
enum class Type {
Rss0X = 0,
Rss2X = 1,
Rdf = 2, // Sometimes denoted as RSS 1.0.
Atom10 = 3,
Json = 4
Json = 4,
SitemapIndex = 5,
Sitemap = 6
};
explicit StandardFeed(RootItem* parent_item = nullptr);
@ -79,8 +85,8 @@ class StandardFeed : public Feed {
const QString& post_process_script,
NetworkFactory::NetworkAuthentication protection,
bool fetch_icons = true,
const QString& username = QString(),
const QString& password = QString(),
const QString& username = {},
const QString& password = {},
const QNetworkProxy& custom_proxy = QNetworkProxy::ProxyType::DefaultProxy);
// Converts particular feed type to string.