rssguard/src/librssguard-standard/src/parsers/sitemapparser.cpp

320 lines
11 KiB
C++

// For license of this file, see <project-root-folder>/LICENSE.md.
#include "src/parsers/sitemapparser.h"
#if defined(ENABLE_COMPRESSED_SITEMAP)
#include "src/3rd-party/qcompressor/qcompressor.h"
#endif
#include "src/definitions.h"
#include <librssguard/definitions/definitions.h>
#include <librssguard/exceptions/applicationexception.h>
#include <librssguard/exceptions/feedrecognizedbutfailedexception.h>
#include <librssguard/miscellaneous/settings.h>
#include <librssguard/miscellaneous/textfactory.h>
#include <QDomDocument>
#include <QTextCodec>
#include <QTextStream>
SitemapParser::SitemapParser(const QString& data) : FeedParser(data) {}
SitemapParser::~SitemapParser() {}
QList<StandardFeed*> SitemapParser::discoverFeeds(ServiceRoot* root, const QUrl& url, bool greedy) const {
auto base_result = FeedParser::discoverFeeds(root, url, greedy);
QHash<QString, StandardFeed*> feeds;
if (!base_result.isEmpty()) {
if (greedy) {
for (StandardFeed* base_fd : base_result) {
feeds.insert(base_fd->source(), base_fd);
}
}
else {
return base_result;
}
}
QStringList to_process_sitemaps;
int sitemap_index_limit = 2;
int timeout = qApp->settings()->value(GROUP(Feeds), SETTING(Feeds::UpdateTimeout)).toInt();
// 1. Direct URL test. If sitemap index, process its children. If found, stop if non-recursive
// discovery is chosen.
// 2. Process "URL/robots.txt" file.
// 3. Process "URLHOST/robots.txt" file.
// 4. Test "URL/sitemap.xml" endpoint.
// 5. Test "URL/sitemap.xml.gz" endpoint.
// 1.
to_process_sitemaps.append(url.toString());
// 2.
// 3.
QStringList to_process_robots = {
url.toString(QUrl::UrlFormattingOption::StripTrailingSlash).replace(QRegularExpression(QSL("\\/$")), QString()) +
QSL("/robots.txt"),
url.toString(QUrl::UrlFormattingOption::RemovePath | QUrl::UrlFormattingOption::RemoveQuery) + QSL("/robots.txt")};
to_process_robots.removeDuplicates();
for (const QString& robots_url : to_process_robots) {
// Download URL.
QByteArray data;
auto res = NetworkFactory::performNetworkOperation(robots_url,
timeout,
{},
data,
QNetworkAccessManager::Operation::GetOperation,
{},
{},
{},
{},
root->networkProxy());
if (res.m_networkError == QNetworkReply::NetworkError::NoError) {
QRegularExpression rx(QSL("Sitemap: ?([^\\r\\n]+)"),
QRegularExpression::PatternOption::CaseInsensitiveOption |
QRegularExpression::PatternOption::MultilineOption);
QRegularExpressionMatchIterator it_rx = rx.globalMatch(QString::fromUtf8(data));
while (it_rx.hasNext()) {
QString sitemap_link = it_rx.next().captured(1);
to_process_sitemaps.append(sitemap_link);
}
}
}
// 4.
to_process_sitemaps.append(url.toString(QUrl::UrlFormattingOption::StripTrailingSlash)
.replace(QRegularExpression(QSL("\\/$")), QString()) +
QSL("/sitemap.xml"));
// 5.
to_process_sitemaps.append(url.toString(QUrl::UrlFormattingOption::StripTrailingSlash)
.replace(QRegularExpression(QSL("\\/$")), QString()) +
QSL("/sitemap.xml.gz"));
while (!to_process_sitemaps.isEmpty()) {
to_process_sitemaps.removeDuplicates();
QString my_url = to_process_sitemaps.takeFirst();
if (feeds.contains(my_url)) {
continue;
}
// Download URL.
QByteArray data;
auto res = NetworkFactory::performNetworkOperation(my_url,
timeout,
{},
data,
QNetworkAccessManager::Operation::GetOperation,
{},
{},
{},
{},
root->networkProxy());
if (res.m_networkError == QNetworkReply::NetworkError::NoError) {
try {
auto guessed_feed = guessFeed(data, res.m_contentType);
guessed_feed.first->setSource(my_url);
guessed_feed.first->setTitle(my_url);
feeds.insert(my_url, guessed_feed.first);
if (!greedy) {
break;
}
}
catch (const FeedRecognizedButFailedException& ex) {
// This is index.
if (sitemap_index_limit-- > 0) {
to_process_sitemaps.append(ex.arbitraryData().toStringList());
}
}
catch (const ApplicationException&) {
qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct sitemap file.";
}
}
}
return feeds.values();
}
QPair<StandardFeed*, QList<IconLocation>> SitemapParser::guessFeed(const QByteArray& content,
const QString& content_type) const {
QByteArray uncompressed_content;
if (isGzip(content)) {
#if defined(ENABLE_COMPRESSED_SITEMAP)
QCompressor::gzipDecompress(content, uncompressed_content);
#else
throw FeedRecognizedButFailedException(QObject::tr("support for gzipped sitemaps is not enabled"));
#endif
}
else {
uncompressed_content = content;
}
QString xml_schema_encoding = QSL(DEFAULT_FEED_ENCODING);
QString xml_contents_encoded;
QString enc =
QRegularExpression(QSL("encoding=\"([A-Z0-9\\-]+)\""), QRegularExpression::PatternOption::CaseInsensitiveOption)
.match(uncompressed_content)
.captured(1);
if (!enc.isEmpty()) {
// Some "encoding" attribute was found get the encoding
// out of it.
xml_schema_encoding = enc;
}
QTextCodec* custom_codec = QTextCodec::codecForName(xml_schema_encoding.toLocal8Bit());
if (custom_codec != nullptr) {
xml_contents_encoded = custom_codec->toUnicode(uncompressed_content);
}
else {
xml_contents_encoded = QString::fromUtf8(uncompressed_content);
}
// Feed XML was obtained, guess it now.
QDomDocument xml_document;
QString error_msg;
int error_line, error_column;
if (!xml_document.setContent(xml_contents_encoded, true, &error_msg, &error_line, &error_column)) {
throw ApplicationException(QObject::tr("XML is not well-formed, %1").arg(error_msg));
}
QDomElement root_element = xml_document.documentElement();
if (root_element.tagName() == QSL("sitemapindex")) {
QStringList locs;
int i = 0;
for (QDomNodeList ndl = root_element.elementsByTagNameNS(sitemapNamespace(), QSL("loc")); i < ndl.size(); i++) {
locs << ndl.at(i).toElement().text();
}
throw FeedRecognizedButFailedException(QObject::tr("sitemap indices are not supported"), locs);
}
if (root_element.tagName() != QSL("urlset")) {
throw ApplicationException(QObject::tr("not a Sitemap"));
}
auto* feed = new StandardFeed();
QList<IconLocation> icon_possible_locations;
feed->setEncoding(xml_schema_encoding);
feed->setType(StandardFeed::Type::Sitemap);
feed->setTitle(StandardFeed::typeToString(StandardFeed::Type::Sitemap));
return {feed, icon_possible_locations};
}
QString SitemapParser::sitemapNamespace() const {
return QSL("http://www.sitemaps.org/schemas/sitemap/0.9");
}
QString SitemapParser::sitemapNewsNamespace() const {
return QSL("http://www.google.com/schemas/sitemap-news/0.9");
}
QString SitemapParser::sitemapImageNamespace() const {
return QSL("http://www.google.com/schemas/sitemap-image/1.1");
}
QString SitemapParser::sitemapVideoNamespace() const {
return QSL("http://www.google.com/schemas/sitemap-video/1.1");
}
QDomNodeList SitemapParser::xmlMessageElements() {
return m_xml.elementsByTagNameNS(sitemapNamespace(), QSL("url"));
}
QString SitemapParser::xmlMessageTitle(const QDomElement& msg_element) const {
QString str_title = msg_element.elementsByTagNameNS(sitemapNewsNamespace(), QSL("title")).at(0).toElement().text();
if (str_title.isEmpty()) {
str_title = msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("title")).at(0).toElement().text();
}
if (str_title.isEmpty()) {
str_title = msg_element.elementsByTagNameNS(sitemapImageNamespace(), QSL("title")).at(0).toElement().text();
}
return str_title;
}
QString SitemapParser::xmlMessageUrl(const QDomElement& msg_element) const {
return msg_element.elementsByTagNameNS(sitemapNamespace(), QSL("loc")).at(0).toElement().text();
}
QString SitemapParser::xmlMessageDescription(const QDomElement& msg_element) const {
return xmlRawChild(msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("description")).at(0).toElement());
}
QDateTime SitemapParser::xmlMessageDateCreated(const QDomElement& msg_element) const {
QString str_date = msg_element.elementsByTagNameNS(sitemapNamespace(), QSL("lastmod")).at(0).toElement().text();
if (str_date.isEmpty()) {
str_date =
msg_element.elementsByTagNameNS(sitemapNewsNamespace(), QSL("publication_date")).at(0).toElement().text();
}
return TextFactory::parseDateTime(str_date);
}
QString SitemapParser::xmlMessageId(const QDomElement& msg_element) const {
return xmlMessageUrl(msg_element);
}
QList<Enclosure> SitemapParser::xmlMessageEnclosures(const QDomElement& msg_element) const {
QList<Enclosure> enclosures;
// sitemap-image
QDomNodeList elem_links = msg_element.elementsByTagNameNS(sitemapImageNamespace(), QSL("image"));
for (int i = 0; i < elem_links.size(); i++) {
QDomElement link = elem_links.at(i).toElement();
QString loc = link.elementsByTagNameNS(sitemapImageNamespace(), QSL("loc")).at(0).toElement().text();
if (!loc.isEmpty()) {
// NOTE: The MIME is made up.
enclosures.append(Enclosure(loc, QSL("image/png")));
}
}
// sitemap-video
elem_links = msg_element.elementsByTagNameNS(sitemapVideoNamespace(), QSL("video"));
for (int i = 0; i < elem_links.size(); i++) {
QDomElement link = elem_links.at(i).toElement();
QString loc = link.elementsByTagNameNS(sitemapVideoNamespace(), QSL("player_loc")).at(0).toElement().text();
if (loc.isEmpty()) {
loc = link.elementsByTagNameNS(sitemapVideoNamespace(), QSL("content_loc")).at(0).toElement().text();
}
if (!loc.isEmpty()) {
// NOTE: The MIME is made up.
enclosures.append(Enclosure(loc, QSL("video/mpeg")));
}
}
return enclosures;
}
bool SitemapParser::isGzip(const QByteArray& content) {
return content.size() >= 2 && ((content[0] & 0xFF) == 0x1f) && ((content[1] & 0xFF) == 0x8b);
}