make RDF parser inherit from FeedParser, enhance ATOM and RDF detection reliability

This commit is contained in:
Martin Rotter 2021-07-14 08:41:42 +02:00
parent b0a11288cb
commit 3b9cdb477b
10 changed files with 101 additions and 88 deletions

View File

@ -20,8 +20,6 @@ AtomParser::AtomParser(const QString& data) : FeedParser(data) {
} }
} }
AtomParser::~AtomParser() = default;
QString AtomParser::feedAuthor() const { QString AtomParser::feedAuthor() const {
QDomNodeList top_level_nodes = m_xml.documentElement().childNodes(); QDomNodeList top_level_nodes = m_xml.documentElement().childNodes();
QStringList author_str; QStringList author_str;
@ -148,6 +146,10 @@ QString AtomParser::messageAuthor(const QDomElement& msg_element) const {
return author_str.join(", "); return author_str.join(", ");
} }
QString AtomParser::atomNamespace() const {
return m_atomNamespace;
}
QDomNodeList AtomParser::messageElements() { QDomNodeList AtomParser::messageElements() {
return m_xml.elementsByTagNameNS(m_atomNamespace, QSL("entry")); return m_xml.elementsByTagNameNS(m_atomNamespace, QSL("entry"));
} }

View File

@ -13,7 +13,8 @@
class AtomParser : public FeedParser { class AtomParser : public FeedParser {
public: public:
explicit AtomParser(const QString& data); explicit AtomParser(const QString& data);
virtual ~AtomParser();
QString atomNamespace() const;
private: private:
QDomNodeList messageElements(); QDomNodeList messageElements();

View File

@ -19,8 +19,6 @@ FeedParser::FeedParser(QString data) : m_xmlData(std::move(data)), m_mrssNamespa
} }
} }
FeedParser::~FeedParser() = default;
QList<Message> FeedParser::messages() { QList<Message> FeedParser::messages() {
QString feed_author = feedAuthor(); QString feed_author = feedAuthor();
QList<Message> messages; QList<Message> messages;

View File

@ -12,7 +12,6 @@
class FeedParser { class FeedParser {
public: public:
explicit FeedParser(QString data); explicit FeedParser(QString data);
virtual ~FeedParser();
virtual QList<Message> messages(); virtual QList<Message> messages();

View File

@ -2,6 +2,7 @@
#include "services/standard/rdfparser.h" #include "services/standard/rdfparser.h"
#include "exceptions/applicationexception.h"
#include "miscellaneous/application.h" #include "miscellaneous/application.h"
#include "miscellaneous/textfactory.h" #include "miscellaneous/textfactory.h"
#include "network-web/webfactory.h" #include "network-web/webfactory.h"
@ -9,33 +10,32 @@
#include <QDomDocument> #include <QDomDocument>
RdfParser::RdfParser() = default; RdfParser::RdfParser(const QString& data)
: FeedParser(data),
m_rdfNamespace(QSL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")),
m_rssNamespace(QSL("http://purl.org/rss/1.0/")) {}
RdfParser::~RdfParser() = default; QDomNodeList RdfParser::messageElements() {
QList<Message> RdfParser::parseXmlData(const QString& data) {
QList<Message> messages;
QDomDocument xml_file; QDomDocument xml_file;
QDateTime current_time = QDateTime::currentDateTime();
xml_file.setContent(data, true); xml_file.setContent(m_xmlData, true);
// Pull out all messages. // Pull out all messages.
QDomNodeList messages_in_xml = xml_file.elementsByTagName(QSL("item")); return xml_file.elementsByTagName(QSL("item"));
}
for (int i = 0; i < messages_in_xml.size(); i++) { Message RdfParser::extractMessage(const QDomElement& msg_element, QDateTime current_time) const {
QDomNode message_item = messages_in_xml.item(i);
Message new_message; Message new_message;
// Deal with title and description. // Deal with title and description.
QString elem_title = message_item.namedItem(QSL("title")).toElement().text().simplified(); QString elem_title = msg_element.namedItem(QSL("title")).toElement().text().simplified();
QString elem_description = message_item.namedItem(QSL("description")).toElement().text(); QString elem_description = rawXmlChild(msg_element.namedItem(QSL("description")).toElement());
// Now we obtained maximum of information for title & description. // Now we obtained maximum of information for title & description.
if (elem_title.isEmpty()) { if (elem_title.isEmpty()) {
if (elem_description.isEmpty()) { if (elem_description.isEmpty()) {
// BOTH title and description are empty, skip this message. // BOTH title and description are empty, skip this message.
continue; throw ApplicationException(QSL("Not enough data for the message."));
} }
else { else {
// Title is empty but description is not. // Title is empty but description is not.
@ -54,18 +54,18 @@ QList<Message> RdfParser::parseXmlData(const QString& data) {
str.setCodec(DEFAULT_FEED_ENCODING); str.setCodec(DEFAULT_FEED_ENCODING);
message_item.save(str, 0, QDomNode::EncodingPolicy::EncodingFromTextStream); msg_element.save(str, 0, QDomNode::EncodingPolicy::EncodingFromTextStream);
new_message.m_rawContents = raw_contents; new_message.m_rawContents = raw_contents;
// Deal with link and author. // Deal with link and author.
new_message.m_url = message_item.namedItem(QSL("link")).toElement().text(); new_message.m_url = msg_element.namedItem(QSL("link")).toElement().text();
new_message.m_author = message_item.namedItem(QSL("creator")).toElement().text(); new_message.m_author = msg_element.namedItem(QSL("creator")).toElement().text();
// Deal with creation date. // Deal with creation date.
QString elem_updated = message_item.namedItem(QSL("date")).toElement().text(); QString elem_updated = msg_element.namedItem(QSL("date")).toElement().text();
if (elem_updated.isEmpty()) { if (elem_updated.isEmpty()) {
elem_updated = message_item.namedItem(QSL("dc:date")).toElement().text(); elem_updated = msg_element.namedItem(QSL("dc:date")).toElement().text();
} }
// Deal with creation date. // Deal with creation date.
@ -85,8 +85,13 @@ QList<Message> RdfParser::parseXmlData(const QString& data) {
new_message.m_url = ""; new_message.m_url = "";
} }
messages.append(new_message); return new_message;
} }
return messages; QString RdfParser::rssNamespace() const {
return m_rssNamespace;
}
QString RdfParser::rdfNamespace() const {
return m_rdfNamespace;
} }

View File

@ -3,16 +3,25 @@
#ifndef RDFPARSER_H #ifndef RDFPARSER_H
#define RDFPARSER_H #define RDFPARSER_H
#include "services/standard/feedparser.h"
#include "core/message.h" #include "core/message.h"
#include <QList> #include <QList>
class RdfParser { class RdfParser : public FeedParser {
public: public:
explicit RdfParser(); explicit RdfParser(const QString& data);
virtual ~RdfParser();
QList<Message> parseXmlData(const QString& data); QString rdfNamespace() const;
QString rssNamespace() const;
private:
QDomNodeList messageElements();
Message extractMessage(const QDomElement& msg_element, QDateTime current_time) const;
QString m_rdfNamespace;
QString m_rssNamespace;
}; };
#endif // RDFPARSER_H #endif // RDFPARSER_H

View File

@ -14,8 +14,6 @@
RssParser::RssParser(const QString& data) : FeedParser(data) {} RssParser::RssParser(const QString& data) : FeedParser(data) {}
RssParser::~RssParser() = default;
QDomNodeList RssParser::messageElements() { QDomNodeList RssParser::messageElements() {
QDomNode channel_elem = m_xml.namedItem(QSL("rss")).namedItem(QSL("channel")); QDomNode channel_elem = m_xml.namedItem(QSL("rss")).namedItem(QSL("channel"));

View File

@ -12,7 +12,6 @@
class RssParser : public FeedParser { class RssParser : public FeedParser {
public: public:
explicit RssParser(const QString& data); explicit RssParser(const QString& data);
virtual ~RssParser();
private: private:
QDomNodeList messageElements(); QDomNodeList messageElements();

View File

@ -352,6 +352,7 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type,
int error_line, error_column; int error_line, error_column;
if (!xml_document.setContent(xml_contents_encoded, if (!xml_document.setContent(xml_contents_encoded,
true,
&error_msg, &error_msg,
&error_line, &error_line,
&error_column)) { &error_column)) {
@ -362,23 +363,24 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type,
feed->setEncoding(encod); feed->setEncoding(encod);
QDomElement root_element = xml_document.documentElement(); QDomElement root_element = xml_document.documentElement();
QString root_tag_name = root_element.tagName(); RdfParser rdf(QSL("<a/>"));
AtomParser atom(QSL("<a/>"));
if (root_tag_name == QL1S("rdf:RDF")) { if (root_element.namespaceURI() == rdf.rdfNamespace()) {
// We found RDF feed. // We found RDF feed.
QDomElement channel_element = root_element.namedItem(QSL("channel")).toElement(); QDomElement channel_element = root_element.elementsByTagNameNS(rdf.rssNamespace(), QSL("channel")).at(0).toElement();
feed->setType(Type::Rdf); feed->setType(Type::Rdf);
feed->setTitle(channel_element.namedItem(QSL("title")).toElement().text()); feed->setTitle(channel_element.elementsByTagNameNS(rdf.rssNamespace(), QSL("title")).at(0).toElement().text());
feed->setDescription(channel_element.namedItem(QSL("description")).toElement().text()); feed->setDescription(channel_element.elementsByTagNameNS(rdf.rssNamespace(), QSL("description")).at(0).toElement().text());
QString home_page = channel_element.namedItem(QSL("link")).toElement().text(); QString home_page = channel_element.elementsByTagNameNS(rdf.rssNamespace(), QSL("link")).at(0).toElement().text();
if (!home_page.isEmpty()) { if (!home_page.isEmpty()) {
icon_possible_locations.prepend({ home_page, false }); icon_possible_locations.prepend({ home_page, false });
} }
} }
else if (root_tag_name == QL1S("rss")) { else if (root_element.tagName() == QL1S("rss")) {
// We found RSS 0.91/0.92/0.93/2.0/2.0.1 feed. // We found RSS 0.91/0.92/0.93/2.0/2.0.1 feed.
QString rss_type = root_element.attribute("version", "2.0"); QString rss_type = root_element.attribute("version", "2.0");
@ -410,7 +412,7 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type,
icon_possible_locations.prepend({ home_page, false }); icon_possible_locations.prepend({ home_page, false });
} }
} }
else if (root_tag_name == QL1S("feed")) { else if (root_element.namespaceURI() == atom.atomNamespace()) {
// We found ATOM feed. // We found ATOM feed.
feed->setType(Type::Atom10); feed->setType(Type::Atom10);
feed->setTitle(root_element.namedItem(QSL("title")).toElement().text()); feed->setTitle(root_element.namedItem(QSL("title")).toElement().text());

View File

@ -244,7 +244,7 @@ QList<Message> StandardServiceRoot::obtainNewMessages(const QList<Feed*>& feeds)
break; break;
case StandardFeed::Type::Rdf: case StandardFeed::Type::Rdf:
messages = RdfParser().parseXmlData(formatted_feed_contents); messages = RdfParser(formatted_feed_contents).messages();
break; break;
case StandardFeed::Type::Atom10: case StandardFeed::Type::Atom10: