unified abstraction in RSS/RDF/ATOM parsers, improved support for RDF/modules

This commit is contained in:
Martin Rotter 2022-02-05 18:56:38 +01:00
parent 3052b45726
commit 91573ec11d
8 changed files with 255 additions and 283 deletions

View File

@ -21,119 +21,17 @@ AtomParser::AtomParser(const QString& data) : FeedParser(data) {
}
QString AtomParser::feedAuthor() const {
QDomNodeList top_level_nodes = m_xml.documentElement().childNodes();
QStringList author_str;
auto authors = m_xml.documentElement().elementsByTagNameNS(m_atomNamespace, QSL("author"));
for (int i = 0; i < top_level_nodes.size(); i++) {
auto elem = top_level_nodes.at(i).toElement();
for (int i = 0; i < authors.size(); i++) {
QDomNode auth = authors.at(i);
if (elem.localName() != QSL("author") || elem.namespaceURI() != m_atomNamespace) {
continue;
}
QDomNodeList names = elem.elementsByTagNameNS(m_atomNamespace, QSL("name"));
if (!names.isEmpty()) {
const QString name = names.at(0).toElement().text();
if (!name.isEmpty() && !author_str.contains(name)) {
author_str.append(name);
}
if (auth.parentNode() == m_xml.documentElement()) {
return auth.toElement().elementsByTagNameNS(m_atomNamespace, QSL("name")).at(0).toElement().text();
}
}
return author_str.join(QSL(", "));
}
Message AtomParser::extractMessage(const QDomElement& msg_element, const QDateTime& current_time) const {
Message new_message;
QString title = textsFromPath(msg_element, m_atomNamespace, QSL("title"), true).join(QSL(", "));
QString summary = rawXmlChild(msg_element.elementsByTagNameNS(m_atomNamespace, QSL("content")).at(0).toElement());
if (summary.isEmpty()) {
summary = rawXmlChild(msg_element.elementsByTagNameNS(m_atomNamespace, QSL("summary")).at(0).toElement());
if (summary.isEmpty()) {
summary = rawXmlChild(msg_element.elementsByTagNameNS(m_mrssNamespace, QSL("description")).at(0).toElement());
}
}
// Now we obtained maximum of information for title & description.
if (title.isEmpty() && summary.isEmpty()) {
// BOTH title and description are empty, skip this message.
throw ApplicationException(QSL("Not enough data for the message."));
}
// Title is not empty, description does not matter.
new_message.m_title = qApp->web()->unescapeHtml(qApp->web()->stripTags(title));
new_message.m_contents = summary;
new_message.m_author = qApp->web()->unescapeHtml(messageAuthor(msg_element));
new_message.m_customId = msg_element.elementsByTagNameNS(m_atomNamespace, QSL("id")).at(0).toElement().text();
QString raw_contents;
QTextStream str(&raw_contents);
msg_element.save(str, 0, QDomNode::EncodingPolicy::EncodingFromTextStream);
new_message.m_rawContents = raw_contents;
QString updated = textsFromPath(msg_element, m_atomNamespace, QSL("updated"), true).join(QSL(", "));
if (updated.isEmpty()) {
updated = textsFromPath(msg_element, m_atomNamespace, QSL("modified"), true).join(QSL(", "));
}
// Deal with creation date.
new_message.m_created = TextFactory::parseDateTime(updated);
new_message.m_createdFromFeed = !new_message.m_created.isNull();
if (!new_message.m_createdFromFeed) {
// Date was NOT obtained from the feed, set current date as creation date for the message.
new_message.m_created = current_time;
}
// Deal with links
QDomNodeList elem_links = msg_element.toElement().elementsByTagNameNS(m_atomNamespace, QSL("link"));
QString last_link_alternate, last_link_other;
for (int i = 0; i < elem_links.size(); i++) {
QDomElement link = elem_links.at(i).toElement();
QString attribute = link.attribute(QSL("rel"));
if (attribute == QSL("enclosure")) {
QString enclosure_type = link.attribute(QSL("type"));
if (enclosure_type.isEmpty()) {
enclosure_type = QSL(DEFAULT_ENCLOSURE_MIME_TYPE);
}
new_message.m_enclosures.append(Enclosure(link.attribute(QSL("href")), enclosure_type));
qDebugNN << LOGSEC_CORE
<< "Found enclosure"
<< QUOTE_W_SPACE(new_message.m_enclosures.last().m_url)
<< "for the message.";
}
else if (attribute.isEmpty() || attribute == QSL("alternate")) {
last_link_alternate = link.attribute(QSL("href"));
}
else {
last_link_other = link.attribute(QSL("href"));
}
}
// Obtain MRSS enclosures.
new_message.m_enclosures.append(mrssGetEnclosures(msg_element));
if (!last_link_alternate.isEmpty()) {
new_message.m_url = last_link_alternate;
}
else if (!last_link_other.isEmpty()) {
new_message.m_url = last_link_other;
}
else if (!new_message.m_enclosures.isEmpty()) {
new_message.m_url = new_message.m_enclosures.first().m_url;
}
return new_message;
return {};
}
QString AtomParser::messageAuthor(const QDomElement& msg_element) const {
@ -158,3 +56,75 @@ QString AtomParser::atomNamespace() const {
QDomNodeList AtomParser::messageElements() {
return m_xml.elementsByTagNameNS(m_atomNamespace, QSL("entry"));
}
QString AtomParser::messageTitle(const QDomElement& msg_element) const {
return textsFromPath(msg_element, m_atomNamespace, QSL("title"), true).join(QSL(", "));
}
QString AtomParser::messageDescription(const QDomElement& msg_element) const {
QString summary = rawXmlChild(msg_element.elementsByTagNameNS(m_atomNamespace, QSL("content")).at(0).toElement());
if (summary.isEmpty()) {
summary = rawXmlChild(msg_element.elementsByTagNameNS(m_atomNamespace, QSL("summary")).at(0).toElement());
if (summary.isEmpty()) {
summary = rawXmlChild(msg_element.elementsByTagNameNS(m_mrssNamespace, QSL("description")).at(0).toElement());
}
}
return summary;
}
QDateTime AtomParser::messageDateCreated(const QDomElement& msg_element) const {
QString updated = textsFromPath(msg_element, m_atomNamespace, QSL("updated"), true).join(QSL(", "));
if (updated.simplified().isEmpty()) {
updated = textsFromPath(msg_element, m_atomNamespace, QSL("modified"), true).join(QSL(", "));
}
return TextFactory::parseDateTime(updated);
}
QString AtomParser::messageId(const QDomElement& msg_element) const {
return msg_element.elementsByTagNameNS(m_atomNamespace, QSL("id")).at(0).toElement().text();
}
QString AtomParser::messageUrl(const QDomElement& msg_element) const {
QDomNodeList elem_links = msg_element.toElement().elementsByTagNameNS(m_atomNamespace, QSL("link"));
QString last_link_other;
for (int i = 0; i < elem_links.size(); i++) {
QDomElement link = elem_links.at(i).toElement();
QString attribute = link.attribute(QSL("rel"));
if (attribute.isEmpty() || attribute == QSL("alternate")) {
return link.attribute(QSL("href"));
}
else if (attribute != QSL("enclosure")) {
last_link_other = link.attribute(QSL("href"));
}
}
if (!last_link_other.isEmpty()) {
return last_link_other;
}
else {
return {};
}
}
QList<Enclosure> AtomParser::messageEnclosures(const QDomElement& msg_element) const {
QList<Enclosure> enclosures;
QDomNodeList elem_links = msg_element.toElement().elementsByTagNameNS(m_atomNamespace, QSL("link"));
for (int i = 0; i < elem_links.size(); i++) {
QDomElement link = elem_links.at(i).toElement();
QString attribute = link.attribute(QSL("rel"));
if (attribute == QSL("enclosure")) {
enclosures.append(Enclosure(link.attribute(QSL("href")), link.attribute(QSL("type"))));
}
}
return enclosures;
}

View File

@ -16,11 +16,16 @@ class AtomParser : public FeedParser {
QString atomNamespace() const;
private:
QDomNodeList messageElements();
QString feedAuthor() const;
Message extractMessage(const QDomElement& msg_element, const QDateTime& current_time) const;
QString messageAuthor(const QDomElement& msg_element) const;
protected:
virtual QString messageTitle(const QDomElement& msg_element) const;
virtual QString messageDescription(const QDomElement& msg_element) const;
virtual QDateTime messageDateCreated(const QDomElement& msg_element) const;
virtual QString messageId(const QDomElement& msg_element) const;
virtual QString messageUrl(const QDomElement& msg_element) const;
virtual QList<Enclosure> messageEnclosures(const QDomElement& msg_element) const;
virtual QDomNodeList messageElements();
virtual QString messageAuthor(const QDomElement& msg_element) const;
virtual QString feedAuthor() const;
private:
QString m_atomNamespace;

View File

@ -20,6 +20,14 @@ FeedParser::FeedParser(QString data) : m_xmlData(std::move(data)), m_mrssNamespa
}
}
QString FeedParser::messageRawContents(const QDomElement& msg_element) const {
QString raw_contents;
QTextStream str(&raw_contents);
msg_element.save(str, 0, QDomNode::EncodingPolicy::EncodingFromTextStream);
return raw_contents;
}
QList<Message> FeedParser::messages() {
QString feed_author = feedAuthor();
QList<Message> messages;
@ -29,15 +37,50 @@ QList<Message> FeedParser::messages() {
QDomNodeList messages_in_xml = messageElements();
for (int i = 0; i < messages_in_xml.size(); i++) {
QDomNode message_item = messages_in_xml.item(i);
QDomElement message_item = messages_in_xml.item(i).toElement();
try {
Message new_message = extractMessage(message_item.toElement(), current_time);
Message new_message;
// Fill available data.
new_message.m_title = qApp->web()->unescapeHtml(messageTitle(message_item));
new_message.m_contents = messageDescription(message_item);
new_message.m_author = qApp->web()->unescapeHtml(messageAuthor(message_item));
new_message.m_url = messageUrl(message_item);
new_message.m_created = messageDateCreated(message_item);
new_message.m_customId = messageId(message_item);
new_message.m_rawContents = messageRawContents(message_item);
new_message.m_enclosures = messageEnclosures(message_item);
new_message.m_enclosures.append(mrssGetEnclosures(message_item));
// Fixup missing data.
//
// NOTE: Message must have "title" field, otherwise it is skipped.
// Author.
if (new_message.m_author.isEmpty() && !feed_author.isEmpty()) {
new_message.m_author = feed_author;
}
// Created date.
new_message.m_createdFromFeed = !new_message.m_created.isNull();
if (!new_message.m_createdFromFeed) {
// Date was NOT obtained from the feed, set current date as creation date for the message.
// NOTE: Date is lessened by 1 second for each message to allow for more
// stable sorting.
new_message.m_created = current_time.addSecs(-1);
current_time = new_message.m_created;
}
// Enclosures.
for (Enclosure& enc : new_message.m_enclosures) {
if (enc.m_mimeType.simplified().isEmpty()) {
enc.m_mimeType = QSL(DEFAULT_ENCLOSURE_MIME_TYPE);
}
}
// Url.
new_message.m_url = new_message.m_url.replace(QRegularExpression(QSL("[\\t\\n]")), QString());
messages.append(new_message);

View File

@ -15,14 +15,23 @@ class FeedParser {
virtual QList<Message> messages();
protected:
virtual QString feedAuthor() const;
virtual QDomNodeList messageElements() = 0;
virtual QString messageTitle(const QDomElement& msg_element) const = 0;
virtual QString messageUrl(const QDomElement& msg_element) const = 0;
virtual QString messageDescription(const QDomElement& msg_element) const = 0;
virtual QString messageAuthor(const QDomElement& msg_element) const = 0;
virtual QDateTime messageDateCreated(const QDomElement& msg_element) const = 0;
virtual QString messageId(const QDomElement& msg_element) const = 0;
virtual QList<Enclosure> messageEnclosures(const QDomElement& msg_element) const = 0;
virtual QString messageRawContents(const QDomElement& msg_element) const;
protected:
QList<Enclosure> mrssGetEnclosures(const QDomElement& msg_element) const;
QString mrssTextFromPath(const QDomElement& msg_element, const QString& xml_path) const;
QString rawXmlChild(const QDomElement& container) const;
QStringList textsFromPath(const QDomElement& element, const QString& namespace_uri, const QString& xml_path, bool only_first) const;
virtual QDomNodeList messageElements() = 0;
virtual QString feedAuthor() const;
virtual Message extractMessage(const QDomElement& msg_element, const QDateTime& current_time) const = 0;
protected:
QString m_xmlData;

View File

@ -13,77 +13,12 @@
RdfParser::RdfParser(const QString& data)
: FeedParser(data),
m_rdfNamespace(QSL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")),
m_rssNamespace(QSL("http://purl.org/rss/1.0/")) {}
m_rssNamespace(QSL("http://purl.org/rss/1.0/")),
m_rssCoNamespace(QSL("http://purl.org/rss/1.0/modules/content/")),
m_dcElNamespace(QSL("http://purl.org/dc/elements/1.1/")) {}
QDomNodeList RdfParser::messageElements() {
QDomDocument xml_file;
xml_file.setContent(m_xmlData, true);
// Pull out all messages.
return xml_file.elementsByTagName(QSL("item"));
}
Message RdfParser::extractMessage(const QDomElement& msg_element, const QDateTime& current_time) const {
Message new_message;
// Deal with title and description.
QString elem_title = msg_element.namedItem(QSL("title")).toElement().text().simplified();
QString elem_description = rawXmlChild(msg_element.namedItem(QSL("description")).toElement());
// Now we obtained maximum of information for title & description.
if (elem_title.isEmpty()) {
if (elem_description.isEmpty()) {
// BOTH title and description are empty, skip this message.
throw ApplicationException(QSL("Not enough data for the message."));
}
else {
// Title is empty but description is not.
new_message.m_title = qApp->web()->unescapeHtml(qApp->web()->stripTags(elem_description.simplified()));
new_message.m_contents = elem_description;
}
}
else {
// Title is really not empty, description does not matter.
new_message.m_title = qApp->web()->unescapeHtml(qApp->web()->stripTags(elem_title));
new_message.m_contents = elem_description;
}
QString raw_contents;
QTextStream str(&raw_contents);
msg_element.save(str, 0, QDomNode::EncodingPolicy::EncodingFromTextStream);
new_message.m_rawContents = raw_contents;
// Deal with link and author.
new_message.m_url = msg_element.namedItem(QSL("link")).toElement().text();
new_message.m_author = msg_element.namedItem(QSL("creator")).toElement().text();
// Deal with creation date.
QString elem_updated = msg_element.namedItem(QSL("date")).toElement().text();
if (elem_updated.isEmpty()) {
elem_updated = msg_element.namedItem(QSL("dc:date")).toElement().text();
}
// Deal with creation date.
new_message.m_created = TextFactory::parseDateTime(elem_updated);
new_message.m_createdFromFeed = !new_message.m_created.isNull();
if (!new_message.m_createdFromFeed) {
// Date was NOT obtained from the feed, set current date as creation date for the message.
new_message.m_created = current_time;
}
if (new_message.m_author.isNull()) {
new_message.m_author = QL1S("");
}
if (new_message.m_url.isNull()) {
new_message.m_url = QL1S("");
}
return new_message;
return m_xml.elementsByTagNameNS(m_rssNamespace, QSL("item"));
}
QString RdfParser::rssNamespace() const {
@ -93,3 +28,37 @@ QString RdfParser::rssNamespace() const {
QString RdfParser::rdfNamespace() const {
return m_rdfNamespace;
}
QString RdfParser::messageTitle(const QDomElement& msg_element) const {
return msg_element.elementsByTagNameNS(m_rssNamespace, QSL("title")).at(0).toElement().text();
}
QString RdfParser::messageDescription(const QDomElement& msg_element) const {
QString description = msg_element.elementsByTagNameNS(m_rssCoNamespace, QSL("encoded")).at(0).toElement().text();
if (description.simplified().isEmpty()) {
description = msg_element.elementsByTagNameNS(m_rssNamespace, QSL("description")).at(0).toElement().text();
}
return description;
}
QString RdfParser::messageAuthor(const QDomElement& msg_element) const {
return msg_element.elementsByTagNameNS(m_dcElNamespace, QSL("creator")).at(0).toElement().text();
}
QDateTime RdfParser::messageDateCreated(const QDomElement& msg_element) const {
return TextFactory::parseDateTime(msg_element.elementsByTagNameNS(m_dcElNamespace, QSL("date")).at(0).toElement().text());
}
QString RdfParser::messageId(const QDomElement& msg_element) const {
return msg_element.elementsByTagNameNS(m_dcElNamespace, QSL("identifier")).at(0).toElement().text();
}
QString RdfParser::messageUrl(const QDomElement& msg_element) const {
return msg_element.elementsByTagNameNS(m_rssNamespace, QSL("link")).at(0).toElement().text();
}
QList<Enclosure> RdfParser::messageEnclosures(const QDomElement& msg_element) const {
return {};
}

View File

@ -16,12 +16,21 @@ class RdfParser : public FeedParser {
QString rdfNamespace() const;
QString rssNamespace() const;
private:
QDomNodeList messageElements();
Message extractMessage(const QDomElement& msg_element, const QDateTime& current_time) const;
protected:
virtual QString messageTitle(const QDomElement& msg_element) const;
virtual QString messageDescription(const QDomElement& msg_element) const;
virtual QString messageAuthor(const QDomElement& msg_element) const;
virtual QDateTime messageDateCreated(const QDomElement& msg_element) const;
virtual QString messageId(const QDomElement& msg_element) const;
virtual QString messageUrl(const QDomElement& msg_element) const;
virtual QList<Enclosure> messageEnclosures(const QDomElement& msg_element) const;
virtual QDomNodeList messageElements();
private:
QString m_rdfNamespace;
QString m_rssNamespace;
QString m_rssCoNamespace;
QString m_dcElNamespace;
};
#endif // RDFPARSER_H

View File

@ -25,102 +25,63 @@ QDomNodeList RssParser::messageElements() {
}
}
Message RssParser::extractMessage(const QDomElement& msg_element, const QDateTime& current_time) const {
Message new_message;
QString RssParser::messageTitle(const QDomElement& msg_element) const {
return msg_element.namedItem(QSL("title")).toElement().text();
}
// Deal with titles & descriptions.
QString elem_title = msg_element.namedItem(QSL("title")).toElement().text().simplified();
QString elem_description = rawXmlChild(msg_element.elementsByTagName(QSL("encoded")).at(0).toElement());
QString RssParser::messageDescription(const QDomElement& msg_element) const {
QString description = rawXmlChild(msg_element.elementsByTagName(QSL("encoded")).at(0).toElement());
if (description.isEmpty()) {
description = rawXmlChild(msg_element.elementsByTagName(QSL("description")).at(0).toElement());
}
return description;
}
QString RssParser::messageAuthor(const QDomElement& msg_element) const {
QString author = msg_element.namedItem(QSL("author")).toElement().text();
if (author.isEmpty()) {
author = msg_element.namedItem(QSL("creator")).toElement().text();
}
return author;
}
QDateTime RssParser::messageDateCreated(const QDomElement& msg_element) const {
QDateTime date_created = TextFactory::parseDateTime(msg_element.namedItem(QSL("pubDate")).toElement().text());
if (date_created.isNull()) {
date_created = TextFactory::parseDateTime(msg_element.namedItem(QSL("date")).toElement().text());
}
return date_created;
}
QString RssParser::messageId(const QDomElement& msg_element) const {
return msg_element.namedItem(QSL("guid")).toElement().text();
}
QString RssParser::messageUrl(const QDomElement& msg_element) const {
QString url = msg_element.namedItem(QSL("link")).toElement().text();
if (url.isEmpty()) {
// Try to get "href" attribute.
url = msg_element.namedItem(QSL("link")).toElement().attribute(QSL("href"));
}
return url;
}
QList<Enclosure> RssParser::messageEnclosures(const QDomElement& msg_element) const {
QString elem_enclosure = msg_element.namedItem(QSL("enclosure")).toElement().attribute(QSL("url"));
QString elem_enclosure_type = msg_element.namedItem(QSL("enclosure")).toElement().attribute(QSL("type"));
new_message.m_customId = msg_element.namedItem(QSL("guid")).toElement().text();
new_message.m_url = msg_element.namedItem(QSL("link")).toElement().text();
if (new_message.m_url.isEmpty() && !new_message.m_enclosures.isEmpty()) {
new_message.m_url = new_message.m_enclosures.first().m_url;
}
if (new_message.m_url.isEmpty()) {
// Try to get "href" attribute.
new_message.m_url = msg_element.namedItem(QSL("link")).toElement().attribute(QSL("href"));
}
if (elem_description.isEmpty()) {
elem_description = rawXmlChild(msg_element.elementsByTagName(QSL("description")).at(0).toElement());
}
if (elem_description.isEmpty()) {
elem_description = new_message.m_url;
}
// Now we obtained maximum of information for title & description.
if (elem_title.isEmpty()) {
if (elem_description.isEmpty()) {
// BOTH title and description are empty, skip this message.
throw ApplicationException(QSL("Not enough data for the message."));
}
else {
// Title is empty but description is not.
new_message.m_title = qApp->web()->unescapeHtml(qApp->web()->stripTags(elem_description.simplified()));
new_message.m_contents = elem_description;
}
}
else {
// Title is really not empty, description does not matter.
new_message.m_title = qApp->web()->unescapeHtml(qApp->web()->stripTags(elem_title));
new_message.m_contents = elem_description;
}
if (!elem_enclosure.isEmpty()) {
if (elem_enclosure_type.isEmpty()) {
elem_enclosure_type = QSL(DEFAULT_ENCLOSURE_MIME_TYPE);
}
new_message.m_enclosures.append(Enclosure(elem_enclosure, elem_enclosure_type));
qDebugNN << LOGSEC_CORE
<< "Found enclosure"
<< QUOTE_W_SPACE(elem_enclosure)
<< "for the message.";
return { Enclosure(elem_enclosure, elem_enclosure_type) };
}
else {
new_message.m_enclosures.append(mrssGetEnclosures(msg_element));
return {};
}
QString raw_contents;
QTextStream str(&raw_contents);
msg_element.save(str, 0, QDomNode::EncodingPolicy::EncodingFromTextStream);
new_message.m_rawContents = raw_contents;
new_message.m_author = msg_element.namedItem(QSL("author")).toElement().text();
if (new_message.m_author.isEmpty()) {
new_message.m_author = msg_element.namedItem(QSL("creator")).toElement().text();
}
// Deal with creation date.
new_message.m_created = TextFactory::parseDateTime(msg_element.namedItem(QSL("pubDate")).toElement().text());
if (new_message.m_created.isNull()) {
new_message.m_created = TextFactory::parseDateTime(msg_element.namedItem(QSL("date")).toElement().text());
}
if (!(new_message.m_createdFromFeed = !new_message.m_created.isNull())) {
// Date was NOT obtained from the feed,
// set current date as creation date for the message.
new_message.m_created = current_time;
}
if (new_message.m_author.isNull()) {
new_message.m_author = QL1S("");
}
new_message.m_author = qApp->web()->unescapeHtml(new_message.m_author);
if (new_message.m_url.isNull()) {
new_message.m_url = QL1S("");
}
return new_message;
}

View File

@ -13,9 +13,15 @@ class RssParser : public FeedParser {
public:
explicit RssParser(const QString& data);
private:
QDomNodeList messageElements();
Message extractMessage(const QDomElement& msg_element, const QDateTime& current_time) const;
protected:
virtual QDomNodeList messageElements();
virtual QString messageTitle(const QDomElement& msg_element) const;
virtual QString messageDescription(const QDomElement& msg_element) const;
virtual QString messageAuthor(const QDomElement& msg_element) const;
virtual QDateTime messageDateCreated(const QDomElement& msg_element) const;
virtual QString messageId(const QDomElement& msg_element) const;
virtual QString messageUrl(const QDomElement& msg_element) const;
virtual QList<Enclosure> messageEnclosures(const QDomElement& msg_element) const;
};
#endif // RSSPARSER_H