diff --git a/src/librssguard/exceptions/feedrecognizedbutfailedexception.cpp b/src/librssguard/exceptions/feedrecognizedbutfailedexception.cpp index 50762e80c..ae35cb720 100644 --- a/src/librssguard/exceptions/feedrecognizedbutfailedexception.cpp +++ b/src/librssguard/exceptions/feedrecognizedbutfailedexception.cpp @@ -2,4 +2,10 @@ #include "exceptions/feedrecognizedbutfailedexception.h" -FeedRecognizedButFailedException::FeedRecognizedButFailedException(const QString &message) : ApplicationException(message) {} +FeedRecognizedButFailedException::FeedRecognizedButFailedException(const QString& message, + const QVariant& arbitrary_data) + : ApplicationException(message), m_arbitraryData(arbitrary_data) {} + +QVariant FeedRecognizedButFailedException::arbitraryData() const { + return m_arbitraryData; +} diff --git a/src/librssguard/exceptions/feedrecognizedbutfailedexception.h b/src/librssguard/exceptions/feedrecognizedbutfailedexception.h index 71f8d65f5..af3228693 100644 --- a/src/librssguard/exceptions/feedrecognizedbutfailedexception.h +++ b/src/librssguard/exceptions/feedrecognizedbutfailedexception.h @@ -5,9 +5,16 @@ #include "exceptions/applicationexception.h" +#include + class FeedRecognizedButFailedException : public ApplicationException { public: - explicit FeedRecognizedButFailedException(const QString& message = {}); + explicit FeedRecognizedButFailedException(const QString& message = {}, const QVariant& arbitrary_data = {}); + + QVariant arbitraryData() const; + + private: + QVariant m_arbitraryData; }; #endif // UNRECOGNIZEDFEEDFORMATEXCEPTION_H diff --git a/src/librssguard/network-web/basenetworkaccessmanager.cpp b/src/librssguard/network-web/basenetworkaccessmanager.cpp index e1d18cd3a..7338c9c83 100644 --- a/src/librssguard/network-web/basenetworkaccessmanager.cpp +++ b/src/librssguard/network-web/basenetworkaccessmanager.cpp @@ -66,15 +66,18 @@ QNetworkReply* BaseNetworkAccessManager::createRequest(QNetworkAccessManager::Op // new_request.setMaximumRedirectsAllowed(0); new_request.setRawHeader(HTTP_HEADERS_COOKIE, QSL("JSESSIONID= ").toLocal8Bit()); + /* + auto custom_ua = qApp->web()->customUserAgent(); - auto custom_ua = qApp->web()->customUserAgent(); + if (custom_ua.isEmpty()) { + new_request.setRawHeader(HTTP_HEADERS_USER_AGENT, HTTP_COMPLETE_USERAGENT); + } + else { + new_request.setRawHeader(HTTP_HEADERS_USER_AGENT, custom_ua.toLocal8Bit()); + } + */ - if (custom_ua.isEmpty()) { - new_request.setRawHeader(HTTP_HEADERS_USER_AGENT, HTTP_COMPLETE_USERAGENT); - } - else { - new_request.setRawHeader(HTTP_HEADERS_USER_AGENT, custom_ua.toLocal8Bit()); - } + new_request.setRawHeader(HTTP_HEADERS_USER_AGENT, " "); auto reply = QNetworkAccessManager::createRequest(op, new_request, outgoingData); return reply; diff --git a/src/librssguard/services/standard/definitions.h b/src/librssguard/services/standard/definitions.h index ca83c539b..a35c27487 100644 --- a/src/librssguard/services/standard/definitions.h +++ b/src/librssguard/services/standard/definitions.h @@ -6,4 +6,17 @@ #define FEED_INITIAL_OPML_PATTERN "feeds-%1.opml" #define DEFAULT_ENCLOSURE_MIME_TYPE "image/jpg" +#define ADVANCED_FEED_ADD_DIALOG_CODE 64 + +#define RSS_REGEX_MATCHER "]+type=\"application\\/(?:rss\\+xml)\"[^>]*>" +#define RSS_HREF_REGEX_MATCHER "href=\"([^\"]+)\"" + +#define JSON_REGEX_MATCHER "]+type=\"application\\/(?:feed\\+json|json)\"[^>]*>" +#define JSON_HREF_REGEX_MATCHER "href=\"([^\"]+)\"" + +#define ATOM_REGEX_MATCHER "]+type=\"application\\/(?:atom\\+xml|rss\\+xml)\"[^>]*>" +#define ATOM_HREF_REGEX_MATCHER "href=\"([^\"]+)\"" + +#define GITHUB_URL_REGEX "github\\.com\\/(\\w+)\\/(\\w+)" + #endif // STANDARD_DEFINITIONS_H diff --git a/src/librssguard/services/standard/gui/formdiscoverfeeds.cpp b/src/librssguard/services/standard/gui/formdiscoverfeeds.cpp index 778f68133..326b5573c 100644 --- a/src/librssguard/services/standard/gui/formdiscoverfeeds.cpp +++ b/src/librssguard/services/standard/gui/formdiscoverfeeds.cpp @@ -2,11 +2,13 @@ #include "services/standard/gui/formdiscoverfeeds.h" +#include "3rd-party/boolinq/boolinq.h" #include "gui/guiutilities.h" #include "miscellaneous/application.h" #include "miscellaneous/iconfactory.h" #include "services/abstract/category.h" #include "services/abstract/serviceroot.h" +#include "services/standard/definitions.h" #include "services/standard/standardfeed.h" #include "services/standard/parsers/atomparser.h" @@ -28,14 +30,20 @@ FormDiscoverFeeds::FormDiscoverFeeds(ServiceRoot* service_root, m_parsers = {new AtomParser({}), new RssParser({}), new RdfParser({}), new JsonParser({}), new SitemapParser({})}; + m_btnGoAdvanced = m_ui.m_buttonBox->addButton(tr("Close && &advanced mode"), QDialogButtonBox::ButtonRole::NoRole); m_btnImportSelectedFeeds = m_ui.m_buttonBox->addButton(tr("Import selected feeds"), QDialogButtonBox::ButtonRole::ActionRole); + m_btnGoAdvanced + ->setToolTip(tr("Close this dialog and display dialog for adding individual feeds with advanced options.")); + + m_btnGoAdvanced->setIcon(qApp->icons()->fromTheme(QSL("system-upgrade"))); m_btnImportSelectedFeeds->setIcon(qApp->icons()->fromTheme(QSL("document-import"))); m_ui.m_btnDiscover->setIcon(qApp->icons()->fromTheme(QSL("system-search"))); connect(m_ui.m_txtUrl->lineEdit(), &QLineEdit::textChanged, this, &FormDiscoverFeeds::onUrlChanged); connect(m_btnImportSelectedFeeds, &QPushButton::clicked, this, &FormDiscoverFeeds::importSelectedFeeds); + connect(m_btnGoAdvanced, &QPushButton::clicked, this, &FormDiscoverFeeds::userWantsAdvanced); connect(m_ui.m_btnDiscover, &QPushButton::clicked, this, &FormDiscoverFeeds::discoverFeeds); connect(&m_watcherLookup, &QFutureWatcher>::progressValueChanged, this, [=](int prog) { @@ -145,6 +153,11 @@ void FormDiscoverFeeds::addSingleFeed(StandardFeed* feed) { void FormDiscoverFeeds::importSelectedFeeds() {} +void FormDiscoverFeeds::userWantsAdvanced() { + setResult(ADVANCED_FEED_ADD_DIALOG_CODE); + close(); +} + void FormDiscoverFeeds::loadDiscoveredFeeds(const QList& feeds) { m_ui.m_pbDiscovery->setVisible(false); m_discoveredModel->setDiscoveredFeeds(feeds); @@ -166,16 +179,27 @@ QVariant DiscoveredFeedsModel::data(const QModelIndex& index, int role) const { switch (role) { case Qt::ItemDataRole::DisplayRole: { if (index.column() == 0) { - return m_discoveredFeeds.at(index.row())->title(); + return m_discoveredFeeds.at(index.row()).m_feed->title(); } else { - return StandardFeed::typeToString(m_discoveredFeeds.at(index.row())->type()); + return StandardFeed::typeToString(m_discoveredFeeds.at(index.row()).m_feed->type()); } } + case Qt::ItemDataRole::CheckStateRole: { + if (index.column() == 0) { + return m_discoveredFeeds.at(index.row()).m_isChecked ? Qt::CheckState::Checked : Qt::CheckState::Unchecked; + } + else { + return {}; + } + + break; + } + case Qt::ItemDataRole::DecorationRole: { if (index.column() == 0) { - return m_discoveredFeeds.at(index.row())->fullIcon(); + return m_discoveredFeeds.at(index.row()).m_feed->fullIcon(); } } @@ -184,12 +208,18 @@ QVariant DiscoveredFeedsModel::data(const QModelIndex& index, int role) const { } } -QList DiscoveredFeedsModel::discoveredFeeds() const { +QList DiscoveredFeedsModel::discoveredFeeds() const { return m_discoveredFeeds; } -void DiscoveredFeedsModel::setDiscoveredFeeds(const QList& newDiscoveredFeeds) { - m_discoveredFeeds = newDiscoveredFeeds; +void DiscoveredFeedsModel::setDiscoveredFeeds(const QList& feeds) { + auto std_feeds = boolinq::from(feeds) + .select([](StandardFeed* fd) { + return FeedItem{false, fd}; + }) + .toStdList(); + + m_discoveredFeeds = FROM_STD_LIST(QList, std_feeds); emit layoutAboutToBeChanged(); emit layoutChanged(); @@ -208,3 +238,17 @@ QVariant DiscoveredFeedsModel::headerData(int section, Qt::Orientation orientati return {}; } + +Qt::ItemFlags DiscoveredFeedsModel::flags(const QModelIndex& index) const { + return index.column() == 0 ? Qt::ItemFlag::ItemIsUserCheckable | QAbstractListModel::flags(index) + : QAbstractListModel::flags(index); +} + +bool DiscoveredFeedsModel::setData(const QModelIndex& index, const QVariant& value, int role) { + if (role == Qt::ItemDataRole::CheckStateRole && index.column() == 0) { + m_discoveredFeeds[index.row()].m_isChecked = value.value() == Qt::CheckState::Checked; + return true; + } + + return QAbstractListModel::setData(index, value, role); +} diff --git a/src/librssguard/services/standard/gui/formdiscoverfeeds.h b/src/librssguard/services/standard/gui/formdiscoverfeeds.h index 2fdb1e98e..4d28b1beb 100644 --- a/src/librssguard/services/standard/gui/formdiscoverfeeds.h +++ b/src/librssguard/services/standard/gui/formdiscoverfeeds.h @@ -19,18 +19,25 @@ class DiscoveredFeedsModel : public QAbstractListModel { Q_OBJECT public: + struct FeedItem { + bool m_isChecked; + StandardFeed* m_feed; + }; + explicit DiscoveredFeedsModel(QObject* parent = {}); virtual QVariant headerData(int section, Qt::Orientation orientation, int role) const; virtual int rowCount(const QModelIndex& parent) const; virtual int columnCount(const QModelIndex& parent) const; virtual QVariant data(const QModelIndex& index, int role) const; + virtual bool setData(const QModelIndex& index, const QVariant& value, int role); + virtual Qt::ItemFlags flags(const QModelIndex& index) const; - QList discoveredFeeds() const; - void setDiscoveredFeeds(const QList& newDiscoveredFeeds); + QList discoveredFeeds() const; + void setDiscoveredFeeds(const QList& feeds); private: - QList m_discoveredFeeds; + QList m_discoveredFeeds; }; class FormDiscoverFeeds : public QDialog { @@ -50,12 +57,14 @@ class FormDiscoverFeeds : public QDialog { void importSelectedFeeds(); private: + void userWantsAdvanced(); void loadDiscoveredFeeds(const QList& feeds); void loadCategories(const QList& categories, RootItem* root_item); private: Ui::FormDiscoverFeeds m_ui; QPushButton* m_btnImportSelectedFeeds; + QPushButton* m_btnGoAdvanced; ServiceRoot* m_serviceRoot; QList m_parsers; QFutureWatcher> m_watcherLookup; diff --git a/src/librssguard/services/standard/gui/formdiscoverfeeds.ui b/src/librssguard/services/standard/gui/formdiscoverfeeds.ui index 4d4e3f278..358fe774e 100644 --- a/src/librssguard/services/standard/gui/formdiscoverfeeds.ui +++ b/src/librssguard/services/standard/gui/formdiscoverfeeds.ui @@ -6,8 +6,8 @@ 0 0 - 406 - 334 + 513 + 360 diff --git a/src/librssguard/services/standard/parsers/atomparser.cpp b/src/librssguard/services/standard/parsers/atomparser.cpp index e77602393..474709524 100644 --- a/src/librssguard/services/standard/parsers/atomparser.cpp +++ b/src/librssguard/services/standard/parsers/atomparser.cpp @@ -4,6 +4,8 @@ #include "definitions/definitions.h" #include "exceptions/applicationexception.h" +#include "miscellaneous/application.h" +#include "miscellaneous/settings.h" #include "miscellaneous/textfactory.h" #include "services/standard/definitions.h" #include "services/standard/standardfeed.h" @@ -24,7 +26,178 @@ AtomParser::AtomParser(const QString& data) : FeedParser(data) { AtomParser::~AtomParser() {} QList AtomParser::discoverFeeds(ServiceRoot* root, const QUrl& url) const { - return {}; + QString my_url = url.toString(); + QList feeds; + + // 1. Test direct URL for a feed. + // 2. Test embedded ATOM feed links from HTML data. + // 3. Test "URL/feed" endpoint. + // 4. Test "URL/atom" endpoint. + // 5. If URL is Github repository, test for: + // https://github.com/:owner/:repo/releases.atom + // https://github.com/:owner/:repo/commits.atom + // https://github.com/:user/:repo/tags.atom + + // Download URL. + int timeout = qApp->settings()->value(GROUP(Feeds), SETTING(Feeds::UpdateTimeout)).toInt(); + QByteArray data; + auto res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + // 1. + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + + return {guessed_feed.first}; + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + + // 2. + QRegularExpression rx(QSL(ATOM_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); + QRegularExpression rx_href(QSL(ATOM_HREF_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); + + rx_href.optimize(); + + QRegularExpressionMatchIterator it_rx = rx.globalMatch(QString::fromUtf8(data)); + + while (it_rx.hasNext()) { + QRegularExpressionMatch mat_tx = it_rx.next(); + QString link_tag = mat_tx.captured(); + QString feed_link = rx_href.match(link_tag).captured(1); + + if (feed_link.startsWith(QL1S("//"))) { + feed_link = QSL(URI_SCHEME_HTTP) + feed_link.mid(2); + } + else if (feed_link.startsWith(QL1C('/'))) { + feed_link = url.toString(QUrl::UrlFormattingOption::RemovePath | QUrl::UrlFormattingOption::RemoveQuery | + QUrl::UrlFormattingOption::StripTrailingSlash) + + feed_link; + } + + QByteArray data; + auto res = NetworkFactory::performNetworkOperation(feed_link, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(feed_link); + feeds.append(guessed_feed.first); + } + catch (const ApplicationException& ex) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(feed_link) + << " should be direct link to feed file but was not recognized:" << QUOTE_W_SPACE_DOT(ex.message()); + } + } + } + } + + // 3. + my_url = url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + QSL("/feed"); + res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + feeds.append(guessed_feed.first); + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + } + + // 4. + my_url = url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + QSL("/atom"); + res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + feeds.append(guessed_feed.first); + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + } + + // 5. + my_url = url.toString(QUrl::UrlFormattingOption::StripTrailingSlash); + + if (QRegularExpression(QSL(GITHUB_URL_REGEX)).match(my_url).isValid()) { + QStringList github_feeds = {QSL("releases.atom"), QSL("commits.atom"), QSL("tags.atom")}; + + for (const QString& github_feed : github_feeds) { + my_url = url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + QL1C('/') + github_feed; + res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + feeds.append(guessed_feed.first); + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + } + } + } + + return feeds; } QPair> AtomParser::guessFeed(const QByteArray& content, diff --git a/src/librssguard/services/standard/parsers/jsonparser.cpp b/src/librssguard/services/standard/parsers/jsonparser.cpp index 539c0ec6c..08df7c872 100644 --- a/src/librssguard/services/standard/parsers/jsonparser.cpp +++ b/src/librssguard/services/standard/parsers/jsonparser.cpp @@ -6,6 +6,7 @@ #include "definitions/typedefs.h" #include "exceptions/applicationexception.h" #include "exceptions/feedrecognizedbutfailedexception.h" +#include "miscellaneous/settings.h" #include "miscellaneous/textfactory.h" #include "services/standard/definitions.h" #include "services/standard/standardfeed.h" @@ -19,7 +20,89 @@ JsonParser::JsonParser(const QString& data) : FeedParser(data, false) {} JsonParser::~JsonParser() {} QList JsonParser::discoverFeeds(ServiceRoot* root, const QUrl& url) const { - return {}; + QString my_url = url.toString(); + QList feeds; + + // 1. Test direct URL for a feed. + // 2. Test embedded JSON feed links from HTML data. + + // Download URL. + int timeout = qApp->settings()->value(GROUP(Feeds), SETTING(Feeds::UpdateTimeout)).toInt(); + QByteArray data; + auto res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + // 1. + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + + return {guessed_feed.first}; + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + + // 2. + QRegularExpression rx(QSL(JSON_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); + QRegularExpression rx_href(QSL(JSON_HREF_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); + + rx_href.optimize(); + + QRegularExpressionMatchIterator it_rx = rx.globalMatch(QString::fromUtf8(data)); + + while (it_rx.hasNext()) { + QRegularExpressionMatch mat_tx = it_rx.next(); + QString link_tag = mat_tx.captured(); + QString feed_link = rx_href.match(link_tag).captured(1); + + if (feed_link.startsWith(QL1S("//"))) { + feed_link = QSL(URI_SCHEME_HTTP) + feed_link.mid(2); + } + else if (feed_link.startsWith(QL1C('/'))) { + feed_link = url.toString(QUrl::UrlFormattingOption::RemovePath | QUrl::UrlFormattingOption::RemoveQuery | + QUrl::UrlFormattingOption::StripTrailingSlash) + + feed_link; + } + + QByteArray data; + auto res = NetworkFactory::performNetworkOperation(feed_link, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(feed_link); + feeds.append(guessed_feed.first); + } + catch (const ApplicationException& ex) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(feed_link) + << " should be direct link to feed file but was not recognized:" << QUOTE_W_SPACE_DOT(ex.message()); + } + } + } + } + + return feeds; } QPair> JsonParser::guessFeed(const QByteArray& content, diff --git a/src/librssguard/services/standard/parsers/rdfparser.cpp b/src/librssguard/services/standard/parsers/rdfparser.cpp index 1adcd9c3a..7a475b65a 100644 --- a/src/librssguard/services/standard/parsers/rdfparser.cpp +++ b/src/librssguard/services/standard/parsers/rdfparser.cpp @@ -3,6 +3,7 @@ #include "services/standard/parsers/rdfparser.h" #include "exceptions/applicationexception.h" +#include "miscellaneous/settings.h" #include "miscellaneous/textfactory.h" #include "services/standard/definitions.h" #include "services/standard/standardfeed.h" @@ -18,7 +19,141 @@ RdfParser::RdfParser(const QString& data) RdfParser::~RdfParser() {} QList RdfParser::discoverFeeds(ServiceRoot* root, const QUrl& url) const { - return {}; + QString my_url = url.toString(); + QList feeds; + + // 1. Test direct URL for a feed. + // 2. Test embedded RDF feed links from HTML data. + // 3. Test "URL/feed" endpoint. + // 4. Test "URL/rdf" endpoint. + + // Download URL. + int timeout = qApp->settings()->value(GROUP(Feeds), SETTING(Feeds::UpdateTimeout)).toInt(); + QByteArray data; + auto res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + // 1. + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + + return {guessed_feed.first}; + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + + // 2. + QRegularExpression rx(QSL(RSS_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); + QRegularExpression rx_href(QSL(RSS_HREF_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); + + rx_href.optimize(); + + QRegularExpressionMatchIterator it_rx = rx.globalMatch(QString::fromUtf8(data)); + + while (it_rx.hasNext()) { + QRegularExpressionMatch mat_tx = it_rx.next(); + QString link_tag = mat_tx.captured(); + QString feed_link = rx_href.match(link_tag).captured(1); + + if (feed_link.startsWith(QL1S("//"))) { + feed_link = QSL(URI_SCHEME_HTTP) + feed_link.mid(2); + } + else if (feed_link.startsWith(QL1C('/'))) { + feed_link = url.toString(QUrl::UrlFormattingOption::RemovePath | QUrl::UrlFormattingOption::RemoveQuery | + QUrl::UrlFormattingOption::StripTrailingSlash) + + feed_link; + } + + QByteArray data; + auto res = NetworkFactory::performNetworkOperation(feed_link, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(feed_link); + feeds.append(guessed_feed.first); + } + catch (const ApplicationException& ex) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(feed_link) + << " should be direct link to feed file but was not recognized:" << QUOTE_W_SPACE_DOT(ex.message()); + } + } + } + } + + // 3. + my_url = url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + QSL("/feed"); + res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + feeds.append(guessed_feed.first); + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + } + + // 4. + my_url = url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + QSL("/rdf"); + res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + feeds.append(guessed_feed.first); + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + } + + return feeds; } QPair> RdfParser::guessFeed(const QByteArray& content, diff --git a/src/librssguard/services/standard/parsers/rssparser.cpp b/src/librssguard/services/standard/parsers/rssparser.cpp index faa2f7aba..8bfce44e8 100644 --- a/src/librssguard/services/standard/parsers/rssparser.cpp +++ b/src/librssguard/services/standard/parsers/rssparser.cpp @@ -19,12 +19,18 @@ RssParser::RssParser(const QString& data) : FeedParser(data) {} RssParser::~RssParser() {} QList RssParser::discoverFeeds(ServiceRoot* root, const QUrl& url) const { + QString my_url = url.toString(); QList feeds; + // 1. Test direct URL for a feed. + // 2. Test embedded RSS feed links from HTML data. + // 3. Test "URL/feed" endpoint. + // 4. Test "URL/rss" endpoint. + // Download URL. int timeout = qApp->settings()->value(GROUP(Feeds), SETTING(Feeds::UpdateTimeout)).toInt(); QByteArray data; - auto res = NetworkFactory::performNetworkOperation(url.toString(), + auto res = NetworkFactory::performNetworkOperation(my_url, timeout, {}, data, @@ -36,20 +42,21 @@ QList RssParser::discoverFeeds(ServiceRoot* root, const QUrl& url root->networkProxy()); if (res.m_networkError == QNetworkReply::NetworkError::NoError) { - // Parse result, might be HTML or directly the feed file. try { + // 1. auto guessed_feed = guessFeed(data, res.m_contentType); - guessed_feed.first->setSource(url.toString()); + guessed_feed.first->setSource(my_url); return {guessed_feed.first}; } catch (...) { - qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(url) << "is not a direct feed file."; + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; } - QRegularExpression rx(QSL(FEED_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); - QRegularExpression rx_href(QSL(FEED_HREF_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); + // 2. + QRegularExpression rx(QSL(RSS_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); + QRegularExpression rx_href(QSL(RSS_HREF_REGEX_MATCHER), QRegularExpression::PatternOption::CaseInsensitiveOption); rx_href.optimize(); @@ -82,22 +89,70 @@ QList RssParser::discoverFeeds(ServiceRoot* root, const QUrl& url root->networkProxy()); if (res.m_networkError == QNetworkReply::NetworkError::NoError) { - // Parse result, might be HTML or directly the feed file. try { auto guessed_feed = guessFeed(data, res.m_contentType); - guessed_feed.first->setSource(url.toString()); - + guessed_feed.first->setSource(feed_link); feeds.append(guessed_feed.first); } catch (const ApplicationException& ex) { - qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(url) + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(feed_link) << " should be direct link to feed file but was not recognized:" << QUOTE_W_SPACE_DOT(ex.message()); } } } } + // 3. + my_url = url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + QSL("/feed"); + res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + feeds.append(guessed_feed.first); + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + } + + // 4. + my_url = url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + QSL("/rss"); + res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + feeds.append(guessed_feed.first); + } + catch (...) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct feed file."; + } + } + return feeds; } diff --git a/src/librssguard/services/standard/parsers/sitemapparser.cpp b/src/librssguard/services/standard/parsers/sitemapparser.cpp index 79dbdd3de..4d3ec78b8 100644 --- a/src/librssguard/services/standard/parsers/sitemapparser.cpp +++ b/src/librssguard/services/standard/parsers/sitemapparser.cpp @@ -9,6 +9,7 @@ #include "definitions/definitions.h" #include "exceptions/applicationexception.h" #include "exceptions/feedrecognizedbutfailedexception.h" +#include "miscellaneous/settings.h" #include "miscellaneous/textfactory.h" #include "services/standard/definitions.h" @@ -21,7 +22,108 @@ SitemapParser::SitemapParser(const QString& data) : FeedParser(data) {} SitemapParser::~SitemapParser() {} QList SitemapParser::discoverFeeds(ServiceRoot* root, const QUrl& url) const { - return {}; + QHash feeds; + QStringList to_process_sitemaps; + + // 1. Process "URL/robots.txt" file. + // 2. Process "URLHOST/robots.txt" file. + // 3. Direct URL test. If sitemap index, process its children. + // 4. Test "URL/sitemap.xml" endpoint. + // 5. Test "URL/sitemap.xml.gz" endpoint. + + // 1. + // 2. + QStringList to_process_robots = { + url.toString(QUrl::UrlFormattingOption::StripTrailingSlash).replace(QRegularExpression(QSL("\\/$")), QString()) + + QSL("/robots.txt"), + url.toString(QUrl::UrlFormattingOption::RemovePath | QUrl::UrlFormattingOption::RemoveQuery) + QSL("/robots.txt")}; + + to_process_robots.removeDuplicates(); + + for (const QString& robots_url : to_process_robots) { + // Download URL. + int timeout = qApp->settings()->value(GROUP(Feeds), SETTING(Feeds::UpdateTimeout)).toInt(); + QByteArray data; + auto res = NetworkFactory::performNetworkOperation(robots_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + QRegularExpression rx(QSL("Sitemap: ?([^\\r\\n]+)"), + QRegularExpression::PatternOption::CaseInsensitiveOption | + QRegularExpression::PatternOption::MultilineOption); + QRegularExpressionMatchIterator it_rx = rx.globalMatch(QString::fromUtf8(data)); + + while (it_rx.hasNext()) { + QString sitemap_link = it_rx.next().captured(1); + + to_process_sitemaps.append(sitemap_link); + } + } + } + + // 3. + to_process_sitemaps.append(url.toString()); + + // 4. + to_process_sitemaps.append(url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + .replace(QRegularExpression(QSL("\\/$")), QString()) + + QSL("/sitemap.xml")); + + // 5. + to_process_sitemaps.append(url.toString(QUrl::UrlFormattingOption::StripTrailingSlash) + .replace(QRegularExpression(QSL("\\/$")), QString()) + + QSL("/sitemap.xml.gz")); + + while (!to_process_sitemaps.isEmpty()) { + to_process_sitemaps.removeDuplicates(); + + QString my_url = to_process_sitemaps.takeFirst(); + + if (feeds.contains(my_url)) { + continue; + } + + // Download URL. + int timeout = qApp->settings()->value(GROUP(Feeds), SETTING(Feeds::UpdateTimeout)).toInt(); + QByteArray data; + auto res = NetworkFactory::performNetworkOperation(my_url, + timeout, + {}, + data, + QNetworkAccessManager::Operation::GetOperation, + {}, + {}, + {}, + {}, + root->networkProxy()); + + if (res.m_networkError == QNetworkReply::NetworkError::NoError) { + try { + // 1. + auto guessed_feed = guessFeed(data, res.m_contentType); + + guessed_feed.first->setSource(my_url); + feeds.insert(my_url, guessed_feed.first); + } + catch (const FeedRecognizedButFailedException& ex) { + // This is index. + to_process_sitemaps.append(ex.arbitraryData().toStringList()); + } + catch (const ApplicationException&) { + qDebugNN << LOGSEC_CORE << QUOTE_W_SPACE(my_url) << "is not a direct sitemap file."; + } + } + } + + return feeds.values(); } QPair> SitemapParser::guessFeed(const QByteArray& content, @@ -73,7 +175,14 @@ QPair> SitemapParser::guessFeed(const QByteAr QDomElement root_element = xml_document.documentElement(); if (root_element.tagName() == QSL("sitemapindex")) { - throw FeedRecognizedButFailedException(QObject::tr("sitemap indices are not supported")); + QStringList locs; + int i = 0; + + for (QDomNodeList ndl = root_element.elementsByTagNameNS(sitemapNamespace(), QSL("loc")); i < ndl.size(); i++) { + locs << ndl.at(i).toElement().text(); + } + + throw FeedRecognizedButFailedException(QObject::tr("sitemap indices are not supported"), locs); } if (root_element.tagName() != QSL("urlset")) { @@ -180,5 +289,5 @@ QList SitemapParser::xmlMessageEnclosures(const QDomElement& msg_elem } bool SitemapParser::isGzip(const QByteArray& content) { - return ((content[0] & 0xFF) == 0x1f) && ((content[1] & 0xFF) == 0x8b); + return content.size() >= 2 && ((content[0] & 0xFF) == 0x1f) && ((content[1] & 0xFF) == 0x8b); } diff --git a/src/librssguard/services/standard/standardserviceroot.cpp b/src/librssguard/services/standard/standardserviceroot.cpp index c1754d2f2..22e8bc855 100644 --- a/src/librssguard/services/standard/standardserviceroot.cpp +++ b/src/librssguard/services/standard/standardserviceroot.cpp @@ -140,16 +140,14 @@ void StandardServiceRoot::addNewFeed(RootItem* selected_item, const QString& url url, qApp->mainFormWidget())); - form_discover->exec(); + if (form_discover->exec() == ADVANCED_FEED_ADD_DIALOG_CODE) { + QScopedPointer form_pointer(new FormStandardFeedDetails(this, + selected_item, + url, + qApp->mainFormWidget())); - /* - QScopedPointer form_pointer(new FormStandardFeedDetails(this, - selected_item, - url, - qApp->mainFormWidget())); - - form_pointer->addEditFeed(); - */ + form_pointer->addEditFeed(); + } qApp->feedUpdateLock()->unlock(); } diff --git a/src/librssguard/services/standard/standardserviceroot.h b/src/librssguard/services/standard/standardserviceroot.h index 35e03868b..be9a8f2a6 100644 --- a/src/librssguard/services/standard/standardserviceroot.h +++ b/src/librssguard/services/standard/standardserviceroot.h @@ -15,10 +15,10 @@ class FeedsImportExportModel; class QMenu; class StandardServiceRoot : public ServiceRoot { - Q_OBJECT + Q_OBJECT - friend class FormStandardFeedDetails; - friend class FormStandardImportExport; + friend class FormStandardFeedDetails; + friend class FormStandardImportExport; public: explicit StandardServiceRoot(RootItem* parent = nullptr); @@ -48,7 +48,6 @@ class StandardServiceRoot : public ServiceRoot { void exportFeeds(); private: - // Takes structure residing under given root item and adds feeds/categories from // it to active structure. // NOTE: This is used for import/export of the model.