From c7c069d889004dfdfd5a4b7e7c01fc415526a327 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Mon, 22 Apr 2024 14:37:49 +0200 Subject: [PATCH] starting to work on fetching full article content --- resources/rssguard.qrc | 1 + .../article-extractor/extract-article.mjs | 11 ++ src/librssguard/CMakeLists.txt | 2 + src/librssguard/gui/webbrowser.cpp | 36 ++++- src/librssguard/gui/webbrowser.h | 6 + src/librssguard/network-web/articleparse.cpp | 140 ++++++++++++++++++ src/librssguard/network-web/articleparse.h | 34 +++++ src/librssguard/network-web/webfactory.cpp | 6 + src/librssguard/network-web/webfactory.h | 3 + 9 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 resources/scripts/article-extractor/extract-article.mjs create mode 100644 src/librssguard/network-web/articleparse.cpp create mode 100644 src/librssguard/network-web/articleparse.h diff --git a/resources/rssguard.qrc b/resources/rssguard.qrc index b43cf2970..d73ac7411 100644 --- a/resources/rssguard.qrc +++ b/resources/rssguard.qrc @@ -26,6 +26,7 @@ scripts/adblock/adblock-server.js scripts/readability/readabilize-article.js + scripts/article-extractor/extract-article.mjs scripts/filters/blacklist.js scripts/filters/whitelist.js diff --git a/resources/scripts/article-extractor/extract-article.mjs b/resources/scripts/article-extractor/extract-article.mjs new file mode 100644 index 000000000..f118769a8 --- /dev/null +++ b/resources/scripts/article-extractor/extract-article.mjs @@ -0,0 +1,11 @@ +import { extract } from '@extractus/article-extractor' + +const input = process.argv[2]; + +try { + const article = await extract(input); + console.log(JSON.stringify(article)); +} +catch (err) { + console.error(err); +} \ No newline at end of file diff --git a/src/librssguard/CMakeLists.txt b/src/librssguard/CMakeLists.txt index 7df9f3a98..5412dac88 100644 --- a/src/librssguard/CMakeLists.txt +++ b/src/librssguard/CMakeLists.txt @@ -253,6 +253,8 @@ set(SOURCES network-web/adblock/adblockrequestinfo.h network-web/apiserver.cpp network-web/apiserver.h + network-web/articleparse.cpp + network-web/articleparse.h network-web/basenetworkaccessmanager.cpp network-web/basenetworkaccessmanager.h network-web/cookiejar.cpp diff --git a/src/librssguard/gui/webbrowser.cpp b/src/librssguard/gui/webbrowser.cpp index dc4540277..611bd8d5c 100644 --- a/src/librssguard/gui/webbrowser.cpp +++ b/src/librssguard/gui/webbrowser.cpp @@ -12,10 +12,12 @@ #include "miscellaneous/application.h" #include "miscellaneous/iconfactory.h" #include "miscellaneous/settings.h" +#include "network-web/articleparse.h" #include "network-web/networkfactory.h" #include "network-web/readability.h" #include "network-web/webfactory.h" +#include #include #include #include @@ -37,7 +39,10 @@ WebBrowser::WebBrowser(WebViewer* viewer, QWidget* parent) #endif m_actionReadabilePage(new QAction(qApp->icons()->fromTheme(QSL("text-html")), tr("View website in reader mode"), - this)) { + this)), + m_actionGetFullArticle(new QAction(qApp->icons()->fromTheme(QSL("download"), QSL("browser-download")), + tr("Load full source article"), + this)) { if (m_webView == nullptr) { m_webView = qApp->createWebView(); dynamic_cast(m_webView)->setParent(this); @@ -87,6 +92,7 @@ void WebBrowser::createConnections() { connect(m_actionOpenInSystemBrowser, &QAction::triggered, this, &WebBrowser::openCurrentSiteInSystemBrowser); connect(m_actionReadabilePage, &QAction::triggered, this, &WebBrowser::readabilePage); + connect(m_actionGetFullArticle, &QAction::triggered, this, &WebBrowser::getFullArticle); #if defined(ENABLE_MEDIAPLAYER) connect(m_actionPlayPageInMediaPlayer, &QAction::triggered, this, &WebBrowser::playCurrentSiteInMediaPlayer); @@ -99,6 +105,9 @@ void WebBrowser::createConnections() { connect(qApp->web()->readability(), &Readability::htmlReadabled, this, &WebBrowser::setReadabledHtml); connect(qApp->web()->readability(), &Readability::errorOnHtmlReadabiliting, this, &WebBrowser::readabilityFailed); + + connect(qApp->web()->articleParse(), &ArticleParse::articleParsed, this, &WebBrowser::setFullArticleHtml); + connect(qApp->web()->articleParse(), &ArticleParse::errorOnArticlePArsing, this, &WebBrowser::fullArticleFailed); } void WebBrowser::updateUrl(const QUrl& url) { @@ -182,6 +191,11 @@ void WebBrowser::readabilePage() { qApp->web()->readability()->makeHtmlReadable(this, m_webView->html(), m_webView->url().toString()); } +void WebBrowser::getFullArticle() { + m_actionGetFullArticle->setEnabled(false); + qApp->web()->articleParse()->parseArticle(this, m_webView->url().toString()); +} + bool WebBrowser::eventFilter(QObject* watched, QEvent* event) { Q_UNUSED(watched) @@ -292,6 +306,21 @@ void WebBrowser::readabilityFailed(QObject* sndr, const QString& error) { } } +void WebBrowser::setFullArticleHtml(QObject* sndr, const QString& json_answer) { + if (sndr == this && !json_answer.isEmpty()) { + QJsonDocument json_doc = QJsonDocument::fromJson(json_answer.toUtf8()); + QString better_html = json_doc["content"].toString(); + + m_webView->setReadabledHtml(better_html, m_webView->url()); + } +} + +void WebBrowser::fullArticleFailed(QObject* sndr, const QString& error) { + if (sndr == this && !error.isEmpty()) { + m_webView->setReadabledHtml(error, m_webView->url()); + } +} + void WebBrowser::initializeLayout() { m_toolBar->setFloatable(false); m_toolBar->setMovable(false); @@ -324,9 +353,11 @@ void WebBrowser::initializeLayout() { m_actionOpenInSystemBrowser->setEnabled(false); m_actionReadabilePage->setEnabled(false); + m_actionGetFullArticle->setEnabled(false); // Add needed actions into toolbar. m_toolBar->addAction(m_actionOpenInSystemBrowser); + m_toolBar->addAction(m_actionGetFullArticle); m_toolBar->addAction(m_actionReadabilePage); #if defined(ENABLE_MEDIAPLAYER) @@ -358,6 +389,7 @@ void WebBrowser::onLoadingStarted() { m_loadingProgress->show(); m_actionOpenInSystemBrowser->setEnabled(false); m_actionReadabilePage->setEnabled(false); + m_actionGetFullArticle->setEnabled(false); #if defined(ENABLE_MEDIAPLAYER) m_actionPlayPageInMediaPlayer->setEnabled(false); @@ -375,6 +407,7 @@ void WebBrowser::onLoadingFinished(bool success) { if (url.isValid() && !url.host().isEmpty()) { m_actionOpenInSystemBrowser->setEnabled(true); + m_actionGetFullArticle->setEnabled(true); m_actionReadabilePage->setEnabled(true); #if defined(ENABLE_MEDIAPLAYER) @@ -384,6 +417,7 @@ void WebBrowser::onLoadingFinished(bool success) { else { m_actionOpenInSystemBrowser->setEnabled(false); m_actionReadabilePage->setEnabled(false); + m_actionGetFullArticle->setEnabled(false); #if defined(ENABLE_MEDIAPLAYER) m_actionPlayPageInMediaPlayer->setEnabled(false); diff --git a/src/librssguard/gui/webbrowser.h b/src/librssguard/gui/webbrowser.h index 5692cc326..9ede837e5 100644 --- a/src/librssguard/gui/webbrowser.h +++ b/src/librssguard/gui/webbrowser.h @@ -74,9 +74,14 @@ class RSSGUARD_DLLSPEC WebBrowser : public TabContent { void newWindowRequested(WebViewer* viewer); void readabilePage(); + void getFullArticle(); + void setReadabledHtml(QObject* sndr, const QString& better_html); void readabilityFailed(QObject* sndr, const QString& error); + void setFullArticleHtml(QObject* sndr, const QString& json_answer); + void fullArticleFailed(QObject* sndr, const QString& error); + signals: void windowCloseRequested(); void iconChanged(int index, const QIcon& icon); @@ -106,6 +111,7 @@ class RSSGUARD_DLLSPEC WebBrowser : public TabContent { #endif QAction* m_actionReadabilePage; + QAction* m_actionGetFullArticle; QList m_messages; QPointer m_root; diff --git a/src/librssguard/network-web/articleparse.cpp b/src/librssguard/network-web/articleparse.cpp new file mode 100644 index 000000000..6e83553c1 --- /dev/null +++ b/src/librssguard/network-web/articleparse.cpp @@ -0,0 +1,140 @@ +// For license of this file, see /LICENSE.md. + +#include "network-web/articleparse.h" + +#include "3rd-party/boolinq/boolinq.h" +#include "exceptions/applicationexception.h" +#include "miscellaneous/application.h" + +#include + +#define EXTRACTOR_PACKAGE "@extractus/article-extractor" +#define EXTRACTOR_VERSION "8.0.7" + +ArticleParse::ArticleParse(QObject* parent) : QObject{parent}, m_modulesInstalling(false), m_modulesInstalled(false) { + connect(qApp->nodejs(), &NodeJs::packageInstalledUpdated, this, &ArticleParse::onPackageReady); + connect(qApp->nodejs(), &NodeJs::packageError, this, &ArticleParse::onPackageError); +} + +void ArticleParse::onPackageReady(const QList& pkgs, bool already_up_to_date) { + Q_UNUSED(already_up_to_date) + + bool concerns_extractor = boolinq::from(pkgs).any([](const NodeJs::PackageMetadata& pkg) { + return pkg.m_name == QSL(EXTRACTOR_PACKAGE); + }); + + if (!concerns_extractor) { + return; + } + + m_modulesInstalled = true; + m_modulesInstalling = false; + + qApp->showGuiMessage(Notification::Event::NodePackageUpdated, + {tr("Packages for article-extractor are installed"), + tr("Press the button once more!"), + QSystemTrayIcon::MessageIcon::Information}, + {true, true, false}); + + // Emit this just to allow readability again for user. + emit articleParsed(nullptr, tr("Packages for article-extractor are installed. You can now use this feature!")); +} + +void ArticleParse::onPackageError(const QList& pkgs, const QString& error) { + bool concerns_extractor = boolinq::from(pkgs).any([](const NodeJs::PackageMetadata& pkg) { + return pkg.m_name == QSL(EXTRACTOR_PACKAGE); + }); + + if (!concerns_extractor) { + return; + } + + m_modulesInstalled = m_modulesInstalling = false; + + qApp->showGuiMessage(Notification::Event::NodePackageUpdated, + {tr("Packages for article-extractor are NOT installed"), + tr("There is error: %1").arg(error), + QSystemTrayIcon::MessageIcon::Critical}, + {true, true, false}); + + // Emit this just to allow readability again for user. + emit articleParsed(nullptr, tr("Packages for article-extractor are NOT installed. There is error: %1").arg(error)); +} + +void ArticleParse::parseArticle(QObject* sndr, const QString& url) { + if (!m_modulesInstalled) { + // NOTE: Here we use MJS file directly placed in its NODE package folder + // because NODE_PATH is not supported for MJS files. + m_scriptFilename = qApp->nodejs()->processedPackageFolder() + QDir::separator() + QSL("extract-article.mjs"); + + if (!IOFactory::copyFile(QSL(":/scripts/article-extractor/extract-article.mjs"), m_scriptFilename)) { + qCriticalNN << LOGSEC_ADBLOCK << "Failed to copy article-extractor script to TEMP."; + } + + try { + NodeJs::PackageStatus st_extractor = + qApp->nodejs()->packageStatus({QSL(EXTRACTOR_PACKAGE), QSL(EXTRACTOR_VERSION)}); + + if (st_extractor != NodeJs::PackageStatus::UpToDate) { + if (!m_modulesInstalling) { + // We make sure to update modules. + m_modulesInstalling = true; + + qApp->showGuiMessage(Notification::Event::NodePackageUpdated, + {tr("Node.js libraries not installed"), + tr("%1 will now install some needed libraries, this will take only a few seconds. " + "You will be notified when installation is complete.") + .arg(QSL(APP_NAME)), + QSystemTrayIcon::MessageIcon::Warning}, + {true, true, false}); + qApp->nodejs()->installUpdatePackages({{QSL(EXTRACTOR_PACKAGE), QSL(EXTRACTOR_VERSION)}}); + } + + return; + } + else { + m_modulesInstalled = true; + } + } + catch (const ApplicationException& ex) { + qApp->showGuiMessage(Notification::Event::NodePackageUpdated, + {tr("Node.js libraries not installed"), + tr("Node.js is not configured properly. Go to \"Settings\" -> \"Node.js\" and check " + "if your Node.js is properly configured."), + QSystemTrayIcon::MessageIcon::Critical}, + {true, true, false}); + + qCriticalNN << LOGSEC_CORE << "Failed to check for Node.js package status:" << QUOTE_W_SPACE_DOT(ex.message()); + + // Emit this just to allow readability again for user. + emit articleParsed(sndr, + tr("Node.js is not configured properly. Go to \"Settings\" -> \"Node.js\" and check " + "if your Node.js is properly configured.")); + } + } + + QProcess* proc = new QProcess(this); + + connect(proc, + QOverload::of(&QProcess::finished), + this, + [=](int exit_code, QProcess::ExitStatus exit_status) { + onParsingFinished(sndr, exit_code, exit_status); + }); + + qApp->nodejs()->runScript(proc, m_scriptFilename, {url}); +} + +void ArticleParse::onParsingFinished(QObject* sndr, int exit_code, QProcess::ExitStatus exit_status) { + QProcess* proc = qobject_cast(sender()); + + if (exit_status == QProcess::ExitStatus::NormalExit && exit_code == EXIT_SUCCESS) { + emit articleParsed(sndr, QString::fromUtf8(proc->readAllStandardOutput())); + } + else { + QString err = QString::fromUtf8(proc->readAllStandardError()); + emit errorOnArticlePArsing(sndr, err); + } + + proc->deleteLater(); +} diff --git a/src/librssguard/network-web/articleparse.h b/src/librssguard/network-web/articleparse.h new file mode 100644 index 000000000..672273593 --- /dev/null +++ b/src/librssguard/network-web/articleparse.h @@ -0,0 +1,34 @@ +// For license of this file, see /LICENSE.md. + +#ifndef ARTICLEPARSE_H +#define ARTICLEPARSE_H + +#include "miscellaneous/nodejs.h" + +#include +#include + +class ArticleParse : public QObject { + Q_OBJECT + + public: + explicit ArticleParse(QObject* parent = nullptr); + + void parseArticle(QObject* sndr, const QString& url); + + private slots: + void onParsingFinished(QObject* sndr, int exit_code, QProcess::ExitStatus exit_status); + void onPackageReady(const QList& pkgs, bool already_up_to_date); + void onPackageError(const QList& pkgs, const QString& error); + + signals: + void articleParsed(QObject* sndr, const QString& better_html); + void errorOnArticlePArsing(QObject* sndr, const QString& error); + + private: + bool m_modulesInstalling; + bool m_modulesInstalled; + QString m_scriptFilename; +}; + +#endif // ARTICLEPARSE_H diff --git a/src/librssguard/network-web/webfactory.cpp b/src/librssguard/network-web/webfactory.cpp index 5ab3af5f5..14f8637a0 100644 --- a/src/librssguard/network-web/webfactory.cpp +++ b/src/librssguard/network-web/webfactory.cpp @@ -8,6 +8,7 @@ #include "miscellaneous/settings.h" #include "network-web/adblock/adblockmanager.h" #include "network-web/apiserver.h" +#include "network-web/articleparse.h" #include "network-web/cookiejar.h" #include "network-web/readability.h" @@ -54,6 +55,7 @@ WebFactory::WebFactory(QObject* parent) : QObject(parent), m_apiServer(nullptr), m_cookieJar = new CookieJar(this); m_readability = new Readability(this); + m_articleParse = new ArticleParse(this); #if defined(NO_LITE) #if QT_VERSION >= 0x050D00 // Qt >= 5.13.0 @@ -568,6 +570,10 @@ Readability* WebFactory::readability() const { return m_readability; } +ArticleParse* WebFactory::articleParse() const { + return m_articleParse; +} + void WebFactory::startApiServer() { m_apiServer = new ApiServer(this); m_apiServer->setListenAddressPort(QSL("http://localhost:54123"), true); diff --git a/src/librssguard/network-web/webfactory.h b/src/librssguard/network-web/webfactory.h index acc55a15d..b1db2e1ef 100644 --- a/src/librssguard/network-web/webfactory.h +++ b/src/librssguard/network-web/webfactory.h @@ -20,6 +20,7 @@ class AdBlockManager; class CookieJar; class ApiServer; class Readability; +class ArticleParse; class RSSGUARD_DLLSPEC WebFactory : public QObject { Q_OBJECT @@ -50,6 +51,7 @@ class RSSGUARD_DLLSPEC WebFactory : public QObject { CookieJar* cookieJar() const; Readability* readability() const; + ArticleParse* articleParse() const; void startApiServer(); void stopApiServer(); @@ -95,6 +97,7 @@ class RSSGUARD_DLLSPEC WebFactory : public QObject { ApiServer* m_apiServer; CookieJar* m_cookieJar; Readability* m_readability; + ArticleParse* m_articleParse; QString m_customUserAgent; };