starting to work on fetching full article content

This commit is contained in:
Martin Rotter 2024-04-22 14:37:49 +02:00
parent d8525c3a93
commit c7c069d889
9 changed files with 238 additions and 1 deletions

View File

@ -26,6 +26,7 @@
<file>scripts/adblock/adblock-server.js</file>
<file>scripts/readability/readabilize-article.js</file>
<file>scripts/article-extractor/extract-article.mjs</file>
<file>scripts/filters/blacklist.js</file>
<file>scripts/filters/whitelist.js</file>

View File

@ -0,0 +1,11 @@
import { extract } from '@extractus/article-extractor'
const input = process.argv[2];
try {
const article = await extract(input);
console.log(JSON.stringify(article));
}
catch (err) {
console.error(err);
}

View File

@ -253,6 +253,8 @@ set(SOURCES
network-web/adblock/adblockrequestinfo.h
network-web/apiserver.cpp
network-web/apiserver.h
network-web/articleparse.cpp
network-web/articleparse.h
network-web/basenetworkaccessmanager.cpp
network-web/basenetworkaccessmanager.h
network-web/cookiejar.cpp

View File

@ -12,10 +12,12 @@
#include "miscellaneous/application.h"
#include "miscellaneous/iconfactory.h"
#include "miscellaneous/settings.h"
#include "network-web/articleparse.h"
#include "network-web/networkfactory.h"
#include "network-web/readability.h"
#include "network-web/webfactory.h"
#include <QJsonObject>
#include <QKeyEvent>
#include <QProgressBar>
#include <QScrollBar>
@ -37,7 +39,10 @@ WebBrowser::WebBrowser(WebViewer* viewer, QWidget* parent)
#endif
m_actionReadabilePage(new QAction(qApp->icons()->fromTheme(QSL("text-html")),
tr("View website in reader mode"),
this)) {
this)),
m_actionGetFullArticle(new QAction(qApp->icons()->fromTheme(QSL("download"), QSL("browser-download")),
tr("Load full source article"),
this)) {
if (m_webView == nullptr) {
m_webView = qApp->createWebView();
dynamic_cast<QWidget*>(m_webView)->setParent(this);
@ -87,6 +92,7 @@ void WebBrowser::createConnections() {
connect(m_actionOpenInSystemBrowser, &QAction::triggered, this, &WebBrowser::openCurrentSiteInSystemBrowser);
connect(m_actionReadabilePage, &QAction::triggered, this, &WebBrowser::readabilePage);
connect(m_actionGetFullArticle, &QAction::triggered, this, &WebBrowser::getFullArticle);
#if defined(ENABLE_MEDIAPLAYER)
connect(m_actionPlayPageInMediaPlayer, &QAction::triggered, this, &WebBrowser::playCurrentSiteInMediaPlayer);
@ -99,6 +105,9 @@ void WebBrowser::createConnections() {
connect(qApp->web()->readability(), &Readability::htmlReadabled, this, &WebBrowser::setReadabledHtml);
connect(qApp->web()->readability(), &Readability::errorOnHtmlReadabiliting, this, &WebBrowser::readabilityFailed);
connect(qApp->web()->articleParse(), &ArticleParse::articleParsed, this, &WebBrowser::setFullArticleHtml);
connect(qApp->web()->articleParse(), &ArticleParse::errorOnArticlePArsing, this, &WebBrowser::fullArticleFailed);
}
void WebBrowser::updateUrl(const QUrl& url) {
@ -182,6 +191,11 @@ void WebBrowser::readabilePage() {
qApp->web()->readability()->makeHtmlReadable(this, m_webView->html(), m_webView->url().toString());
}
void WebBrowser::getFullArticle() {
m_actionGetFullArticle->setEnabled(false);
qApp->web()->articleParse()->parseArticle(this, m_webView->url().toString());
}
bool WebBrowser::eventFilter(QObject* watched, QEvent* event) {
Q_UNUSED(watched)
@ -292,6 +306,21 @@ void WebBrowser::readabilityFailed(QObject* sndr, const QString& error) {
}
}
void WebBrowser::setFullArticleHtml(QObject* sndr, const QString& json_answer) {
if (sndr == this && !json_answer.isEmpty()) {
QJsonDocument json_doc = QJsonDocument::fromJson(json_answer.toUtf8());
QString better_html = json_doc["content"].toString();
m_webView->setReadabledHtml(better_html, m_webView->url());
}
}
void WebBrowser::fullArticleFailed(QObject* sndr, const QString& error) {
if (sndr == this && !error.isEmpty()) {
m_webView->setReadabledHtml(error, m_webView->url());
}
}
void WebBrowser::initializeLayout() {
m_toolBar->setFloatable(false);
m_toolBar->setMovable(false);
@ -324,9 +353,11 @@ void WebBrowser::initializeLayout() {
m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
m_actionGetFullArticle->setEnabled(false);
// Add needed actions into toolbar.
m_toolBar->addAction(m_actionOpenInSystemBrowser);
m_toolBar->addAction(m_actionGetFullArticle);
m_toolBar->addAction(m_actionReadabilePage);
#if defined(ENABLE_MEDIAPLAYER)
@ -358,6 +389,7 @@ void WebBrowser::onLoadingStarted() {
m_loadingProgress->show();
m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
m_actionGetFullArticle->setEnabled(false);
#if defined(ENABLE_MEDIAPLAYER)
m_actionPlayPageInMediaPlayer->setEnabled(false);
@ -375,6 +407,7 @@ void WebBrowser::onLoadingFinished(bool success) {
if (url.isValid() && !url.host().isEmpty()) {
m_actionOpenInSystemBrowser->setEnabled(true);
m_actionGetFullArticle->setEnabled(true);
m_actionReadabilePage->setEnabled(true);
#if defined(ENABLE_MEDIAPLAYER)
@ -384,6 +417,7 @@ void WebBrowser::onLoadingFinished(bool success) {
else {
m_actionOpenInSystemBrowser->setEnabled(false);
m_actionReadabilePage->setEnabled(false);
m_actionGetFullArticle->setEnabled(false);
#if defined(ENABLE_MEDIAPLAYER)
m_actionPlayPageInMediaPlayer->setEnabled(false);

View File

@ -74,9 +74,14 @@ class RSSGUARD_DLLSPEC WebBrowser : public TabContent {
void newWindowRequested(WebViewer* viewer);
void readabilePage();
void getFullArticle();
void setReadabledHtml(QObject* sndr, const QString& better_html);
void readabilityFailed(QObject* sndr, const QString& error);
void setFullArticleHtml(QObject* sndr, const QString& json_answer);
void fullArticleFailed(QObject* sndr, const QString& error);
signals:
void windowCloseRequested();
void iconChanged(int index, const QIcon& icon);
@ -106,6 +111,7 @@ class RSSGUARD_DLLSPEC WebBrowser : public TabContent {
#endif
QAction* m_actionReadabilePage;
QAction* m_actionGetFullArticle;
QList<Message> m_messages;
QPointer<RootItem> m_root;

View File

@ -0,0 +1,140 @@
// For license of this file, see <project-root-folder>/LICENSE.md.
#include "network-web/articleparse.h"
#include "3rd-party/boolinq/boolinq.h"
#include "exceptions/applicationexception.h"
#include "miscellaneous/application.h"
#include <QDir>
#define EXTRACTOR_PACKAGE "@extractus/article-extractor"
#define EXTRACTOR_VERSION "8.0.7"
ArticleParse::ArticleParse(QObject* parent) : QObject{parent}, m_modulesInstalling(false), m_modulesInstalled(false) {
connect(qApp->nodejs(), &NodeJs::packageInstalledUpdated, this, &ArticleParse::onPackageReady);
connect(qApp->nodejs(), &NodeJs::packageError, this, &ArticleParse::onPackageError);
}
void ArticleParse::onPackageReady(const QList<NodeJs::PackageMetadata>& pkgs, bool already_up_to_date) {
Q_UNUSED(already_up_to_date)
bool concerns_extractor = boolinq::from(pkgs).any([](const NodeJs::PackageMetadata& pkg) {
return pkg.m_name == QSL(EXTRACTOR_PACKAGE);
});
if (!concerns_extractor) {
return;
}
m_modulesInstalled = true;
m_modulesInstalling = false;
qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
{tr("Packages for article-extractor are installed"),
tr("Press the button once more!"),
QSystemTrayIcon::MessageIcon::Information},
{true, true, false});
// Emit this just to allow readability again for user.
emit articleParsed(nullptr, tr("Packages for article-extractor are installed. You can now use this feature!"));
}
void ArticleParse::onPackageError(const QList<NodeJs::PackageMetadata>& pkgs, const QString& error) {
bool concerns_extractor = boolinq::from(pkgs).any([](const NodeJs::PackageMetadata& pkg) {
return pkg.m_name == QSL(EXTRACTOR_PACKAGE);
});
if (!concerns_extractor) {
return;
}
m_modulesInstalled = m_modulesInstalling = false;
qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
{tr("Packages for article-extractor are NOT installed"),
tr("There is error: %1").arg(error),
QSystemTrayIcon::MessageIcon::Critical},
{true, true, false});
// Emit this just to allow readability again for user.
emit articleParsed(nullptr, tr("Packages for article-extractor are NOT installed. There is error: %1").arg(error));
}
void ArticleParse::parseArticle(QObject* sndr, const QString& url) {
if (!m_modulesInstalled) {
// NOTE: Here we use MJS file directly placed in its NODE package folder
// because NODE_PATH is not supported for MJS files.
m_scriptFilename = qApp->nodejs()->processedPackageFolder() + QDir::separator() + QSL("extract-article.mjs");
if (!IOFactory::copyFile(QSL(":/scripts/article-extractor/extract-article.mjs"), m_scriptFilename)) {
qCriticalNN << LOGSEC_ADBLOCK << "Failed to copy article-extractor script to TEMP.";
}
try {
NodeJs::PackageStatus st_extractor =
qApp->nodejs()->packageStatus({QSL(EXTRACTOR_PACKAGE), QSL(EXTRACTOR_VERSION)});
if (st_extractor != NodeJs::PackageStatus::UpToDate) {
if (!m_modulesInstalling) {
// We make sure to update modules.
m_modulesInstalling = true;
qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
{tr("Node.js libraries not installed"),
tr("%1 will now install some needed libraries, this will take only a few seconds. "
"You will be notified when installation is complete.")
.arg(QSL(APP_NAME)),
QSystemTrayIcon::MessageIcon::Warning},
{true, true, false});
qApp->nodejs()->installUpdatePackages({{QSL(EXTRACTOR_PACKAGE), QSL(EXTRACTOR_VERSION)}});
}
return;
}
else {
m_modulesInstalled = true;
}
}
catch (const ApplicationException& ex) {
qApp->showGuiMessage(Notification::Event::NodePackageUpdated,
{tr("Node.js libraries not installed"),
tr("Node.js is not configured properly. Go to \"Settings\" -> \"Node.js\" and check "
"if your Node.js is properly configured."),
QSystemTrayIcon::MessageIcon::Critical},
{true, true, false});
qCriticalNN << LOGSEC_CORE << "Failed to check for Node.js package status:" << QUOTE_W_SPACE_DOT(ex.message());
// Emit this just to allow readability again for user.
emit articleParsed(sndr,
tr("Node.js is not configured properly. Go to \"Settings\" -> \"Node.js\" and check "
"if your Node.js is properly configured."));
}
}
QProcess* proc = new QProcess(this);
connect(proc,
QOverload<int, QProcess::ExitStatus>::of(&QProcess::finished),
this,
[=](int exit_code, QProcess::ExitStatus exit_status) {
onParsingFinished(sndr, exit_code, exit_status);
});
qApp->nodejs()->runScript(proc, m_scriptFilename, {url});
}
void ArticleParse::onParsingFinished(QObject* sndr, int exit_code, QProcess::ExitStatus exit_status) {
QProcess* proc = qobject_cast<QProcess*>(sender());
if (exit_status == QProcess::ExitStatus::NormalExit && exit_code == EXIT_SUCCESS) {
emit articleParsed(sndr, QString::fromUtf8(proc->readAllStandardOutput()));
}
else {
QString err = QString::fromUtf8(proc->readAllStandardError());
emit errorOnArticlePArsing(sndr, err);
}
proc->deleteLater();
}

View File

@ -0,0 +1,34 @@
// For license of this file, see <project-root-folder>/LICENSE.md.
#ifndef ARTICLEPARSE_H
#define ARTICLEPARSE_H
#include "miscellaneous/nodejs.h"
#include <QObject>
#include <QProcess>
class ArticleParse : public QObject {
Q_OBJECT
public:
explicit ArticleParse(QObject* parent = nullptr);
void parseArticle(QObject* sndr, const QString& url);
private slots:
void onParsingFinished(QObject* sndr, int exit_code, QProcess::ExitStatus exit_status);
void onPackageReady(const QList<NodeJs::PackageMetadata>& pkgs, bool already_up_to_date);
void onPackageError(const QList<NodeJs::PackageMetadata>& pkgs, const QString& error);
signals:
void articleParsed(QObject* sndr, const QString& better_html);
void errorOnArticlePArsing(QObject* sndr, const QString& error);
private:
bool m_modulesInstalling;
bool m_modulesInstalled;
QString m_scriptFilename;
};
#endif // ARTICLEPARSE_H

View File

@ -8,6 +8,7 @@
#include "miscellaneous/settings.h"
#include "network-web/adblock/adblockmanager.h"
#include "network-web/apiserver.h"
#include "network-web/articleparse.h"
#include "network-web/cookiejar.h"
#include "network-web/readability.h"
@ -54,6 +55,7 @@ WebFactory::WebFactory(QObject* parent) : QObject(parent), m_apiServer(nullptr),
m_cookieJar = new CookieJar(this);
m_readability = new Readability(this);
m_articleParse = new ArticleParse(this);
#if defined(NO_LITE)
#if QT_VERSION >= 0x050D00 // Qt >= 5.13.0
@ -568,6 +570,10 @@ Readability* WebFactory::readability() const {
return m_readability;
}
ArticleParse* WebFactory::articleParse() const {
return m_articleParse;
}
void WebFactory::startApiServer() {
m_apiServer = new ApiServer(this);
m_apiServer->setListenAddressPort(QSL("http://localhost:54123"), true);

View File

@ -20,6 +20,7 @@ class AdBlockManager;
class CookieJar;
class ApiServer;
class Readability;
class ArticleParse;
class RSSGUARD_DLLSPEC WebFactory : public QObject {
Q_OBJECT
@ -50,6 +51,7 @@ class RSSGUARD_DLLSPEC WebFactory : public QObject {
CookieJar* cookieJar() const;
Readability* readability() const;
ArticleParse* articleParse() const;
void startApiServer();
void stopApiServer();
@ -95,6 +97,7 @@ class RSSGUARD_DLLSPEC WebFactory : public QObject {
ApiServer* m_apiServer;
CookieJar* m_cookieJar;
Readability* m_readability;
ArticleParse* m_articleParse;
QString m_customUserAgent;
};