From 35b378d8b1c26bb3177ce13fa3d6b8f84842664b Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Fri, 8 Mar 2024 08:36:32 +0100 Subject: [PATCH] make new scraping feature more robust --- resources/scripts/builtin_js/observer.js | 2 +- .../webviewers/webengine/webengineviewer.cpp | 47 ++++++++++++------- .../webviewers/webengine/webengineviewer.h | 2 +- .../services/standard/standardfeed.cpp | 2 +- .../services/standard/standardserviceroot.cpp | 2 +- 5 files changed, 34 insertions(+), 21 deletions(-) diff --git a/resources/scripts/builtin_js/observer.js b/resources/scripts/builtin_js/observer.js index b374dcc61..857eff57d 100644 --- a/resources/scripts/builtin_js/observer.js +++ b/resources/scripts/builtin_js/observer.js @@ -3,7 +3,7 @@ const targetNode = document; const waitTimeMs = 5000; const idleIdString = "iiddllee"; -var scrollings = 0; // Change to higher number to force some scrollings. +var scrollings = 1; // Change to higher number to force some scrollings. var lastResourceTime = new Date(); // Setup DOM observer and observe for changes in elements only. diff --git a/src/librssguard/gui/webviewers/webengine/webengineviewer.cpp b/src/librssguard/gui/webviewers/webengine/webengineviewer.cpp index 8714b7dfb..5a5e0341e 100644 --- a/src/librssguard/gui/webviewers/webengine/webengineviewer.cpp +++ b/src/librssguard/gui/webviewers/webengine/webengineviewer.cpp @@ -202,37 +202,50 @@ QUrl WebEngineViewer::url() const { return QWebEngineView::url(); } -QByteArray WebEngineViewer::getJsEnabledHtml(const QString& url) { +QByteArray WebEngineViewer::getJsEnabledHtml(const QString& url, bool worker_thread) { WebEnginePage* page = new WebEnginePage(); WebEngineViewer* viewer = nullptr; - QMetaObject::invokeMethod( - qApp, - [&] { - // NOTE: Must be created on main thread. - viewer = new WebEngineViewer(); - }, - Qt::ConnectionType::BlockingQueuedConnection); + if (worker_thread) { + QMetaObject::invokeMethod( + qApp, + [&] { + // NOTE: Must be created on main thread. + viewer = new WebEngineViewer(); + }, + Qt::ConnectionType::BlockingQueuedConnection); - viewer->moveToThread(qApp->thread()); - page->moveToThread(qApp->thread()); + viewer->moveToThread(qApp->thread()); + page->moveToThread(qApp->thread()); + } + else { + viewer = new WebEngineViewer(); + } viewer->setPage(page); viewer->setAttribute(Qt::WidgetAttribute::WA_DontShowOnScreen, true); viewer->setAttribute(Qt::WidgetAttribute::WA_DeleteOnClose, true); - QMetaObject::invokeMethod(viewer, "show", Qt::ConnectionType::BlockingQueuedConnection); - QString html; - QMetaObject::invokeMethod(page, - "pageHtml", - Qt::ConnectionType::BlockingQueuedConnection, - Q_RETURN_ARG(QString, html), - Q_ARG(QString, url)); + + if (worker_thread) { + QMetaObject::invokeMethod(viewer, "show", Qt::ConnectionType::BlockingQueuedConnection); + QMetaObject::invokeMethod(page, + "pageHtml", + Qt::ConnectionType::BlockingQueuedConnection, + Q_RETURN_ARG(QString, html), + Q_ARG(QString, url)); + } + else { + viewer->show(); + html = page->pageHtml(url); + } page->deleteLater(); viewer->close(); + IOFactory::writeFile("a.html", html.toUtf8()); + return html.toUtf8(); } diff --git a/src/librssguard/gui/webviewers/webengine/webengineviewer.h b/src/librssguard/gui/webviewers/webengine/webengineviewer.h index 73ccace29..e18e813a9 100644 --- a/src/librssguard/gui/webviewers/webengine/webengineviewer.h +++ b/src/librssguard/gui/webviewers/webengine/webengineviewer.h @@ -37,7 +37,7 @@ class WebEngineViewer : public QWebEngineView, public WebViewer { virtual QString html() const; virtual QUrl url() const; - static QByteArray getJsEnabledHtml(const QString& url); + static QByteArray getJsEnabledHtml(const QString& url, bool worker_thread); signals: void pageTitleChanged(const QString& new_title); diff --git a/src/librssguard/services/standard/standardfeed.cpp b/src/librssguard/services/standard/standardfeed.cpp index 1f4454b0e..f1a0c0b9f 100644 --- a/src/librssguard/services/standard/standardfeed.cpp +++ b/src/librssguard/services/standard/standardfeed.cpp @@ -297,7 +297,7 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type, } else if (source_type == StandardFeed::SourceType::EmbeddedBrowser) { #if defined(NO_LITE) - feed_contents = WebEngineViewer::getJsEnabledHtml(source); + feed_contents = WebEngineViewer::getJsEnabledHtml(source, false); #else throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME))); #endif diff --git a/src/librssguard/services/standard/standardserviceroot.cpp b/src/librssguard/services/standard/standardserviceroot.cpp index 7f94c8ab2..29288770d 100644 --- a/src/librssguard/services/standard/standardserviceroot.cpp +++ b/src/librssguard/services/standard/standardserviceroot.cpp @@ -242,7 +242,7 @@ QList StandardServiceRoot::obtainNewMessages(Feed* feed, } else if (f->sourceType() == StandardFeed::SourceType::EmbeddedBrowser) { #if defined(NO_LITE) - feed_contents = WebEngineViewer::getJsEnabledHtml(f->source()); + feed_contents = WebEngineViewer::getJsEnabledHtml(f->source(), true); #else throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME))); #endif