make new scraping feature more robust

This commit is contained in:
Martin Rotter 2024-03-08 08:36:32 +01:00
parent 47714ac590
commit 35b378d8b1
5 changed files with 34 additions and 21 deletions

View File

@ -3,7 +3,7 @@ const targetNode = document;
const waitTimeMs = 5000; const waitTimeMs = 5000;
const idleIdString = "iiddllee"; const idleIdString = "iiddllee";
var scrollings = 0; // Change to higher number to force some scrollings. var scrollings = 1; // Change to higher number to force some scrollings.
var lastResourceTime = new Date(); var lastResourceTime = new Date();
// Setup DOM observer and observe for changes in elements only. // Setup DOM observer and observe for changes in elements only.

View File

@ -202,37 +202,50 @@ QUrl WebEngineViewer::url() const {
return QWebEngineView::url(); return QWebEngineView::url();
} }
QByteArray WebEngineViewer::getJsEnabledHtml(const QString& url) { QByteArray WebEngineViewer::getJsEnabledHtml(const QString& url, bool worker_thread) {
WebEnginePage* page = new WebEnginePage(); WebEnginePage* page = new WebEnginePage();
WebEngineViewer* viewer = nullptr; WebEngineViewer* viewer = nullptr;
QMetaObject::invokeMethod( if (worker_thread) {
qApp, QMetaObject::invokeMethod(
[&] { qApp,
// NOTE: Must be created on main thread. [&] {
viewer = new WebEngineViewer(); // NOTE: Must be created on main thread.
}, viewer = new WebEngineViewer();
Qt::ConnectionType::BlockingQueuedConnection); },
Qt::ConnectionType::BlockingQueuedConnection);
viewer->moveToThread(qApp->thread()); viewer->moveToThread(qApp->thread());
page->moveToThread(qApp->thread()); page->moveToThread(qApp->thread());
}
else {
viewer = new WebEngineViewer();
}
viewer->setPage(page); viewer->setPage(page);
viewer->setAttribute(Qt::WidgetAttribute::WA_DontShowOnScreen, true); viewer->setAttribute(Qt::WidgetAttribute::WA_DontShowOnScreen, true);
viewer->setAttribute(Qt::WidgetAttribute::WA_DeleteOnClose, true); viewer->setAttribute(Qt::WidgetAttribute::WA_DeleteOnClose, true);
QMetaObject::invokeMethod(viewer, "show", Qt::ConnectionType::BlockingQueuedConnection);
QString html; QString html;
QMetaObject::invokeMethod(page,
"pageHtml", if (worker_thread) {
Qt::ConnectionType::BlockingQueuedConnection, QMetaObject::invokeMethod(viewer, "show", Qt::ConnectionType::BlockingQueuedConnection);
Q_RETURN_ARG(QString, html), QMetaObject::invokeMethod(page,
Q_ARG(QString, url)); "pageHtml",
Qt::ConnectionType::BlockingQueuedConnection,
Q_RETURN_ARG(QString, html),
Q_ARG(QString, url));
}
else {
viewer->show();
html = page->pageHtml(url);
}
page->deleteLater(); page->deleteLater();
viewer->close(); viewer->close();
IOFactory::writeFile("a.html", html.toUtf8());
return html.toUtf8(); return html.toUtf8();
} }

View File

@ -37,7 +37,7 @@ class WebEngineViewer : public QWebEngineView, public WebViewer {
virtual QString html() const; virtual QString html() const;
virtual QUrl url() const; virtual QUrl url() const;
static QByteArray getJsEnabledHtml(const QString& url); static QByteArray getJsEnabledHtml(const QString& url, bool worker_thread);
signals: signals:
void pageTitleChanged(const QString& new_title); void pageTitleChanged(const QString& new_title);

View File

@ -297,7 +297,7 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type,
} }
else if (source_type == StandardFeed::SourceType::EmbeddedBrowser) { else if (source_type == StandardFeed::SourceType::EmbeddedBrowser) {
#if defined(NO_LITE) #if defined(NO_LITE)
feed_contents = WebEngineViewer::getJsEnabledHtml(source); feed_contents = WebEngineViewer::getJsEnabledHtml(source, false);
#else #else
throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME))); throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME)));
#endif #endif

View File

@ -242,7 +242,7 @@ QList<Message> StandardServiceRoot::obtainNewMessages(Feed* feed,
} }
else if (f->sourceType() == StandardFeed::SourceType::EmbeddedBrowser) { else if (f->sourceType() == StandardFeed::SourceType::EmbeddedBrowser) {
#if defined(NO_LITE) #if defined(NO_LITE)
feed_contents = WebEngineViewer::getJsEnabledHtml(f->source()); feed_contents = WebEngineViewer::getJsEnabledHtml(f->source(), true);
#else #else
throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME))); throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME)));
#endif #endif