very experimental way of PoC for JS-enabled websites scraping

This commit is contained in:
Martin Rotter 2024-03-06 07:29:11 +01:00
parent 0cbd356c4a
commit accb478bee
4 changed files with 34 additions and 3 deletions

View File

@ -108,9 +108,9 @@ void CookieJar::saveCookies() {
sett->endGroup(); sett->endGroup();
for (const QNetworkCookie& cookie : cookies) { for (const QNetworkCookie& cookie : cookies) {
if (cookie.isSessionCookie()) { /*if (cookie.isSessionCookie()) {
continue; continue;
} }*/
sett->setPassword(GROUP(Cookies), sett->setPassword(GROUP(Cookies),
QSL("%1-%2").arg(QString::number(i++), QString::fromUtf8(cookie.name())), QSL("%1-%2").arg(QString::number(i++), QString::fromUtf8(cookie.name())),
cookie.toRawForm(QNetworkCookie::RawForm::Full)); cookie.toRawForm(QNetworkCookie::RawForm::Full));

View File

@ -12,6 +12,7 @@
#include <QString> #include <QString>
#include <QStringList> #include <QStringList>
#include <QTimer>
#include <QUrl> #include <QUrl>
#include <QUrlQuery> #include <QUrlQuery>
#include <QWebEngineScript> #include <QWebEngineScript>
@ -33,8 +34,12 @@ WebEngineViewer* WebEnginePage::view() const {
QString WebEnginePage::pageHtml(const QString& url) { QString WebEnginePage::pageHtml(const QString& url) {
QEventLoop loop; QEventLoop loop;
QString html; QString html;
QTimer tmr;
connect(this, &WebEnginePage::loadFinished, &loop, &QEventLoop::quit); tmr.setInterval(15000);
connect(&tmr, &QTimer::timeout, &loop, &QEventLoop::quit);
connect(this, &WebEnginePage::loadFinished, &tmr, QOverload<>::of(&QTimer::start));
load(url); load(url);
loop.exec(); loop.exec();

View File

@ -14,6 +14,7 @@
#include "services/standard/standardserviceroot.h" #include "services/standard/standardserviceroot.h"
#if defined(NO_LITE) #if defined(NO_LITE)
#include "gui/webviewers/webengine/webengineviewer.h"
#include "network-web/webengine/webenginepage.h" #include "network-web/webengine/webenginepage.h"
#endif #endif
@ -297,8 +298,16 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type,
else if (source_type == StandardFeed::SourceType::EmbeddedBrowser) { else if (source_type == StandardFeed::SourceType::EmbeddedBrowser) {
#if defined(NO_LITE) #if defined(NO_LITE)
WebEnginePage page; WebEnginePage page;
WebEngineViewer viewer;
// NOTE: Viewer must be present or JavaScript just does not run.
viewer.setPage(&page);
viewer.setAttribute(Qt::WA_DontShowOnScreen);
viewer.show();
feed_contents = page.pageHtml(source).toUtf8(); feed_contents = page.pageHtml(source).toUtf8();
// IOFactory::writeFile("a.html", feed_contents);
#else #else
throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME))); throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME)));
#endif #endif

View File

@ -31,6 +31,7 @@
#include "services/standard/standardserviceentrypoint.h" #include "services/standard/standardserviceentrypoint.h"
#if defined(NO_LITE) #if defined(NO_LITE)
#include "gui/webviewers/webengine/webengineviewer.h"
#include "network-web/webengine/webenginepage.h" #include "network-web/webengine/webenginepage.h"
#endif #endif
@ -242,9 +243,24 @@ QList<Message> StandardServiceRoot::obtainNewMessages(Feed* feed,
else if (f->sourceType() == StandardFeed::SourceType::EmbeddedBrowser) { else if (f->sourceType() == StandardFeed::SourceType::EmbeddedBrowser) {
#if defined(NO_LITE) #if defined(NO_LITE)
WebEnginePage* page = new WebEnginePage(); WebEnginePage* page = new WebEnginePage();
WebEngineViewer* viewer = nullptr;
QMetaObject::invokeMethod(
qApp,
[&] {
// NOTE: Must be create on main thread.
viewer = new WebEngineViewer();
},
Qt::ConnectionType::BlockingQueuedConnection);
viewer->moveToThread(qApp->thread());
page->moveToThread(qApp->thread()); page->moveToThread(qApp->thread());
viewer->setPage(page);
viewer->setAttribute(Qt::WA_DontShowOnScreen);
QMetaObject::invokeMethod(viewer, "show", Qt::ConnectionType::BlockingQueuedConnection);
QString html; QString html;
QMetaObject::invokeMethod(page, QMetaObject::invokeMethod(page,
"pageHtml", "pageHtml",
@ -255,6 +271,7 @@ QList<Message> StandardServiceRoot::obtainNewMessages(Feed* feed,
feed_contents = html.toUtf8(); feed_contents = html.toUtf8();
page->deleteLater(); page->deleteLater();
viewer->deleteLater();
#else #else
throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME))); throw ApplicationException(tr("this source type cannot be used on 'lite' %1 build").arg(QSL(APP_NAME)));
#endif #endif