experimental post-processing filters.

This commit is contained in:
Martin Rotter 2021-02-02 20:41:17 +01:00
parent 301c87c40c
commit b7b48ae5c1
3 changed files with 92 additions and 5 deletions

View File

@ -37,7 +37,7 @@ RSS Guard 3.9.0+ offers extra advanced features which were inspired by [Liferea]
You can select source type of each feed. If you select `URL`, then RSS Guard simply downloads feed file from given location.
However, if you choose `Script` option, then you cannot provide URL of your feed and you rely on custom script to obtain your script and provide its contents to **standard output**. Resulting data written to standard output **MUST** be valid feed file, for example RSS or ATOM XML file.
However, if you choose `Script` option, then you cannot provide URL of your feed and you rely on custom script to obtain your script and provide its contents to **standard output**. Resulting data written to standard output should be valid feed file, for example RSS or ATOM XML file.
<img src="images/scrape-source-type.png" width="50%">
@ -59,8 +59,12 @@ RSS Guard offers placeholder `%data%` which is automatically replaced with full
Also, working directory of process executing the script is set to RSS Guard's user data folder.
After your source feed data are downloaded either via URL or custom script, you can optionally post-process the data with one more custom script, which will take raw source data as input and must produce processed feed data to **standard output** while printing all error messages to **error output**.
Typical post-processing filter might do things like advanced CSS formatting of feed file entries, removing some ads etc.
After your source feed data are downloaded either via URL or custom script, you can optionally post-process the data with one more custom script, which will take **raw source data as input** and must produce processed valid feed data to **standard output** while printing all error messages to **error output**.
Format of post-process script execution line is the same as above.
Typical post-processing filter might do things like advanced CSS formatting of feed file entries, removing some ads or simply pretty-printing XML data:
| Command | Explanation |
|---------|-------------|
| `bash.exe#-c "xmllint --format -"` | Pretty-print input XML feed data. |

View File

@ -474,6 +474,11 @@ QList<Message> StandardFeed::obtainNewMessages(bool* error_during_obtaining) {
int download_timeout = qApp->settings()->value(GROUP(Feeds), SETTING(Feeds::UpdateTimeout)).toInt();
if (sourceType() == SourceType::Url) {
qDebugNN << LOGSEC_CORE
<< "Downloading URL"
<< QUOTE_W_SPACE(url())
<< "to obtain feed data.";
QByteArray feed_contents;
QList<QPair<QByteArray, QByteArray>> headers;
@ -516,8 +521,45 @@ QList<Message> StandardFeed::obtainNewMessages(bool* error_during_obtaining) {
}
}
else {
qDebugNN << LOGSEC_CORE
<< "Running custom script"
<< QUOTE_W_SPACE(url())
<< "to obtain feed data.";
// Use script to generate feed file.
formatted_feed_contents = generateFeedFileWithScript(url(), download_timeout);
try {
formatted_feed_contents = generateFeedFileWithScript(url(), download_timeout);
}
catch (const ApplicationException& ex) {
qCriticalNN << LOGSEC_CORE
<< "Custom script for generating feed file failed:"
<< QUOTE_W_SPACE_DOT(ex.message());
setStatus(Status::OtherError);
*error_during_obtaining = true;
return {};
}
}
if (!postProcessScript().simplified().isEmpty()) {
qDebugNN << LOGSEC_CORE
<< "Post-processing obtained feed data with custom script"
<< QUOTE_W_SPACE_DOT(postProcessScript());
try {
formatted_feed_contents = postProcessFeedFileWithScript(postProcessScript(),
formatted_feed_contents,
download_timeout);
}
catch (const ApplicationException& ex) {
qCriticalNN << LOGSEC_CORE
<< "Post-processing script failed:"
<< QUOTE_W_SPACE_DOT(ex.message());
setStatus(Status::OtherError);
*error_during_obtaining = true;
return {};
}
}
// Feed data are downloaded and encoded.
@ -593,6 +635,46 @@ QString StandardFeed::generateFeedFileWithScript(const QString& execution_line,
}
}
QString StandardFeed::postProcessFeedFileWithScript(const QString& execution_line, const QString raw_feed_data, int run_timeout) {
auto prepared_query = prepareExecutionLine(execution_line);
QProcess process;
process.setInputChannelMode(QProcess::InputChannelMode::ManagedInputChannel);
process.setWorkingDirectory(qApp->userDataFolder());
process.setProgram(prepared_query.first);
#if defined(Q_OS_WIN)
process.setNativeArguments(prepared_query.second);
#else
process.setArguments({ prepared_query.second });
#endif
if (!process.open() || process.error() == QProcess::ProcessError::FailedToStart) {
throw ApplicationException(QSL("process failed to start"));
}
process.write(raw_feed_data.toUtf8());
process.closeWriteChannel();
if (process.waitForFinished(run_timeout)) {
auto raw_output = process.readAllStandardOutput();
return raw_output;
}
else {
process.kill();
auto raw_error = process.readAllStandardError();
if (raw_error.simplified().isEmpty()) {
throw ApplicationException(QSL("process failed to finish properly"));
}
else {
throw ApplicationException(QString(raw_error));
}
}
}
QNetworkReply::NetworkError StandardFeed::networkError() const {
return m_networkError;
}

View File

@ -80,6 +80,7 @@ class StandardFeed : public Feed {
static QPair<QString, QString> prepareExecutionLine(const QString& execution_line);
static QString generateFeedFileWithScript(const QString& execution_line, int run_timeout);
static QString postProcessFeedFileWithScript(const QString& execution_line, const QString raw_feed_data, int run_timeout);
// Tries to guess feed hidden under given URL
// and uses given credentials.