Commit ccf65027 authored by Volker Krause's avatar Volker Krause
Browse files

Rebase script extractor on to the new extractor engine

This no longer contains type-specific wrapping code for interfacing with
JS, that's provided by the document model now. This also no longer treats
script extractors specially, but bases them on the same abstract base
class.

This keeps full source compatibility on the JS side for now, although it
contains preparations to eventually phase out the awkward Context API in
favor of exposing the full document tree directly.
parent 03820798
......@@ -27,6 +27,9 @@ set(kitinerary_lib_srcs
engine/extractordocumentprocessor.cpp
engine/extractorfilter.cpp
engine/extractorresult.cpp
engine/extractorscriptengine.cpp
engine/scriptextractor.cpp
era/ssbticket.cpp
extractors/iatabcbpextractor.cpp
......@@ -233,6 +236,7 @@ ecm_generate_headers(KItinerary_Engine_FORWARDING_HEADERS
ExtractorDocumentProcessor
ExtractorFilter
ExtractorResult
ScriptExtractor
PREFIX KItinerary
REQUIRED_HEADERS KItinerary_Engine_HEADERS
RELATIVE engine
......
/*
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "extractorscriptengine_p.h"
#include "extractordocumentnode.h"
#include "extractordocumentprocessor.h"
#include "extractorresult.h"
#include "scriptextractor.h"
#include "logging.h"
#include "jsapi/barcode.h"
#include "jsapi/context.h"
#include "jsapi/jsonld.h"
#include <QFile>
#include <QJSEngine>
#include <QJSValueIterator>
#include <QScopeGuard>
using namespace KItinerary;
namespace KItinerary {
class ExtractorScriptEnginePrivate {
public:
bool loadScript(const QString &fileName);
JsApi::Barcode *m_barcodeApi = nullptr;
JsApi::Context *m_context = nullptr;
JsApi::JsonLd *m_jsonLdApi = nullptr;
QJSEngine m_engine;
};
}
ExtractorScriptEngine::ExtractorScriptEngine() = default;
ExtractorScriptEngine::~ExtractorScriptEngine() = default;
void ExtractorScriptEngine::ensureInitialized()
{
if (d) {
return;
}
d = std::make_unique<ExtractorScriptEnginePrivate>();
d->m_context = new JsApi::Context; // will be deleted by QJSEngine taking ownership
d->m_engine.installExtensions(QJSEngine::ConsoleExtension);
d->m_jsonLdApi = new JsApi::JsonLd(&d->m_engine);
d->m_engine.globalObject().setProperty(QStringLiteral("JsonLd"), d->m_engine.newQObject(d->m_jsonLdApi));
d->m_barcodeApi = new JsApi::Barcode;
d->m_engine.globalObject().setProperty(QStringLiteral("Barcode"), d->m_engine.newQObject(d->m_barcodeApi));
d->m_engine.globalObject().setProperty(QStringLiteral("Context"), d->m_engine.newQObject(d->m_context));
}
void ExtractorScriptEngine::setBarcodeDecoder(BarcodeDecoder *barcodeDecoder)
{
ensureInitialized();
d->m_barcodeApi->setDecoder(barcodeDecoder);
}
static void printScriptError(const QJSValue &result)
{
// don't change the formatting without adjusting KItinerary Workbench too!
qCWarning(Log).noquote().nospace() << "JS ERROR: [" << result.property(QStringLiteral("fileName")).toString()
<< "]:" << result.property(QStringLiteral("lineNumber")).toInt() << ": " << result.toString();
}
bool ExtractorScriptEnginePrivate::loadScript(const QString &fileName)
{
// TODO we could skip this is if the right script is already loaded
// we cannot do this unconditionally however without breaking KItinerary Workbench's live editing
if (fileName.isEmpty()) {
return false;
}
QFile f(fileName);
if (!f.open(QFile::ReadOnly)) {
qCWarning(Log) << "Failed to open extractor script" << f.fileName() << f.errorString();
return false;
}
auto result = m_engine.evaluate(QString::fromUtf8(f.readAll()), f.fileName());
if (result.isError()) {
printScriptError(result);
return false;
}
return true;
}
ExtractorResult ExtractorScriptEngine::execute(const ScriptExtractor *extractor, const ExtractorDocumentNode &node, const ExtractorDocumentNode &triggerNode) const
{
const_cast<ExtractorScriptEngine*>(this)->ensureInitialized();
if (!d->loadScript(extractor->scriptFileName())) {
return {};
}
auto mainFunc = d->m_engine.globalObject().property(extractor->scriptFunction());
if (!mainFunc.isCallable()) {
qCWarning(Log) << "Script entry point not found!" << extractor->scriptFunction();
return {};
}
qCDebug(Log) << "Running script extractor" << extractor->scriptFileName() << extractor->scriptFunction();
node.setScriptEngine(&d->m_engine);
const auto engineReset = qScopeGuard([&node]{ node.setScriptEngine(nullptr); });
d->m_context->m_data = d->m_engine.toScriptValue(node.result().jsonLdResult());
d->m_context->m_barcode = triggerNode.content();
d->m_context->m_pdfPageNum = triggerNode.location().toInt();
d->m_context->m_senderDate = node.contextDateTime();
d->m_jsonLdApi->setContextDate(node.contextDateTime());
d->m_barcodeApi->setContextDate(node.contextDateTime());
const auto nodeArg = d->m_engine.toScriptValue(node);
const auto dataArg = nodeArg.property(QLatin1String("content"));
QJSValueList args{ dataArg, nodeArg };
const auto result = mainFunc.call(args);
if (result.isError()) {
printScriptError(result);
return {};
}
QJsonArray out;
if (result.isArray()) {
QJSValueIterator it(result);
while (it.hasNext()) {
it.next();
if (it.value().isObject()) {
out.push_back(QJsonValue::fromVariant(it.value().toVariant()));
}
}
} else if (result.isObject()) {
out.push_back(QJsonValue::fromVariant(result.toVariant()));
} else {
qCWarning(Log) << "Invalid result type from script";
}
return out;
}
/*
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_EXTRACTORSCRIPTENGINE_P_H
#define KITINERARY_EXTRACTORSCRIPTENGINE_P_H
#include <memory>
namespace KItinerary {
class BarcodeDecoder;
class ExtractorDocumentNode;
class ExtractorResult;
class ScriptExtractor;
class ExtractorScriptEnginePrivate;
/** JavaScript execution environment for KItinerary::ScriptExtractor instances. */
class ExtractorScriptEngine
{
public:
explicit ExtractorScriptEngine();
~ExtractorScriptEngine();
void setBarcodeDecoder(BarcodeDecoder *barcodeDecoder);
ExtractorResult execute(const ScriptExtractor *extractor, const ExtractorDocumentNode &node, const ExtractorDocumentNode &triggerNode) const;
private:
void ensureInitialized();
std::unique_ptr<ExtractorScriptEnginePrivate> d;
};
}
#endif // KITINERARY_EXTRACTORSCRIPTENGINE_P_H
/*
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "scriptextractor.h"
#include "extractorscriptengine_p.h"
#include "logging.h"
#include <KItinerary/ExtractorDocumentNode>
#include <KItinerary/ExtractorEngine>
#include <KItinerary/ExtractorFilter>
#include <KItinerary/ExtractorResult>
#include <QFile>
#include <QFileInfo>
#include <QJsonArray>
#include <QJsonObject>
using namespace KItinerary;
namespace KItinerary {
class ScriptExtractorPrivate
{
public:
QString m_mimeType;
QString m_fileName;
QString m_scriptName;
QString m_scriptFunction;
std::vector<ExtractorFilter> m_filters;
int m_index = -1;
};
}
ScriptExtractor::ScriptExtractor()
: d(std::make_unique<ScriptExtractorPrivate>())
{
}
ScriptExtractor::~ScriptExtractor() = default;
bool ScriptExtractor::load(const QJsonObject &obj, const QString &fileName, int index)
{
d->m_fileName = fileName;
d->m_index = index;
d->m_mimeType = obj.value(QLatin1String("mimeType")).toString();
const auto filterArray = obj.value(QLatin1String("filter")).toArray();
for (const auto &filterValue : filterArray) {
ExtractorFilter f;
if (!f.load(filterValue.toObject())) {
qCDebug(Log) << "invalid filter expression:" << fileName;
return false;
}
d->m_filters.push_back(std::move(f));
}
const auto scriptName = obj.value(QLatin1String("script")).toString();
if (!scriptName.isEmpty()) {
QFileInfo fi(fileName);
d->m_scriptName = fi.path() + QLatin1Char('/') + scriptName;
}
if (!d->m_scriptName.isEmpty() && !QFile::exists(d->m_scriptName)) {
qCWarning(Log) << "Script file not found:" << d->m_scriptName;
return false;
}
d->m_scriptFunction = obj.value(QLatin1String("function")).toString(QStringLiteral("main"));
return !d->m_filters.empty() && !d->m_mimeType.isEmpty();
}
QJsonObject ScriptExtractor::toJson() const
{
QJsonObject obj;
obj.insert(QStringLiteral("mimeType"), d->m_mimeType);
QFileInfo metaFi(d->m_fileName);
QFileInfo scriptFi(d->m_scriptName);
if (metaFi.canonicalPath() == scriptFi.canonicalPath()) {
obj.insert(QStringLiteral("script"), scriptFi.fileName());
} else {
obj.insert(QStringLiteral("script"), d->m_scriptName);
}
obj.insert(QStringLiteral("function"), d->m_scriptFunction);
QJsonArray filters;
std::transform(d->m_filters.begin(), d->m_filters.end(), std::back_inserter(filters), std::mem_fn(&ExtractorFilter::toJson));
obj.insert(QStringLiteral("filter"), filters);
return obj;
}
QString ScriptExtractor::name() const
{
QFileInfo fi(d->m_fileName);
if (d->m_index < 0) {
return fi.baseName();
}
return fi.baseName() + QLatin1Char(':') + QString::number(d->m_index);
}
QString ScriptExtractor::mimeType() const
{
return d->m_mimeType;
}
void ScriptExtractor::setMimeType(const QString &mimeType)
{
d->m_mimeType = mimeType;
}
QString ScriptExtractor::scriptFileName() const
{
return d->m_scriptName;
}
void ScriptExtractor::setScriptFileName(const QString &script)
{
d->m_scriptName = script;
}
QString ScriptExtractor::scriptFunction() const
{
return d->m_scriptFunction;
}
void ScriptExtractor::setScriptFunction(const QString &func)
{
d->m_scriptFunction = func;
}
QString ScriptExtractor::fileName() const
{
return d->m_fileName;
}
const std::vector<ExtractorFilter>& ScriptExtractor::filters() const
{
return d->m_filters;
}
void ScriptExtractor::setFilters(std::vector<ExtractorFilter> &&filters)
{
d->m_filters = std::move(filters);
}
bool ScriptExtractor::canHandle(const ExtractorDocumentNode &node) const
{
if (node.mimeType() != d->m_mimeType) {
return false;
}
// no filters matches always
if (d->m_filters.empty()) {
return true;
}
return std::any_of(d->m_filters.begin(), d->m_filters.end(), [&node](const auto &filter) {
return !filter.matches(node).isNull();
});
}
ExtractorResult ScriptExtractor::extract(const ExtractorDocumentNode &node, const ExtractorEngine *engine) const
{
ExtractorDocumentNode triggerNode;
for (const auto &filter : d->m_filters) {
triggerNode = filter.matches(node);
if (!triggerNode.isNull()) {
break;
}
}
return engine->scriptEngine()->execute(this, node, triggerNode);
}
/*
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_SCRIPTEXTRACTOR_H
#define KITINERARY_SCRIPTEXTRACTOR_H
#include "abstractextractor.h"
#include <memory>
#include <vector>
class QJsonObject;
class QString;
namespace KItinerary {
class ExtractorFilter;
class ScriptExtractorPrivate;
/** A single unstructured data extraction rule set.
*
* These rules are loaded from JSON meta-data files in a compiled-in qrc file,
* or from $XDG_DATA_DIRS/kitinerary/extractors.
*
* @section extractor_metadata Meta Data Format
*
* The meta-data files either contain a single JSON object or an array of JSON objects
* with the following content:
* - \c type: The type of the extractor, \c text if not specified.
* - \c filter: An array of filters that are used to select this extractor for a given input file.
* - \c script: A JavaScript file to execute.
* - \c function: The entry point in the above mentioned script, @c main if not specified.
*
* The following extractor types are supported (see also ExtractorInput::Type):
* - \c Text: plain text, the argument to the script function is a single string.
* - \c Html: HTML documents, the argument to the script function is a HtmlDocument instance.
* - \c Pdf: PDF documents, the argument to the script function is a PdfDocument instance.
* - \c PkPass: Apple Wallet passes, the argument to the script function is a KPkPass::BoardingPass instance.
* - \c ICal: iCalendar events, the argument to the script function is a KCalendarCore::Event instance.
*
* Filter definitions have the following field:
* - \c mimeType: The MIME type of the document part this filter can match against.
* - \c field: The name of the field to match against. This can be a field id in a Apple Wallet pass,
* A MIME message header name, a property on a Json-LD object or an iCal calendar or event.
* For plain text or binary content, this is ignored.
* - \c match: A regular expression that is matched against the specified value (see QRegularExpression).
* - \c scope: Specifies how the filter should be applied relative to the document node that is being extracted.
* One of @c Current, @c Parent, @c Children, @c Ancestors, @c Descendants (@c Current is the default).
*
* Example:
* @code
* [
* {
* "type": "Pdf",
* "filter": [ { "field": "From", "match": "@swiss.com", "mimeType": "message/rfc822", "scope": "Ancestors" } ],
* "script": "swiss.js",
* "function": "parsePdf"
* },
* {
* "type": "PkPass",
* "filter": [ { "field": "passTypeIdentifier", "match": "pass.booking.swiss.com", "mimeType": "application/vnd.apple.pkpass", "scope": "Current" } ],
* "script": "swiss.js",
* "function": "parsePkPass"
* }
* ]
* @endcode
*
* @section extractor_development Development
*
* For development it's convenient to symlink the extractors source folder to
* $XDG_DATA_DIRS/kitinerary/extractors, so you can re-run a changed extractor
* script without recompiling or restarting the application.
*
*/
class KITINERARY_EXPORT ScriptExtractor : public AbstractExtractor
{
public:
explicit ScriptExtractor();
~ScriptExtractor();
QString name() const override;
bool canHandle(const ExtractorDocumentNode &node) const override;
ExtractorResult extract(const ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
/** The JS script containing the code of the extractor. */
QString scriptFileName() const;
/** The JS function entry point for this extractor, @c main if empty. */
QString scriptFunction() const;
/** Mime type this script extractor supports. */
QString mimeType() const;
/** Returns the filters deciding whether this extractor should be applied. */
const std::vector<ExtractorFilter> &filters() const;
///@cond internal
/** Load meta data from the given JSON object. */
bool load(const QJsonObject &obj, const QString &fileName, int index = -1);
/** Save extractor meta data to a JSON object. */
QJsonObject toJson() const;
/** Source file name. */
QString fileName() const;
void setMimeType(const QString &mimeType);
void setScriptFileName(const QString &script);
void setScriptFunction(const QString &func);
void setFilters(std::vector<ExtractorFilter> &&filters);
///@endcond
private:
std::unique_ptr<ScriptExtractorPrivate> d;
};
}
#endif // KITINERARY_SCRIPTEXTRACTOR_H
......@@ -24,6 +24,7 @@
#include "vdv/vdvticketparser.h"
#include "engine/extractordocumentnodefactory.h"
#include "engine/extractorscriptengine_p.h"
#include "jsapi/barcode.h"
#include "jsapi/context.h"
#include "jsapi/jsonld.h"
......@@ -106,6 +107,7 @@ public:
BarcodeDecoder m_barcodeDecoder;
QString m_externalExtractor;
QString m_usedExtractor;
ExtractorScriptEngine m_scriptEngine;
};
template <typename T>
......@@ -787,3 +789,8 @@ const BarcodeDecoder* ExtractorEngine::barcodeDecoder() const
return &d->m_barcodeDecoder;
}
const ExtractorScriptEngine* ExtractorEngine::scriptEngine() const
{
d->m_scriptEngine.setBarcodeDecoder(&d->m_barcodeDecoder);
return &d->m_scriptEngine;
}
......@@ -40,6 +40,7 @@ class BarcodeDecoder;
class Extractor;
class ExtractorDocumentNodeFactory;
class ExtractorEnginePrivate;
class ExtractorScriptEngine;
class HtmlDocument;
class PdfDocument;
......@@ -221,6 +222,11 @@ public:
*/
const BarcodeDecoder* barcodeDecoder() const;
///@cond internal
/** JavaScript execution engine for script extractors. */
const ExtractorScriptEngine* scriptEngine() const;
///@endcond
private:
std::unique_ptr<ExtractorEnginePrivate> d;
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment