Members of the KDE Community are recommended to subscribe to the kde-community mailing list at https://mail.kde.org/mailman/listinfo/kde-community to allow them to participate in important discussions and receive other important announcements

Commit 5b6a3ebe authored by Volker Krause's avatar Volker Krause

Replace the declarative extractor definitions with JavaScript

The approach worked technically, but turned out fairly hard to work with.
JavaScript is a bit more verbose, but easier to work with as it doesn't
enforce a very specific way of modeling the extractors. Being able to do
printf debugging inside the extractor code rather than reviewing a full
extractor rule execution trace to find mistakes is also convenient.
parent e93bcaf5
......@@ -17,7 +17,7 @@
"departureTime": "2027-11-15T06:46:00",
"trainNumber": "EC 171"
},
"reservationNumber": ""
"reservationNumber": "XXX007"
},
{
"@type": "TrainReservation",
......@@ -37,6 +37,6 @@
"departureTime": "2027-11-17T13:39:00",
"trainNumber": "EC 170"
},
"reservationNumber": ""
"reservationNumber": "XXX007"
}
]
......@@ -68,7 +68,7 @@ private Q_SLOTS:
QVERIFY(f.open(QFile::ReadOnly));
Extractor extractor;
QVERIFY(extractor.load(QLatin1String(":/org.kde.messageviewer/semantic/rules/") + extractorName + QLatin1String(".xml")));
QVERIFY(extractor.load(QLatin1String(":/org.kde.messageviewer/semantic/rules/") + extractorName + QLatin1String(".json")));
ExtractorEngine engine;
engine.setText(QString::fromUtf8(f.readAll()));
......@@ -115,7 +115,7 @@ private Q_SLOTS:
QVERIFY(f.open(QFile::ReadOnly));
Extractor extractor;
QVERIFY(extractor.load(QLatin1String(":/org.kde.messageviewer/semantic/rules/") + extractorName + QLatin1String(".xml")));
QVERIFY(extractor.load(QLatin1String(":/org.kde.messageviewer/semantic/rules/") + extractorName + QLatin1String(".json")));
ExtractorPreprocessor preproc;
preproc.preprocessHtml(QString::fromUtf8(f.readAll()));
......@@ -137,6 +137,6 @@ private Q_SLOTS:
}
};
QTEST_APPLESS_MAIN(UnstructuredDataExtractorTest)
QTEST_MAIN(UnstructuredDataExtractorTest)
#include "unstructureddataextractortest.moc"
......@@ -10,13 +10,11 @@ set(semantic_lib_srcs
airportdb/airportdb.cpp
datatypes.cpp
extractor.cpp
extractorcontext.cpp
extractorengine.cpp
extractorfilter.cpp
extractorpreprocessor.cpp
extractorpostprocessor.cpp
extractorrepository.cpp
extractorrule.cpp
jsonlddocument.cpp
structureddataextractor.cpp
)
......@@ -24,7 +22,7 @@ qt5_add_resources(semantic_lib_srcs rules/rules.qrc)
ecm_qt_declare_logging_category(semantic_lib_srcs HEADER semantic_debug.h IDENTIFIER SEMANTIC_LOG CATEGORY_NAME org.kde.pim.messageviewer.semantic)
add_library(semantic_extractor STATIC ${semantic_lib_srcs})
set_target_properties(semantic_extractor PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(semantic_extractor PUBLIC Qt5::Core KF5::Mime)
target_link_libraries(semantic_extractor PUBLIC Qt5::Core KF5::Mime PRIVATE Qt5::Qml)
if (HAVE_POPPLER)
target_link_libraries(semantic_extractor PRIVATE Poppler::Qt5)
endif()
......
......@@ -18,66 +18,52 @@
*/
#include "extractor.h"
#include "extractorrule.h"
#include "semantic_debug.h"
#include <QFile>
#include <QXmlStreamReader>
#include <QFileInfo>
#include <QJsonArray>
#include <QJsonDocument>
#include <QJsonObject>
#include <QJsonParseError>
#include <memory>
Extractor::Extractor() = default;
Extractor::Extractor(Extractor &&) = default;
Extractor::~Extractor()
{
qDeleteAll(m_rules);
}
Extractor::~Extractor() = default;
bool Extractor::load(const QString &fileName)
{
qCDebug(SEMANTIC_LOG) << "loading" << fileName;
QFile file(fileName);
if (!file.open(QFile::ReadOnly)) {
return false;
}
QXmlStreamReader reader(&file);
while (!reader.atEnd()) {
reader.readNext();
if (reader.tokenType() != QXmlStreamReader::StartElement) {
continue;
}
if (reader.name() == QLatin1String("extractor")) {
continue;
}
if (reader.name() == QLatin1String("filter")) {
ExtractorFilter f;
if (!f.load(reader)) {
return false;
}
m_filters.push_back(std::move(f));
continue;
}
auto rule = ExtractorRule::fromXml(reader);
if (rule) {
m_rules.push_back(rule);
}
}
if (reader.hasError()) {
qCWarning(SEMANTIC_LOG) << "Loading error:" << fileName << reader.errorString();
QJsonParseError error;
const auto doc = QJsonDocument::fromJson(file.readAll(), &error);
if (doc.isNull()) {
qCWarning(SEMANTIC_LOG) << "Extractor loading error:" << fileName << error.errorString();
return false;
}
qCDebug(SEMANTIC_LOG) << fileName << "loaded!";
return true;
const auto obj = doc.object();
for (const auto &filterValue : obj.value(QLatin1String("filter")).toArray()) {
ExtractorFilter f;
if (!f.load(filterValue.toObject()))
return false;
m_filters.push_back(std::move(f));
}
const auto scriptName = obj.value(QLatin1String("script")).toString();
QFileInfo fi(fileName);
m_scriptName = fi.absolutePath() + QLatin1Char('/') + scriptName;
return !m_filters.empty() && !m_scriptName.isEmpty() && QFile::exists(m_scriptName);
}
QVector<ExtractorRule *> Extractor::rules() const
QString Extractor::scriptFileName() const
{
return m_rules;
return m_scriptName;
}
const std::vector<ExtractorFilter> &Extractor::filters() const
......
......@@ -22,7 +22,6 @@
#include "extractorfilter.h"
#include <QVector>
#include <vector>
class ExtractorRule;
......@@ -39,11 +38,11 @@ public:
bool load(const QString &fileName);
QVector<ExtractorRule *> rules() const;
QString scriptFileName() const;
const std::vector<ExtractorFilter> &filters() const;
private:
QVector<ExtractorRule *> m_rules;
QString m_scriptName;
std::vector<ExtractorFilter> m_filters;
};
......
/*
Copyright (c) 2017 Volker Krause <vkrause@kde.org>
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to the
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
*/
#ifndef EXTRACTORCONTEXT_H
#define EXTRACTORCONTEXT_H
#include <QHash>
#include <QJsonObject>
#include <QVector>
class ExtractorEngine;
class ExtractorRule;
class QJsonValue;
/** Context stack used inside ExtractorEngine. */
class ExtractorContext
{
public:
ExtractorContext(ExtractorEngine *engine, ExtractorContext *parent = nullptr);
~ExtractorContext();
ExtractorEngine *engine() const;
QVector<ExtractorRule *> &rules();
void setRules(const QVector<ExtractorRule *> &rules);
int offset() const;
void setOffset(int offset);
QString variableValue(const QString &name) const;
void setVariable(const QString &name, const QString &value);
void setProperty(const QString &name, const QJsonValue &value);
QJsonObject object() const;
private:
ExtractorEngine *m_engine;
ExtractorContext *m_parent;
QVector<ExtractorRule *> m_rules;
QHash<QString, QString> m_variables;
QJsonObject m_obj;
int m_offset = 0;
};
#endif // EXTRACTORCONTEXT_H
......@@ -18,10 +18,41 @@
*/
#include "extractorengine.h"
#include "extractorcontext.h"
#include "extractorrule.h"
#include "semantic_debug.h"
#include <QDateTime>
#include <QFile>
#include <QLocale>
#include <QJSEngine>
class JsApi : public QObject {
Q_OBJECT
public:
explicit JsApi(QJSEngine *engine)
: QObject(engine)
, m_engine(engine)
{
}
Q_INVOKABLE QJSValue newObject(const QString &typeName) const;
Q_INVOKABLE QDateTime toDateTime(const QString &dt, const QString &format, const QString &locale) const;
private:
QJSEngine *m_engine;
};
QJSValue JsApi::newObject(const QString& typeName) const
{
auto v = m_engine->newObject();
v.setProperty(QStringLiteral("@type"), typeName);
return v;
}
QDateTime JsApi::toDateTime(const QString &dt, const QString &format, const QString &locale) const
{
return QLocale(locale).toDateTime(dt, format);
}
ExtractorEngine::ExtractorEngine() = default;
ExtractorEngine::~ExtractorEngine() = default;
......@@ -46,62 +77,44 @@ QJsonArray ExtractorEngine::extract()
return {};
}
qCDebug(SEMANTIC_LOG) << m_text << m_text.size();
ExtractorContext context(this);
context.setRules(m_extractor->rules());
executeContext(&context);
executeScript();
return m_result;
}
static bool isEmptyObject(const QJsonObject &obj)
void ExtractorEngine::executeScript()
{
return obj.size() <= 1 && obj.contains(QLatin1String("@type"));
}
Q_ASSERT(m_extractor);
ExtractorEngine::Result ExtractorEngine::executeContext(ExtractorContext *context)
{
while (!context->rules().isEmpty()) {
QVector<ExtractorRule *> repeatingRules;
for (auto it = context->rules().begin(); it != context->rules().end(); ++it) {
if (!(*it)->match(context)) {
continue;
}
qCDebug(SEMANTIC_LOG) << (*it)->ruleType() << (*it)->dataType() << (*it)->name();
ExtractorContext subContext(this, context);
subContext.setRules((*it)->rules());
subContext.setOffset(context->offset());
switch ((*it)->ruleType()) {
case ExtractorRule::Class:
subContext.setProperty(QLatin1String("@type"), (*it)->dataType());
break;
case ExtractorRule::Break:
return Result::Break;
default:
break;
}
const auto subResult = executeContext(&subContext);
if (subResult == Result::Break) {
return (*it)->repeats() ? Result::Return : Result::Break;
}
if ((*it)->ruleType() == ExtractorRule::Class && !isEmptyObject(subContext.object())) {
if ((*it)->name().isEmpty()) {
m_result.push_back(subContext.object());
} else {
context->setProperty((*it)->name(), subContext.object());
}
}
context->setOffset(subContext.offset());
if ((*it)->repeats()) {
repeatingRules.push_back(*it);
}
}
context->setRules(repeatingRules);
QFile f(m_extractor->scriptFileName());
if (!f.open(QFile::ReadOnly)) {
qCWarning(SEMANTIC_LOG) << "Failed to open extractor script" << f.fileName() << f.errorString();
return;
}
QJSEngine engine;
engine.installExtensions(QJSEngine::ConsoleExtension);
auto jsApi = new JsApi(&engine);
engine.globalObject().setProperty(QStringLiteral("JsonLd"), engine.newQObject(jsApi));
auto result = engine.evaluate(QString::fromUtf8(f.readAll()), f.fileName());
if (result.isError()) {
qCWarning(SEMANTIC_LOG) << "Script parsing error in" << result.property(QLatin1String("fileName")).toString()
<< ':' << result.property(QLatin1String("lineNumber")).toInt() << result.toString();
return;
}
return Result::Return;
auto mainFunc = engine.globalObject().property(QLatin1String("main"));
if (!mainFunc.isCallable()) {
qCWarning(SEMANTIC_LOG) << "Script has no main() function!";
return;
}
result = mainFunc.call({m_text});
if (result.isError()) {
qCWarning(SEMANTIC_LOG) << "Script execution error in" << result.property(QLatin1String("fileName")).toString()
<< ':' << result.property(QLatin1String("lineNumber")).toInt() << result.toString();
return;
}
m_result = QJsonArray::fromVariantList(result.toVariant().toList());
}
#include "extractorengine.moc"
......@@ -27,8 +27,6 @@
#include <vector>
class ExtractorContext;
/** Code for executing an extractor rule set on a specific email part. */
class ExtractorEngine
{
......@@ -43,11 +41,7 @@ public:
QJsonArray extract();
private:
enum class Result {
Return,
Break
};
Result executeContext(ExtractorContext *context);
void executeScript();
const Extractor *m_extractor = nullptr;
QString m_text;
......
......@@ -19,7 +19,7 @@
#include "extractorfilter.h"
#include <QXmlStreamReader>
#include <QJsonObject>
ExtractorFilter::ExtractorFilter() = default;
ExtractorFilter::~ExtractorFilter() = default;
......@@ -34,10 +34,9 @@ bool ExtractorFilter::matches(const QString &headerData) const
return m_exp.match(headerData).hasMatch();
}
bool ExtractorFilter::load(QXmlStreamReader &reader)
bool ExtractorFilter::load(const QJsonObject& obj)
{
Q_ASSERT(reader.name() == QLatin1String("filter"));
m_headerName = reader.attributes().value(QLatin1String("header")).toString().toUtf8();
m_exp.setPattern(reader.attributes().value(QLatin1String("match")).toString());
m_headerName = obj.value(QLatin1String("header")).toString().toUtf8();
m_exp.setPattern(obj.value(QLatin1String("match")).toString());
return !m_headerName.isEmpty() && m_exp.isValid();
}
......@@ -23,7 +23,7 @@
#include <QRegularExpression>
#include <QByteArray>
class QXmlStreamReader;
class QJsonObject;
/** Determines whether an extractor is applicable to a given email. */
class ExtractorFilter
......@@ -34,7 +34,7 @@ public:
const char *headerName() const;
bool matches(const QString &headerData) const;
bool load(QXmlStreamReader &reader);
bool load(const QJsonObject &obj);
private:
QByteArray m_headerName;
......
......@@ -61,7 +61,7 @@ std::vector<const Extractor *> ExtractorRepository::extractorsForMessage(KMime::
void ExtractorRepository::loadExtractors()
{
QDirIterator it(QStringLiteral(":/org.kde.messageviewer/semantic/rules"), {QStringLiteral("*.xml")}, QDir::Files);
QDirIterator it(QStringLiteral(":/org.kde.messageviewer/semantic/rules"), {QStringLiteral("*.json")}, QDir::Files);
while (it.hasNext()) {
Extractor e;
if (e.load(it.next())) {
......
/*
Copyright (c) 2017 Volker Krause <vkrause@kde.org>
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to the
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
*/
#include "extractorrule.h"
#include "extractorcontext.h"
#include "extractorengine.h"
#include "semantic_debug.h"
#include <QDateTime>
#include <QXmlStreamReader>
#include <memory>
ExtractorRule::ExtractorRule(ExtractorRule::Type type)
: m_ruleType(type)
{
}
ExtractorRule::~ExtractorRule()
{
qDeleteAll(m_rules);
}
ExtractorRule::Type ExtractorRule::ruleType() const
{
return m_ruleType;
}
bool ExtractorRule::hasSubRules() const
{
return !m_rules.empty();
}
QVector<ExtractorRule *> ExtractorRule::rules() const
{
return m_rules;
}
QString ExtractorRule::name() const
{
return m_name;
}
QString ExtractorRule::dataType() const
{
return m_dataType;
}
bool ExtractorRule::repeats() const
{
return m_repeat;
}
QString ExtractorRule::value(const QRegularExpressionMatch &match, ExtractorContext *context) const
{
auto v = m_value;
while (true) {
const auto begin = v.indexOf(QLatin1String("${"));
if (begin < 0) {
break;
}
const auto end = v.indexOf(QLatin1Char('}'), begin + 3);
const auto varName = v.mid(begin + 2, end - begin - 2);
bool isNum = false;
const auto captureIdx = varName.toInt(&isNum);
if (isNum) {
v.replace(begin, end - begin + 1, match.captured(captureIdx));
} else {
v.replace(begin, end - begin + 1, context->variableValue(varName));
}
}
return v.trimmed();
}
QString ExtractorRule::format() const
{
return m_format;
}
QLocale ExtractorRule::locale() const
{
return m_locale;
}
bool ExtractorRule::load(QXmlStreamReader &reader)
{
m_name = reader.attributes().value(QLatin1String("name")).toString();
m_dataType = reader.attributes().value(QLatin1String("type")).toString();
m_value = reader.attributes().value(QLatin1String("value")).toString();
m_format = reader.attributes().value(QLatin1String("format")).toString();
m_repeat = reader.attributes().value(QLatin1String("repeat")) == QLatin1String("true");
m_regexp.setPattern(reader.attributes().value(QLatin1String("match")).toString());
if (!m_regexp.isValid()) {
qCWarning(SEMANTIC_LOG) << m_regexp.errorString() << m_regexp.pattern() << "at offset" << m_regexp.patternErrorOffset();
}
if (reader.attributes().hasAttribute(QLatin1String("locale"))) {
m_locale = QLocale(reader.attributes().value(QLatin1String("locale")).toString());
}
while (!reader.atEnd()) {
reader.readNext();
if (reader.tokenType() == QXmlStreamReader::EndElement) {
return true;
}
if (reader.tokenType() != QXmlStreamReader::StartElement) {
continue;
}
auto rule = fromXml(reader);
if (!rule) {
return false;
}
m_rules.push_back(rule);
}
return false;
}
ExtractorRule *ExtractorRule::fromXml(QXmlStreamReader &reader)
{
std::unique_ptr<ExtractorRule> rule;
QStringRef readerName = reader.name();
if (readerName == QLatin1String("variable")) {
rule.reset(new ExtractorVariableRule);
} else if (readerName == QLatin1String("class")) {
rule.reset(new ExtractorClassRule);
} else if (readerName == QLatin1String("property")) {
rule.reset(new ExtractorPropertyRule);
} else if (readerName == QLatin1String("break")) {
rule.reset(new ExtractorBreakRule);
} else {
return nullptr;
}
if (!rule->load(reader)) {
return nullptr;
}
return rule.release();
}
bool ExtractorRule::match(ExtractorContext *context) const
{
// use QString::midRef(offset) rather than match(text(), offset) as that makes '^' matches work
const auto res = m_regexp.match(context->engine()->text().midRef(context->offset()));
if (res.hasMatch()) {
qCDebug(SEMANTIC_LOG) << name() << res.captured() << context->offset() << res.capturedEnd() << context->engine()->text().midRef(context->offset(), 20);
processMatch(res, context);
context->setOffset(res.capturedEnd() + context->offset());
}
return res.hasMatch();
}
void ExtractorRule::processMatch(const QRegularExpressionMatch &match, ExtractorContext *context) const
{
Q_UNUSED(match);
Q_UNUSED(context);
}
ExtractorVariableRule::ExtractorVariableRule()
: ExtractorRule(ExtractorRule::Variable)
{
}
void ExtractorVariableRule::processMatch(const QRegularExpressionMatch &match, ExtractorContext *context) const
{
context->setVariable(name(), value(match, context));
}