Commit 716464b8 authored by Volker Krause's avatar Volker Krause
Browse files

Make the extractor filter match scope explicit

That is, which parts of the document need to match relative to the part
being considered for extraction. This so far is all implicit based on the
types of the matching and extracted parts. Explicitly specifying this will
therefore further allow us to remove implicit type-specific logic from the
core engine, while giving us even more flexibility.

This information isn't actually used yet, this is only a small preparation
for a larger upcoming rework of the extractor engine.
parent 333834f5
Pipeline #54575 passed with stages
in 11 minutes and 30 seconds
......@@ -50,19 +50,21 @@ class ExtractorRepositoryPrivate;
* A MIME message header name, a property on a Json-LD object or an iCal calendar or event.
* For @c Text or @c Barcode, this is ignored.
* - \c match: A regular expression that is matched against the specified value (see QRegularExpression).
* - \c scope: Specifies how the filter should be applied relative to the document node that is being extracted.
* One of @c Current, @c Parent, @c Children, @c Ancestors, @c Descendants (@c Current is the default).
*
* Example:
* @code
* [
* {
* "type": "Pdf",
* "filter": [ { "field": "From", "match": "@swiss.com", "type": "Email" } ],
* "filter": [ { "field": "From", "match": "@swiss.com", "type": "Email", "scope": "Ancestors" } ],
* "script": "swiss.js",
* "function": "parsePdf"
* },
* {
* "type": "PkPass",
* "filter": [ { "field": "passTypeIdentifier", "match": "pass.booking.swiss.com", "type": "PkPass" } ],
* "filter": [ { "field": "passTypeIdentifier", "match": "pass.booking.swiss.com", "type": "PkPass", "scope": "Current" } ],
* "script": "swiss.js",
* "function": "parsePkPass"
* }
......
/*
SPDX-FileCopyrightText: 2017 Volker Krause <vkrause@kde.org>
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
......@@ -8,6 +8,8 @@
#include "logging.h"
#include <QJsonObject>
#include <QMetaEnum>
#include <QRegularExpression>
using namespace KItinerary;
......@@ -18,6 +20,7 @@ public:
ExtractorInput::Type m_type = ExtractorInput::Unknown;
QString m_fieldName;
QRegularExpression m_exp;
ExtractorFilter::Scope m_scope = ExtractorFilter::Current;
};
}
......@@ -73,6 +76,19 @@ static bool needsFieldName(ExtractorInput::Type type)
}
}
template <typename T>
static T readEnum(const QJsonValue &v, T defaultValue = {})
{
if (!v.isString()) {
return defaultValue;
}
const auto me = QMetaEnum::fromType<T>();
bool success = false;
const auto result = static_cast<T>(me.keyToValue(v.toString().toUtf8().constData(), &success));
return success ? result : defaultValue;
}
bool ExtractorFilter::load(const QJsonObject &obj)
{
d->m_type = ExtractorInput::typeFromName(obj.value(QLatin1String("type")).toString());
......@@ -81,17 +97,19 @@ bool ExtractorFilter::load(const QJsonObject &obj)
}
d->m_fieldName = obj.value(QLatin1String("field")).toString();
d->m_exp.setPattern(obj.value(QLatin1String("match")).toString());
d->m_scope = readEnum<ExtractorFilter::Scope>(obj.value(QLatin1String("scope")), ExtractorFilter::Current);
return d->m_type != ExtractorInput::Unknown && (!d->m_fieldName.isEmpty() || !needsFieldName(d->m_type)) && d->m_exp.isValid();
}
QJsonObject ExtractorFilter::toJson() const
{
QJsonObject obj;
obj.insert(QStringLiteral("type"), ExtractorInput::typeToString(d->m_type));
obj.insert(QLatin1String("type"), ExtractorInput::typeToString(d->m_type));
if (needsFieldName(d->m_type)) {
obj.insert(QStringLiteral("field"), d->m_fieldName);
obj.insert(QLatin1String("field"), d->m_fieldName);
}
obj.insert(QStringLiteral("match"), pattern());
obj.insert(QLatin1String("match"), pattern());
obj.insert(QLatin1String("scope"), QLatin1String(QMetaEnum::fromType<ExtractorFilter::Scope>().valueToKey(d->m_scope)));
return obj;
}
......@@ -105,3 +123,14 @@ void ExtractorFilter::setPattern(const QString &pattern)
d.detach();
d->m_exp.setPattern(pattern);
}
ExtractorFilter::Scope ExtractorFilter::scope() const
{
return d->m_scope;
}
void ExtractorFilter::setScope(Scope scope)
{
d.detach();
d->m_scope = scope;
}
/*
SPDX-FileCopyrightText: 2017 Volker Krause <vkrause@kde.org>
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef EXTRACTORFILTER_H
#define EXTRACTORFILTER_H
#ifndef KITINERARY_EXTRACTORFILTER_H
#define KITINERARY_EXTRACTORFILTER_H
#include "kitinerary_export.h"
#include "extractorinput.h"
#include <QRegularExpression>
#include <QByteArray>
#include <QExplicitlySharedDataPointer>
#include <qobjectdefs.h>
class QJsonObject;
......@@ -23,6 +23,7 @@ class ExtractorFilterPrivate;
/** Determines whether an extractor is applicable to a given email. */
class KITINERARY_EXPORT ExtractorFilter
{
Q_GADGET
public:
ExtractorFilter();
~ExtractorFilter();
......@@ -40,6 +41,18 @@ public:
/** Pattern to match field value against. */
QString pattern() const;
/** Specifies which document nodes should match this filter, relative to the one being extracted. */
enum Scope {
Current, ///< match the node being extracted
Parent, ///< match the direct parent node
Children, ///< match the direct child nodes
Ancestors, ///< match any direct or indirect parent nodes
Descendants, ///< match any direct or indirect child nodes
};
Q_ENUM(Scope)
/** Evaluation scope of this filter, in relation to the node being extracted. */
Scope scope() const;
///@cond internal
/** Load filter from @p obj. */
bool load(const QJsonObject &obj);
......@@ -49,6 +62,7 @@ public:
void setType(ExtractorInput::Type type);
void setFieldName(const QString &fieldName);
void setPattern(const QString &pattern);
void setScope(Scope scope);
///@endcond
private:
......@@ -57,4 +71,4 @@ private:
}
#endif // EXTRACTORFILTER_H
#endif // KITINERARY_EXTRACTORFILTER_H
......@@ -3,7 +3,8 @@
{
"field": "From",
"match": "@reservation.accor-mail.com",
"type": "Email"
"type": "Email",
"scope": "Ancestors"
}
],
"function": "parseConfirmation",
......
......@@ -3,7 +3,8 @@
{
"field": "From",
"match": "@acprail.com",
"type": "Email"
"type": "Email",
"scope": "Ancestors"
}
],
"function": "main",
......
[{
"type": "html",
"filter": [
{ "field": "From", "match": "aerlingus.com", "type": "Email" }
{ "field": "From", "match": "aerlingus.com", "type": "Email", "scope": "Ancestors" }
],
"script": "aerlingus.js"
}]
......@@ -3,7 +3,8 @@
{
"field": "From",
"match": "@agoda.com",
"type": "Email"
"type": "Email",
"scope": "Ancestors"
}
],
"function": "main",
......
{
"type": "pdf",
"filter": [
{ "field": "From", "match": "@airbaltic.com", "type": "Email" },
{ "field": "From", "match": "@airbaltic.com", "type": "Email", "scope": "Ancestors" },
{ "field": "reservationFor.airline.iataCode", "match": "BT", "type": "JsonLd" }
],
"script": "airbaltic.js"
......
......@@ -3,7 +3,8 @@
{
"field": "From",
"match": "@aircoach.ie",
"type": "Email"
"type": "Email",
"scope": "Ancestors"
}
],
"function": "main",
......
......@@ -2,16 +2,16 @@
{
"type": "text",
"filter": [
{ "field": "From", "match": "@amadeus.com", "type": "Email" },
{ "field": "Message-ID", "match": "amadeus.com", "type": "Email" }
{ "field": "From", "match": "@amadeus.com", "type": "Email", "scope": "Ancestors" },
{ "field": "Message-ID", "match": "amadeus.com", "type": "Email", "scope": "Ancestors" }
],
"script": "amadeus.js"
},
{
"type": "ical",
"filter": [
{ "field": "From", "match": "@amadeus.com", "type": "Email" },
{ "field": "productId", "match": "//AMADEUS//", "type": "ICal" }
{ "field": "From", "match": "@amadeus.com", "type": "Email", "scope": "Ancestors" },
{ "field": "productId", "match": "//AMADEUS//", "type": "ICal", "scope": "Parent" }
],
"script": "amadeus.js",
"function": "parseEvent"
......
{
"type": "pdf",
"filter": [
{ "field": "From", "match": "americanairlines@aa.com", "type": "Email" },
{ "field": "From", "match": "americanairlines@aa.com", "type": "Email", "scope": "Ancestors" },
{ "field": "reservationFor.airline.iataCode", "match": "AA", "type": "JsonLd" }
],
"script": "americanairlines.js"
......
{
"filter": [
{ "field": "From", "match": "@aohostels.com", "type": "Email" }
{ "field": "From", "match": "@aohostels.com", "type": "Email", "scope": "Ancestors" }
],
"script": "aohostels.js",
"type": "Text"
......
[
{
"type": "html",
"filter": [ { "field": "From", "match": "@availpro.com", "type": "Email" } ],
"filter": [ { "field": "From", "match": "@availpro.com", "type": "Email", "scope": "Ancestors" } ],
"script": "availpro.js",
"function": "parseHtml"
}
......
[
{
"type": "text",
"filter": [ { "field": "From", "match": "@booking.com", "type": "Email" } ],
"filter": [ { "field": "From", "match": "@booking.com", "type": "Email", "scope": "Ancestors" } ],
"script": "booking.js"
},
{
"type": "html",
"filter": [ { "field": "From", "match": "@booking.com", "type": "Email" } ],
"filter": [ { "field": "From", "match": "@booking.com", "type": "Email", "scope": "Ancestors" } ],
"script": "booking.js",
"function": "parseHtml"
}
......
......@@ -2,14 +2,14 @@
{
"type": "html",
"filter": [
{ "field": "From", "match": "@brusselsairlines.com", "type": "Email" }
{ "field": "From", "match": "@brusselsairlines.com", "type": "Email", "scope": "Ancestors" }
],
"script": "brusselsairlines.js"
},
{
"type": "pdf",
"filter": [
{ "field": "From", "match": "brusselsairlines.com", "type": "Email" },
{ "field": "From", "match": "brusselsairlines.com", "type": "Email", "scope": "Ancestors" },
{ "field": "reservationFor.airline.iataCode", "match": "SN", "type": "JsonLd" }
],
"script": "brusselsairlines-receipt.js"
......
{
"filter": [
{ "field": "From", "match": "info@cd.cz", "type": "Email" },
{ "field": "From", "match": "eshop@cd.cz", "type": "Email" }
{ "field": "From", "match": "info@cd.cz", "type": "Email", "scope": "Ancestors" },
{ "field": "From", "match": "eshop@cd.cz", "type": "Email", "scope": "Ancestors" }
],
"script": "czechrailways.js",
"type": "Text"
......
......@@ -4,7 +4,8 @@
{
"field": "From",
"match": "buchungsbestaetigung@bahn.de",
"type": "Email"
"type": "Email",
"scope": "Ancestors"
},
{
"match": "uic:0080",
......@@ -26,7 +27,8 @@
{
"field": "From",
"match": "UNITTEST-buchungsbestaetigung@bahn.de",
"type": "Email"
"type": "Email",
"scope": "Ancestors"
}
],
"function": "parseText",
......@@ -38,7 +40,8 @@
{
"field": "From",
"match": "buchungsbestaetigung@bahn.de",
"type": "Email"
"type": "Email",
"scope": "Ancestors"
}
],
"function": "parseCancellation",
......
[
{
"type": "ical",
"filter": [ { "field": "productId", "match": "//DinnerBooking//", "type": "ICal" } ],
"filter": [ { "field": "productId", "match": "//DinnerBooking//", "type": "ICal", "scope": "Parent" } ],
"script": "dinnerbooking.js",
"function": "parseEvent"
}
......
[{
"type": "pkpass",
"filter": [ { "field": "passTypeIdentifier", "match": "pass.com.blueend.customer.eap.ticket", "type": "PkPass" } ],
"filter": [ { "field": "passTypeIdentifier", "match": "pass.com.blueend.customer.eap.ticket", "type": "PkPass", "scope": "Current" } ],
"script": "easyairportparking-pkpass.js"
}]
[
{
"type": "html",
"filter": [ { "field": "From", "match": "@easyjet.com", "type": "Email" } ],
"filter": [ { "field": "From", "match": "@easyjet.com", "type": "Email", "scope": "Ancestors" } ],
"script": "easyjet.js",
"function": "parseHtmlBooking"
},
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment