Commit a1dfa9e2 authored by Volker Krause's avatar Volker Krause
Browse files

Implement extractor loading and matching for the new system

This is significantly simpler than the previous implementation as we no
longer need the type-specific matching logic here anymore.
parent a45611fe
......@@ -4,8 +4,13 @@
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include <KItinerary/AbstractExtractor>
#include <KItinerary/Extractor>
#include <KItinerary/ExtractorDocumentNode>
#include <KItinerary/ExtractorDocumentNodeFactory>
#include <KItinerary/ExtractorEngine>
#include <KItinerary/ExtractorRepository>
#include <KItinerary/ScriptExtractor>
#include <QDebug>
#include <QObject>
......@@ -41,8 +46,37 @@ private Q_SLOTS:
QCOMPARE(extractors.size(), 1);
QVERIFY(extractors[0].name().startsWith(QLatin1String("irctc")));
}
void testExtractorsForNode()
{
ExtractorEngine engine;
std::vector<const AbstractExtractor*> extractors;
auto root = engine.documentNodeFactory()->createNode(QStringLiteral("PNR:1234567890,TRAIN:12345,DOJ:dd-mm-yyyy,TIME:hh:mm,SL,A TO B,John Doe+2,S7 49 S7 52 S7 55,FARE:140,SC:10+PG CHGS."), u"text/plain");
QVERIFY(!root.isNull());
engine.extractorRepository()->extractorsForNode(root, extractors);
QCOMPARE(extractors.size(), 1);
QVERIFY(extractors[0]->name().startsWith(QLatin1String("irctc")));
extractors.clear();
root = engine.documentNodeFactory()->createNode(QStringLiteral("i0CVxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxX"), u"text/plain");
QVERIFY(!root.isNull());
engine.extractorRepository()->extractorsForNode(root, extractors);
QCOMPARE(extractors.size(), 1);
QVERIFY(extractors[0]->name().startsWith(QLatin1String("sncf")));
QVERIFY(dynamic_cast<const ScriptExtractor*>(extractors[0]));
extractors.clear();
root = engine.documentNodeFactory()->createNode(QStringLiteral("M1DOE/JOHN EXXX007 TXLBRUSN 2592 110"), u"text/plain");
QVERIFY(!root.isNull());
engine.extractorRepository()->extractorsForNode(root, extractors);
QCOMPARE(extractors.size(), 1);
QVERIFY(extractors[0]->name().startsWith(QLatin1String("<IATA BCBP>")));
QVERIFY(!dynamic_cast<const ScriptExtractor*>(extractors[0]));
extractors.clear();
}
};
QTEST_APPLESS_MAIN(ExtractorRepositoryTest)
QTEST_GUILESS_MAIN(ExtractorRepositoryTest)
#include "extractorrepositorytest.moc"
......@@ -26,6 +26,7 @@ set(kitinerary_lib_srcs
engine/extractordocumentnodefactory.cpp
engine/extractordocumentprocessor.cpp
engine/extractorfilter.cpp
engine/extractorrepository.cpp
engine/extractorresult.cpp
engine/extractorscriptengine.cpp
engine/scriptextractor.cpp
......@@ -104,7 +105,6 @@ set(kitinerary_lib_srcs
extractorengine.cpp
extractorinput.cpp
extractorpostprocessor.cpp
extractorrepository.cpp
extractorutil.cpp
extractorvalidator.cpp
file.cpp
......@@ -185,7 +185,6 @@ ecm_generate_headers(KItinerary_FORWARDING_HEADERS
ExtractorEngine
ExtractorInput
ExtractorPostprocessor
ExtractorRepository
ExtractorValidator
File
HtmlDocument
......@@ -235,6 +234,7 @@ ecm_generate_headers(KItinerary_Engine_FORWARDING_HEADERS
ExtractorDocumentNodeFactory
ExtractorDocumentProcessor
ExtractorFilter
ExtractorRepository
ExtractorResult
ScriptExtractor
PREFIX KItinerary
......
/*
SPDX-FileCopyrightText: 2017 Volker Krause <vkrause@kde.org>
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "config-kitinerary.h"
#include "extractorrepository.h"
#include "extractor.h"
#include <KItinerary/ExtractorFilter>
#include "logging.h"
#include "extractors/iatabcbpextractor.h"
#include <KItinerary/ExtractorDocumentNode>
#include <KItinerary/ExtractorDocumentProcessor>
#include <KItinerary/ExtractorFilter>
#include <KItinerary/ScriptExtractor>
#ifdef HAVE_KCAL
#include <KCalendarCore/Calendar>
......@@ -38,12 +44,16 @@ namespace KItinerary {
class ExtractorRepositoryPrivate {
public:
ExtractorRepositoryPrivate();
void loadExtractors();
void loadAll();
void initBuiltInExtractors();
void loadScriptExtractors();
void addExtractor(Extractor &&e);
void addExtractor(std::unique_ptr<AbstractExtractor> &&e);
void extractorForTypeAndContent(ExtractorInput::Type type, const QString &content, std::vector<Extractor> &extractors) const;
static void insertExtractor(const Extractor &ext, std::vector<Extractor> &extractors);
std::vector<Extractor> m_extractors;
std::vector<std::unique_ptr<AbstractExtractor>> m_extractorsNew;
QStringList m_extraSearchPaths;
};
}
......@@ -51,7 +61,18 @@ public:
ExtractorRepositoryPrivate::ExtractorRepositoryPrivate()
{
initResources();
loadExtractors();
loadAll();
}
void ExtractorRepositoryPrivate::loadAll()
{
initBuiltInExtractors();
loadScriptExtractors();
}
void ExtractorRepositoryPrivate::initBuiltInExtractors()
{
addExtractor(std::make_unique<IataBcbpExtractor>());
}
void ExtractorRepositoryPrivate::extractorForTypeAndContent(ExtractorInput::Type type, const QString &content, std::vector<Extractor> &extractors) const
......@@ -91,7 +112,7 @@ ExtractorRepository::ExtractorRepository(KItinerary::ExtractorRepository &&) noe
void ExtractorRepository::reload()
{
d->m_extractors.clear();
d->loadExtractors();
d->loadAll();
}
const std::vector<Extractor>& ExtractorRepository::allExtractors() const
......@@ -237,6 +258,25 @@ void ExtractorRepository::extractorsForEvent(const KCalendarCore::Event *event,
}
#endif
void ExtractorRepository::extractorsForNode(const ExtractorDocumentNode &node, std::vector<const AbstractExtractor*> &extractors) const
{
if (node.isNull()) {
return;
}
for (const auto &extractor : d->m_extractorsNew) {
if (extractor->canHandle(node)) {
// while we only would add each extractor at most once, some of them might already be in the list, so de-duplicate
const auto it = std::lower_bound(extractors.begin(), extractors.end(), extractor.get(), [](auto lhs, auto rhs) {
return lhs < rhs;
});
if (it == extractors.end() || (*it) != extractor.get()) {
extractors.insert(it, extractor.get());
}
}
}
}
void ExtractorRepository::extractorsForContent(const QString &content, std::vector<Extractor> &extractors) const
{
d->extractorForTypeAndContent(ExtractorInput::Text, content, extractors);
......@@ -253,7 +293,7 @@ Extractor ExtractorRepository::extractor(const QString &name) const
return {};
}
void ExtractorRepositoryPrivate::loadExtractors()
void ExtractorRepositoryPrivate::loadScriptExtractors()
{
auto searchDirs = m_extraSearchPaths;
const auto qsp = QStandardPaths::standardLocations(QStandardPaths::GenericDataLocation);
......@@ -291,6 +331,13 @@ void ExtractorRepositoryPrivate::loadExtractors()
if (e.load(obj, fi.canonicalFilePath())) {
addExtractor(std::move(e));
}
auto ext = std::make_unique<ScriptExtractor>();
if (ext->load(obj, fi.canonicalFilePath())) {
addExtractor(std::move(ext));
} else {
qCWarning(Log) << "failed to load extractor:" << fi.canonicalFilePath();
}
} else if (doc.isArray()) {
const auto extractorArray = doc.array();
int i = 0;
......@@ -299,6 +346,13 @@ void ExtractorRepositoryPrivate::loadExtractors()
if (e.load(v.toObject(), fi.canonicalFilePath(), extractorArray.size() == 1 ? -1 : i)) {
addExtractor(std::move(e));
}
auto ext = std::make_unique<ScriptExtractor>();
if (ext->load(v.toObject(), fi.canonicalFilePath(), extractorArray.size() == 1 ? -1 : i)) {
addExtractor(std::move(ext));
} else {
qCWarning(Log) << "failed to load extractor:" << fi.canonicalFilePath();
}
++i;
}
} else {
......@@ -319,6 +373,16 @@ void ExtractorRepositoryPrivate::addExtractor(Extractor &&e)
}
}
void ExtractorRepositoryPrivate::addExtractor(std::unique_ptr<AbstractExtractor> &&e)
{
auto it = std::lower_bound(m_extractorsNew.begin(), m_extractorsNew.end(), e, [](const auto &lhs, const auto &rhs) {
return lhs->name() < rhs->name();
});
if (it == m_extractorsNew.end() || (*it)->name() != e->name()) {
m_extractorsNew.insert(it, std::move(e));
}
}
QStringList ExtractorRepository::additionalSearchPaths() const
{
return d->m_extraSearchPaths;
......
/*
SPDX-FileCopyrightText: 2017 Volker Krause <vkrause@kde.org>
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef EXTRACTORREPOSITORY_H
#define EXTRACTORREPOSITORY_H
#ifndef KITINERARY_EXTRACTORREPOSITORY_H
#define KITINERARY_EXTRACTORREPOSITORY_H
#include "kitinerary_export.h"
......@@ -32,12 +32,18 @@ class QString;
namespace KItinerary {
class AbstractExtractor;
class Extractor;
class ExtractorDocumentNode;
class ExtractorRepositoryPrivate;
/** Collection of all unstructured data extractor rule sets.
/**
* Collection of all known data extractors.
* This class is usually not used directly, but as an implementation detail to KItinerary::ExtractorEngine.
*
* See KItinerary::Extractor on where this loads its content from.
* @internal This API is only exported for developer tooling.
* @see KItinerary::ScriptExtractor.
*/
class KITINERARY_EXPORT ExtractorRepository
{
......@@ -55,6 +61,9 @@ public:
/** All known extractors. */
const std::vector<Extractor>& allExtractors() const;
/** Finds matching extractors for the given document node. */
void extractorsForNode(const ExtractorDocumentNode &node, std::vector<const AbstractExtractor*> &extractors) const;
/** Finds matching extractors for the given message part. */
void extractorsForMessage(KMime::Content *part, std::vector<Extractor> &extractors) const;
/** Finds matching extractors for the given pkpass boarding pass. */
......@@ -90,4 +99,4 @@ private:
}
#endif // EXTRACTORREPOSITORY_H
#endif // KITINERARY_EXTRACTORREPOSITORY_H
......@@ -9,7 +9,8 @@
#include "extractorcapabilities.h"
#include "extractor.h"
#include "extractorrepository.h"
#include <KItinerary/ExtractorRepository>
#include <QString>
......
......@@ -8,7 +8,7 @@
#include "barcodedecoder.h"
#include "extractorengine.h"
#include "extractor.h"
#include "extractorrepository.h"
#include "engine/extractorrepository.h"
#include "generic/genericpdfextractor_p.h"
#include "generic/genericicalextractor_p.h"
#include "generic/genericpkpassextractor_p.h"
......@@ -789,6 +789,11 @@ const BarcodeDecoder* ExtractorEngine::barcodeDecoder() const
return &d->m_barcodeDecoder;
}
const ExtractorRepository* ExtractorEngine::extractorRepository() const
{
return &d->m_repo;
}
const ExtractorScriptEngine* ExtractorEngine::scriptEngine() const
{
d->m_scriptEngine.setBarcodeDecoder(&d->m_barcodeDecoder);
......
......@@ -40,6 +40,7 @@ class BarcodeDecoder;
class Extractor;
class ExtractorDocumentNodeFactory;
class ExtractorEnginePrivate;
class ExtractorRepository;
class ExtractorScriptEngine;
class HtmlDocument;
class PdfDocument;
......@@ -223,6 +224,8 @@ public:
const BarcodeDecoder* barcodeDecoder() const;
///@cond internal
/** Extractor repository instance used by this engine. */
const ExtractorRepository* extractorRepository() const;
/** JavaScript execution engine for script extractors. */
const ExtractorScriptEngine* scriptEngine() const;
///@endcond
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment