Commit 537968ac authored by Volker Krause's avatar Volker Krause
Browse files

Add document model for the new extractor engine

This is essentially a tree of variants representing nested documents.
Unlike the previous approach, nested documents are no longer an after-
thought but will now also be properly accessible by tooling. Using
variants and MIME types as well as delegating type-specific functionality
also makes this type-independent and easier extensible.
parent c7711fbe
......@@ -11,6 +11,8 @@ ecm_add_test(mergeutiltest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
ecm_add_test(locationutiltest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
ecm_add_test(knowledgedbtest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
ecm_add_test(airportdbtest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
ecm_add_test(extractorresulttest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
ecm_add_test(extractordocumentnodetest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
ecm_add_test(extractorinputtest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
ecm_add_test(extractorrepositorytest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
ecm_add_test(berdecodertest.cpp LINK_LIBRARIES Qt5::Test KPim::Itinerary)
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include <KItinerary/ExtractorDocumentNode>
#include <QDebug>
#include <QTest>
using namespace KItinerary;
#define s(x) QStringLiteral(x)
class ExtractorDocumentNodeTest : public QObject
{
Q_OBJECT
private Q_SLOTS:
void testBasics()
{
ExtractorDocumentNode node;
QVERIFY(node.isNull());
node = {};
QVERIFY(node.isNull());
node.setContent(s("a plain text content"));
node.setMimeType(s("text/plain"));
node.setContextDateTime(QDateTime::currentDateTime());
QVERIFY(node.isNull()); // not properly constructed
ExtractorDocumentNode child;
node.appendChild(child);
QCOMPARE(child.parent().mimeType(), QLatin1String("text/plain"));
QVERIFY(child.contextDateTime().isValid());
QCOMPARE(child.contextDateTime(), node.contextDateTime());
QVERIFY(node.parent().isNull());
QVERIFY(node.parent().parent().isNull());
}
};
QTEST_GUILESS_MAIN(ExtractorDocumentNodeTest)
#include "extractordocumentnodetest.moc"
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include <KItinerary/ExtractorResult>
#include <KItinerary/Place>
#include <QDebug>
#include <QJsonArray>
#include <QJsonObject>
#include <QTest>
using namespace KItinerary;
#define s(x) QStringLiteral(x)
class ExtractorResultTest : public QObject
{
Q_OBJECT
private Q_SLOTS:
void testAppend()
{
Place p;
p.setName(s("TEST"));
ExtractorResult res({p});
QCOMPARE(res.size(), 1);
QCOMPARE(res.result().size(), 1);
QCOMPARE(res.jsonLdResult().size(), 1);
QJsonObject obj;
obj.insert(QLatin1String("@type"), QLatin1String("Place"));
obj.insert(QLatin1String("name"), QLatin1String("test2"));
res.append(QJsonArray({obj}));
QCOMPARE(res.size(), 2);
QCOMPARE(res.result().size(), 2);
QCOMPARE(res.jsonLdResult().size(), 2);
}
};
QTEST_GUILESS_MAIN(ExtractorResultTest)
#include "extractorresulttest.moc"
......@@ -21,6 +21,9 @@ set(kitinerary_lib_srcs
datatypes/rentalcar.cpp
datatypes/visit.cpp
engine/extractordocumentnode.cpp
engine/extractordocumentprocessor.cpp
engine/extractorresult.cpp
era/ssbticket.cpp
generic/genericextractor.cpp
......@@ -206,6 +209,14 @@ ecm_generate_headers(KItinerary_Datatypes_FORWARDING_HEADERS
REQUIRED_HEADERS KItinerary_Datatypes_HEADERS
RELATIVE datatypes
)
ecm_generate_headers(KItinerary_Engine_FORWARDING_HEADERS
HEADER_NAMES
ExtractorDocumentNode
ExtractorResult
PREFIX KItinerary
REQUIRED_HEADERS KItinerary_Engine_HEADERS
RELATIVE engine
)
ecm_generate_headers(KItinerary_Pdf_FORWARDING_HEADERS
HEADER_NAMES
PdfDocument
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "extractordocumentnode.h"
#include "extractordocumentprocessor.h"
#include "extractorresult.h"
#include <QJSEngine>
#include <QJSValue>
#include <cassert>
using namespace KItinerary;
namespace KItinerary {
class ExtractorDocumentNodePrivate
{
public:
std::weak_ptr<ExtractorDocumentNodePrivate> parent;
std::vector<ExtractorDocumentNode> childNodes;
QString mimeType;
QVariant content;
QDateTime contextDateTime;
const ExtractorDocumentProcessor *processor;
ExtractorResult result;
QVariant location;
QJSEngine *m_jsEngine = nullptr;
QJSEngine *jsEngine() const;
};
}
QJSEngine* ExtractorDocumentNodePrivate::jsEngine() const
{
if (m_jsEngine) {
return m_jsEngine;
}
const auto p = parent.lock();
return p ? p->jsEngine() : nullptr;
}
ExtractorDocumentNode::ExtractorDocumentNode()
: d(std::make_shared<ExtractorDocumentNodePrivate>())
{
}
ExtractorDocumentNode::ExtractorDocumentNode(const std::shared_ptr<ExtractorDocumentNodePrivate> &dd)
: d(dd ? dd : std::make_shared<ExtractorDocumentNodePrivate>())
{
}
ExtractorDocumentNode::ExtractorDocumentNode(const ExtractorDocumentNode &other) = default;
ExtractorDocumentNode::ExtractorDocumentNode(ExtractorDocumentNode &&other) = default;
ExtractorDocumentNode::~ExtractorDocumentNode()
{
if (d && d.use_count() == 1 && d->processor) {
d->processor->destroyNode(*this);
}
}
ExtractorDocumentNode& ExtractorDocumentNode::operator=(const ExtractorDocumentNode &other)
{
if (d && d.use_count() == 1 && d->processor) {
d->processor->destroyNode(*this);
}
d = other.d;
return *this;
}
ExtractorDocumentNode& ExtractorDocumentNode::operator=(ExtractorDocumentNode &&other)
{
if (d && d.use_count() == 1 && d->processor) {
d->processor->destroyNode(*this);
}
d = std::move(other.d);
return *this;
}
bool ExtractorDocumentNode::isNull() const
{
return d->content.isNull() || !d->processor || d->mimeType.isEmpty();
}
ExtractorDocumentNode ExtractorDocumentNode::parent() const
{
return ExtractorDocumentNode(d->parent.lock());
}
void ExtractorDocumentNode::setParent(const ExtractorDocumentNode &parent)
{
d->parent = std::weak_ptr(parent.d);
}
QString ExtractorDocumentNode::mimeType() const
{
return d->mimeType;
}
void ExtractorDocumentNode::setMimeType(const QString &mimeType)
{
d->mimeType = mimeType;
}
QVariant ExtractorDocumentNode::content() const
{
return d->content;
}
void ExtractorDocumentNode::setContent(const QVariant &content)
{
d->content = content;
}
const ExtractorDocumentProcessor* ExtractorDocumentNode::processor() const
{
return d->processor;
}
void ExtractorDocumentNode::setProcessor(const ExtractorDocumentProcessor *processor)
{
assert(!d->processor);
d->processor = processor;
}
const std::vector<ExtractorDocumentNode>& ExtractorDocumentNode::childNodes() const
{
return d->childNodes;
}
void ExtractorDocumentNode::appendChild(ExtractorDocumentNode &child)
{
child.setParent(*this);
d->childNodes.push_back(child);
}
ExtractorResult ExtractorDocumentNode::result() const
{
return d->result;
}
void ExtractorDocumentNode::addResult(ExtractorResult &&result)
{
d->result.append(std::move(result));
}
void ExtractorDocumentNode::setResult(ExtractorResult &&result)
{
d->result = std::move(result);
}
QDateTime ExtractorDocumentNode::contextDateTime() const
{
if (!d->contextDateTime.isValid() && !d->parent.expired()) {
return parent().contextDateTime();
}
return d->contextDateTime;
}
void ExtractorDocumentNode::setContextDateTime(const QDateTime &contextDateTime)
{
d->contextDateTime = contextDateTime;
}
QVariant ExtractorDocumentNode::location() const
{
if (d->location.isNull() && !d->parent.expired()) {
return parent().location();
}
return d->location;
}
void ExtractorDocumentNode::setLocation(const QVariant &location)
{
d->location = location;
}
QJsonArray ExtractorDocumentNode::jsonLdResult() const
{
return d->result.jsonLdResult();
}
QVariantList ExtractorDocumentNode::childNodesVariant() const
{
QVariantList l;
l.reserve(d->childNodes.size());
std::transform(d->childNodes.begin(), d->childNodes.end(), std::back_inserter(l), [](const auto &c) { return QVariant::fromValue(c); });
return l;
}
QJSValue ExtractorDocumentNode::contentJsValue() const
{
if (!d || !d->processor) {
return {};
}
if (auto jsEngine = d->jsEngine()) {
return d->processor->contentToScriptValue(*this, jsEngine);
}
return {};
}
void ExtractorDocumentNode::setScriptEngine(QJSEngine* jsEngine) const
{
if (!d->parent.expired()) {
parent().setScriptEngine(jsEngine);
} else {
d->m_jsEngine = jsEngine;
}
}
#include "moc_extractordocumentnode.cpp"
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_EXTRACTORDOCUMENTNODE_H
#define KITINERARY_EXTRACTORDOCUMENTNODE_H
#include "kitinerary_export.h"
#include <QDateTime>
#include <QJsonArray>
#include <QMetaType>
#include <QVariant>
#include <memory>
#include <type_traits>
class QJSEngine;
class QJSValue;
namespace KItinerary {
///@cond internal
namespace Internal {
template <typename T>
struct OwnedPtr {
inline OwnedPtr() = default;
inline OwnedPtr(T* _ptr) : ptr(_ptr) {}
inline operator T*() const { return ptr; }
T *ptr = nullptr;
};
}
///@endcond
class ExtractorDocumentNodePrivate;
class ExtractorDocumentProcessor;
class ExtractorResult;
class ExtractorScriptEngine;
/** A node in the extracted document object tree.
* Essentially this models a tree of variants representing the input document,
* Each node being associated with and managed by the KItinerary::ExtractorDocumentProcessor
* for its corresponding type.
* Each nodes also carries the result of data extraction on itself and/or its children.
* This is meant for consumption in both C++ and JS code.
*/
class KITINERARY_EXPORT ExtractorDocumentNode
{
Q_GADGET
Q_PROPERTY(bool isNull READ isNull)
/** The parent node, or a null node if this is the root node. */
Q_PROPERTY(KItinerary::ExtractorDocumentNode parent READ parent)
/** Child nodes, for QJSEngine access. */
Q_PROPERTY(QVariantList childNodes READ childNodesVariant)
/** The MIME type of this node. */
Q_PROPERTY(QString mimeType READ mimeType)
/** The decoded content of this node.
* The exact type in here depends on the MIME type, adapted for QJSEngine consumption.
*/
Q_PROPERTY(QJSValue content READ contentJsValue)
/** The best known context date/time at this point in the document tree.
* If not set on this node, the context date/time of the parent node is returned.
*/
Q_PROPERTY(QDateTime contextDateTime READ contextDateTime)
/** Result access for QJSEngine. */
Q_PROPERTY(QJsonArray result READ jsonLdResult)
/** Information about the location of this node in relation to one of its
* ancestors.
* The exact meaning of this depends on the type of the node, one example
* would be a page number an image is found on in a PDF document.
*/
Q_PROPERTY(QVariant location READ location)
public:
/** Creates a null node.
* @see KItinerary::ExtractorDocumentNodeFactory on how to create proper instances.
*/
ExtractorDocumentNode();
ExtractorDocumentNode(const ExtractorDocumentNode &other);
ExtractorDocumentNode(ExtractorDocumentNode &&other);
~ExtractorDocumentNode();
ExtractorDocumentNode& operator=(const ExtractorDocumentNode &other);
ExtractorDocumentNode& operator=(ExtractorDocumentNode &&other);
/** Returns @c true if this is a null instance. */
bool isNull() const;
ExtractorDocumentNode parent() const;
///@cond internal
void setParent(const ExtractorDocumentNode &parent);
///@endcond
/** The MIME type of this node. */
QString mimeType() const;
///@cond internal
void setMimeType(const QString &mimeType);
///@endcond
/** Returns the decoded content of this node.
* The content of the QVariant depends on the MIME type.
*/
QVariant content() const;
/** Set decoded content.
* Only to be used from KItinerary::ExtractorDocumentProcessor::createNodeFromData.
*/
void setContent(const QVariant &content);
/** Checks if the content of this node is of type @p T. */
template <typename T>
inline bool isA() const
{
return content().userType() == qMetaTypeId<T>();
}
/** Returns the content of this node converted to type @p T. */
template <typename T>
inline typename std::enable_if<!std::is_pointer<T>::value || !QMetaTypeId2<Internal::OwnedPtr<typename std::remove_pointer<T>::type>>::Defined, T>::type
content() const
{
return content().value<T>();
}
template <typename T>
inline typename std::enable_if<std::is_pointer<T>::value && QMetaTypeId2<Internal::OwnedPtr<typename std::remove_pointer<T>::type>>::Defined, T>::type
content() const
{
if (isA<T>()) {
return content().value<T>();
}
return content().value<Internal::OwnedPtr<typename std::remove_pointer<T>::type>>();
}
template <typename T>
inline void setContent(const T& value)
{
setContent(QVariant::fromValue(value));
}
/** The best known context date/time at this point in the document tree. */
QDateTime contextDateTime() const;
/** Set the context date/time.
* Only use this from KItinerary::ExtractorDocumentProcessor.
*/
void setContextDateTime(const QDateTime &contextDateTime);
/* Information about the location of this node in relation to one of its ancestors. */
QVariant location() const;
/** Set the location information.
* Only use this from KItinerary::ExtractorDocumentProcessor.
*/
void setLocation(const QVariant &location);
///@cond internal
const ExtractorDocumentProcessor* processor() const;
void setProcessor(const ExtractorDocumentProcessor *processor);
///@endcond
/** The child nodes of this node. */
const std::vector<ExtractorDocumentNode>& childNodes() const;
/** Add another child node.
* Do not use this outside of KItinerary::ExtractorDocumentProcessor::expandNode().
*/
void appendChild(ExtractorDocumentNode &child);
/** Returns the results that have accumulated so far from this node or its children. */
ExtractorResult result() const;
/** Add additional results from an extraction step. */
void addResult(ExtractorResult &&result);
/** Replace the existing results by @p result. */
void setResult(ExtractorResult &&result);
private:
explicit ExtractorDocumentNode(const std::shared_ptr<ExtractorDocumentNodePrivate> &dd);
QJsonArray jsonLdResult() const;
QVariantList childNodesVariant() const;
QJSValue contentJsValue() const;
std::shared_ptr<ExtractorDocumentNodePrivate> d;
friend class ExtractorScriptEngine;
void setScriptEngine(QJSEngine *jsEngine) const;
};
}
Q_DECLARE_METATYPE(KItinerary::ExtractorDocumentNode)
#endif // KITINERARY_EXTRACTORDOCUMENTNODE_H
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "extractordocumentprocessor.h"
#include "extractorresult.h"
#include <QJSEngine>
#include <QJSValue>
using namespace KItinerary;
ExtractorDocumentProcessor::~ExtractorDocumentProcessor() = default;
bool ExtractorDocumentProcessor::canHandleData([[maybe_unused]] const QByteArray &encodedData, [[maybe_unused]] QStringView fileName) const
{
return false;
}
ExtractorDocumentNode ExtractorDocumentProcessor::createNodeFromData([[maybe_unused]] const QByteArray &encodedData) const
{
return {};
}
ExtractorDocumentNode ExtractorDocumentProcessor::createNodeFromContent(const QVariant &decodedData) const
{
ExtractorDocumentNode node;
node.setContent(decodedData);
return node;
}
void ExtractorDocumentProcessor::expandNode([[maybe_unused]] ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
{
}
void ExtractorDocumentProcessor::reduceNode(ExtractorDocumentNode &node) const
{
for (const auto &child : node.childNodes()) {
node.addResult(child.result());
}
}
void ExtractorDocumentProcessor::preExtract([[maybe_unused]] ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
{
}
bool ExtractorDocumentProcessor::matches([[maybe_unused]] const ExtractorFilter &filter, [[maybe_unused]] const ExtractorDocumentNode &node) const
{
return false;
}
void ExtractorDocumentProcessor::postExtract([[maybe_unused]] ExtractorDocumentNode &node) const
{
}
QJSValue ExtractorDocumentProcessor::contentToScriptValue([[maybe_unused]] const ExtractorDocumentNode &node, QJSEngine *engine) const
{
return engine->toScriptValue(node.content());
}
void ExtractorDocumentProcessor::destroyNode([[maybe_unused]] ExtractorDocumentNode &node) const
{
}
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_EXTRACTORDOCUMENTPROCESSOR_H