Commit 42aeb48a authored by Volker Krause's avatar Volker Krause
Browse files

Add document type support for everything we supported so far

This is largely existing code that previously was either in the generic
extractors or somewhere in ExtractorEngine, refactored to have everything
related to a specific document type in a single place.
parent bc7865df
......@@ -5,8 +5,13 @@
*/
#include <KItinerary/ExtractorDocumentNode>
#include <KItinerary/ExtractorDocumentNodeFactory>
#include <KItinerary/ExtractorDocumentProcessor>
#include <KItinerary/ExtractorEngine>
#include <KItinerary/PdfDocument>
#include <QDebug>
#include <QFile>
#include <QTest>
using namespace KItinerary;
......@@ -38,6 +43,64 @@ private Q_SLOTS:
QVERIFY(node.parent().isNull());
QVERIFY(node.parent().parent().isNull());
}
void testPdfFromData()
{
ExtractorEngine engine;
QFile f(QStringLiteral(SOURCE_DIR "/misc/test.pdf"));
QVERIFY(f.open(QFile::ReadOnly));
auto root = engine.documentNodeFactory()->createNode(f.readAll());
QVERIFY(!root.isNull());
QCOMPARE(root.mimeType(), QLatin1String("application/pdf"));
QCOMPARE(root.childNodes().size(), 0);
QVERIFY(root.location().isNull());
root.processor()->expandNode(root, &engine);
QCOMPARE(root.childNodes().size(), 2);
auto c1 = root.childNodes()[0];
QVERIFY(!c1.isNull());
QCOMPARE(c1.mimeType(), QLatin1String("internal/qimage"));
QVERIFY(!c1.parent().isNull());
QCOMPARE(c1.content().userType(), qMetaTypeId<QImage>());
QCOMPARE(c1.childNodes().size(), 0);
QCOMPARE(c1.location().toInt(), 0);
c1.processor()->expandNode(c1, &engine);
QCOMPARE(c1.childNodes().size(), 1);
auto c11 = c1.childNodes()[0];
QVERIFY(!c11.isNull());
QCOMPARE(c11.mimeType(), QLatin1String("text/plain"));
QCOMPARE(c11.content<QString>(), QLatin1String("PDF417 is a stacked linear barcode symbol format used in a variety of applications, primarily transport, identification cards, and inventory management."));
QCOMPARE(c11.location().toInt(), 0);
}
void testPdfFromContent()
{
ExtractorEngine engine;
QFile f(QStringLiteral(SOURCE_DIR "/misc/test.pdf"));
QVERIFY(f.open(QFile::ReadOnly));
std::unique_ptr<PdfDocument> pdf(PdfDocument::fromData(f.readAll()));
auto root = engine.documentNodeFactory()->createNode(QVariant::fromValue(pdf.get()), u"application/pdf");
QVERIFY(!root.isNull());
QCOMPARE(root.mimeType(), QLatin1String("application/pdf"));
root.processor()->expandNode(root, &engine);
QCOMPARE(root.childNodes().size(), 2);
auto c2 = root.childNodes()[1];
QVERIFY(!c2.isNull());
QCOMPARE(c2.mimeType(), QLatin1String("internal/qimage"));
QCOMPARE(c2.location().toInt(), 1);
c2.processor()->expandNode(c2, &engine);
QCOMPARE(c2.childNodes().size(), 1);
auto c21 = c2.childNodes()[0];
QVERIFY(!c21.isNull());
QCOMPARE(c21.mimeType(), QLatin1String("text/plain"));
QCOMPARE(c21.content<QString>(), QLatin1String("This is an example Aztec symbol for Wikipedia."));
QCOMPARE(c21.location().toInt(), 1);
}
};
QTEST_GUILESS_MAIN(ExtractorDocumentNodeTest)
......
......@@ -29,6 +29,8 @@ private Q_SLOTS:
QFile f(QStringLiteral(SOURCE_DIR "/misc/test.pdf"));
QVERIFY(f.open(QFile::ReadOnly));
#ifdef HAVE_POPPLER
QVERIFY(PdfDocument::maybePdf(f.readAll()));
f.seek(0);
std::unique_ptr<PdfDocument> doc(PdfDocument::fromData(f.readAll()));
QVERIFY(doc);
QCOMPARE(doc->text(), QStringLiteral("This is the first page.\nIt contains a PDF 417 barcode.\nThis is the second page.\nIt contains an Aztec code.\n"));
......@@ -72,7 +74,9 @@ private Q_SLOTS:
void testInvalidPdfDocument()
{
QVERIFY(!PdfDocument::maybePdf(QByteArray()));
QVERIFY(!PdfDocument::fromData(QByteArray()));
QVERIFY(!PdfDocument::maybePdf(QByteArray("HELLO")));
QVERIFY(!PdfDocument::fromData(QByteArray("HELLO")));
QFile f(QStringLiteral(SOURCE_DIR "/misc/test.pdf"));
......
......@@ -23,6 +23,7 @@ set(kitinerary_lib_srcs
engine/abstractextractor.cpp
engine/extractordocumentnode.cpp
engine/extractordocumentnodefactory.cpp
engine/extractordocumentprocessor.cpp
engine/extractorfilter.cpp
engine/extractorresult.cpp
......@@ -59,6 +60,18 @@ set(kitinerary_lib_srcs
pdf/popplerglobalparams.cpp
pdf/popplerutils.cpp
processors/binarydocumentprocessor.cpp
processors/htmldocumentprocessor.cpp
processors/icaldocumentprocessor.cpp
processors/imagedocumentprocessor.cpp
processors/jsonlddocumentprocessor.cpp
processors/mimedocumentprocessor.cpp
processors/pdfdocumentprocessor.cpp
processors/pkpassdocumentprocessor.cpp
processors/textdocumentprocessor.cpp
processors/uic9183documentprocessor.cpp
processors/vdvdocumentprocessor.cpp
scripts/extractors.qrc
tlv/berelement.cpp
......@@ -216,6 +229,8 @@ ecm_generate_headers(KItinerary_Engine_FORWARDING_HEADERS
HEADER_NAMES
AbstractExtractor
ExtractorDocumentNode
ExtractorDocumentNodeFactory
ExtractorDocumentProcessor
ExtractorFilter
ExtractorResult
PREFIX KItinerary
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "extractordocumentnodefactory.h"
#include "extractordocumentnode.h"
#include "extractordocumentprocessor.h"
#include "logging.h"
#include "processors/binarydocumentprocessor.h"
// #include "processors/externalprocessor.h"
#include "processors/htmldocumentprocessor.h"
#include "processors/icaldocumentprocessor.h"
#include "processors/imagedocumentprocessor.h"
#include "processors/jsonlddocumentprocessor.h"
#include "processors/mimedocumentprocessor.h"
#include "processors/pdfdocumentprocessor.h"
#include "processors/pkpassdocumentprocessor.h"
#include "processors/textdocumentprocessor.h"
#include "processors/uic9183documentprocessor.h"
#include "processors/vdvdocumentprocessor.h"
#include <QHash>
#include <QMimeDatabase>
using namespace KItinerary;
enum {
MinDocumentSize = 4,
MaxDocumentSize = 4000000,
};
namespace KItinerary {
class ExtractorDocumentNodeFactoryStatic {
public:
ExtractorDocumentNodeFactoryStatic();
void registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {});
template <typename T>
inline void registerProcessor(QStringView canonicalMimeType, std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {})
{
registerProcessor(std::make_unique<T>(), canonicalMimeType, aliasMimeTypes, fallbackMimeType);
}
void registerBuiltIn();
QStringView resolveAlias(QStringView mimeType) const;
struct ProcessorData {
QString mimeType;
const ExtractorDocumentProcessor* processor;
};
std::vector<ProcessorData> m_probeProcessors;
std::vector<ProcessorData> m_fallbackProbeProcessors;
std::vector<ProcessorData> m_mimetypeProcessorMap;
QHash<QString, QString> m_aliasMap;
// just for memory management
std::vector<std::unique_ptr<ExtractorDocumentProcessor>> processorPool;
static void insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap);
};
class ExtractorDocumentNodeFactoryPrivate {
public:
ExtractorDocumentNodeFactoryStatic *s;
std::unique_ptr<ExtractorDocumentProcessor> interceptProcessor;
};
}
ExtractorDocumentNodeFactoryStatic::ExtractorDocumentNodeFactoryStatic()
{
registerBuiltIn();
}
void ExtractorDocumentNodeFactoryStatic::insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap)
{
if (mimeType.empty()) {
return;
}
const auto it = std::lower_bound(procMap.begin(), procMap.end(), mimeType, [](const auto &proc, auto mt) {
return proc.mimeType < mt;
});
if (it != procMap.end() && (*it).mimeType == mimeType) {
qCWarning(Log) << "Document processor already registered for mimetype:" << mimeType;
return;
}
procMap.insert(it, { mimeType.toString(), proc });
}
void ExtractorDocumentNodeFactoryStatic::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
std::initializer_list<QStringView> aliasMimeTypes, QStringView fallbackMimeType)
{
insertProcessor(processor.get(), canonicalMimeType, m_probeProcessors);
insertProcessor(processor.get(), canonicalMimeType, m_mimetypeProcessorMap);
for (const auto mt : aliasMimeTypes) {
m_aliasMap.insert(mt.toString(), canonicalMimeType.isEmpty() ? fallbackMimeType.toString() : canonicalMimeType.toString());
}
insertProcessor(processor.get(), fallbackMimeType, m_fallbackProbeProcessors);
insertProcessor(processor.get(), fallbackMimeType, m_mimetypeProcessorMap);
processorPool.push_back(std::move(processor));
}
void ExtractorDocumentNodeFactoryStatic::registerBuiltIn()
{
registerProcessor<JsonLdDocumentProcessor>(u"application/ld+json", {u"application/json"});
registerProcessor<PdfDocumentProcessor>(u"application/pdf");
registerProcessor<PkPassDocumentProcessor>(u"application/vnd.apple.pkpass");
registerProcessor<IcalEventProcessor>(u"internal/event");
registerProcessor<ImageDocumentProcessor>(u"internal/qimage");
registerProcessor<Uic9183DocumentProcessor>(u"internal/uic9183");
registerProcessor<VdvDocumentProcessor>(u"internal/vdv");
registerProcessor<IcalCalendarProcessor>(u"text/calendar");
// fallback types that catch a very broad set of input types
// order matters particularly here, the broadest ones need to go last
registerProcessor<MimeDocumentProcessor>({}, {u"application/mbox"}, u"message/rfc822");
registerProcessor<HtmlDocumentProcessor>({}, {u"application/xhtml+xml"}, u"text/html");
registerProcessor<TextDocumentProcessor>({}, {}, u"text/plain");
registerProcessor<BinaryDocumentProcessor>({}, {}, u"application/octet-stream");
}
QStringView ExtractorDocumentNodeFactoryStatic::resolveAlias(QStringView mimeType) const
{
const auto it = m_aliasMap.find(mimeType.toString());
if (it != m_aliasMap.end()) {
return it.value();
}
return mimeType;
}
ExtractorDocumentNodeFactory::ExtractorDocumentNodeFactory()
: d(std::make_unique<ExtractorDocumentNodeFactoryPrivate>())
{
static ExtractorDocumentNodeFactoryStatic s_factory;
d->s = &s_factory;
}
ExtractorDocumentNodeFactory::~ExtractorDocumentNodeFactory() = default;
ExtractorDocumentNode ExtractorDocumentNodeFactory::createNode(const QByteArray &data, QStringView fileName, QStringView mimeType) const
{
if (data.size() <= MinDocumentSize || data.size() > MaxDocumentSize) {
return {};
}
if (d->interceptProcessor && d->interceptProcessor->canHandleData(data, fileName)) {
auto node = d->interceptProcessor->createNodeFromData(data);
node.setMimeType(QStringLiteral("internal/external-process"));
node.setProcessor(d->interceptProcessor.get());
return node;
}
QString autoDetectedMimeType;
if (mimeType.isEmpty()) {
// let processors check themselves if they support this data
for (const auto &p : d->s->m_probeProcessors) {
if (p.processor->canHandleData(data, fileName)) {
auto node = p.processor->createNodeFromData(data);
if (node.content().isNull()) {
continue;
}
node.setMimeType(p.mimeType);
node.setProcessor(p.processor);
return node;
}
}
// same again with the basic types that ultimately will accept anything
for (const auto &p : d->s->m_fallbackProbeProcessors) {
if (p.processor->canHandleData(data, fileName)) {
auto node = p.processor->createNodeFromData(data);
if (node.content().isNull()) {
continue;
}
node.setMimeType(p.mimeType);
node.setProcessor(p.processor);
return node;
}
}
// if none felt responsible, try the generic mimetype detection
QMimeDatabase db;
if (fileName.isEmpty()) {
autoDetectedMimeType = db.mimeTypeForData(data).name();
} else {
autoDetectedMimeType = db.mimeTypeForFileNameAndData(fileName.toString(), data).name();
}
mimeType = autoDetectedMimeType;
}
mimeType = d->s->resolveAlias(mimeType);
const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
return proc.mimeType < mt;
});
if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
qCDebug(Log) << "No document processor found for mimetype" << mimeType;
return {};
}
auto node = (*it).processor->createNodeFromData(data);
node.setMimeType((*it).mimeType);
node.setProcessor((*it).processor);
return node;
}
ExtractorDocumentNode ExtractorDocumentNodeFactory::createNode(const QVariant &decodedData, QStringView mimeType) const
{
mimeType = d->s->resolveAlias(mimeType);
const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
return proc.mimeType < mt;
});
if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
qCDebug(Log) << "No document processor found for mimetype" << mimeType;
return {};
}
auto node = (*it).processor->createNodeFromContent(decodedData);
node.setMimeType((*it).mimeType);
node.setProcessor((*it).processor);
return node;
}
void ExtractorDocumentNodeFactory::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView mimeType,
std::initializer_list<QStringView> aliasMimeTypes)
{
d->s->registerProcessor(std::move(processor), mimeType, aliasMimeTypes);
}
void ExtractorDocumentNodeFactory::setUseSeparateProcess(bool separateProcess)
{
if (separateProcess && !d->interceptProcessor) {
// d->interceptProcessor = std::make_unique<ExternalProcessor>();
} else {
d->interceptProcessor.reset();
}
}
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_EXTRACTORDOCUMENTNODEFACTORY_H
#define KITINERARY_EXTRACTORDOCUMENTNODEFACTORY_H
#include <kitinerary_export.h>
#include <QString>
#include <QStringView>
#include <memory>
class QByteArray;
class QVariant;
namespace KItinerary {
class ExtractorDocumentNode;
class ExtractorDocumentNodeFactoryPrivate;
class ExtractorDocumentProcessor;
/** Instantiates KItinerary::ExtractorDocumentNode instances using the type-specific document processor. */
class KITINERARY_EXPORT ExtractorDocumentNodeFactory
{
public:
explicit ExtractorDocumentNodeFactory();
~ExtractorDocumentNodeFactory();
/** Create a new document node from @p data.
* @param fileName Optional hint for MIME-type auto-detection.
* @param mimeType MIME type of @p data if known, auto-detected otherwise.
*/
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName = {}, QStringView mimeType = {}) const;
/** Create a node for an already decoded content object. */
ExtractorDocumentNode createNode(const QVariant &decodedData, QStringView mimeType) const;
/** Register a new document processor. */
void registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
std::initializer_list<QStringView> aliasMimeTypes = {});
/** Perform extraction of "risky" content such as PDF files in a separate process.
* This is safer as it isolates the using application from crashes/hangs due to corrupt files.
* It is however slower, and not available on all platforms.
* This is off by default.
*/
void setUseSeparateProcess(bool separateProcess);
private:
std::unique_ptr<ExtractorDocumentNodeFactoryPrivate> d;
};
}
#endif // KITINERARY_EXTRACTORDOCUMENTNODEFACTORY_H
/*
SPDX-FileCopyrightText: 2017 Volker Krause <vkrause@kde.org>
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
......@@ -23,6 +23,7 @@
#include "uic9183/uic9183parser.h"
#include "vdv/vdvticketparser.h"
#include "engine/extractordocumentnodefactory.h"
#include "jsapi/barcode.h"
#include "jsapi/context.h"
#include "jsapi/jsonld.h"
......@@ -100,6 +101,7 @@ public:
std::vector<GenericExtractor::Result> m_genericResults;
QJsonArray m_result;
QJSEngine m_engine;
ExtractorDocumentNodeFactory m_nodeFactory;
ExtractorRepository m_repo;
BarcodeDecoder m_barcodeDecoder;
QString m_externalExtractor;
......@@ -774,3 +776,14 @@ QString ExtractorEngine::usedCustomExtractor() const
{
return d->m_usedExtractor;
}
const ExtractorDocumentNodeFactory* ExtractorEngine::documentNodeFactory() const
{
return &d->m_nodeFactory;
}
const BarcodeDecoder* ExtractorEngine::barcodeDecoder() const
{
return &d->m_barcodeDecoder;
}
......@@ -36,7 +36,9 @@ class QVariant;
namespace KItinerary {
class BarcodeDecoder;
class Extractor;
class ExtractorDocumentNodeFactory;
class ExtractorEnginePrivate;
class HtmlDocument;
class PdfDocument;
......@@ -209,6 +211,16 @@ public:
*/
QString usedCustomExtractor() const;
/** Factory for creating new document nodes.
* This is only for use by KItinerary::ExtractorDocumentProcessor instances.
*/
const ExtractorDocumentNodeFactory* documentNodeFactory() const;
/** Barcode decoder for use by KItinerary::ExtractorDocumentProcessor.
* Use this rather than your own instance as it caches repeated attempts to
* decode the same image.
*/
const BarcodeDecoder* barcodeDecoder() const;
private:
std::unique_ptr<ExtractorEnginePrivate> d;
};
......
......@@ -305,3 +305,8 @@ PdfDocument* PdfDocument::fromData(const QByteArray &data, QObject *parent)
return nullptr;
#endif
}
bool PdfDocument::maybePdf(const QByteArray &data)
{
return data.startsWith("%PDF");
}
......@@ -105,6 +105,9 @@ public:
*/
static PdfDocument* fromData(const QByteArray &data, QObject *parent = nullptr);
/** Fast check whether @p data might be a PDF document. */
static bool maybePdf(const QByteArray &data);
private:
QVariantList pagesVariant() const;
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "binarydocumentprocessor.h"
#include <KItinerary/ExtractorFilter>
using namespace KItinerary;
ExtractorDocumentNode BinaryDocumentProcessor::createNodeFromData(const QByteArray &encodedData) const
{
ExtractorDocumentNode node;
node.setContent(encodedData);
return node;
}
bool BinaryDocumentProcessor::matches(const ExtractorFilter &filter, const ExtractorDocumentNode &node) const
{
return filter.matches(QString::fromLatin1(node.content<QByteArray>()));
}
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_BINARYDOCUMENTPROCESSOR_H
#define KITINERARY_BINARYDOCUMENTPROCESSOR_H
#include <KItinerary/ExtractorDocumentProcessor>
namespace KItinerary {
/** Processor for generic binary content. */
class BinaryDocumentProcessor : public ExtractorDocumentProcessor
{
public:
ExtractorDocumentNode createNodeFromData(const QByteArray &encodedData) const override;
bool matches(const ExtractorFilter &filter, const ExtractorDocumentNode &node) const override;
};
}
#endif // KITINERARY_BINARYDOCUMENTPROCESSOR_H
/*
SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "htmldocumentprocessor.h"
#include "logging.h"
#include <KItinerary/ExtractorResult>
#include <KItinerary/HtmlDocument>
#include <QJsonArray>
#include <QJsonDocument>
#include <QJsonObject>
#include <QJSEngine>
#include <QJSValue>
#include <QString>
#include <QUrl>
using namespace KItinerary;
Q_DECLARE_METATYPE(Internal::OwnedPtr<HtmlDocument>)
static bool contentStartsWith(const QByteArray &data, char s)