Commit c7711fbe authored by Volker Krause's avatar Volker Krause
Browse files

Deprecate the type-specific ExtractorEngine input interface

Going forward, there will only be two input methods, one taking raw data
and one taking a variant with already decoded data. This is also continuing
the work of phasing out ExtractorInput enums in favor of full MIME types.

Nothing really changed on the inside yet, this is mostly transitional
parent f7d6efa3
Pipeline #54967 passed with stages
in 11 minutes and 22 seconds
......@@ -62,7 +62,7 @@ private Q_SLOTS:
ExtractorEngine engine;
engine.setContextDate(QDateTime(QDate(2017, 12, 29), QTime(18, 46, 2)));
engine.setContent(QVariant::fromValue(pass), u"application/");
auto result = JsonLdDocument::fromJson(engine.extract());
ExtractorPostprocessor postproc;
......@@ -78,7 +78,7 @@ static void printExtractors()
ExtractorRepository repo;
for (const auto &ext : repo.allExtractors()) {
std::cout << qPrintable( << " (" << qPrintable(ExtractorInput::typeToString(ext.type()));
std::cout << qPrintable( << " (" << qPrintable(ext.mimeType());
if (!ext.scriptFileName().isEmpty()) {
std::cout << ", " << qPrintable(ext.scriptFileName()) << ":" << qPrintable(ext.scriptFunction());
......@@ -245,14 +245,19 @@ void ExtractorEngine::setCalendar(const QSharedPointer<KCalendarCore::Calendar>
void ExtractorEngine::setData(const QByteArray &data, const QString &fileName)
void ExtractorEngine::setData(const QByteArray &data, QStringView fileName, QStringView mimeType)
// let's not even try to parse anything with implausible size
if (data.size() <= 4 || data.size() > 4000000) {
const auto nameType = ExtractorInput::typeFromFileName(fileName);
const auto mtType = ExtractorInput::typeFromMimeType(mimeType.toString());
if (mtType != ExtractorInput::Unknown) {
setData(data, mtType);
const auto nameType = ExtractorInput::typeFromFileName(fileName.toString());
const auto contentType = ExtractorInput::typeFromContent(data);
setData(data, nameType == ExtractorInput::Unknown ? contentType : nameType);
......@@ -268,6 +273,35 @@ void ExtractorEngine::setData(const QByteArray &data, ExtractorInput::Type type)
d->m_inputType = type;
void ExtractorEngine::setContent(const QVariant &data, QStringView mimeType)
// ### temporary scaffolding until we have the new extractor engine
switch (ExtractorInput::typeFromMimeType(mimeType.toString())) {
case ExtractorInput::Text:
case ExtractorInput::Html:
case ExtractorInput::Pdf:
case ExtractorInput::PkPass:
case ExtractorInput::ICal:
#ifdef HAVE_KCAL
case ExtractorInput::Email:
void ExtractorEnginePrivate::openDocument()
if (m_data.isEmpty()) {
SPDX-FileCopyrightText: 2017 Volker Krause <>
SPDX-FileCopyrightText: 2017-2021 Volker Krause <>
SPDX-License-Identifier: LGPL-2.0-or-later
#include "kitinerary_export.h"
#include "extractorinput.h"
......@@ -32,6 +32,7 @@ class Content;
class QByteArray;
class QDateTime;
class QJsonArray;
class QVariant;
namespace KItinerary {
......@@ -41,9 +42,9 @@ class HtmlDocument;
class PdfDocument;
* Unstructured data extraction engine.
* Semantic data extraction engine.
* This will apply the given Extractor instance to the given input data
* This will attempt to find travel itinerary data in the given input data
* (plain text, HTML text, PDF documents, etc), and return the extracted
* JSON-LD data.
......@@ -117,9 +118,9 @@ class KITINERARY_EXPORT ExtractorEngine
ExtractorEngine(ExtractorEngine &&) noexcept;
ExtractorEngine(const ExtractorEngine &) = delete;
/** Resets the internal state, call before processing new input data. */
void clear();
......@@ -127,41 +128,46 @@ public:
/** The text to extract data from.
* Only considered for text extractors.
void setText(const QString &text);
[[deprecated("use setContent")]] void setText(const QString &text);
/** A HTML document to extract data from.
* Only considered for HTML and text extractors.
void setHtmlDocument(HtmlDocument *htmlDoc);
[[deprecated("use setContent")]] void setHtmlDocument(HtmlDocument *htmlDoc);
/** A PDF document to extract data from.
* Only considered for PDF or text extractors.
void setPdfDocument(PdfDocument *pdfDoc);
[[deprecated("use setContent")]] void setPdfDocument(PdfDocument *pdfDoc);
/** The pkpass boarding pass to extract data from.
* Only considered for pkpass extractors.
void setPass(KPkPass::Pass *pass);
[[deprecated("use setContent")]] void setPass(KPkPass::Pass *pass);
/** The iCalendar to extract data from.
* Only considered for ical extractors.
void setCalendar(const QSharedPointer<KCalendarCore::Calendar> &calendar);
[[deprecated("use setContent")]] void setCalendar(const QSharedPointer<KCalendarCore::Calendar> &calendar);
/** A MIME part to extract from.
* This is assumed to contain one of the supported mime types.
* @p content is also set as extraction context (see setContext).
void setContent(KMime::Content *content);
/** Any kind of data to extract from.
* ExtractorEngine tries to auto-detect what type of data this is
* and pick one of the above methods accordingly.
* Avoid using this if you know exactly what data you have.
* @param fileName Used as a hint to determine the type, optional.
[[deprecated("use setContent")]] void setContent(KMime::Content *content);
/** Set raw data to extract from.
* @param data Raw data to extract from.
* @param fileName Used as a hint to determine the type, optional and used for MIME type auto-detection if needed.
* @param mimeType MIME type of @p data, auto-detected if empty.
void setData(const QByteArray &data, const QString &fileName = {});
void setData(const QByteArray &data, QStringView fileName = {}, QStringView mimeType = {});
/** Raw data to extract, but with a known type.
* No content type detection is performed here, you should be sure about @p type.
void setData(const QByteArray &data, ExtractorInput::Type type);
[[deprecated("use setData")]] void setData(const QByteArray &data, ExtractorInput::Type type);
/** Already decoded data to extract from.
* @param data Has to contain a object of a supported data type matching @p mimeType.
void setContent(const QVariant &data, QStringView mimeType);
/** Sets the MIME part the document we try to extract comes from.
* Use this for documents received by email, to provide additional
......@@ -209,4 +215,4 @@ private:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment