Members of the KDE Community are recommended to subscribe to the kde-community mailing list at https://mail.kde.org/mailman/listinfo/kde-community to allow them to participate in important discussions and receive other important announcements

Commit 13abbb58 authored by Volker Krause's avatar Volker Krause

Add support for extracting untyped byte arrays to the extractor engine

We have that code duplicated in the unit tests, the command line extractor
and the app with various levels of precision, so better unify this here.

This will also enable the command line extractor to auto-detect the content
type on stdin (so far only implemented for PDF and PkPass), and the app to
import more content from the clipboard or content: URLs.
parent 3ac7b1d9
......@@ -108,41 +108,8 @@ private Q_SLOTS:
QFile inFile(inputFile);
QVERIFY(inFile.open(QFile::ReadOnly));
std::unique_ptr<KPkPass::Pass> pass;
std::unique_ptr<HtmlDocument> htmlDoc;
std::unique_ptr<PdfDocument> pdfDoc;
KCalCore::Calendar::Ptr calendar;
std::unique_ptr<KMime::Message> mimeMsg;
QJsonArray jsonResult;
if (inputFile.endsWith(QLatin1String(".pkpass"))) {
pass.reset(KPkPass::Pass::fromData(inFile.readAll()));
m_engine.setPass(pass.get());
} else if (inputFile.endsWith(QLatin1String(".pdf"))) {
pdfDoc.reset(PdfDocument::fromData(inFile.readAll()));
QVERIFY(pdfDoc);
m_engine.setPdfDocument(pdfDoc.get());
} else if (inputFile.endsWith(QLatin1String(".html"))) {
htmlDoc.reset(HtmlDocument::fromData(inFile.readAll()));
QVERIFY(htmlDoc);
m_engine.setHtmlDocument(htmlDoc.get());
} else if (inputFile.endsWith(QLatin1String(".txt"))) {
m_engine.setText(QString::fromUtf8(inFile.readAll()));
} else if (inputFile.endsWith(QLatin1String(".ics"))) {
calendar.reset(new KCalCore::MemoryCalendar(QTimeZone()));
KCalCore::ICalFormat format;
QVERIFY(format.fromRawString(calendar, inFile.readAll()));
calendar->setProductId(format.loadedProductId());
m_engine.setCalendar(calendar);
} else if (inputFile.endsWith(QLatin1String(".eml")) || inputFile.endsWith(QLatin1String(".mbox"))) {
mimeMsg.reset(new KMime::Message);
mimeMsg->setContent(inFile.readAll());
mimeMsg->parse();
m_engine.setContent(mimeMsg.get());
}
jsonResult = m_engine.extract();
m_engine.setData(inFile.readAll(), inputFile);
auto jsonResult = m_engine.extract();
const auto expectedSkip = QFile::exists(inputFile + QLatin1String(".skip"));
if (jsonResult.isEmpty() && expectedSkip) {
......
......@@ -134,18 +134,16 @@ int main(int argc, char** argv)
std::unique_ptr<KMime::Message> mimeMsg;
QJsonArray jsonResult;
if (f.fileName().endsWith(QLatin1String(".pkpass")) || parser.value(typeOpt) == QLatin1String("pkpass")) {
if (parser.value(typeOpt) == QLatin1String("pkpass")) {
pass.reset(KPkPass::Pass::fromData(f.readAll()));
engine.setPass(pass.get());
} else if (f.fileName().endsWith(QLatin1String(".pdf")) || parser.value(typeOpt) == QLatin1String("pdf")) {
} else if (parser.value(typeOpt) == QLatin1String("pdf")) {
pdfDoc.reset(PdfDocument::fromData(f.readAll()));
engine.setPdfDocument(pdfDoc.get());
} else if (f.fileName().endsWith(QLatin1String(".html")) || parser.value(typeOpt) == QLatin1String("html")) {
} else if (parser.value(typeOpt) == QLatin1String("html")) {
htmlDoc.reset(HtmlDocument::fromData(f.readAll()));
engine.setHtmlDocument(htmlDoc.get());
} else if (f.fileName().endsWith(QLatin1String(".txt"))) {
engine.setText(QString::fromUtf8(f.readAll()));
} else if (f.fileName().endsWith(QLatin1String(".ics")) || parser.value(typeOpt) == QLatin1String("ical")) {
} else if (parser.value(typeOpt) == QLatin1String("ical")) {
calendar.reset(new KCalCore::MemoryCalendar(QTimeZone()));
KCalCore::ICalFormat format;
if (!format.fromRawString(calendar, f.readAll())) {
......@@ -154,11 +152,13 @@ int main(int argc, char** argv)
}
calendar->setProductId(format.loadedProductId());
engine.setCalendar(calendar);
} else if (f.fileName().endsWith(QLatin1String(".eml")) || f.fileName().endsWith(QLatin1String(".mbox")) || parser.value(typeOpt) == QLatin1String("mime")) {
} else if (parser.value(typeOpt) == QLatin1String("mime")) {
mimeMsg.reset(new KMime::Message);
mimeMsg->setContent(f.readAll());
mimeMsg->parse();
engine.setContent(mimeMsg.get());
} else {
engine.setData(f.readAll(), f.fileName());
}
jsonResult = engine.extract();
......
......@@ -42,6 +42,7 @@
#include <KPkPass/Pass>
#include <KMime/Content>
#include <KMime/Message>
#include <QDateTime>
#include <QFile>
......@@ -52,6 +53,8 @@
#include <QJSEngine>
#include <QJSValueIterator>
#include <cstring>
using namespace KItinerary;
namespace KItinerary {
......@@ -85,8 +88,9 @@ public:
#ifdef HAVE_KCAL
KCalCore::Calendar::Ptr m_calendar;
#endif
KMime::Content *m_mimeContent = nullptr;
KMime::Content *m_mimeContent;
KMime::Content *m_mimeContext = nullptr;
std::unique_ptr<KMime::Content> m_ownedMimeContent;
GenericPdfExtractor m_genericPdfExtractor;
QJsonArray m_result;
QJSEngine m_engine;
......@@ -134,6 +138,7 @@ void ExtractorEngine::clear()
d->m_result = {};
d->m_mimeContext = nullptr;
d->m_context->m_senderDate = {};
d->m_ownedMimeContent.reset();
}
void ExtractorEnginePrivate::resetContent()
......@@ -217,6 +222,43 @@ void ExtractorEnginePrivate::setContent(KMime::Content *content)
m_mimeContent = (ct && ct->isMultipart()) ? content : nullptr;
}
void ExtractorEngine::setData(const QByteArray &data, const QString &fileName)
{
// let's not even try to parse anything with implausible size
if (data.size() <= 4 || data.size() > 4000000) {
return;
}
if (fileName.endsWith(QLatin1String(".pkpass"), Qt::CaseInsensitive) || strncmp(data.constData(), "PK\x03\x04", 4) == 0) {
d->m_pass = make_owning_ptr(KPkPass::Pass::fromData(data));
} else if (fileName.endsWith(QLatin1String(".pdf"), Qt::CaseInsensitive) || strncmp(data.constData(), "%PDF", 4) == 0) {
d->m_pdfDoc = make_owning_ptr(PdfDocument::fromData(data));
} else if (fileName.endsWith(QLatin1String(".html"), Qt::CaseInsensitive)) { // TODO content check
d->m_htmlDoc = make_owning_ptr(HtmlDocument::fromData(data));
} else if (fileName.endsWith(QLatin1String(".ics"), Qt::CaseInsensitive)) { // TODO content check
#ifdef HAVE_KCAL
d->m_calendar.reset(new KCalCore::MemoryCalendar(QTimeZone()));
KCalCore::ICalFormat format;
if (!format.fromRawString(d->m_calendar, data)) {
qCDebug(Log) << "Failed to parse iCal content.";
d->m_calendar.reset();
}
d->m_calendar->setProductId(format.loadedProductId());
#else
qCDebug(Log) << "Trying to exctract ical file, but ical support is not enabled.";
#endif
} else if (fileName.endsWith(QLatin1String(".eml"), Qt::CaseInsensitive) || fileName.endsWith(QLatin1String(".mbox"), Qt::CaseInsensitive)) { // TODO how can we check content for being MIME?
d->m_ownedMimeContent.reset(new KMime::Message);
d->m_ownedMimeContent->setContent(data);
d->m_ownedMimeContent->parse();
setContent(d->m_ownedMimeContent.get());
} else if (fileName.endsWith(QLatin1String(".txt"), Qt::CaseInsensitive)) {
d->m_text = QString::fromUtf8(data);
} else {
qCDebug(Log) << "Failed to detect data type!";
}
}
void ExtractorEnginePrivate::setContext(KMime::Content *context)
{
m_mimeContext = context;
......
......@@ -22,8 +22,11 @@
#include "kitinerary_export.h"
#include <QString>
#include <memory>
#include <vector>
template <typename T> class QSharedPointer;
namespace KCalCore {
......@@ -38,9 +41,9 @@ namespace KMime {
class Content;
}
class QByteArray;
class QDateTime;
class QJsonArray;
class QString;
namespace KItinerary {
......@@ -158,6 +161,13 @@ public:
* @p content is also set as extraction context (see setContext).
*/
void setContent(KMime::Content *content);
/** Any kind of data to extract from.
* ExtractorEngine tries to auto-detect what type of data this is
* and pick one of the above methods accordingly.
* Avoid using this if you know exactly what data you have.
* @param fileName Used as a hint to determine the type, optional.
*/
void setData(const QByteArray &data, const QString &fileName = {});
/** Sets the MIME part the document we try to extract comes from.
* Use this for documents received by email, to provide additional
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment