Commit 6ffe2cb3 authored by Volker Krause's avatar Volker Krause
Browse files

Allow to optionally perform extraction on full page raster images in PDFs

This is off by default as it requires quite heavy image processing, but
if an application is certain the input PDF contains something that can
be extracted this can be enabled.

Makes importing Danish COVID certificates work.
parent a8a16331
Pipeline #135787 passed with stages
in 1 minute and 36 seconds
......@@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
set(PIM_VERSION "5.19.40")
set(PIM_VERSION "5.19.41")
project(KItinerary VERSION ${PIM_VERSION})
set(KF5_MIN_VERSION "5.90.0")
......
......@@ -48,6 +48,7 @@ public:
BarcodeDecoder m_barcodeDecoder;
QString m_usedExtractor;
ExtractorScriptEngine m_scriptEngine;
ExtractorEngine::Hints m_hints = ExtractorEngine::NoHint;
};
}
......@@ -137,6 +138,16 @@ void ExtractorEngine::setContextDate(const QDateTime &dt)
d->m_contextNode.setContextDateTime(dt);
}
ExtractorEngine::Hints ExtractorEngine::hints() const
{
return d->m_hints;
}
void ExtractorEngine::setHints(ExtractorEngine::Hints hints)
{
d->m_hints = hints;
}
QJsonArray ExtractorEngine::extract()
{
d->m_rootNode.setParent(d->m_contextNode);
......
......@@ -153,6 +153,20 @@ public:
*/
void setAdditionalExtractors(std::vector<const AbstractExtractor*> &&extractors);
/** Hints about the document to extract based on application knowledge that
* can help the extractor.
*/
enum Hint {
NoHint = 0,
ExtractFullPageRasterImages = 1, ///< perform expensive image processing on (PDF) documents containing full page raster images
};
Q_DECLARE_FLAGS(Hints, Hint)
/** The currently set extraction hints. */
Hints hints() const;
/** Set extraction hints. */
void setHints(Hints hints);
/** Perform the actual extraction, and return the JSON-LD data
* that has been found.
*/
......@@ -190,5 +204,7 @@ private:
std::unique_ptr<ExtractorEnginePrivate> d;
};
Q_DECLARE_OPERATORS_FOR_FLAGS(ExtractorEngine::Hints)
}
......@@ -28,11 +28,18 @@ ExtractorDocumentNode ImageDocumentProcessor::createNodeFromData(const QByteArra
void ImageDocumentProcessor::expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const
{
// check whether we possibly have a full PDF page raster image here
const auto img = node.content<QImage>();
BarcodeDecoder::BarcodeTypes barcodeHints = BarcodeDecoder::Any2D;
if (engine->hints() & ExtractorEngine::ExtractFullPageRasterImages && !BarcodeDecoder::maybeBarcode(img.width(), img.height(), barcodeHints)) {
barcodeHints |= BarcodeDecoder::IgnoreAspectRatio;
}
// in case the barcode raw data (string or bytearray) gets detected as a type we handle,
// we nevertheless inject a raw data node in between. This is useful in cases where the
// content is parsable but that is actually not desired (e.g. JSON content in ticket barcodes).
const auto b = engine->barcodeDecoder()->decodeBinary(node.content<QImage>());
const auto b = engine->barcodeDecoder()->decodeBinary(img, barcodeHints);
if (!b.isEmpty()) {
auto c = engine->documentNodeFactory()->createNode(b);
if (c.isA<QByteArray>() || c.isA<QString>()) {
......@@ -45,7 +52,7 @@ void ImageDocumentProcessor::expandNode(ExtractorDocumentNode &node, const Extra
return;
}
const auto s = engine->barcodeDecoder()->decodeString(node.content<QImage>());
const auto s = engine->barcodeDecoder()->decodeString(img, barcodeHints);
if (!s.isEmpty()) {
auto c = engine->documentNodeFactory()->createNode(s.toUtf8());
if (c.isA<QByteArray>() || c.isA<QString>()) {
......
......@@ -106,6 +106,28 @@ void PdfDocumentProcessor::expandNode(ExtractorDocumentNode &node, const Extract
m_imageIds.insert(img.objectId());
}
}
// handle full page raster images
if ((engine->hints() & ExtractorEngine::ExtractFullPageRasterImages) && page.imageCount() == 1 && page.text().isEmpty()) {
qDebug() << "full page raster image";
auto img = page.image(0);
if (img.hasObjectId() && m_imageIds.find(img.objectId()) != m_imageIds.end()) { // already handled
continue;
}
img.setLoadingHints(PdfImage::NoHint); // don't abort on color
const auto imgData = img.image();
if (imgData.isNull()) {
continue;
}
auto childNode = engine->documentNodeFactory()->createNode(imgData, u"internal/qimage");
childNode.setLocation(i);
node.appendChild(childNode);
if (img.hasObjectId()) {
m_imageIds.insert(img.objectId());
}
}
}
// fallback node for implicit conversion to plain text
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment