Members of the KDE Community are recommended to subscribe to the kde-community mailing list at https://mail.kde.org/mailman/listinfo/kde-community to allow them to participate in important discussions and receive other important announcements

Commit 43fc9f15 authored by Volker Krause's avatar Volker Krause

Add support for extracting PDF booking confirmations

Similar to HTML ones, we just extract the raw text from the document and
feed that into the unstructured data extractor.
parent e23b39e3
......@@ -138,6 +138,8 @@ find_package(Gpgmepp 1.8.0 CONFIG)
set_package_properties(Gpgmepp PROPERTIES DESCRIPTION "GpgME library" URL "http://www.gnupg.org" TYPE REQUIRED)
set( SharedMimeInfo_MINIMUM_VERSION "1.0" )
find_package(SharedMimeInfo ${SharedMimeInfo_MINIMUM_VERSION} REQUIRED)
find_package(Poppler COMPONENTS Qt5)
set_package_properties("Poppler" PROPERTIES TYPE OPTIONAL PURPOSE "Support for PDF booking confirmations in the semantic extraction plugin.")
if (KDEPIMADDONS_BUILD_EXAMPLES)
add_subdirectory(examples)
......
if(TARGET Poppler::Qt5)
set(HAVE_POPPLER ON)
endif()
configure_file(config-semantic.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config-semantic.h)
# static lib for use by unit test
set(semantic_lib_srcs
datatypes.cpp
......@@ -16,6 +21,9 @@ ecm_qt_declare_logging_category(semantic_lib_srcs HEADER semantic_debug.h IDENTI
add_library(semantic_extractor STATIC ${semantic_lib_srcs})
set_target_properties(semantic_extractor PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(semantic_extractor PUBLIC Qt5::Core KF5::Mime)
if (HAVE_POPPLER)
target_link_libraries(semantic_extractor PRIVATE Poppler::Qt5)
endif()
set(semantic_plugin_srcs
......
/*
Copyright (c) 2017 Volker Krause <vkrause@kde.org>
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to the
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
*/
#ifndef CONFIG_SEMANTIC_H
#define CONFIG_SEMANTIC_H
#cmakedefine HAVE_POPPLER
#endif
......@@ -17,11 +17,19 @@
02110-1301, USA.
*/
#include "config-semantic.h"
#include "extractorpreprocessor.h"
#include "semantic_debug.h"
#ifdef HAVE_POPPLER
#include <poppler-qt5.h>
#endif
#include <QDebug>
#include <memory>
void ExtractorPreprocessor::preprocessPlainText(const QString& input)
{
m_buffer = input;
......@@ -56,12 +64,28 @@ void ExtractorPreprocessor::preprocessHtml(const QString& input)
//qCDebug(SEMANTIC_LOG) << "Preprocessed HTML content: " << m_buffer;
}
void ExtractorPreprocessor::preprocessPdf(const QByteArray &input)
{
#ifdef HAVE_POPPLER
std::unique_ptr<Poppler::Document> doc(Poppler::Document::loadFromData(input));
if (!doc || doc->isLocked())
return;
for (int i = 0; i < doc->numPages(); ++i) {
std::unique_ptr<Poppler::Page> page(doc->page(i));
m_buffer += page->text({}, Poppler::Page::PhysicalLayout);
}
#else
Q_UNUSED(input);
#endif
}
QString ExtractorPreprocessor::text() const
{
return m_buffer;
}
void ExtractorPreprocessor::replaceEntityAndAppend(const QStringRef& source)
void ExtractorPreprocessor::replaceEntityAndAppend(const QStringRef &source)
{
int begin = 0;
int end = source.indexOf(QLatin1Char('&'), begin);
......
......@@ -28,6 +28,7 @@ class ExtractorPreprocessor
public:
void preprocessPlainText(const QString &input);
void preprocessHtml(const QString &input);
void preprocessPdf(const QByteArray &input);
QString text() const;
......
......@@ -42,7 +42,7 @@ public:
const MimeTreeParser::Interface::BodyPartFormatter *bodyPartFormatter(int idx) const override
{
if (idx < 2) {
if (idx < 3) {
return new SemanticProcessor();
}
return nullptr;
......
{
"formatter": [
{ "mimetype": "text/html" },
{ "mimetype": "text/plain" }
{ "mimetype": "text/plain" },
{ "mimetype": "application/pdf" }
],
"renderer": [
{ "type": "MimeTreeParser::MessagePartList" }
......
......@@ -75,14 +75,16 @@ MimeTreeParser::MessagePart::Ptr SemanticProcessor::process(MimeTreeParser::Inte
return {};
qCDebug(SEMANTIC_LOG) << "Found unstructured extractor rules for message" << extractors.size();
// preprocessor to remove HTML tags and to extract PDFs (TODO)
ExtractorPreprocessor preproc;
if (part.content()->contentType()->isPlainText())
if (part.content()->contentType()->isPlainText()) {
preproc.preprocessPlainText(part.content()->decodedText());
else if (part.content()->contentType()->isHTMLText())
} else if (part.content()->contentType()->isHTMLText()) {
preproc.preprocessHtml(part.content()->decodedText());
else
} else if (part.content()->contentType()->mimeType() == "application/pdf") {
preproc.preprocessPdf(part.content()->decodedContent());
} else {
return {};
}
ExtractorEngine engine;
engine.setExtractor(extractors.at(0));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment