Commit f4817081 authored by Volker Krause's avatar Volker Krause

Add preprocessor for HTML content

This allows us to feed HTML content into the unstructured data extractor
too.
parent 90ac625b
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML>
<HEAD>
<TITLE>Your reservation - A00000000</TITLE>
<STYLE>
BODY {
FONT-SIZE: 8pt
}
/* lots of style sheets omitted */
</STYLE>
<META content="MSHTML 6.00.2900.3059" name=GENERATOR>
</HEAD>
<body rightmargin="0" bottommargin="0">
<table width="663" cellspacing="0" cellpadding="0" align="center">
<tr>
<td>
<DIV class=content>
<TABLE cellSpacing="0" cellPadding="0" width="100%" border="0">
<TR>
<td valign="middle"><img src="https://tdp.brusselsairlines.com/BEL/pictures/email/email-header-icon.png" width="41" height="41" alt="Header"></td>
<td width="100%" valign="middle" class="title">Booking confirmation</td>
<td width="100%" valign="middle"><img src="https://tdp.brusselsairlines.com/BEL/pictures/email/email-header-logo.png" width="201" height="30" alt="Brussels Airlines Logo"></td>
</TR>
</TABLE>
<br>
<H1 class=underlineBig>Your booking details</H1>
<DIV class=spacer><SPAN>&nbsp;</SPAN></DIV>
<TABLE cellSpacing=0 cellPadding=0 width="100%" border=0>
<TR vAlign=top>
<TD width="50%">
<TABLE cellSpacing=2 cellPadding=0 border=0>
<TR valign="bottom">
<TD>Booking reference:</TD>
<TD class="confirmation"><b>XXX007</b></TD>
</TR>
</TABLE>
</TD>
<TD width="50%">
<TABLE cellSpacing=2 cellPadding=0 border=0>
<TR vAlign=top>
<TD>Main contact:</TD>
<TD>Mr John Doe</TD>
</TR>
<TR vAlign=top>
<TD>E-mail:</TD>
<TD>my@email.eu</TD>
</TR>
<TR vAlign=top>
<TD>Mobile:</TD>
<TD>49-0987654321</TD>
</TR>
</TABLE>
</TD>
</TR>
</TABLE>
<div class="spacer"><span>&nbsp;</span></div>
<div id="loopBannerDiv">
<a href="https://www.brusselsairlines.com/en-be/loop/registration/default.aspx"><img src="https://tdp.brusselsairlines.com/BEL/en/pages/tdp/banners/loop/confirmation/bair_tdp_loop_banner.gif" /></a>
</div>
<div class="spacer"><span>&nbsp;</span></div>
<H2 class=underlineBig2>Passengers</H2>
<DIV class=spacer><SPAN>&nbsp;</SPAN></DIV>
<TABLE cellSpacing=0 cellPadding=2 width="100%" border=0>
<TR vAlign=top>
<TD><STRONG>Mr John Doe</STRONG><B></B></TD>
<TD><STRONG>Flight</STRONG></TD>
<TD><b>TXL-BRU, BRU-TXL</b></TD>
<TD></TD>
</TR>
<TR>
<TD></TD>
<TD></TD>
<TD>Frequent flyer</TD>
<TD>Miles & More 12345678123490</TD>
</TR>
<TR>
<TD></TD>
<TD></TD>
<TD>Ticket number</TD>
<TD>082-1234567890</TD>
</TR>
<TR>
<TD></TD>
<TD></TD>
<TD>Seats</TD>
<TD>*,*</TD>
</TR>
</TABLE>
<DIV class=spacer><SPAN>&nbsp;</SPAN></DIV>
<H2 class=underlineBig>Flight details</H2>
<DIV class=spacer><SPAN>&nbsp;</SPAN></DIV>
<DIV class=underlineDotted>Departure</DIV>
<TABLE cellSpacing=0 cellPadding=2 width="100%">
<TR vAlign=top>
<TD width="30%"><STRONG>Berlin, Tegel Airport, DE</STRONG><BR>Fri, 03 Feb 2017,&nbsp;<B>18:25<BR></TD>
<TD width="30%"><STRONG>Brussels Airport, BE</STRONG><BR>Fri, 03 Feb 2017,&nbsp;<B>19:45<BR></TD>
<TD width="20%" rowSpan=2>SN 2588 <BR>Brussels Airlines<BR></TD>
<TD class=small width="20%" rowSpan=2>
Check&Go
(Q) <BR>Non stop
</TD>
</TR>
<TR vAlign=top>
<TD class=small colSpan=2>Airbus A320-100/200 </TD>
<TD class=small colSpan=2></TD>
</TR>
</TABLE>
<DIV class=spacer><SPAN>&nbsp;</SPAN></DIV>
<DIV class=underlineDotted>Return</DIV>
<TABLE cellSpacing=0 cellPadding=2 width="100%">
<TR vAlign=top>
<TD width="30%"><STRONG>Brussels Airport, BE</STRONG><BR>Sun, 05 Feb 2017,&nbsp;<B>20:40<BR></TD>
<TD width="30%"><STRONG>Berlin, Tegel Airport, DE</STRONG><BR>Sun, 05 Feb 2017,&nbsp;<B>22:00<BR></TD>
<TD width="20%" rowSpan=2>SN 2591 <BR>Brussels Airlines<BR></TD>
<TD class=small width="20%" rowSpan=2>
Check&Go
(E) <BR>Non stop
</TD>
</TR>
<TR vAlign=top>
<TD class=small colSpan=2>Airbus A319 </TD>
<TD class=small colSpan=2></TD>
</TR>
</TABLE>
<!-- a gazillion lines of advertisements omitted -->
</BODY>
</HTML>
[
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "SN",
"name": "Brussels Airlines"
},
"arrivalAirport": {
"@type": "Airport",
"name": "Brussels Airport, BE"
},
"arrivalTime": "2017-02-03T19:45:00",
"departureAirport": {
"@type": "Airport",
"name": "Berlin, Tegel Airport, DE"
},
"departureTime": "2017-02-03T18:25:00",
"flightNumber": "2588"
},
"reservationNumber": "XXX007"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "SN",
"name": "Brussels Airlines"
},
"arrivalAirport": {
"@type": "Airport",
"name": "Berlin, Tegel Airport, DE"
},
"arrivalTime": "2017-02-05T22:00:00",
"departureAirport": {
"@type": "Airport",
"name": "Brussels Airport, BE"
},
"departureTime": "2017-02-05T20:40:00",
"flightNumber": "2591"
},
"reservationNumber": "XXX007"
}
]
......@@ -19,6 +19,7 @@
#include "extractor.h"
#include "extractorengine.h"
#include "extractorpreprocessor.h"
#include <QDebug>
#include <QDir>
......@@ -36,7 +37,7 @@ private Q_SLOTS:
Q_INIT_RESOURCE(rules);
}
void testExtract_data()
void testExtractText_data()
{
QTest::addColumn<QString>("inputFile");
QTest::addColumn<QString>("extractorName");
......@@ -55,7 +56,7 @@ private Q_SLOTS:
}
}
void testExtract()
void testExtractText()
{
QFETCH(QString, inputFile);
QFETCH(QString, extractorName);
......@@ -69,7 +70,56 @@ private Q_SLOTS:
ExtractorEngine engine;
engine.setText(QString::fromUtf8(f.readAll()));
engine.setExtractor({&extractor});
engine.setExtractor(&extractor);
const auto data = engine.extract();
QFile ref(jsonFile);
QVERIFY(ref.open(QFile::ReadOnly));
const auto doc = QJsonDocument::fromJson(ref.readAll());
QVERIFY(doc.isArray());
if (data != doc.array())
qDebug().noquote() << QJsonDocument(data).toJson();
QCOMPARE(data, doc.array());
}
void testExtractHtml_data()
{
QTest::addColumn<QString>("inputFile");
QTest::addColumn<QString>("extractorName");
QTest::addColumn<QString>("jsonFile");
QDir dir(QStringLiteral(SOURCE_DIR "/unstructureddata"));
const auto lst = dir.entryList(QStringList(QStringLiteral("*.html")), QDir::Files | QDir::Readable | QDir::NoSymLinks);
for (const auto &file : lst) {
const auto refFile = dir.path() + QLatin1Char('/') + file.left(file.size() - 5) + QStringLiteral(".json");
if (!QFile::exists(refFile)) {
qDebug() << "reference file" << refFile << "does not exist, skipping test file" << file;
continue;
}
const auto idx = file.indexOf(QLatin1Char('_'));
QTest::newRow(file.toLatin1()) << QString(dir.path() + QLatin1Char('/') + file) << file.left(idx) << refFile;
}
}
void testExtractHtml()
{
QFETCH(QString, inputFile);
QFETCH(QString, extractorName);
QFETCH(QString, jsonFile);
QFile f(inputFile);
QVERIFY(f.open(QFile::ReadOnly));
Extractor extractor;
QVERIFY(extractor.load(QLatin1String(":/org.kde.messageviewer/semantic/rules/") + extractorName + QLatin1String(".xml")));
ExtractorPreprocessor preproc;
preproc.preprocessHtml(QString::fromUtf8(f.readAll()));
ExtractorEngine engine;
engine.setText(preproc.text());
engine.setExtractor(&extractor);
const auto data = engine.extract();
QFile ref(jsonFile);
......
......@@ -5,6 +5,7 @@ set(semantic_lib_srcs
extractorcontext.cpp
extractorengine.cpp
extractorfilter.cpp
extractorpreprocessor.cpp
extractorrepository.cpp
extractorrule.cpp
jsonlddocument.cpp
......
/*
Copyright (c) 2017 Volker Krause <vkrause@kde.org>
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to the
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
*/
#include "extractorpreprocessor.h"
#include "semantic_debug.h"
#include <QDebug>
void ExtractorPreprocessor::preprocessPlainText(const QString& input)
{
m_buffer = input;
}
void ExtractorPreprocessor::preprocessHtml(const QString& input)
{
m_buffer.reserve(input.size());
int begin = 0;
int end = input.indexOf(QLatin1Char('<'), begin);
while (begin < input.size() && end < input.size() && end >= 0 && begin >= 0) {
if (end > begin) {
replaceEntityAndAppend(input.midRef(begin, end - begin));
}
begin = input.indexOf(QLatin1Char('>'), end);
if (begin < 0)
break;
// replace elements with something suitable for field separation
const auto elementName = input.mid(end + 1, begin - end - 1);
if (elementName.startsWith(QLatin1String("br"), Qt::CaseInsensitive)) {
m_buffer.append(QLatin1Char('\n'));
} else {
m_buffer.append(QLatin1Char(' '));
}
++begin;
end = input.indexOf(QLatin1Char('<'), begin);
}
if (begin >= 0 && end < 0)
replaceEntityAndAppend(input.midRef(begin));
//qCDebug(SEMANTIC_LOG) << "Preprocessed HTML content: " << m_buffer;
}
QString ExtractorPreprocessor::text() const
{
return m_buffer;
}
void ExtractorPreprocessor::replaceEntityAndAppend(const QStringRef& source)
{
int begin = 0;
int end = source.indexOf(QLatin1Char('&'), begin);
while (begin < source.size() && end < source.size() && end >= 0 && begin >= 0) {
if (end > begin) {
m_buffer.append(source.mid(begin, end - begin));
}
begin = source.indexOf(QLatin1Char(';'), end);
if (begin < 0)
break;
const auto entityName = source.mid(end + 1, begin - end - 1);
if (entityName == QLatin1String("nbsp")) {
m_buffer.append(QLatin1Char(' '));
} else {
// keep unknown entities
m_buffer.append(source.mid(end, begin - end + 1));
}
++begin;
end = source.indexOf(QLatin1Char('&'), begin);
}
if (begin >= 0 && end < 0) {
m_buffer.append(source.mid(begin));
}
}
/*
Copyright (c) 2017 Volker Krause <vkrause@kde.org>
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to the
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
*/
#ifndef EXTRACTORPREPROCESSOR_H
#define EXTRACTORPREPROCESSOR_H
#include <QString>
/** Preprocessing of HTML and PDF attachments. */
class ExtractorPreprocessor
{
public:
void preprocessPlainText(const QString &input);
void preprocessHtml(const QString &input);
QString text() const;
private:
void replaceEntityAndAppend(const QStringRef &source);
QString m_buffer;
};
#endif // EXTRACTORPREPROCESSOR_H
<?xml version="1.0" encoding="UTF-8"?>
<extractor>
<filter header="From" match="@brusselsairlines.com"/>
<variable match="Booking reference:\s+([A-Z0-9]{6})" name="bookingRef" value="${1}"/>
<class type="FlightReservation" match="Departure|Return" repeat="true">
<property name="reservationNumber" value="${bookingRef}"/>
<class type="Flight" name="reservationFor">
<class type="Airport" name="departureAirport">
<property name="name" match="([A-Z][\S ]*)\n" value="${1}"/>
</class>
<property name="departureTime" match="([0-9]{2} [A-Za-z]{3} [0-9]{4}),\s*([0-9]{2}:[0-9]{2})" value="${1} ${2}" type="dateTime" format="dd MMM yyyy hh:mm"/>
<class type="Airport" name="arrivalAirport">
<property name="name" match="([A-Z][\S ]*)\n" value="${1}"/>
</class>
<property name="arrivalTime" match="([0-9]{2} [A-Za-z]{3} [0-9]{4}),\s*([0-9]{2}:[0-9]{2})" value="${1} ${2}" type="dateTime" format="dd MMM yyyy hh:mm"/>
<variable name="airlineCode" match="[A-Z0-9]{2}" value="${0}"/>
<property name="flightNumber" match="[0-9]{3,4}" value="${0}"/>
<class type="Airline" name="airline">
<property name="iataCode" value="${airlineCode}"/>
<property name="name" match="([A-Z][A-Za-z0-9 ]*)\n" value="${1}"/>
</class>
</class>
</class>
</extractor>
<RCC>
<qresource prefix="/org.kde.messageviewer/semantic/rules">
<file>amadeus.xml</file>
<file>brusselsairlines.xml</file>
<file>eurowings.xml</file>
<file>fcmtravel.xml</file>
</qresource>
......
......@@ -19,6 +19,7 @@
#include "semanticprocessor.h"
#include "extractorengine.h"
#include "extractorpreprocessor.h"
#include "jsonlddocument.h"
#include "structureddataextractor.h"
#include "semanticmemento.h"
......@@ -73,13 +74,18 @@ MimeTreeParser::MessagePart::Ptr SemanticProcessor::process(MimeTreeParser::Inte
return {};
qCDebug(SEMANTIC_LOG) << "Found unstructured extractor rules for message" << extractors.size();
// TODO preprocessor to remove HTML tags and to extract PDFs
if (!part.content()->contentType()->isPlainText())
// preprocessor to remove HTML tags and to extract PDFs (TODO)
ExtractorPreprocessor preproc;
if (part.content()->contentType()->isPlainText())
preproc.preprocessPlainText(part.content()->decodedText());
else if (part.content()->contentType()->isHTMLText())
preproc.preprocessHtml(part.content()->decodedText());
else
return {};
ExtractorEngine engine;
engine.setExtractor(extractors.at(0));
engine.setText(part.content()->decodedText());
engine.setText(preproc.text());
const auto data = engine.extract();
qCDebug(SEMANTIC_LOG).noquote() << QJsonDocument(data).toJson();
const auto decodedData = JsonLdDocument::fromJson(data);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment