Members of the KDE Community are recommended to subscribe to the kde-community mailing list at https://mail.kde.org/mailman/listinfo/kde-community to allow them to participate in important discussions and receive other important announcements

Commit 90ac625b authored by Volker Krause's avatar Volker Krause

Add booking data fallback extractor

If we don't find structured JSON-LD data, we now try to apply sender-
specific rule sets to extract the relevant information. The rules are
essentially recursively applied regular expressions that emit the
corresponding JSON-LD properties.

So far this only consumes plain text parts, but after suitable pre-
processing it should work on HTML and PDF content in the same way.
parent f0f6081f
......@@ -43,12 +43,14 @@ endmacro ()
add_diff_bodyformatter_class_unittest(diffhighlightertest.cpp "../highlighter/highlighter.cpp")
add_definitions(-DSOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}" )
include_directories(${CMAKE_CURRENT_BINARY_DIR}/../semantic/)
add_definitions(-DSOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}")
ecm_add_test(
structureddataextractortest.cpp
${CMAKE_CURRENT_BINARY_DIR}/../semantic/semantic_debug.cpp
TEST_NAME structureddataextractortest
NAME_PREFIX "messageviewerplugins"
LINK_LIBRARIES Qt5::Test
NAME_PREFIX "messageviewerplugins-"
LINK_LIBRARIES Qt5::Test semantic_extractor
)
ecm_add_test(
unstructureddataextractortest.cpp
NAME_PREFIX "messageviewerplugins-"
LINK_LIBRARIES Qt5::Test semantic_extractor
)
......@@ -17,8 +17,9 @@
02110-1301, USA.
*/
#include "../semantic/structureddataextractor.cpp"
#include "structureddataextractor.h"
#include <QDebug>
#include <QDir>
#include <QFile>
#include <QJsonDocument>
......
[
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "DL",
"name": "DELTA AIR LINES"
},
"arrivalAirport": {
"@type": "Airport",
"name": "AMSTERDAM, NL (SCHIPHOL AIRPORT)"
},
"arrivalTime": "2016-06-07T13:30:00",
"departureAirport": {
"@type": "Airport",
"name": "BERLIN, DE (TEGEL)"
},
"departureTime": "2016-06-07T12:10:00",
"flightNumber": "9520"
},
"reservationNumber": "123456"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "DL",
"name": "DELTA AIR LINES"
},
"arrivalAirport": {
"@type": "Airport",
"name": "DETROIT, MI (METROPOLITAN WAYNE CO), TERMINAL EM"
},
"arrivalTime": "2016-06-07T17:40:00",
"departureAirport": {
"@type": "Airport",
"name": "AMSTERDAM, NL (SCHIPHOL AIRPORT)"
},
"departureTime": "2016-06-07T15:00:00",
"flightNumber": "139"
},
"reservationNumber": "123456"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "DL",
"name": "DELTA AIR LINES"
},
"arrivalAirport": {
"@type": "Airport",
"name": "PARIS, FR (CHARLES DE GAULLE), TERMINAL 2E"
},
"arrivalTime": "2016-06-10T11:30:00",
"departureAirport": {
"@type": "Airport",
"name": "DETROIT, MI (METROPOLITAN WAYNE CO), TERMINAL EM"
},
"departureTime": "2016-06-09T21:40:00",
"flightNumber": "8573"
},
"reservationNumber": "123456"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "DL",
"name": "DELTA AIR LINES"
},
"arrivalAirport": {
"@type": "Airport",
"name": "BERLIN, DE (TEGEL)"
},
"arrivalTime": "2016-06-10T14:40:00",
"departureAirport": {
"@type": "Airport",
"name": "PARIS, FR (CHARLES DE GAULLE), TERMINAL 2F"
},
"departureTime": "2016-06-10T13:00:00",
"flightNumber": "8680"
},
"reservationNumber": "123456"
}
]
/TIT IMPORTANT MESSAGE PLEASE READ INFO AT THE BOTTOM
TRAVEL AGENCY BOOKING REF: 123456
SOME PLACE, SOMEWHERE. 9 DATE: 27 MAY 2016
S-171 54 SOME CITY
COUNTRY DOE/JOHN MR
FLIGHT DL 9520 - DELTA AIR LINES TUE 07 JUNE 2016
-----------------------------------------------------------------------------
DEPARTURE: BERLIN, DE (TEGEL) 07 JUN 12:10
ARRIVAL: AMSTERDAM, NL (SCHIPHOL AIRPORT) 07 JUN 13:30
FLIGHT BOOKING REF: DL/123456
RESERVATION CONFIRMED, ECONOMY (M) DURATION: 01:20
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BAGGAGE ALLOWANCE: 1PC
MEAL: SNACK
NON STOP BERLIN TO AMSTERDAM
OPERATED BY: KLM ROYAL DUTCH AIRLINES, KL 1824
EQUIPMENT: BOEING 737-900
FLIGHT DL 139 - DELTA AIR LINES TUE 07 JUNE 2016
-----------------------------------------------------------------------------
DEPARTURE: AMSTERDAM, NL (SCHIPHOL AIRPORT) 07 JUN 15:00
ARRIVAL: DETROIT, MI (METROPOLITAN WAYNE CO), TERMINAL EM - 07 JUN 17:40
E.H. MCNAMARA TERMINAL
FLIGHT BOOKING REF: DL/123456
RESERVATION CONFIRMED, ECONOMY (M) DURATION: 08:40
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BAGGAGE ALLOWANCE: 1PC
MEAL: DINNER
NON STOP AMSTERDAM TO DETROIT, MI
OPERATED BY: DELTA AIR LINES, DL
EQUIPMENT: AIRBUS INDUSTRIE A330-200
FLIGHT DL 8573 - DELTA AIR LINES THU 09 JUNE 2016
-----------------------------------------------------------------------------
DEPARTURE: DETROIT, MI (METROPOLITAN WAYNE CO), TERMINAL EM - 09 JUN 21:40
E.H. MCNAMARA TERMINAL
ARRIVAL: PARIS, FR (CHARLES DE GAULLE), TERMINAL 2E - 10 JUN 11:30
AEROGARE 2 TERMINAL E
FLIGHT BOOKING REF: DL/123456
RESERVATION CONFIRMED, ECONOMY (M) DURATION: 07:50
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BAGGAGE ALLOWANCE: 1PC
MEAL: BREAKFAST
NON STOP DETROIT, MI TO PARIS
OPERATED BY: AIR FRANCE, AF 377
EQUIPMENT: AIRBUS INDUSTRIE A340-300
FLIGHT DL 8680 - DELTA AIR LINES FRI 10 JUNE 2016
-----------------------------------------------------------------------------
DEPARTURE: PARIS, FR (CHARLES DE GAULLE), TERMINAL 2F - 10 JUN 13:00
AEROGARE 2 TERMINAL F
ARRIVAL: BERLIN, DE (TEGEL) 10 JUN 14:40
FLIGHT BOOKING REF: DL/123456
RESERVATION CONFIRMED, ECONOMY (M) DURATION: 01:40
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BAGGAGE ALLOWANCE: 1PC
NON STOP PARIS TO BERLIN
OPERATED BY: AIR FRANCE, AF 1534
EQUIPMENT: AIRBUS INDUSTRIE A321
FLIGHT(S) CALCULATED AVERAGE CO2 EMISSIONS IS 978.44 KG/PERSON
SOURCE: ICAO CARBON EMISSIONS CALCULATOR
http://www.icao.int/environmental-protection/CarbonOffset/Pages/default.aspx
[
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "4U",
"name": "Germanwings"
},
"arrivalAirport": {
"@type": "Airport",
"name": "London Heathrow"
},
"arrivalTime": "2017-06-15T13:50:00",
"departureAirport": {
"@type": "Airport",
"name": "Berlin-Tegel"
},
"departureTime": "2017-06-15T12:55:00",
"flightNumber": "8462"
},
"reservationNumber": "ABC123"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "4U",
"name": "Germanwings"
},
"arrivalAirport": {
"@type": "Airport",
"name": "Berlin-Tegel"
},
"arrivalTime": "2017-06-18T22:00:00",
"departureAirport": {
"@type": "Airport",
"name": "London Heathrow"
},
"departureTime": "2017-06-18T19:10:00",
"flightNumber": "8465"
},
"reservationNumber": "ABC123"
}
]
Eurowings
PASSENGER RECEIPT
Confirmation of Booking
MR John Doe 23.04.2017 | 11:10:45 Uhr
Street 5
12345 City
Country
Dear MR John Doe,
many thanks for booking your flight with us.
We wish you a pleasant flight.
======================================================================
YOUR BOOKING
======================================================================
Individual reservation code: ** ABC123 **
(please state at check-in)
Date of booking: 23.04.2017 11:09
Date of change: 23.04.2017 11:09
----------------------------------------------------------------------
** FLIGHT DATA (TIMES ARE LOCAL TIMES) **
----------------------------------------------------------------------
Flight: 15.06.2017 | 4U 8462 (X)
* operated by Germanwings
Departure: 12:55 Berlin-Tegel
Arrival: 13:50 London Heathrow
Flight: 18.06.2017 | 4U 8465 (K)
* operated by Germanwings
Departure: 19:10 London Heathrow
Arrival: 22:00 Berlin-Tegel
----------------------------------------------------------------------
** PASSENGER **
----------------------------------------------------------------------
1. Passenger: MR John Doe
Frequent Flyer Number: 123401234567890
======================================================================
FREE INFORMATION SERVICE
======================================================================
Stay up to date on your flight status. Sign up with your mobile[...]
[
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "BT",
"name": "Air Baltic"
},
"arrivalAirport": {
"@type": "Airport",
"iataCode": "RIX",
"name": "Riga"
},
"arrivalTime": "2017-11-05T11:35:00",
"departureAirport": {
"@type": "Airport",
"iataCode": "TXL",
"name": "Tegel"
},
"departureTime": "2017-11-05T08:55:00",
"flightNumber": "212"
},
"reservationNumber": "ABCDEF"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "BT",
"name": "Air Baltic"
},
"arrivalAirport": {
"@type": "Airport",
"iataCode": "TXL",
"name": "Tegel"
},
"arrivalTime": "2017-11-10T08:20:00",
"departureAirport": {
"@type": "Airport",
"iataCode": "RIX",
"name": "Riga"
},
"departureTime": "2017-11-10T07:35:00",
"flightNumber": "211"
},
"reservationNumber": "ABCDEF"
}
]
=====================================================================================
Thank you for booking with Travellink Corporate
=====================================================================================
Booking reference: ABCDEF
Reserved by: Travel Agent
Company: My Company
Address: Street 30
12345 City
Traveller: John Doe
=====================================================================================
FLIGHT Airline booking reference: ABCDEF - BT
=====================================================================================
Status: Confirmed
OUTBOUND Berlin - Riga
Flight: BT212, Air Baltic
Operated by: Air Baltic
Date: Sun 5 Nov 2017
Departure: 08:55 Tegel(TXL), Berlin, Germany
Date: Sun 5 Nov 2017
Arrival: 11:35 Riga(RIX), Riga, Latvia
Cabin class: Economy
Duration:
Status: Confirmed
HOMEBOUND Riga - Berlin
Flight: BT211, Air Baltic
Operated by: Air Baltic
Date: Fri 10 Nov 2017
Departure: 07:35 Riga(RIX), Riga, Latvia
Date: Fri 10 Nov 2017
Arrival: 08:20 Tegel(TXL), Berlin, Germany
Cabin class: Economy
Duration:
Departure and arrival times displayed are always in local time.
Please note! Verify your flight departure terminal before you leave for the airport.
-------------------------------------------------------------------------------------
Booking information
-------------------------------------------------------------------------------------
Check-in: Use Booking reference and your surname (Passport/National ID) for check-in at the airport counter or at the airlines website.
[
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "AB",
"name": "Air Berlin"
},
"arrivalAirport": {
"@type": "Airport",
"iataCode": "DUS",
"name": "International Airport"
},
"arrivalTime": "2016-10-17T12:00:00",
"departureAirport": {
"@type": "Airport",
"iataCode": "TXL",
"name": "Tegel"
},
"departureTime": "2016-10-17T10:50:00",
"flightNumber": "6439"
},
"reservationNumber": "XXX007"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "AB",
"name": "Air Berlin"
},
"arrivalAirport": {
"@type": "Airport",
"iataCode": "SFO",
"name": "San Francisco International"
},
"arrivalTime": "2016-10-17T15:55:00",
"departureAirport": {
"@type": "Airport",
"iataCode": "DUS",
"name": "International Airport"
},
"departureTime": "2016-10-17T13:20:00",
"flightNumber": "7392"
},
"reservationNumber": "XXX007"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "AA",
"name": "American Airlines"
},
"arrivalAirport": {
"@type": "Airport",
"iataCode": "ORD",
"name": "O Hare International"
},
"arrivalTime": "2016-10-22T14:00:00",
"departureAirport": {
"@type": "Airport",
"iataCode": "PDX",
"name": "Portland International"
},
"departureTime": "2016-10-22T08:00:00",
"flightNumber": "086"
},
"reservationNumber": "XXX007"
},
{
"@type": "FlightReservation",
"reservationFor": {
"@type": "Flight",
"airline": {
"@type": "Airline",
"iataCode": "AB",
"name": "Air Berlin"
},
"arrivalAirport": {
"@type": "Airport",
"iataCode": "TXL",
"name": "Tegel"
},
"arrivalTime": "2016-10-23T07:00:00",
"departureAirport": {
"@type": "Airport",
"iataCode": "ORD",
"name": "O Hare International"
},
"departureTime": "2016-10-22T15:25:00",
"flightNumber": "7421"
},
"reservationNumber": "XXX007"
}
]
=====================================================================================
Thank you for booking with Travellink Corporate
=====================================================================================
Travellink booking reference: XXX007
Reserved by: Booking Agent
Company: My Company
Address: Street 30
12345 City
Traveller: John Doe
Traveller: Jane Doe
=====================================================================================
=====================================================================================
Status: Confirmed
OUTBOUND Berlin - San Francisco
Flight: AB6439, Air Berlin
Date: Mon 17 Oct 2016
Departure: 10:50 Tegel(TXL), Berlin, Germany
Date: Mon 17 Oct 2016
Arrival: 12:00 International Airport(DUS), Duesseldorf, Germany
Cabin class: Economy
Status: Confirmed
Flight: AB7392, Air Berlin
Date: Mon 17 Oct 2016
Departure: 13:20 International Airport(DUS), Duesseldorf, Germany
Date: Mon 17 Oct 2016
Arrival: 15:55 San Francisco International(SFO), San Francisco, United States Of America
Terminal I
Cabin class: Economy
Duration: 14h 05min
Status: Confirmed
HOMEBOUND Portland - Berlin
Flight: AA086, American Airlines
Date: Sat 22 Oct 2016
Departure: 08:00 Portland International(PDX), Portland, United States Of America
Date: Sat 22 Oct 2016
Arrival: 14:00 O Hare International(ORD), Chicago, United States Of America
Terminal 3
Cabin class: Economy
Status: Confirmed
Flight: AB7421, Air Berlin
Date: Sat 22 Oct 2016
Departure: 15:25 O Hare International(ORD), Chicago, United States Of America
Terminal 3
Date: Sun 23 Oct 2016
Arrival: 07:00 Tegel(TXL), Berlin, Germany
Cabin class: Economy
Duration: 14h 00min
Departure and arrival times displayed are always in local time.
Please note! Verify your flight departure terminal before you leave for the airport.
-------------------------------------------------------------------------------------
Booking information
-------------------------------------------------------------------------------------
Check-in: Use Booking reference and your surname (Passport/National ID) for check-in at the airport counter or at the airlines website.
/*
Copyright (c) 2017 Volker Krause <vkrause@kde.org>
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Library General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at your
option) any later version.
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to the
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
*/
#include "extractor.h"
#include "extractorengine.h"
#include <QDebug>
#include <QDir>
#include <QFile>
#include <QJsonDocument>
#include <QObject>
#include <QTest>
class UnstructuredDataExtractorTest : public QObject
{
Q_OBJECT
private Q_SLOTS:
void init()
{
Q_INIT_RESOURCE(rules);
}
void testExtract_data()
{
QTest::addColumn<QString>("inputFile");
QTest::addColumn<QString>("extractorName");
QTest::addColumn<QString>("jsonFile");
QDir dir(QStringLiteral(SOURCE_DIR "/unstructureddata"));
const auto lst = dir.entryList(QStringList(QStringLiteral("*.txt")), QDir::Files | QDir::Readable | QDir::NoSymLinks);
for (const auto &file : lst) {
const auto refFile = dir.path() + QLatin1Char('/') + file.left(file.size() - 4) + QStringLite