Commit 9092eff8 authored by Volker Krause's avatar Volker Krause
Browse files

Fully switch to the new extractor system

This involves a few changes to extractor scripts and test data:
* Extractors that relied on the unintentional iteration over unrecognized
barcodes in PDF documents have been ported to the new explicit API.
* Extractor filters using the old "Barcode" type have been changed to use
other plain text or binary data mimetypes.
* Extractor types or filter types referring to ical calendar data now
separate between the whole calendar and individual events.
* Modified times in test results are updated in cases where we now have
better context times available (mainly top-level pkpass files).
parent 640560cb
Pipeline #56288 passed with stages
in 12 minutes and 26 seconds
......@@ -3,7 +3,7 @@
"@context": "http://schema.org",
"@type": "FlightReservation",
"airplaneSeat": "14E",
"modifiedTime": "2017-12-29T18:46:02",
"modifiedTime": "2017-11-04T07:25:00Z",
"passengerSequenceNumber": "63",
"reservationFor": {
"@type": "Flight",
......
......@@ -3,7 +3,7 @@
"@context": "http://schema.org",
"@type": "FlightReservation",
"airplaneSeat": "19F",
"modifiedTime": "2017-12-29T18:46:02",
"modifiedTime": "2017-10-23T15:55:00+02:00",
"passengerSequenceNumber": "60",
"reservationFor": {
"@type": "Flight",
......
......@@ -3,7 +3,7 @@
"@context": "http://schema.org",
"@type": "FlightReservation",
"airplaneSeat": "17C",
"modifiedTime": "2017-12-29T18:46:02",
"modifiedTime": "2017-06-17T18:40:00+01:00",
"passengerSequenceNumber": "40",
"reservationFor": {
"@type": "Flight",
......
......@@ -3,7 +3,7 @@
"@context": "http://schema.org",
"@type": "FlightReservation",
"airplaneSeat": "1A",
"modifiedTime": "2017-12-29T18:46:02",
"modifiedTime": "2019-05-21T00:20:00+02:00",
"passengerSequenceNumber": "1",
"reservationFor": {
"@type": "Flight",
......
......@@ -3,7 +3,7 @@
"@context": "http://schema.org",
"@type": "FlightReservation",
"airplaneSeat": "5A",
"modifiedTime": "2017-12-29T18:46:02",
"modifiedTime": "2018-03-05T17:05:00+01:00",
"passengerSequenceNumber": "17",
"reservationFor": {
"@type": "Flight",
......
......@@ -8,10 +8,8 @@
#include <kitinerary_version.h>
#include <KItinerary/CalendarHandler>
#include <KItinerary/Extractor>
#include <KItinerary/ExtractorCapabilities>
#include <KItinerary/ExtractorEngine>
#include <KItinerary/ExtractorInput>
#include <KItinerary/ExtractorPostprocessor>
#include <KItinerary/ExtractorRepository>
#include <KItinerary/JsonLdDocument>
......@@ -169,10 +167,10 @@ int main(int argc, char** argv)
if (!parser.value(extOpt).isEmpty()) {
const auto extNames = parser.value(extOpt).split(QLatin1Char(';'),
Qt::SkipEmptyParts);
std::vector<Extractor> exts;
std::vector<const AbstractExtractor*> exts;
exts.reserve(extNames.size());
for (const auto &name : extNames) {
const auto ext = repo.extractor(name);
const auto ext = repo.extractorByName(name);
exts.push_back(ext);
}
engine.setAdditionalExtractors(std::move(exts));
......@@ -184,7 +182,7 @@ int main(int argc, char** argv)
}
if (ExtractorInput::typeFromName(parser.value(formatOpt)) == ExtractorInput::ICal) {
if (parser.value(formatOpt).compare(QLatin1String("ical"), Qt::CaseInsensitive) == 0) {
const auto batches = batchReservations(postproc.result());
KCalendarCore::Calendar::Ptr cal(new KCalendarCore::MemoryCalendar(QTimeZone::systemTimeZone()));
for (const auto &batch : batches) {
......
......@@ -522,12 +522,7 @@ QJsonArray ExtractorEngine::extract()
{
d->m_rootNode.setParent(d->m_contextNode);
d->processNode(d->m_rootNode);
#if 0
return d->m_rootNode.result().jsonLdResult();
#else
d->extractDocument();
return d->m_result;
#endif
}
void ExtractorEnginePrivate::extractRecursive(KMime::Content *content)
......
......@@ -8,7 +8,7 @@
"script": "amadeus.js"
},
{
"mimeType": "text/calendar",
"mimeType": "internal/event",
"filter": [
{ "field": "From", "match": "@amadeus.com", "mimeType": "message/rfc822", "scope": "Ancestors" },
{ "field": "productId", "match": "//AMADEUS//", "mimeType": "text/calendar", "scope": "Parent" }
......
......@@ -4,9 +4,19 @@
SPDX-License-Identifier: LGPL-2.0-or-later
*/
function main(pdf) {
var res = Context.data[0];
var page = pdf.pages[Context.pdfPageNumber];
function main(pdf, node) {
var res = [];
for (barcode of node.findChildNodes({ scope: "Descendants", mimeType: "text/plain", match: "M.*" })) {
if (barcode.location == undefined)
continue;
res.push(parsePage(pdf.pages[barcode.location], barcode));
}
return res;
}
function parsePage(page, node)
{
var res = node.result[0];
var time = page.text.match(/Departing at\s+(\d{1,2}:\d{2}[AP]M)/);
if (time)
res.reservationFor.departureTime = JsonLd.toDateTime(time[1], "h:mmA", "en")
......
[
{
"mimeType": "text/calendar",
"mimeType": "internal/event",
"filter": [ { "field": "productId", "match": "//DinnerBooking//", "mimeType": "text/calendar", "scope": "Parent" } ],
"script": "dinnerbooking.js",
"function": "parseEvent"
......
......@@ -2,7 +2,7 @@
{
"mimeType": "application/pdf",
"filter": [
{ "type": "Barcode", "match": "docType.:.INTERNATIONAL_ID", "scope": "Descendants" }
{ "match": "docType.:.INTERNATIONAL_ID", "scope": "Descendants", "mimeType": "text/plain" }
],
"script": "koleje-malopolskie.js",
"function": "parsePdf"
......
[
{
"mimeType": "text/calendar",
"mimeType": "internal/event",
"filter": [ { "field": "productId", "match": "NP4GmbH//PCSOffice", "mimeType": "text/calendar", "scope": "Parent" } ],
"script": "np4.js",
"function": "parseEvent"
......
......@@ -9,8 +9,8 @@
"function": "main"
},
{
"mimeType": "text/calendar",
"filter": [ { "field": "uid", "match": "@regiojet.cz", "type": "ICal", "scope": "Current" } ],
"mimeType": "internal/event",
"filter": [ { "field": "uid", "match": "@regiojet.cz", "mimeType": "internal/event", "scope": "Current" } ],
"script": "regiojet.js",
"function": "parseEvent"
}
......
......@@ -2,7 +2,7 @@
"filter": [
{
"match": "^\\d{13}.*\\S{6}(\\.\\.\\S{5}| +)?$",
"type": "Barcode",
"mimeType": "text/plain",
"scope": "Descendants"
}
],
......
......@@ -361,7 +361,7 @@ function parseOuigoConfirmation(html)
return reservations;
}
function parseOuigoTicket(pdf) {
function parseOuigoTicket(pdf, node) {
var text = pdf.pages[0].textInRect(0, 0, 0.5, 1);
var res = JsonLd.newTrainReservation();
......@@ -377,8 +377,13 @@ function parseOuigoTicket(pdf) {
var seat = text.match(/Voiture\s*(\S+)\s*Place\s*(\S+)/);
res.reservedTicket.ticketedSeat.seatSection = seat[1];
res.reservedTicket.ticketedSeat.seatNumber = seat[2];
if (Context.barcode) {
res.reservedTicket.ticketToken = "azteccode:" + Context.barcode;
var barcodes = node.findChildNodes({ scope: "Descendants", mimeType: "text/plain", match: ".*" });
for (barcode of barcodes) {
if (barcode.location != undefined) {
res.reservedTicket.ticketToken = "azteccode:" + barcodes[0].content;
break;
}
}
return res;
}
......@@ -9,7 +9,7 @@
},
{
"match": "^i0CV",
"type": "Barcode",
"mimeType": "text/plain",
"scope": "Descendants"
}
],
......
......@@ -4,10 +4,17 @@
SPDX-License-Identifier: LGPL-2.0-or-later
*/
function main(input) {
var page = input.pages[Context.pdfPageNumber];
function main(pdf, node) {
var barcodes = node.findChildNodes({ mimeType: "text/plain", match: ".*", scope: "Descendants" });
var results = new Array();
for (barcode of barcodes) {
if (barcode.location != undefined)
results.push(parsePage(pdf.pages[barcode.location], barcode.content));
}
return results;
}
function parsePage(page, barcode) {
const lines = page.text.split("\n")
const res = JsonLd.newEventReservation()
......@@ -17,7 +24,7 @@ function main(input) {
let startTime = ""
let endTime = ""
res.reservedTicket.ticketToken = "qrCode:" + Context.barcode
res.reservedTicket.ticketToken = "qrCode:" + barcode
const address = JsonLd.newObject("PostalAddress")
address.addressCountry = "DE"
......@@ -56,5 +63,5 @@ function main(input) {
res.reservationFor.startDate = JsonLd.toDateTime(date + " " + startTime, "dd MMM yyyy h:mm ap", "en")
res.reservationFor.endDate = JsonLd.toDateTime(date + " " + endTime, "dd MMM yyyy h:mm ap", "en")
return [res]
return res;
}
......@@ -2,7 +2,7 @@
"filter": [
{
"match": "^\\d{13}.{37}[A-Z]{8}VIA",
"type": "Barcode",
"mimeType": "text/plain",
"scope": "Descendants"
}
],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment