Commit 0c4cc76b authored by Volker Krause's avatar Volker Krause
Browse files

Add generic PDF boarding pass extractor

Given an IATA BCBP barcode, this scans the text content for full airport
names by doing looking up all plausible text tokens in our airport name
fragment database. This works for many airlines, and allows replacing
manual and often less reliable per-airline extractor script code.
parent c19c1052
......@@ -33,7 +33,8 @@
"latitude": 45.630001068115234,
"longitude": 8.72305965423584
},
"iataCode": "MXP"
"iataCode": "MXP",
"name": "Milano Malpensa"
},
"departureAirport": {
"@type": "Airport",
......@@ -46,7 +47,8 @@
"latitude": 48.120201110839844,
"longitude": 16.564300537109375
},
"iataCode": "VIE"
"iataCode": "VIE",
"name": "Vienna"
},
"departureDay": "2019-09-06",
"flightNumber": "1996"
......
......@@ -43,6 +43,8 @@ target_sources(KPimItinerary PRIVATE
era/ssbv2ticket.cpp
era/ssbv3ticket.cpp
extractors/genericboardingpassextractor.cpp
iata/iatabcbp.cpp
iata/iatabcbpparser.cpp
iata/iatabcbpsections.cpp
......
......@@ -8,6 +8,7 @@
#include "extractorrepository.h"
#include "logging.h"
#include "extractors/genericboardingpassextractor.h"
#include <KItinerary/ExtractorDocumentNode>
#include <KItinerary/ExtractorDocumentProcessor>
......@@ -57,6 +58,7 @@ void ExtractorRepositoryPrivate::loadAll()
void ExtractorRepositoryPrivate::initBuiltInExtractors()
{
addExtractor(std::make_unique<GenericBoardingPassExtractor>());
}
ExtractorRepository::ExtractorRepository()
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "genericboardingpassextractor.h"
#include "logging.h"
#include "stringutil.h"
#include <knowledgedb/airportdb.h>
#include <knowledgedb/airportnametokenizer_p.h>
#include <pdf/pdfdocument.h>
#include <KItinerary/ExtractorDocumentNode>
#include <KItinerary/ExtractorResult>
#include <KItinerary/Flight>
#include <KItinerary/Reservation>
#include <QDebug>
#include <unordered_map>
using namespace KItinerary;
GenericBoardingPassExtractor::GenericBoardingPassExtractor()
{
m_filter.setMimeType(QStringLiteral("internal/iata-bcbp"));
m_filter.setScope(ExtractorFilter::Descendants);
}
GenericBoardingPassExtractor::~GenericBoardingPassExtractor() = default;
QString GenericBoardingPassExtractor::name() const
{
return QStringLiteral("<Generic PDF Boarding Pass>");
}
bool GenericBoardingPassExtractor::canHandle(const ExtractorDocumentNode &node) const
{
return node.content<PdfDocument*>() && m_filter.matches(node);
}
static void mergeOrAppend(QStringList &l, QStringView s)
{
for (auto &n : l) {
if (n.compare(s, Qt::CaseInsensitive) == 0) {
n = StringUtil::betterString(n, s).toString();
return;
}
}
l.push_back(s.toString());
}
ExtractorResult GenericBoardingPassExtractor::extract(const ExtractorDocumentNode &node, [[maybe_unused]] const ExtractorEngine *engine) const
{
QVector<QVariant> result;
const auto pdf = node.content<PdfDocument*>();
std::vector<ExtractorDocumentNode> bcbpNodes;
m_filter.allMatches(node, bcbpNodes);
std::remove_if(bcbpNodes.begin(), bcbpNodes.end(), [](const auto &node) {
return node.location().type() != QVariant::Int || node.result().isEmpty();
});
std::sort(bcbpNodes.begin(), bcbpNodes.end(), [](const auto &lhs, const auto &rhs) { return lhs.location().toInt() < rhs.location().toInt(); });
for (auto it = bcbpNodes.begin(); it != bcbpNodes.end(); ++it) {
// 1 determine which airports we need to look for on the same page
const auto pageNum = (*it).location().toInt();
std::unordered_map<KnowledgeDb::IataCode, QStringList> airportNames;
for (auto it2 = it; it2 != bcbpNodes.end() && (*it2).location().toInt() == pageNum; ++it2) {
const auto flightReservations = (*it).result().result();
for (const auto &flightRes : flightReservations) {
const auto flight = flightRes.value<FlightReservation>().reservationFor().value<Flight>();
if (!flight.departureAirport().iataCode().isEmpty()) {
airportNames[KnowledgeDb::IataCode{flight.departureAirport().iataCode()}] = QStringList();
}
if (!flight.arrivalAirport().iataCode().isEmpty()) {
airportNames[KnowledgeDb::IataCode{flight.arrivalAirport().iataCode()}] = QStringList();
}
}
}
// 2 tokenize the page and scan for airport names
const auto page = pdf->page(pageNum);
qCDebug(Log) << "scanning page" << pageNum << "for airport names";
const auto pageText = page.text();
AirportNameTokenizer tokenizer(pageText);
while (tokenizer.hasNext()) {
const auto s = tokenizer.next();
if (s.compare(QLatin1String("international"), Qt::CaseInsensitive) == 0 ||
(s.size() == 3 && airportNames.find(KnowledgeDb::IataCode{s}) != airportNames.end()))
{
qCDebug(Log) << " ignoring" << s;
continue;
}
const auto iataCodes = KnowledgeDb::iataCodesFromName(s);
for (const auto code : iataCodes) {
auto it2 = airportNames.find(code);
if (it2 != airportNames.end()) {
qCDebug(Log) << " found candidate:" << s << iataCodes;
mergeOrAppend((*it2).second, s);
}
}
}
// 3 augment the results with what we found
const auto flightReservations = (*it).result().result();
for (const auto &res : flightReservations) {
auto flightRes = res.value<FlightReservation>();
auto flight = flightRes.reservationFor().value<Flight>();
auto airport = flight.departureAirport();
airport.setName(airportNames[KnowledgeDb::IataCode{airport.iataCode()}].join(QLatin1Char(' ')));
flight.setDepartureAirport(airport);
airport = flight.arrivalAirport();
airport.setName(airportNames[KnowledgeDb::IataCode{airport.iataCode()}].join(QLatin1Char(' ')));
flight.setArrivalAirport(airport);
flightRes.setReservationFor(flight);
result.push_back(std::move(flightRes));
}
}
return result;
}
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_GENERICBOARDINGPASSEXTRACTOR_H
#define KITINERARY_GENERICBOARDINGPASSEXTRACTOR_H
#include <KItinerary/AbstractExtractor>
#include <KItinerary/ExtractorFilter>
namespace KItinerary {
/** Generic PDF boarding pass extractor. */
class GenericBoardingPassExtractor : public AbstractExtractor
{
public:
GenericBoardingPassExtractor();
~GenericBoardingPassExtractor();
QString name() const override;
bool canHandle(const KItinerary::ExtractorDocumentNode & node) const override;
ExtractorResult extract(const ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
private:
ExtractorFilter m_filter;
};
}
#endif // KITINERARY_GENERICBOARDINGPASSEXTRACTOR_H
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment