Commit 8e85a316 authored by Volker Krause's avatar Volker Krause
Browse files

Use KContact's new address format metadata for postal code parsing

This replaces our previous overly simplistic and Europe-centric approach
with something that knows country-specific positioning and postal code
formats.

Longer term this ideally grows beyond just splitting out postal codes,
but will be able to also handle other common address part separation
issues.
parent e626ae11
Pipeline #151003 passed with stages
in 4 minutes and 6 seconds
......@@ -31,6 +31,7 @@ ecm_add_test(htmldocumenttest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPi
ecm_add_test(barcodedecodertest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary Qt${QT_MAJOR_VERSION}::Gui)
ecm_add_test(pkpassextractortest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary KPim::PkPass)
ecm_add_test(extractorutiltest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary)
ecm_add_test(addressparsertest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary KF5::Contacts)
ecm_add_test(timefindertest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary)
ecm_add_test(postprocessortest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary)
ecm_add_test(extractorvalidatortest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary)
......
/*
SPDX-FileCopyrightText: 2019 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "text/addressparser.cpp"
#include <KItinerary/Place>
#include <QTest>
using namespace KItinerary;
#define s(x) QStringLiteral(x)
class AddressParserTest : public QObject
{
Q_OBJECT
private Q_SLOTS:
void testPostalCodeExtraction_data()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("city");
QTest::addColumn<QString>("postalCode");
QTest::addColumn<QString>("country");
QTest::newRow("empty") << QString() << QString() << QString() << QString();
QTest::newRow("no code") << s("PARIS") << s("PARIS") << QString() << QString();
QTest::newRow("BE-valid") << s("1060 Brussels") << s("Brussels") << s("1060") << s("BE");
QTest::newRow("BE-invalid") << s("171060 Brussels") << s("171060 Brussels") << QString() << s("BE");
QTest::newRow("FR-valid") << s("75012 Paris Some Suffix") << s("Paris Some Suffix") << s("75012") << s("FR");
QTest::newRow("NZ-valid") << s("Palmerston North 4414") << s("Palmerston North") << s("4414") << s("NZ");
QTest::newRow("PT-valid") << s("1000-205 Lisboa") << s("Lisboa") << s("1000-205") << s("PT");
QTest::newRow("PT-wrong-country") << s("1000-205 Lisboa") << s("1000-205 Lisboa") << QString() << s("DE");
QTest::newRow("AR-short") << s("C1420 Buenos Aires") << s("Buenos Aires") << s("C1420") << s("AR");
QTest::newRow("AR-full") << s("C1420ABC Buenos Aires") << s("Buenos Aires") << s("C1420ABC") << s("AR");
}
void testPostalCodeExtraction()
{
QFETCH(QString, input);
QFETCH(QString, city);
QFETCH(QString, postalCode);
QFETCH(QString, country);
PostalAddress a;
a.setAddressLocality(input);
a.setAddressCountry(country);
AddressParser p;
p.setFallbackCountry(s("GB"));
p.parse(a);
const auto out = p.result();
QCOMPARE(out.addressLocality(), city);
QCOMPARE(out.postalCode(), postalCode);
p.parse(out);
const auto out2 = p.result();
QCOMPARE(out, out2);
}
};
QTEST_GUILESS_MAIN(AddressParserTest)
#include "addressparsertest.moc"
......@@ -61,36 +61,6 @@ private Q_SLOTS:
const auto out2 = ExtractorUtil::extractTerminals(out);
QCOMPARE(out, out2);
}
void testPostalCodeExtraction_data()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QString>("city");
QTest::addColumn<QString>("postalCode");
QTest::newRow("empty") << QString() << QString() << QString();
QTest::newRow("no code") << s("PARIS") << s("PARIS") << QString();
QTest::newRow("prefix 1") << s("1060 Brussels") << s("Brussels") << s("1060");
QTest::newRow("prefix 2") << s("171060 Brussels") << s("Brussels") << s("171060");
QTest::newRow("prefix 3") << s("75012 Paris Some Suffix") << s("Paris Some Suffix") << s("75012");
}
void testPostalCodeExtraction()
{
QFETCH(QString, input);
QFETCH(QString, city);
QFETCH(QString, postalCode);
PostalAddress a;
a.setAddressLocality(input);
const auto out = ExtractorUtil::extractPostalCode(a);
QCOMPARE(out.addressLocality(), city);
QCOMPARE(out.postalCode(), postalCode);
const auto out2 = ExtractorUtil::extractPostalCode(out);
QCOMPARE(out, out2);
}
};
QTEST_GUILESS_MAIN(ExtractorUtilTest)
......
......@@ -92,6 +92,7 @@ target_sources(KPimItinerary PRIVATE
scripts/extractors.qrc
text/addressparser.cpp
text/timefinder.cpp
tlv/berelement.cpp
......
......@@ -10,12 +10,12 @@
#include "extractorvalidator.h"
#include "flightpostprocessor_p.h"
#include "extractorutil.h"
#include "iata/iatabcbpparser.h"
#include "jsonlddocument.h"
#include "logging.h"
#include "mergeutil.h"
#include "sortutil.h"
#include "text/addressparser_p.h"
#include "knowledgedb/airportdb.h"
#include "knowledgedb/timezonedb.h"
......@@ -504,7 +504,10 @@ PostalAddress ExtractorPostprocessorPrivate::processAddress(PostalAddress addr,
addr.setAddressCountry(KCountry::fromLocation(geo.latitude(), geo.longitude()).alpha2());
}
addr = ExtractorUtil::extractPostalCode(addr);
AddressParser addrParser;
addrParser.setFallbackCountry(KCountry::fromQLocale(QLocale().country()).alpha2());
addrParser.parse(addr);
addr = addrParser.result();
return addr;
}
......
......@@ -82,28 +82,3 @@ Flight ExtractorUtil::extractTerminals(Flight flight)
return flight;
}
PostalAddress ExtractorUtil::extractPostalCode(PostalAddress addr)
{
if (!addr.postalCode().isEmpty() || addr.addressLocality().isEmpty()) {
return addr;
}
// ### this so far only covers the typical European numerical prefix case, we probably want
// something for alphanumeric and suffix cases too, if necessary we can also make this
// conditional on addr.addressCountry()
static QRegularExpression patterns[] = {
QRegularExpression(QStringLiteral("^(\\d{4,8}) (.*)$"), QRegularExpression::CaseInsensitiveOption),
};
for (const auto &re : patterns) {
const auto match = re.match(addr.addressLocality());
if (match.hasMatch()) {
addr.setAddressLocality(match.captured(2));
addr.setPostalCode(match.captured(1));
break;
}
}
return addr;
}
......@@ -20,11 +20,6 @@ namespace ExtractorUtil
* @internal Only exported for unit tests.
*/
KITINERARY_EXPORT Flight extractTerminals(Flight flight);
/** Try to extract postal codes included in the city name field.
* @internal Only exported for unit tests.
*/
KITINERARY_EXPORT PostalAddress extractPostalCode(PostalAddress addr);
}
}
......
......@@ -13,6 +13,7 @@ function parseReservation(html) {
var addr = h4s[2].nextSibling.content.match(/(.*)\n+(.*)/);
res.reservationFor.location.address.streetAddress = addr[1];
res.reservationFor.location.address.addressLocality = addr[2];
res.reservationFor.location.address.addressCountry = 'DE';
var links = html.eval('//a');
res.modifyReservationUrl = links[links.length - 1].attribute('href');
......
/*
SPDX-FileCopyrightText: 2022 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "addressparser_p.h"
#include <KContacts/AddressFormat>
#include <QDebug>
#include <QRegularExpression>
using namespace KItinerary;
AddressParser::AddressParser() = default;
AddressParser::~AddressParser() = default;
void AddressParser::setFallbackCountry(const QString &countryCode)
{
m_fallbackCountry = countryCode;
}
void AddressParser::parse(PostalAddress addr)
{
m_address = addr;
if (m_address.postalCode().isEmpty() && !m_address.addressLocality().isEmpty()) {
splitPostalCode();
}
}
PostalAddress AddressParser::result() const
{
return m_address;
}
KContacts::AddressFormat AddressParser::addressFormat() const
{
// TODO detect script
return KContacts::AddressFormatRepository::formatForCountry(m_address.addressCountry().isEmpty() ? m_fallbackCountry : m_address.addressCountry(), KContacts::AddressFormatScriptPreference::Local);
}
static QString captureName(KContacts::AddressFormatField field)
{
switch (field) {
case KContacts::AddressFormatField::PostalCode:
return QStringLiteral("postalCode");
case KContacts::AddressFormatField::Locality:
return QStringLiteral("locality");
default:
return {};
}
}
static QString captureExpression(KContacts::AddressFormatField field)
{
return QLatin1String("?<") + captureName(field) + QLatin1Char('>');
}
void AddressParser::splitPostalCode()
{
const auto format = addressFormat();
if (format.elements().empty() || format.postalCodeRegularExpression().isEmpty()) {
return;
}
// find the format line containing the postal code and locality
using namespace KContacts;
auto startIt = format.elements().begin();
auto endIt = startIt;
enum {
None = 0,
HasLocality = 1,
HasPostalCode = 2,
HasBoth = 3,
};
int inRelevantLine = None;
for (auto it = format.elements().begin(); it != format.elements().end(); ++it) {
if ((*it).isSeparator() && inRelevantLine != HasBoth) {
startIt = endIt = it;
inRelevantLine = None;
}
if ((*it).isSeparator() && inRelevantLine == HasBoth) {
endIt = it;
inRelevantLine = None;
break;
}
if ((*it).isField() && (*it).field() == AddressFormatField::Locality) {
inRelevantLine |= HasLocality;
}
if ((*it).isField() && (*it).field() == AddressFormatField::PostalCode) {
inRelevantLine |= HasPostalCode;
}
}
if (inRelevantLine == HasBoth) {
endIt = format.elements().end();
}
std::vector<AddressFormatElement> line(startIt, endIt);
// TODO also handle the case the region is part of the same line
if (line.empty() || std::count_if(line.begin(), line.end(), std::mem_fn(&AddressFormatElement::isField)) > 2) {
return;
}
// build regex for that format line
QString regex;
regex.push_back(QLatin1Char('^'));
for (auto it = line.begin(); it != line.end(); ++it) {
if ((*it).isField()) {
regex += QLatin1Char('(') + captureExpression((*it).field())
+ ((*it).field() == AddressFormatField::PostalCode ? format.postalCodeRegularExpression() : QLatin1String("\\S.*"))
+ QLatin1Char(')');
}
if ((*it).isLiteral()) {
regex += (*it).literal();
}
}
QRegularExpression re(regex);
if (!re.isValid()) {
qWarning() << "generated invalid address parsing pattern:" << regex;
return;
}
// match against the input
const auto match = re.match(m_address.addressLocality());
if (!match.hasMatch()) {
return;
}
const auto postalCode = match.captured(captureName(AddressFormatField::PostalCode));
const auto locality = match.captured(captureName(AddressFormatField::Locality));
if (!locality.isEmpty() && !postalCode.isEmpty()) {
m_address.setPostalCode(postalCode);
m_address.setAddressLocality(locality);
}
}
/*
SPDX-FileCopyrightText: 2022 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_ADDRESSPARSER_H
#define KITINERARY_ADDRESSPARSER_H
#include <KItinerary/Place>
namespace KContacts {
class AddressFormat;
}
namespace KItinerary {
/** Country-specific address parsing utilities. */
class AddressParser
{
public:
explicit AddressParser();
~AddressParser();
/** The assumed country when no other country information is known. */
void setFallbackCountry(const QString &countryCode);
/** Parse an already partially split address further. */
void parse(PostalAddress addr);
PostalAddress result() const;
private:
void splitPostalCode();
KContacts::AddressFormat addressFormat() const;
PostalAddress m_address;
QString m_fallbackCountry;
};
}
#endif // KITINERARY_ADDRESSPARSER_H
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment