Commit 93638b4f authored by Volker Krause's avatar Volker Krause

Make the airport lookup a bit more clever

The previous method of only looking of unique fragments of the name worked
reasonably well in Europe, but not in the US. In the US there are many
(regional) airports with overlapping name fragments so that even major
international airports could not be looked up properly. We now also
consider non-unique fragments if they result in a unique subset. This
doesn't impact the database size much, but it significantly improves
the detection of US airports.

This also re-generates the database from latest Wikidata data, reducing
the IATA code conflicts from ~250 to 140 due to upstream fixes.
parent 3019b6b6
......@@ -119,6 +119,7 @@ private Q_SLOTS:
void iataLookupTest()
{
// via unique fragment lookup
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("Flughafen Berlin-Tegel")), AirportDb::IataCode{"TXL"});
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("TEGEL")), AirportDb::IataCode{"TXL"});
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("Paris Charles de Gaulle")), AirportDb::IataCode{"CDG"});
......@@ -126,13 +127,18 @@ private Q_SLOTS:
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("AMSTERDAM, NL (SCHIPHOL AIRPORT)")), AirportDb::IataCode{"AMS"});
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("London Heathrow")), AirportDb::IataCode{"LHR"});
// via non-unique fragment lookup
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("John F. Kennedy International Airport")), AirportDb::IataCode{"JFK"});
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("San Francisco International")), AirportDb::IataCode{"SFO"});
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("Düsseldorf International")), AirportDb::IataCode{"DUS"});
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("London City")), AirportDb::IataCode{"LCY"});
QCOMPARE(AirportDb::iataCodeFromName(QStringLiteral("DETROIT, MI (METROPOLITAN WAYNE CO)")), AirportDb::IataCode{"DTW"});
// not unique
QVERIFY(!AirportDb::iataCodeFromName(QStringLiteral("Flughafen Berlin")).isValid());
QVERIFY(!AirportDb::iataCodeFromName(QStringLiteral("Charles de Gaulle Orly")).isValid());
QVERIFY(!AirportDb::iataCodeFromName(QStringLiteral("Brussels Airport, BE")).isValid());
// would be nice of those would work, but needs more complex index
QVERIFY(!AirportDb::iataCodeFromName(QStringLiteral("DETROIT, MI (METROPOLITAN WAYNE CO), TERMINAL EM")).isValid());
QVERIFY(!AirportDb::iataCodeFromName(QStringLiteral("Frankfurt")).isValid());
}
};
......
......@@ -134,42 +134,98 @@ QTimeZone timezoneForAirport(IataCode iataCode)
return QTimeZone(timezone_names + timezone_table[iataIdx]);
}
static const auto name_string_index_size = sizeof(name_string_index) / sizeof(NameIndex);
static const auto name1_string_index_size = sizeof(name1_string_index) / sizeof(Name1Index);
static const NameIndex *nameIndexBegin()
static const Name1Index *name1IndexBegin()
{
return name_string_index;
return name1_string_index;
}
static const NameIndex *nameIndexEnd()
static const Name1Index *name1IndexEnd()
{
return name_string_index + name_string_index_size;
return name1_string_index + name1_string_index_size;
}
IataCode iataCodeFromName(const QString &name)
static const auto nameN_string_index_size = sizeof(nameN_string_index) / sizeof(NameNIndex);
static const NameNIndex *nameNIndexBegin()
{
return nameN_string_index;
}
static const NameNIndex *nameNIndexEnd()
{
return nameN_string_index + nameN_string_index_size;
}
static IataCode iataCodeForUniqueFragment(const QStringList &fragments)
{
IataCode code;
int iataIdx = -1;
for (const auto &s : name.toCaseFolded().split(QRegularExpression(QStringLiteral("[ 0-9/'\"\\(\\)&\\,.–„-]")), QString::SkipEmptyParts)) {
const auto it = std::lower_bound(nameIndexBegin(), nameIndexEnd(), s.toUtf8(), [](const NameIndex &lhs, const QByteArray &rhs) {
const auto cmp = strncmp(name_string_table + lhs.offset(), rhs.constData(), std::min<int>(lhs.length, rhs.size()));
for (const auto &s : fragments) {
const auto it = std::lower_bound(name1IndexBegin(), name1IndexEnd(), s.toUtf8(), [](const Name1Index &lhs, const QByteArray &rhs) {
const auto cmp = strncmp(name1_string_table + lhs.offset(), rhs.constData(), std::min<int>(lhs.length, rhs.size()));
if (cmp == 0) {
return lhs.length < rhs.size();
}
return cmp < 0;
});
if (it == nameIndexEnd() || it->length != s.toUtf8().size() || strncmp(name_string_table + it->offset(), s.toUtf8().constData(), it->length) != 0) {
if (it == name1IndexEnd() || it->length != s.toUtf8().size() || strncmp(name1_string_table + it->offset(), s.toUtf8().constData(), it->length) != 0) {
continue;
}
if (iataIdx >= 0 && iataIdx != it->iataIndex) {
return code; // not unique
return {}; // not unique
}
iataIdx = it->iataIndex;
}
if (iataIdx > 0) {
code = iata_table[iataIdx];
return iata_table[iataIdx];
}
return code;
return {};
}
IataCode iataCodeFromName(const QString &name)
{
const auto fragments = name.toCaseFolded().split(QRegularExpression(QStringLiteral("[ 0-9/'\"\\(\\)&\\,.–„-]")), QString::SkipEmptyParts);
const IataCode code = iataCodeForUniqueFragment(fragments);
if (code.isValid()) {
return code;
}
// we we didn't find a unique name fragment, try the non-unique index
QSet<uint16_t> iataIdxs;
for (const auto &s : fragments) {
const auto it = std::lower_bound(nameNIndexBegin(), nameNIndexEnd(), s.toUtf8(), [](const NameNIndex &lhs, const QByteArray &rhs) {
const auto cmp = strncmp(nameN_string_table + lhs.strOffset, rhs.constData(), std::min<int>(lhs.strLength, rhs.size()));
if (cmp == 0) {
return lhs.strLength < rhs.size();
}
return cmp < 0;
});
if (it == nameNIndexEnd() || it->strLength != s.toUtf8().size() || strncmp(nameN_string_table + it->strOffset, s.toUtf8().constData(), it->strLength) != 0) {
continue;
}
QSet<uint16_t> candidates;
candidates.reserve(it->iataCount);
for (auto i = 0; i < it->iataCount; ++i) {
candidates.insert(nameN_iata_table[it->iataOffset + i]);
}
if (iataIdxs.isEmpty()) { // first round
iataIdxs = candidates;
continue;
}
iataIdxs &= candidates;
if (iataIdxs.isEmpty()) {
break;
}
}
if (iataIdxs.size() == 1) {
return iata_table[*iataIdxs.constBegin()];
}
return {};
}
}
......@@ -24,9 +24,9 @@
namespace AirportDb {
// pack 24 bit offset, 8 bit length and 16 bit IATA index into 48bit with 16bit alignment
struct NameIndex
struct Name1Index
{
inline explicit constexpr NameIndex(uint32_t offset, uint8_t len, uint16_t idx)
inline explicit constexpr Name1Index(uint32_t offset, uint8_t len, uint16_t idx)
: offset1((offset & 0x00ffff00) >> 8)
, offset2(offset & 0x000000ff)
, length(len)
......@@ -45,8 +45,23 @@ struct NameIndex
uint16_t iataIndex;
};
static_assert(sizeof(NameIndex) == 6, "NameIndex size changed!");
static_assert(alignof(NameIndex) <= 2, "NameIndex alignment changed!");
static_assert(sizeof(Name1Index) == 6, "NameIndex size changed!");
static_assert(sizeof(Name1Index) % alignof(Name1Index) == 0, "NameIndex is properly aligned!");
// index structure for non-unique name fragments, packed into 64 bits with 2 byte alignment
// string length (5 bit) and iata count (11 bit) would fit into a joint 16 bit fields, but we
// first need a working "constexpr assert" to make sure we don't overrun that space...
struct NameNIndex
{
uint16_t strOffset;
uint16_t strLength;
uint16_t iataOffset;
uint16_t iataCount;
};
static_assert(sizeof(NameNIndex) == 8, "NameNIndex size changed!");
static_assert(sizeof(NameNIndex) % alignof(NameNIndex) == 0, "NameNIndex is not properly aligned!");
}
#endif
......@@ -117,6 +117,22 @@ static bool soundsMilitaryish(const QString &s)
;
}
static void stripAirportAllLanguages(QStringList &s)
{
// only languages used in the English (sic!) wikidata labels and description matter here
s.removeAll(QLatin1String("aeroport"));
s.removeAll(QLatin1String("aeroporto"));
s.removeAll(QLatin1String("aeropuerto"));
s.removeAll(QLatin1String("air"));
s.removeAll(QLatin1String("airfield"));
s.removeAll(QLatin1String("airpark"));
s.removeAll(QLatin1String("airport"));
s.removeAll(QLatin1String("airstrip"));
s.removeAll(QLatin1String("flughafen"));
s.removeAll(QLatin1String("lufthavn"));
s.removeAll(QLatin1String("terminal"));
}
int coordinateConflicts = 0;
static void merge(Airport &lhs, const Airport &rhs)
......@@ -273,7 +289,7 @@ int main(int argc, char **argv)
}
// step 3 index the names for reverse lookup
QMap<QString, QString> labelMap;
QMap<QString, QVector<QString>> labelMap;
for (auto it = airportMap.begin(); it != airportMap.end(); ++it) {
auto l = (it.value().label + QLatin1Char(' ') + it.value().alias)
.split(QRegularExpression(QStringLiteral("[ 0-9/'\"\\(\\)&\\,.–„-]")), QString::SkipEmptyParts);
......@@ -282,20 +298,18 @@ int main(int argc, char **argv)
});
l.removeAll(it.value().iataCode.toCaseFolded());
l.removeAll(it.value().icaoCode.toCaseFolded());
stripAirportAllLanguages(l);
l.removeDuplicates();
for (const auto &s : l) {
if (s.size() <= 2) {
continue;
}
if (!labelMap.contains(s)) {
labelMap.insert(s, it.value().iataCode);
} else {
// if (!labelMap.value(s).isEmpty())
// qDebug() << "clash on" << s;
labelMap[s] = QString();
}
labelMap[s].push_back(it.value().iataCode);
}
}
for (auto it = labelMap.begin(); it != labelMap.end(); ++it) {
std::sort(it.value().begin(), it.value().end());
}
// step 4 generate code
QFile f(parser.value(outputOpt));
......@@ -386,32 +400,32 @@ static const uint16_t timezone_table[] = {
}
f.write(R"(};
// reverse name lookup string table
static const char name_string_table[] =
// reverse name lookup string table for unique strings
static const char name1_string_table[] =
)");
// TODO prefix compression
std::vector<NameIndex> string_offsets;
std::vector<Name1Index> string_offsets;
string_offsets.reserve(labelMap.size());
uint32_t label_offset = 0;
for (auto it = labelMap.begin(); it != labelMap.end(); ++it) {
if (it.value().isEmpty()) {
if (it.value().size() > 1) {
continue;
}
f.write(" \"");
f.write(it.key().toUtf8());
f.write("\" // ");
f.write(it.value().toUtf8());
f.write(it.value().at(0).toUtf8());
f.write("\n");
string_offsets.push_back(NameIndex{label_offset, (uint8_t)it.key().toUtf8().size(), (uint16_t)std::distance(iataMap.begin(), iataMap.find(it.value()))});
string_offsets.push_back(Name1Index{label_offset, (uint8_t)it.key().toUtf8().size(), (uint16_t)std::distance(iataMap.begin(), iataMap.find(it.value().at(0)))});
label_offset += it.key().toUtf8().size();
}
f.write(R"(;
// string table indices into name_string_table
static const NameIndex name_string_index[] = {
static const Name1Index name1_string_index[] = {
)");
for (const auto &offset : string_offsets) {
f.write(" NameIndex{");
f.write(" Name1Index{");
f.write(QByteArray::number(offset.offset()));
f.write(", ");
f.write(QByteArray::number(offset.length));
......@@ -421,10 +435,68 @@ static const NameIndex name_string_index[] = {
}
f.write(R"(};
// reverse name lookup string table for non-unique strings
static const char nameN_string_table[] =
)");
// TODO prefix compression?
struct stringN_index_t {
QByteArray str;
uint16_t strOffset;
uint16_t iataMapOffset;
QVector<QString> iataList;
};
std::vector<stringN_index_t> stringN_offsets;
stringN_offsets.reserve(labelMap.size() - string_offsets.size());
uint16_t string_offset = 0;
uint16_t iata_map_offset = 0;
for (auto it = labelMap.begin(); it != labelMap.end(); ++it) {
if (it.value().size() == 1) {
continue;
}
f.write(" \"");
f.write(it.key().toUtf8());
f.write("\"\n");
stringN_offsets.emplace_back(stringN_index_t{it.key().toUtf8(), string_offset, iata_map_offset, it.value()});
string_offset += it.key().toUtf8().size();
iata_map_offset += it.value().size();
}
f.write(R"(;
// string table index to iata code mapping
static const uint16_t nameN_iata_table[] = {
)");
for (const auto &offset : stringN_offsets) {
f.write(" ");
for (const auto &iataCode : offset.iataList) {
f.write(QByteArray::number(std::distance(iataMap.begin(), iataMap.find(iataCode))));
f.write(", ");
}
f.write(" // ");
f.write(offset.str);
f.write("\n");
}
f.write(R"(};
// index into the above string and iata index tables
static const NameNIndex nameN_string_index[] = {
)");
for (const auto &offset : stringN_offsets) {
f.write(" NameNIndex{");
f.write(QByteArray::number(offset.strOffset));
f.write(", ");
f.write(QByteArray::number(offset.str.length()));
f.write(", ");
f.write(QByteArray::number(offset.iataMapOffset));
f.write(", ");
f.write(QByteArray::number(offset.iataList.size()));
f.write("},\n");
}
f.write(R"(};
}
)");
qDebug() << "Generated database containing" << iataMap.size() << "airports and" << labelMap.size() << "name keys.";
qDebug() << "Generated database containing" << iataMap.size() << "airports";
qDebug() << "Name fragment index:" << string_offsets.size() << "unique keys," << labelMap.size() - string_offsets.size() << "non-unique keys";
qDebug() << "IATA code collisions:" << iataCollisions;
qDebug() << "Coordinate conflicts:" << coordinateConflicts;
qDebug() << "Failed timezone lookups:" << timezoneLoopupFails;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment