Commit 4e5c7076 authored by Volker Krause's avatar Volker Krause
Browse files

Add Benerail station ids to the train station database

This is now possible that Wikidata got that information, and enables for
example Thalys barcodes to be matched to the correct stations.

This also includes a bit of refactoring of the station identifier db
generation to get rid of some of the code duplication there.
parent b2ef5ea5
......@@ -86,6 +86,22 @@ private Q_SLOTS:
qDebug() << id1;
}
void testStationIdentifiers()
{
auto sncf = KnowledgeDb::SncfStationId(QStringLiteral("FRPNO"));
QVERIFY(sncf.isValid());
QCOMPARE(sncf.toString(), QLatin1String("FRPNO"));
sncf = KnowledgeDb::SncfStationId(QStringLiteral("Abc"));
QVERIFY(!sncf.isValid());
sncf = KnowledgeDb::SncfStationId(QStringLiteral("CHZID"));
QVERIFY(sncf.isValid());
QCOMPARE(sncf.toString(), QLatin1String("CHZID"));
auto vrCode = KnowledgeDb::VRStationCode(QStringLiteral("HSL"));
QVERIFY(vrCode.isValid());
QCOMPARE(vrCode.toString(), QLatin1String("HSL"));
}
void testIBNRLookup()
{
auto station = KnowledgeDb::stationForIbnr(IBNR{1234567});
......@@ -160,6 +176,22 @@ private Q_SLOTS:
QCOMPARE(station.country, CountryId{"NL"});
}
void testBenerailStationIdLookup()
{
auto station = KnowledgeDb::stationForBenerailId({});
QVERIFY(!station.coordinate.isValid());
QCOMPARE(toQTimeZone(station.timezone()), QTimeZone());
station = KnowledgeDb::stationForBenerailId(BenerailStationId{"XXXXX"});
QVERIFY(!station.coordinate.isValid());
QCOMPARE(toQTimeZone(station.timezone()), QTimeZone());
station = KnowledgeDb::stationForBenerailId(BenerailStationId{"NLASC"});
QVERIFY(station.coordinate.isValid());
QCOMPARE(toQTimeZone(station.timezone()), QTimeZone("Europe/Amsterdam"));
QCOMPARE(station.country, CountryId{"NL"});
}
void testCountryDb()
{
auto country = KnowledgeDb::countryForId(CountryId{});
......
......@@ -41,6 +41,7 @@ set(kitinerary_lib_srcs
knowledgedb/countrydb.cpp
knowledgedb/iatacode.cpp
knowledgedb/knowledgedb.cpp
knowledgedb/stationidentifier.cpp
knowledgedb/timezonedb.cpp
knowledgedb/trainstationdb.cpp
......
......@@ -249,6 +249,8 @@ TrainStation ExtractorPostprocessorPrivate::processTrainStation(TrainStation sta
const auto record = KnowledgeDb::stationForIndianRailwaysStationCode(id.mid(3));
applyStationData(record, station);
} else if (id.startsWith(QLatin1String("benerail:")) && id.size() == 14) {
const auto record = KnowledgeDb::stationForBenerailId(KnowledgeDb::BenerailStationId(id.mid(9)));
applyStationData(record, station);
applyStationCountry(id.mid(9, 2).toUpper(), station);
} else if (id.startsWith(QLatin1String("vrfi:")) && id.size() >= 7 && id.size() <= 9) {
const auto record = KnowledgeDb::stationForVRStationCode(KnowledgeDb::VRStationCode(id.mid(5)));
......
......@@ -10,6 +10,7 @@ add_executable(generate-knowledgedb
trainstationdbgenerator.cpp
util.cpp
../stringutil.cpp
../knowledgedb/stationidentifier.cpp
)
target_compile_definitions(generate-knowledgedb PRIVATE "KITINERARY_STATIC_DEFINE")
target_include_directories(generate-knowledgedb PRIVATE
......
......@@ -35,7 +35,13 @@ static bool operator<(const TrainStationDbGenerator::Station &lhs, const QUrl &r
bool TrainStationDbGenerator::generate(QIODevice *out)
{
// retrieve content from Wikidata
if (!fetchIBNR() || !fetchUIC() || !fetchSncf() || !fetchIndianRailwaysStationCode() || !fetchFinishStationCodes()) {
if (!fetch("P954", "ibnr", m_ibnrMap)
|| !fetch("P722", "uic", m_uicMap)
|| !fetch("P8181", "sncf", m_sncfIdMap)
|| !fetch("P8448", "benerail", m_benerailIdMap)
|| !fetchIndianRailwaysStationCode()
|| !fetchFinishStationCodes()
) {
return false;
}
if (!fetchCountryInformation()) {
......@@ -55,9 +61,10 @@ namespace KItinerary {
namespace KnowledgeDb {
)");
writeStationData(out);
writeIBNRMap(out);
writeUICMap(out);
writeSncfMap(out);
writeIdMap(out, m_ibnrMap, "ibnr", "IBNR");
writeIdMap(out, m_uicMap, "uic", "UICStation");
writeIdMap(out, m_sncfIdMap, "sncfStationId", "SncfStationId");
writeIdMap(out, m_benerailIdMap, "benerail", "BenerailStationId");
writeIndianRailwaysMap(out);
writeVRMap(out);
out->write(R"(
......@@ -69,16 +76,17 @@ namespace KnowledgeDb {
return true;
}
bool TrainStationDbGenerator::fetchIBNR()
template<typename Id>
bool TrainStationDbGenerator::fetch(const char *prop, const char *name, std::map<Id, QUrl> &idMap)
{
const auto stationArray = WikiData::query(R"(
SELECT DISTINCT ?station ?stationLabel ?ibnr ?coord ?replacedBy WHERE {
const auto stationArray = WikiData::query(QLatin1String(R"(
SELECT DISTINCT ?station ?stationLabel ?id ?coord ?replacedBy WHERE {
?station (wdt:P31/wdt:P279*) wd:Q55488.
?station wdt:P954 ?ibnr.
?station wdt:)") + QString::fromUtf8(prop) + QLatin1String(R"( ?id.
OPTIONAL { ?station wdt:P625 ?coord. }
OPTIONAL { ?station wdt:P1366 ?replacedBy. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
} ORDER BY (?station))", "wikidata_trainstation_ibnr.json");
} ORDER BY (?station))"), QLatin1String("wikidata_trainstation_") + QString::fromUtf8(name) + QLatin1String(".json"));
if (stationArray.isEmpty()) {
qWarning() << "Empty query result!";
return false;
......@@ -91,97 +99,20 @@ bool TrainStationDbGenerator::fetchIBNR()
}
const auto uri = insertOrMerge(stationObj);
const auto id = stationObj.value(QLatin1String("ibnr")).toObject().value(QLatin1String("value")).toString().toUInt();
if (id < 1000000 || id > 9999999) {
++m_idFormatViolations;
qWarning() << "IBNR format violation" << id << uri;
continue;
}
const auto it = m_ibnrMap.find(id);
if (it != m_ibnrMap.end() && (*it).second != uri) {
++m_idConflicts;
qWarning() << "Conflict on IBNR" << id << uri << m_ibnrMap[id];
} else {
m_ibnrMap[id] = uri;
}
}
return true;
}
bool TrainStationDbGenerator::fetchUIC()
{
const auto stationArray = WikiData::query(R"(
SELECT DISTINCT ?station ?stationLabel ?uic ?coord WHERE {
?station (wdt:P31/wdt:P279*) wd:Q55488.
?station wdt:P722 ?uic.
OPTIONAL { ?station wdt:P625 ?coord. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
} ORDER BY (?station))", "wikidata_trainstation_ibnr.json");
if (stationArray.isEmpty()) {
qWarning() << "Empty query result!";
return false;
}
for (const auto &stationData : stationArray) {
const auto stationObj = stationData.toObject();
const auto uri = insertOrMerge(stationObj);
auto id = stationObj.value(QLatin1String("uic")).toObject().value(QLatin1String("value")).toString().toUInt();
if (id > 9999999) {
id /= 10; // strip off check digit if present
}
if (id < 1000000 || id > 9999999) {
++m_idFormatViolations;
qWarning() << "UIC format violation" << id << uri;
continue;
}
const auto it = m_uicMap.find(id);
if (it != m_uicMap.end() && (*it).second != uri) {
++m_idConflicts;
qWarning() << "Conflict on UIC" << id << uri << m_uicMap[id];
} else {
m_uicMap[id] = uri;
}
}
return true;
}
bool TrainStationDbGenerator::fetchSncf()
{
const auto stationArray = WikiData::query(R"(
SELECT DISTINCT ?station ?stationLabel ?sncfId ?coord WHERE {
?station (wdt:P31/wdt:P279*) wd:Q55488.
?station wdt:P8181 ?sncfId.
OPTIONAL { ?station wdt:P625 ?coord. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
} ORDER BY (?station))", "wikidata_trainstation_gare_connexion.json");
if (stationArray.isEmpty()) {
qWarning() << "Empty query result!";
return false;
}
for (const auto &stationData : stationArray) {
const auto stationObj = stationData.toObject();
const auto uri = insertOrMerge(stationObj);
const auto id = stationObj.value(QLatin1String("sncfId")).toObject().value(QLatin1String("value")).toString().toUpper();
if (id.size() != 5 || !Util::containsOnlyLetters(id)) {
const auto idStr = stationObj.value(QLatin1String("id")).toObject().value(QLatin1String("value")).toString();
const auto id = Id(idStr);
if (!id.isValid()) {
++m_idFormatViolations;
qWarning() << "SNCF ID format violation" << id << uri;
qWarning() << name << "format violation" << idStr << uri;
continue;
}
const auto it = m_sncfIdMap.find(id);
if (it != m_sncfIdMap.end() && (*it).second != uri) {
const auto it = idMap.find(id);
if (it != idMap.end() && (*it).second != uri) {
++m_idConflicts;
qWarning() << "Conflict on SNCF ID" << id << uri << m_sncfIdMap[id];
qWarning() << "Conflict on" << name << idStr << uri << idMap[id];
} else {
m_sncfIdMap[id] = uri;
idMap[id] = uri;
}
}
......@@ -245,17 +176,18 @@ bool TrainStationDbGenerator::fetchFinishStationCodes()
const auto uri = insertOrMerge(stationObj);
// TODO this filters 'Ä' and 'Ö' too, which seem to occur in a few cases?
const auto id = stationObj.value(QLatin1String("code")).toObject().value(QLatin1String("value")).toString().toUpper();
if (id.size() < 2 || id.size() > 4 || !Util::containsOnlyLetters(id)) {
const auto idStr = stationObj.value(QLatin1String("code")).toObject().value(QLatin1String("value")).toString().toUpper();
const auto id = KnowledgeDb::VRStationCode(idStr);
if (!id.isValid()) {
++m_idFormatViolations;
qWarning() << "VR (Finland) station id format violation" << id << uri;
qWarning() << "VR (Finland) station id format violation" << idStr << uri;
continue;
}
const auto it = m_vrfiMap.find(id);
if (it != m_vrfiMap.end() && (*it).second != uri) {
++m_idConflicts;
qWarning() << "Conflict on VR (Finland) station code" << id << uri << m_vrfiMap[id];
qWarning() << "Conflict on VR (Finland) station code" << idStr << uri << m_vrfiMap[id];
} else {
m_vrfiMap[id] = uri;
}
......@@ -360,35 +292,23 @@ void TrainStationDbGenerator::writeStationData(QIODevice *out)
out->write("};\n\n");
}
void TrainStationDbGenerator::writeIBNRMap(QIODevice *out)
template<typename Id>
void TrainStationDbGenerator::writeIdMap(QIODevice *out, const std::map<Id, QUrl> &idMap, const char *tabName, const char *typeName) const
{
out->write("static constexpr const TrainStationIdIndex<IBNR> ibnr_table[] = {\n");
for (const auto &it : m_ibnrMap) {
out->write("static constexpr const TrainStationIdIndex<");
out->write(typeName);
out->write("> ");
out->write(tabName);
out->write("_table[] = {\n");
for (const auto &it : idMap) {
const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second);
if (station == m_stations.end() || (*station).uri != it.second) {
continue;
}
out->write(" { IBNR{");
out->write(QByteArray::number(it.first));
out->write("}, TrainStationIndex{");
out->write(QByteArray::number((int)std::distance(m_stations.begin(), station)));
out->write("} }, // ");
out->write((*station).name.toUtf8());
out->write("\n");
}
out->write("};\n\n");
}
void TrainStationDbGenerator::writeUICMap(QIODevice* out)
{
out->write("static constexpr const TrainStationIdIndex<UICStation> uic_table[] = {\n");
for (const auto &it : m_uicMap) {
const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second);
if (station == m_stations.end() || (*station).uri != it.second) {
continue;
}
out->write(" { UICStation{");
out->write(QByteArray::number(it.first));
out->write(" { ");
out->write(typeName);
out->write("{");
out->write(encodeId(it.first));
out->write("}, TrainStationIndex{");
out->write(QByteArray::number((int)std::distance(m_stations.begin(), station)));
out->write("} }, // ");
......@@ -398,30 +318,11 @@ void TrainStationDbGenerator::writeUICMap(QIODevice* out)
out->write("};\n\n");
}
void TrainStationDbGenerator::writeSncfMap(QIODevice *out)
{
out->write("static constexpr const TrainStationIdIndex<SncfStationId> sncfStationId_table[] = {\n");
for (const auto &it : m_sncfIdMap) {
const auto station = std::lower_bound(m_stations.begin(), m_stations.end(), it.second);
if (station == m_stations.end() || (*station).uri != it.second) {
continue;
}
out->write(" { SncfStationId{\"");
out->write(it.first.toUtf8());
out->write("\"}, TrainStationIndex{");
out->write(QByteArray::number((int)std::distance(m_stations.begin(), station)));
out->write("} }, // ");
out->write((*station).name.toUtf8());
out->write("\n");
}
out->write("};\n\n");
}
void TrainStationDbGenerator::writeIndianRailwaysMap(QIODevice *out)
{
// variable length identifiers, so we need a string table
std::vector<uint16_t> offsets;
offsets.reserve(m_sncfIdMap.size());
offsets.reserve(m_indianRailwaysMap.size());
uint16_t offset = 0;
out->write("static constexpr const char indianRailwaysSationCode_stringtable[] =\n");
......@@ -475,8 +376,8 @@ void TrainStationDbGenerator::writeVRMap(QIODevice *out)
continue;
}
out->write(" { VRStationCode{\"");
out->write(it.first.toUtf8());
for (int i = 0; i < 4 - it.first.size(); ++i) {
out->write(it.first.toString().toUtf8());
for (int i = 0; i < 4 - it.first.toString().toUtf8().size(); ++i) {
out->write("\\0");
}
out->write("\"}, TrainStationIndex{");
......@@ -494,6 +395,7 @@ void TrainStationDbGenerator::printSummary()
qDebug() << "IBNR index:" << m_ibnrMap.size() << "elements";
qDebug() << "UIC index:" << m_uicMap.size() << "elements";
qDebug() << "SNCF station code index:" << m_sncfIdMap.size() << "elements";
qDebug() << "Benerail station code index:" << m_benerailIdMap.size() << "elements";
qDebug() << "Indian Railwaiys station code index:" << m_indianRailwaysMap.size() << "elements";
qDebug() << "VR (Finland) station code index:" << m_vrfiMap.size() << "elements";
qDebug() << "Identifier collisions:" << m_idConflicts;
......
......@@ -8,6 +8,7 @@
#define KITINERARY_GENERATOR_TRAINSTATIONDBGENERATOR_H
#include <knowledgedb.h>
#include <stationidentifier.h>
#include <QByteArray>
#include <QString>
......@@ -37,28 +38,32 @@ public:
};
private:
bool fetchIBNR();
bool fetchUIC();
bool fetchSncf();
template <typename Id>
bool fetch(const char *prop, const char *name, std::map<Id, QUrl> &idMap);
bool fetchIndianRailwaysStationCode();
bool fetchFinishStationCodes();
bool fetchCountryInformation();
QUrl insertOrMerge(const QJsonObject &obj, bool mergeOnly = false);
void processStations();
void writeStationData(QIODevice *out);
void writeIBNRMap(QIODevice *out);
void writeUICMap(QIODevice *out);
void writeSncfMap(QIODevice *out);
template <typename Id>
void writeIdMap(QIODevice *out, const std::map<Id, QUrl> &idMap, const char *tabName, const char *typeName) const;
void writeIndianRailwaysMap(QIODevice *out);
void writeVRMap(QIODevice *out);
void printSummary();
template <typename Id>
QByteArray encodeId(Id id) const { return '"' + id.toString().toUtf8() + '"'; }
QByteArray encodeId(KnowledgeDb::IBNR id) const { return QByteArray::number(id.value()); }
QByteArray encodeId(KnowledgeDb::UICStation id) const { return QByteArray::number(id.value()); }
std::vector<Station> m_stations;
std::map<uint32_t, QUrl> m_ibnrMap;
std::map<uint32_t, QUrl> m_uicMap;
std::map<QString, QUrl> m_sncfIdMap;
std::map<KnowledgeDb::IBNR, QUrl> m_ibnrMap;
std::map<KnowledgeDb::UICStation, QUrl> m_uicMap;
std::map<KnowledgeDb::SncfStationId, QUrl> m_sncfIdMap;
std::map<KnowledgeDb::BenerailStationId, QUrl> m_benerailIdMap;
std::map<QString, QUrl> m_indianRailwaysMap;
std::map<QString, QUrl> m_vrfiMap;
std::map<KnowledgeDb::VRStationCode, QUrl> m_vrfiMap;
int m_idConflicts = 0;
int m_idFormatViolations = 0;
......
......@@ -37,9 +37,14 @@ KnowledgeDb::Coordinate WikiData::parseCoordinate(const QString& value)
}
QJsonArray WikiData::query(const char *sparqlQuery, const char *cacheFileName)
{
return query(QString::fromUtf8(sparqlQuery), QString::fromUtf8(cacheFileName));
}
QJsonArray WikiData::query(const QString &sparqlQuery, const QString &cacheFileName)
{
QDir().mkdir(QStringLiteral("data"));
QFile cacheFile(QLatin1String("data/") + QString::fromUtf8(cacheFileName));
QFile cacheFile(QLatin1String("data/") + cacheFileName);
QByteArray data;
if (cacheFile.exists() && qEnvironmentVariableIsSet("KITINERARY_USE_WIKIDATA_CACHE")) {
cacheFile.open(QFile::ReadOnly);
......@@ -50,7 +55,7 @@ QJsonArray WikiData::query(const char *sparqlQuery, const char *cacheFileName)
if (data.isEmpty()) {
QUrl url(QStringLiteral("https://query.wikidata.org/sparql"));
QUrlQuery query;
query.addQueryItem(QStringLiteral("query"), QString::fromUtf8(sparqlQuery).trimmed().simplified());
query.addQueryItem(QStringLiteral("query"), sparqlQuery.trimmed().simplified());
query.addQueryItem(QStringLiteral("format"), QStringLiteral("json"));
url.setQuery(query);
......
......@@ -30,6 +30,7 @@ namespace WikiData
* CLI tool!
*/
QJsonArray query(const char *sparqlQuery, const char *cacheFileName);
QJsonArray query(const QString &sparqlQuery, const QString &cacheFileName);
}
}
......
/*
SPDX-FileCopyrightText: 2018-2020 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "stationidentifier.h"
#include <QString>
#include <cstring>
using namespace KItinerary::KnowledgeDb;
UICIdentiferBase::UICIdentiferBase(const QString &id)
{
const auto n = id.toUInt();
setValue(n > 9999999 ? n / 10 : n);
}
static bool containsOnlyLetters(const QString &s)
{
for (const auto c : s) {
if (c < QLatin1Char('A') || c > QLatin1Char('Z')) {
return false;
}
}
return true;
}
FiveAlphaId::FiveAlphaId(const QString &id)
{
if (id.size() != 5 || !containsOnlyLetters(id)) {
return;
}
setValue(fromChars(id.toUpper().toUtf8().constData()));
}
QString FiveAlphaId::toString() const
{
auto id = value();
if (id == 0) {
return {};
}
QString s;
s.resize(5);
for (int i = 0; i < 5; ++i) {
s[4 - i] = QLatin1Char('@' + (id % 27));
id /= 27;
}
return s;
}
VRStationCode::VRStationCode(const QString &id)
{
if (id.size() < 2 || id.size() > 4 || !containsOnlyLetters(id)) {
return;
}
char buffer[4];
std::memset(buffer, 0, 4);
std::memcpy(buffer, id.toUpper().toUtf8().constData(), id.size());
setValue(fromChars(buffer));
}
QString VRStationCode::toString() const
{
auto id = value();
if (id == 0) {
return {};
}
QString s;
for (int i = 3; i >= 0; --i) {
const auto v = ((id >> (i*6)) % 32);
if (v == 0) {
break;
}
s += QLatin1Char('@' + v);
}
return s;
}
/*
SPDX-FileCopyrightText: 2018-2020 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_KNOWLEDGEDB_STATIONIDENTIFIER_H
#define KITINERARY_KNOWLEDGEDB_STATIONIDENTIFIER_H
#include "kitinerary_export.h"
#include "knowledgedb.h"
class QString;
namespace KItinerary {
namespace KnowledgeDb {
/** Base class for UIC/IBNR station identifiers. */
class UICIdentiferBase : public UnalignedNumber<3> {
public:
inline explicit constexpr UICIdentiferBase() = default;
inline explicit constexpr UICIdentiferBase(uint32_t id) :
UnalignedNumber<3>(id > 9999999 ? id / 10 : id) // strip off check digit if present
{}
KITINERARY_EXPORT UICIdentiferBase(const QString &id);
inline constexpr bool isValid() const
{
return value() >= 1000000 && value() <= 9999999;
}
};
/** IBNR station id.
* 2 digits UIC country code, 5 digits station id.
* Same format as UICStation, but nevertheless different values.
*/
class IBNR : public UICIdentiferBase {
using UICIdentiferBase::UICIdentiferBase;
};
/** UIC station id.
* 2 digits UIC country code, 5 digits station id.
* Same format as IBNR, but nevertheless different values.
*/
class UICStation : public UICIdentiferBase {
using UICIdentiferBase::UICIdentiferBase;
};
/** Base class for SNCF/Benerail station identifiers. */
class FiveAlphaId : public UnalignedNumber<3> {
public:
inline explicit constexpr FiveAlphaId() = default;
inline explicit constexpr FiveAlphaId(const char s[5])
: UnalignedNumber<3>(fromChars(s))
{
}
KITINERARY_EXPORT explicit FiveAlphaId(const QString &id);
inline constexpr bool isValid() const