Commit 6202739e authored by Volker Krause's avatar Volker Krause
Browse files

Add airport name tokenizer

This will replace the existing regular expressions for this, which doesn't
perform well enough when not processing short strings suspected to be an
airport name, but full PDF content.
parent 8283ad88
......@@ -10,6 +10,7 @@ ecm_add_test(jsonlddocumenttest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(mergeutiltest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(locationutiltest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(knowledgedbtest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(airportnametokenizertest.cpp ../src/lib/knowledgedb/airportnametokenizer.cpp TEST_NAME airportnametokenizertest LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(airportdbtest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(extractorresulttest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(extractordocumentnodetest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include <knowledgedb/airportnametokenizer_p.h>
#include <QDebug>
#include <QObject>
#include <QTest>
using namespace KItinerary;
#define s(x) QStringLiteral(x)
class AirportNameTokenizerTest : public QObject
{
Q_OBJECT
private Q_SLOTS:
void testTokenize_data()
{
QTest::addColumn<QString>("text");
QTest::addColumn<QStringList>("tokens");
QTest::newRow("empty") << QString() << QStringList();
QTest::newRow("no token") << s("1a0") << QStringList();
QTest::newRow("single token") << s("ABC") << QStringList({s("ABC")});
QTest::newRow("space") << s("abc def") << QStringList({s("abc"), s("def")});
QTest::newRow("linebreak") << s("abc\ndef") << QStringList({s("abc"), s("def")});
QTest::newRow("leading space") << s(" abc def ") << QStringList({s("abc"), s("def")});
QTest::newRow("quotes") << s("„abc\" \'def\'") << QStringList({s("abc"), s("def")});
QTest::newRow("dashes") << s("abc-def–ghi") << QStringList({s("abc"), s("def"), s("ghi")});
QTest::newRow("short") << s("ab def gh") << QStringList({s("def")});
QTest::newRow("parenthesis") << s("abc(def)") << QStringList({s("abc"), s("def")});
QTest::newRow("numbers") << s("01 abc 02 def") << QStringList({s("abc"), s("def")});
QTest::newRow("ampersand") << s("abc&def") << QStringList({s("abc"), s("def")});
QTest::newRow("comma") << s("abc, def.") << QStringList({s("abc"), s("def")});
QTest::newRow("SFO") << s("SFO/SAN FRANCISCO INTERNATIONAL") << QStringList({s("SFO"), s("SAN"), s("FRANCISCO"), s("INTERNATIONAL")});
}
void testTokenize()
{
QFETCH(QString, text);
QFETCH(QStringList, tokens);
AirportNameTokenizer tokenizer(text);
const auto out = tokenizer.toStringList();
QCOMPARE(out, tokens);
}
};
QTEST_APPLESS_MAIN(AirportNameTokenizerTest)
#include "airportnametokenizertest.moc"
......@@ -54,6 +54,7 @@ target_sources(KPimItinerary PRIVATE
knowledgedb/alphaid.cpp
knowledgedb/airportdb.cpp
knowledgedb/airportnametokenizer.cpp
knowledgedb/countrydb.cpp
knowledgedb/iatacode.cpp
knowledgedb/knowledgedb.cpp
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "airportnametokenizer_p.h"
#include <QStringList>
using namespace KItinerary;
AirportNameTokenizer::AirportNameTokenizer(QStringView text)
: m_text(text)
{
advance();
}
AirportNameTokenizer::~AirportNameTokenizer() = default;
bool AirportNameTokenizer::hasNext()
{
return m_end > m_begin && m_begin >= 0 && m_end >= 0 && m_end <= m_text.size();
}
QStringView AirportNameTokenizer::next()
{
Q_ASSERT(hasNext());
const auto s = m_text.mid(m_begin, m_end - m_begin);
advance();
return s;
}
void AirportNameTokenizer::advance()
{
m_begin = m_end;
for (;m_begin < m_text.size(); ++m_begin) {
if (!isSeparator(m_text.at(m_begin))) {
break;
}
}
m_end = m_begin + 1;
for (;m_end < m_text.size(); ++m_end) {
if (isSeparator(m_text.at(m_end))) {
break;
}
}
if ((m_end - m_begin) < MIN_LENGTH) {
m_begin = m_end;
}
if (!hasNext() && m_end < m_text.size()) {
advance();
}
}
bool AirportNameTokenizer::isSeparator(QChar c) const
{
return c.isSpace() || c.isNumber() || c.isPunct() || !c.isPrint();
}
QStringList AirportNameTokenizer::toStringList()
{
QStringList l;
while (hasNext()) {
l.push_back(next().toString());
}
return l;
}
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_AIRPORTNAMETOKENIZER_H
#define KITINERARY_AIRPORTNAMETOKENIZER_H
#include <QStringView>
class QStringList;
namespace KItinerary {
/** Split airport names into the tokens used by the airport database. */
class AirportNameTokenizer
{
public:
explicit AirportNameTokenizer(QStringView text);
~AirportNameTokenizer();
/** Returns @true if next() can be called one more time. */
bool hasNext();
/** Returns the next token and advances the tokenizer. */
QStringView next();
/** Returns a list containing all tokens. */
QStringList toStringList();
private:
void advance();
bool isSeparator(QChar c) const;
static constexpr const int MIN_LENGTH = 3;
QStringView m_text;
int m_begin = 0;
int m_end = 0;
};
}
#endif // KITINERARY_AIRPORTNAMETOKENIZER_H
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment