Commit 455c499c authored by Volker Krause's avatar Volker Krause
Browse files

A multi-language time finder for the generic boarding pass extractor

parent 987b502b
......@@ -31,6 +31,7 @@ ecm_add_test(htmldocumenttest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(barcodedecodertest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary Qt::Gui)
ecm_add_test(pkpassextractortest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary KPim::PkPass)
ecm_add_test(extractorutiltest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(timefindertest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(postprocessortest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
ecm_add_test(extractorvalidatortest.cpp LINK_LIBRARIES Qt::Test KPim::Itinerary)
if (TARGET KF5::CalendarCore)
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include <text/timefinder.cpp>
#include <QDebug>
#include <QTest>
using namespace KItinerary;
#define s(x) QStringLiteral(x)
class TimeFinderTest : public QObject
{
Q_OBJECT
private Q_SLOTS:
void testTimeFinderSingular_data()
{
QTest::addColumn<QString>("input");
QTest::addColumn<QTime>("time");
QTest::newRow("ISO") << s("bla 23:42 blub") << QTime(23, 42);
QTest::newRow("short") << s("1:23") << QTime(1, 23);
QTest::newRow("min") << s("abc 0:00") << QTime(0, 0);
QTest::newRow("max") << s("23:59 abc") << QTime(23, 59);
QTest::newRow("French") << s("bla 16h45 blub") << QTime(16, 45);
QTest::newRow("dot") << s("bla 18.30 blub") << QTime(18, 30);
QTest::newRow("US") << s("1:30 pm") << QTime(13, 30);
QTest::newRow("US midnight") << s("12:00 am") << QTime(0, 0);
QTest::newRow("US noon") << s("12:00 p.m.") << QTime(12, 0);
QTest::newRow("extra pm") << s("14:20PM") << QTime(14, 20);
QTest::newRow("short pm") << s("6:40p") << QTime(18, 40);
QTest::newRow("Japanese") << s("16時04分") << QTime(16, 4);
QTest::newRow("dot with ap") << s("12.30 am") << QTime(0, 30);
QTest::newRow("Korean pm") << s("오후 1시 30분") << QTime(13, 30);
QTest::newRow("Korean 24h") << s("14시 5분") << QTime(14, 5);
QTest::newRow("Chinese 24h") << s("19時45分") << QTime(19, 45);
QTest::newRow("Chinese colon/pm") << s("下午7:45") << QTime(19, 45);
QTest::newRow("Chinese full/pm") << s("下午7點45分") << QTime(19, 45);
QTest::newRow("Greek pm") << s("10:40 μ.μ.") << QTime(22, 40);
QTest::newRow("Arabic pm") << s("11:30م ") << QTime(23, 30);
// TODO tests for RLM/LRM control chars, and Arabic times with indic numbers
}
void testTimeFinderSingular()
{
QFETCH(QString, input);
QFETCH(QTime, time);
TimeFinder finder;
finder.find(input);
QCOMPARE(finder.times().size(), 1);
QCOMPARE(finder.times()[0], time);
}
void testTimeFinderNone_data()
{
QTest::addColumn<QString>("input");
QTest::newRow("empty") << QString();
QTest::newRow("text") << s("some text without mentioning a time");
QTest::newRow("colon") << s("item 3: 24");
QTest::newRow("DE date") << s("am 12.10. ");
QTest::newRow("ISO hour out of range") << s("24:59");
QTest::newRow("ISO min out of range") << s("23:60");
QTest::newRow("with seconds") << s("12:23:54");
QTest::newRow("in number") << s("12.234");
QTest::newRow("price") << s("1.30$");
QTest::newRow("pm no separator") << s("6:40px");
QTest::newRow("no leading separation") << s("x12:30 bla");
QTest::newRow("no trailing separation") << s("\n12:30bla");
QTest::newRow("ap without separation") << s("07:13ama");
}
void testTimeFinderNone()
{
QFETCH(QString, input);
TimeFinder finder;
finder.find(input);
QCOMPARE(finder.times().size(), 0);
}
void testTimeFinderPlural()
{
TimeFinder finder;
finder.find(u"from 09:00 to 17:00");
QCOMPARE(finder.times().size(), 2);
QCOMPARE(finder.times()[0], QTime(9, 0));
QCOMPARE(finder.times()[1], QTime(17, 0));
}
};
QTEST_GUILESS_MAIN(TimeFinderTest)
#include "timefindertest.moc"
......@@ -92,6 +92,8 @@ target_sources(KPimItinerary PRIVATE
scripts/extractors.qrc
text/timefinder.cpp
tlv/berelement.cpp
uic9183/rct2ticket.cpp
......
......@@ -11,6 +11,7 @@
#include <knowledgedb/airportdb.h>
#include <knowledgedb/airportnametokenizer_p.h>
#include <pdf/pdfdocument.h>
#include <text/timefinder_p.h>
#include <KItinerary/ExtractorDocumentNode>
#include <KItinerary/ExtractorResult>
......@@ -119,6 +120,7 @@ ExtractorResult GenericBoardingPassExtractor::extract(const ExtractorDocumentNod
flightRes.setReservationFor(flight);
result.push_back(std::move(flightRes));
}
}
return result;
......
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "timefinder_p.h"
#include <QDebug>
#include <QRegularExpression>
#include <QStringView>
#include <QTime>
using namespace KItinerary;
static bool isSeparator(QChar c)
{
return !c.isDigit() && !c.isLetter() && c != QLatin1Char(':');
}
void TimeFinder::find(QStringView text)
{
m_times.clear();
static const QRegularExpression rxTimes[] = {
QRegularExpression(QStringLiteral("(?<hour>\\d?\\d)時(?<min>\\d\\d)分")),
QRegularExpression(QStringLiteral("(?:(?<am>오전)|(?<pm>오후) ?)?(?<hour>\\d?\\d)시 ?(?<min>\\d?\\d)분")),
QRegularExpression(QStringLiteral("(?:(?<am>上午)|(?<pm>下午))?(?<hour>\\d?\\d)點(?<min>\\d?\\d)分")),
QRegularExpression(QStringLiteral("(?:(?<am>上午)|(?<pm>下午))(?<hour>\\d?\\d):(?<min>\\d?\\d)")),
QRegularExpression(QStringLiteral("\\b(?<hour>\\d?\\d)[:h](?<min>\\d\\d)")),
QRegularExpression(QStringLiteral("\\b(?<hour>\\d\\d)\\.(?<min>\\d\\d)(?=$|[^.])")),
};
static const QRegularExpression rxApSuffixes[] = {
QRegularExpression(QStringLiteral("(?<pm> ?(?:pm|PM|p\\.m\\.|م|μ\\\\.))")),
QRegularExpression(QStringLiteral("(?<am> ?(?:am|AM|a\\.m\\.|ص|π\\\\.))")),
QRegularExpression(QStringLiteral("(?<pm>p)")),
QRegularExpression(QStringLiteral("(?<am>a)")),
};
int rxTimesPattern = -1;
for (auto i = 0; i < text.size(); ++i) {
QRegularExpressionMatch rxTimeMatch;
if (rxTimesPattern < 0) {
rxTimesPattern = 0;
for (const auto &rx : rxTimes) {
rxTimeMatch = rx.match(text, i);
if (rxTimeMatch.hasMatch()) {
break;
}
++rxTimesPattern;
}
} else {
rxTimeMatch = rxTimes[rxTimesPattern].match(text, i);
}
if (!rxTimeMatch.hasMatch()) {
return;
}
i = rxTimeMatch.capturedEnd();
QRegularExpressionMatch rxApMatch;
for (const auto &rx : rxApSuffixes) {
rxApMatch = rx.match(text, i, QRegularExpression::NormalMatch, QRegularExpression::AnchoredMatchOption);
if (rxApMatch.hasMatch()) {
break;
}
}
if (rxApMatch.hasMatch()) {
i = rxApMatch.capturedEnd();
}
if (i < text.size() && !isSeparator(text[i])) {
continue;
}
auto hour = rxTimeMatch.captured(u"hour").toInt();
const auto min = rxTimeMatch.captured(u"min").toInt();
if (hour < 0 || hour > 23 || min < 0 || min > 59) {
continue;
}
const bool isPm = !rxApMatch.captured(u"pm").isEmpty() || !rxTimeMatch.captured(u"pm").isEmpty();
const bool isAm = !rxApMatch.captured(u"am").isEmpty() || !rxTimeMatch.captured(u"am").isEmpty();
if (isPm && isAm) {
continue;
} else if (isPm && hour < 12) {
hour += 12;
} else if (isAm && hour == 12) {
hour = 0;
}
if (std::find(m_times.begin(), m_times.end(), QTime(hour, min)) == m_times.end()) {
m_times.push_back(QTime(hour, min));
}
}
}
const std::vector<QTime>& TimeFinder::times() const
{
return m_times;
}
/*
SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#ifndef KITINERARY_TIMEFINDER_H
#define KITINERARY_TIMEFINDER_H
class QStringView;
class QTime;
#include <vector>
namespace KItinerary {
/** Attempts to find time values in all locales mentioned in the given text. */
class TimeFinder
{
public:
void find(QStringView text);
const std::vector<QTime>& times() const;
private:
std::vector<QTime> m_times;
};
}
#endif // KITINERARY_TIMEFINDER_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment