Commit 8bd81172 authored by Volker Krause's avatar Volker Krause
Browse files

Resolve HTML entities in names

Those are sometimes left over from broken encoding in e.g. schema.org
annotations.

This uses KF5::Codecs, which we indirectly depend on anyway already.
parent 1b2f70f1
Pipeline #180433 passed with stage
in 3 minutes and 50 seconds
......@@ -8,6 +8,7 @@ Dependencies:
'frameworks/ki18n' : '@latest'
'frameworks/kcontacts' : '@latest'
'frameworks/kcalendarcore' : '@latest'
'frameworks/kcodecs' : '@latest'
'pim/kmime' : '@same'
'pim/kpkpass' : '@same'
......
......@@ -4,7 +4,7 @@
find_package(Qt${QT_MAJOR_VERSION}Test ${QT_REQUIRED_VERSION} CONFIG REQUIRED)
add_definitions(-DSOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}")
ecm_add_test(stringutiltest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary)
ecm_add_test(stringutiltest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary KF5::Codecs)
ecm_add_test(datatypestest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test Qt${QT_MAJOR_VERSION}::Qml KPim::Itinerary)
ecm_add_test(jsonlddocumenttest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary)
ecm_add_test(mergeutiltest.cpp LINK_LIBRARIES Qt${QT_MAJOR_VERSION}::Test KPim::Itinerary)
......
......@@ -4,7 +4,7 @@
SPDX-License-Identifier: LGPL-2.0-or-later
*/
#include "stringutil.h"
#include "../src/lib/stringutil.cpp"
#include <QObject>
#include <QTest>
......@@ -46,6 +46,15 @@ private Q_SLOTS:
QCOMPARE(StringUtil::prefixSimilarity(u"ab", u"aa"), 0.5f);
QCOMPARE(StringUtil::prefixSimilarity(u"ac", u"abbb"), 0.25f);
}
void testClean()
{
QCOMPARE(StringUtil::clean(QString()), QString());
QCOMPARE(StringUtil::clean(QStringLiteral("Lech Wa&#322;&#281;sa Airport")), QStringLiteral("Lech Wałęsa Airport"));
QCOMPARE(StringUtil::clean(QStringLiteral("On-demand services: MobilityData&#39;s GOFS project &amp;amp; going further")),
QLatin1String("On-demand services: MobilityData's GOFS project &amp; going further"));
}
};
QTEST_APPLESS_MAIN(StringUtilTest)
......
......@@ -22,7 +22,7 @@ target_include_directories(generate-knowledgedb PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../lib/knowledgedb
${CMAKE_CURRENT_BINARY_DIR}/../lib
)
target_link_libraries(generate-knowledgedb PRIVATE Qt${QT_MAJOR_VERSION}::Network Qt${QT_MAJOR_VERSION}::Gui KOSM)
target_link_libraries(generate-knowledgedb PRIVATE Qt${QT_MAJOR_VERSION}::Network Qt${QT_MAJOR_VERSION}::Gui KOSM KF5::Codecs)
# extract all elements we are interested in (airports, terminals, stations at airports)
osm_filter(OUTPUT airports.o5m FILTER --keep=\"iata=* or aeroway=terminal or public_transport=station or railway=station or railway=halt or railway=tram_stop\")
......
......@@ -178,6 +178,7 @@ target_link_libraries(KPimItinerary
Qt${QT_MAJOR_VERSION}::QmlPrivate
KF5::Archive
KF5::I18n
KF5::Codecs
KF5::Contacts
KPim::PkPass
OpenSSL::Crypto
......
......@@ -9,6 +9,7 @@
#include "extractorpostprocessor_p.h"
#include "extractorvalidator.h"
#include "flightpostprocessor_p.h"
#include "stringutil.h"
#include "iata/iatabcbpparser.h"
#include "jsonlddocument.h"
......@@ -399,7 +400,7 @@ EventReservation ExtractorPostprocessorPrivate::processEventReservation(EventRes
KItinerary::Event ExtractorPostprocessorPrivate::processEvent(KItinerary::Event event) const
{
event.setName(event.name().trimmed());
event.setName(StringUtil::clean(event.name()));
// normalize location to be a Place
if (JsonLd::isA<PostalAddress>(event.location())) {
......
......@@ -5,6 +5,7 @@
*/
#include "extractorvalidator.h"
#include "stringutil.h"
#include <QDateTime>
#include <QVariant>
......@@ -78,7 +79,7 @@ public:
template<typename T> inline T ExtractorPostprocessorPrivate::processPlace(T place)
{
place.setName(place.name().simplified());
place.setName(StringUtil::clean(place.name()));
auto addr = processAddress(place.address(), place.telephone(), place.geo());
place.setAddress(addr);
place.setTelephone(processPhoneNumber(place.telephone(), place.address()));
......
......@@ -6,6 +6,8 @@
#include "stringutil.h"
#include <KCharsets>
#include <QDebug>
#include <QString>
......@@ -111,3 +113,8 @@ float StringUtil::prefixSimilarity(QStringView s1, QStringView s2)
return (float)s1.size() / (float)s2.size();
}
QString StringUtil::clean(const QString &s)
{
return KCharsets::resolveEntities(s).simplified();
}
......@@ -6,8 +6,6 @@
#pragma once
#include "kitinerary_export.h"
class QChar;
class QString;
class QStringView;
......@@ -17,10 +15,8 @@ namespace KItinerary {
/** String normalization and comparison utilities. */
namespace StringUtil
{
/** Strips out diacritics and converts to case-folded form.
* @internal only exported for unit tests
*/
KITINERARY_EXPORT QString normalize(QStringView str);
/** Strips out diacritics and converts to case-folded form. */
QString normalize(QStringView str);
/** Assuming both sides are describing the same thing, this tries to find the "better" string.
* That is, prefer the one that didn't lose casing/unicode/etc in previous processing.
......@@ -29,9 +25,11 @@ namespace StringUtil
/** Returns how much of the prefix of two given strings are equal, in
* relation to the longer of the two input strings.
* @internal only exported for unit tests
*/
KITINERARY_EXPORT float prefixSimilarity(QStringView s1, QStringView s2);
float prefixSimilarity(QStringView s1, QStringView s2);
/** Cleans up extra white spaces and XML entities from @p s. */
QString clean(const QString &s);
}
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment