Commit 972da43b authored by Volker Krause's avatar Volker Krause
Browse files

Normalize Unicode compatibility characters as well

This expands a number of ligatures.
parent e6af8c8b
Pipeline #144792 passed with stages
in 2 minutes and 7 seconds
......@@ -26,6 +26,8 @@ private Q_SLOTS:
QTest::newRow("normalized") << _("normal") << _("normal");
QTest::newRow("case-folding") << _("NORMAL") << _("normal");
QTest::newRow("umlaut") << _("NöRMÄl") << _("normal");
QTest::newRow("ligature1") << _("finish") << _("finish");
QTest::newRow("ligature2") << _("off") << _("off");
}
void testNormalize()
......
......@@ -373,13 +373,16 @@ static bool isSameTouristAttraction(const TouristAttraction &lhs, const TouristA
}
// compute the "difference" between @p lhs and @p rhs
static QString diffString(const QString &lhs, const QString &rhs)
static QString diffString(const QString &rawLhs, const QString &rawRhs)
{
const auto lhs = StringUtil::normalize(rawLhs);
const auto rhs = StringUtil::normalize(rawRhs);
QString diff;
// this is just a basic linear-time heuristic, this would need to be more something like
// the Levenstein Distance algorithm
for (int i = 0, j = 0; i < lhs.size() || j < rhs.size();) {
if (i < lhs.size() && j < rhs.size() && StringUtil::normalize(lhs[i]) == StringUtil::normalize(rhs[j])) {
if (i < lhs.size() && j < rhs.size() && lhs[i] == rhs[j]) {
++i;
++j;
continue;
......
......@@ -11,28 +11,29 @@
using namespace KItinerary;
QChar StringUtil::normalize(QChar c)
{
// case folding
const auto n = c.toCaseFolded();
// if the character has a canonical decomposition use that and skip the
// combining diacritic markers following it
// see https://en.wikipedia.org/wiki/Unicode_equivalence
// see https://en.wikipedia.org/wiki/Combining_character
if (n.decompositionTag() == QChar::Canonical) {
return n.decomposition().at(0);
}
return n;
}
QString StringUtil::normalize(QStringView str)
{
QString out;
out.reserve(str.size());
for (const auto c : str) {
out.push_back(normalize(c));
// case folding
const auto n = c.toCaseFolded();
// if the character has a canonical decomposition use that and skip the
// combining diacritic markers following it
// see https://en.wikipedia.org/wiki/Unicode_equivalence
// see https://en.wikipedia.org/wiki/Combining_character
if (n.decompositionTag() == QChar::Canonical) {
out.push_back(n.decomposition().at(0));
}
// handle compatibility compositions such as ligatures
// see https://en.wikipedia.org/wiki/Unicode_compatibility_characters
else if (n.decompositionTag() == QChar::Compat && n.isLetter() && n.script() == QChar::Script_Latin) {
out.append(n.decomposition());
}
else {
out.push_back(n);
}
}
return out;
}
......
......@@ -17,9 +17,6 @@ namespace KItinerary {
/** String normalization and comparison utilities. */
namespace StringUtil
{
/** Convert @p c to case-folded form and remove diacritic marks. */
QChar normalize(QChar c);
/** Strips out diacritics and converts to case-folded form.
* @internal only exported for unit tests
*/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment