Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
PIM
KItinerary
Commits
972da43b
Commit
972da43b
authored
Mar 03, 2022
by
Volker Krause
Browse files
Normalize Unicode compatibility characters as well
This expands a number of ligatures.
parent
e6af8c8b
Pipeline
#144792
passed with stages
in 2 minutes and 7 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
autotests/stringutiltest.cpp
View file @
972da43b
...
...
@@ -26,6 +26,8 @@ private Q_SLOTS:
QTest
::
newRow
(
"normalized"
)
<<
_
(
"normal"
)
<<
_
(
"normal"
);
QTest
::
newRow
(
"case-folding"
)
<<
_
(
"NORMAL"
)
<<
_
(
"normal"
);
QTest
::
newRow
(
"umlaut"
)
<<
_
(
"NöRMÄl"
)
<<
_
(
"normal"
);
QTest
::
newRow
(
"ligature1"
)
<<
_
(
"finish"
)
<<
_
(
"finish"
);
QTest
::
newRow
(
"ligature2"
)
<<
_
(
"off"
)
<<
_
(
"off"
);
}
void
testNormalize
()
...
...
src/lib/mergeutil.cpp
View file @
972da43b
...
...
@@ -373,13 +373,16 @@ static bool isSameTouristAttraction(const TouristAttraction &lhs, const TouristA
}
// compute the "difference" between @p lhs and @p rhs
static
QString
diffString
(
const
QString
&
l
hs
,
const
QString
&
rhs
)
static
QString
diffString
(
const
QString
&
rawL
hs
,
const
QString
&
r
awR
hs
)
{
const
auto
lhs
=
StringUtil
::
normalize
(
rawLhs
);
const
auto
rhs
=
StringUtil
::
normalize
(
rawRhs
);
QString
diff
;
// this is just a basic linear-time heuristic, this would need to be more something like
// the Levenstein Distance algorithm
for
(
int
i
=
0
,
j
=
0
;
i
<
lhs
.
size
()
||
j
<
rhs
.
size
();)
{
if
(
i
<
lhs
.
size
()
&&
j
<
rhs
.
size
()
&&
StringUtil
::
normalize
(
lhs
[
i
]
)
==
StringUtil
::
normalize
(
rhs
[
j
])
)
{
if
(
i
<
lhs
.
size
()
&&
j
<
rhs
.
size
()
&&
lhs
[
i
]
==
rhs
[
j
])
{
++
i
;
++
j
;
continue
;
...
...
src/lib/stringutil.cpp
View file @
972da43b
...
...
@@ -11,28 +11,29 @@
using
namespace
KItinerary
;
QChar
StringUtil
::
normalize
(
QChar
c
)
{
// case folding
const
auto
n
=
c
.
toCaseFolded
();
// if the character has a canonical decomposition use that and skip the
// combining diacritic markers following it
// see https://en.wikipedia.org/wiki/Unicode_equivalence
// see https://en.wikipedia.org/wiki/Combining_character
if
(
n
.
decompositionTag
()
==
QChar
::
Canonical
)
{
return
n
.
decomposition
().
at
(
0
);
}
return
n
;
}
QString
StringUtil
::
normalize
(
QStringView
str
)
{
QString
out
;
out
.
reserve
(
str
.
size
());
for
(
const
auto
c
:
str
)
{
out
.
push_back
(
normalize
(
c
));
// case folding
const
auto
n
=
c
.
toCaseFolded
();
// if the character has a canonical decomposition use that and skip the
// combining diacritic markers following it
// see https://en.wikipedia.org/wiki/Unicode_equivalence
// see https://en.wikipedia.org/wiki/Combining_character
if
(
n
.
decompositionTag
()
==
QChar
::
Canonical
)
{
out
.
push_back
(
n
.
decomposition
().
at
(
0
));
}
// handle compatibility compositions such as ligatures
// see https://en.wikipedia.org/wiki/Unicode_compatibility_characters
else
if
(
n
.
decompositionTag
()
==
QChar
::
Compat
&&
n
.
isLetter
()
&&
n
.
script
()
==
QChar
::
Script_Latin
)
{
out
.
append
(
n
.
decomposition
());
}
else
{
out
.
push_back
(
n
);
}
}
return
out
;
}
...
...
src/lib/stringutil.h
View file @
972da43b
...
...
@@ -17,9 +17,6 @@ namespace KItinerary {
/** String normalization and comparison utilities. */
namespace
StringUtil
{
/** Convert @p c to case-folded form and remove diacritic marks. */
QChar
normalize
(
QChar
c
);
/** Strips out diacritics and converts to case-folded form.
* @internal only exported for unit tests
*/
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment