Commit e925acc1 authored by Luis Javier Merino's avatar Luis Javier Merino Committed by Ahmad Samir
Browse files

Only recognize URIs with balanced parentheses

To prevent URIs inside parentheses from getting extended to the closing
parenthesis, only recognize URIs with balanced parentheses in regname,
path, query and/or fragment.  We still allow unbalanced parenthesis in
userInfo, since the postfix @ should prevent most ambiguous situations,
and the parenthesis can be part of a password.

CCBUG: 455166
parent 6e7cf530
......@@ -37,6 +37,10 @@ void HotSpotFilterTest::testUrlFilterRegex_data()
<< "https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true;
QTest::newRow("markup") << " [https://foobar](https://foobar)"
<< "https://foobar" << true;
QTest::newRow("markup_parens") << "[unix-history-repo](https://github.com/dspinellis/unix-history-repo)"
<< "https://github.com/dspinellis/unix-history-repo" << true;
QTest::newRow("markup_with_parens_inside_parens") << "[*Das verrückte Labyrinth*](https://en.wikipedia.org/wiki/Labyrinth_(board_game))"
<< "https://en.wikipedia.org/wiki/Labyrinth_(board_game)" << true;
QTest::newRow("bracket_before") << "[198]http://www.ietf.org/rfc/rfc2396.txt"
<< "http://www.ietf.org/rfc/rfc2396.txt" << true;
......@@ -77,6 +81,15 @@ void HotSpotFilterTest::testUrlFilterRegex_data()
QTest::newRow("query_with_question_marks") << "ldap://[2001:db8::7]/c=GB?objectClass?one"
<< "ldap://[2001:db8::7]/c=GB?objectClass?one" << true;
QTest::newRow("path_with_parens") << "https://en.wikipedia.org/wiki/C_(programming_language)"
<< "https://en.wikipedia.org/wiki/C_(programming_language)" << true;
QTest::newRow("query_with_parens") << "http://en.wikipedia.org/w/index.php?title=Thresholding_(image_processing)&oldid=132306976"
<< "http://en.wikipedia.org/w/index.php?title=Thresholding_(image_processing)&oldid=132306976" << true;
QTest::newRow("fragment_with_parens") << "https://en.wikipedia.org/wiki/ANSI_escape_code#CSI_(Control_Sequence_Introducer)_sequences"
<< "https://en.wikipedia.org/wiki/ANSI_escape_code#CSI_(Control_Sequence_Introducer)_sequences" << true;
QTest::newRow("url_with_lots_of_parens") << "(https://foo(bar(baz(qux)quux)quuux))))"
<< "https://foo(bar(baz(qux)quux)quuux)" << true;
}
void HotSpotFilterTest::testUrlFilterRegex()
......
......@@ -31,8 +31,12 @@ using namespace Konsole;
// - "port" (':1234'), if present, is assumed to be non-empty
// - We don't check the validity of percent-encoded characters
// (e.g. "www.example.com/foo%XXbar")
// - We don't recognize URIs with unbalanced parens in regname, path, query or fragment.
// We do this to prevent URIs inside parentheses from getting extended to the closing
// parenthesis. We still recognize unbalanced parens in userInfo, but the
// postfix @ should prevent most ambiguity.
// All () groups are non-capturing (by using "(?:)" notation)
// All non-recursive () groups are non-capturing (by using "(?:)" notation)
// less bookkeeping on the PCRE engine side
// scheme://
......@@ -42,18 +46,19 @@ static const char scheme_or_www[] = "(?<=^|[\\s\\[\\]()'\"`])(?:www\\.|[a-z][a-z
static const char scheme_or_www_end[] = ")";
// unreserved / pct-encoded / sub-delims
#define COMMON_1 "a-z0-9\\-._~%!$&'()*+,;="
#define COMMON_1 "a-z0-9\\-._~%!$&'*+,;="
#define BALANCED_PARENS(CHARS) "(?:[" CHARS "]++(\\((?:[" CHARS "]++|(?-1))*+\\))?+)"
/* clang-format off */
static const char userInfo[] = "(?:[" COMMON_1 ":" "]++@)?+"; // user:password@
static const char userInfo[] = "(?:[" COMMON_1 ":()" "]++@)?+"; // user:password@
#define IPv6_literal "\\[[0-9a-fA-F:.]++\\]"
static const char host[] = "(?:[" COMMON_1 "]++|" IPv6_literal ")?+"; // www.foo.bar
static const char host[] = "(?:" BALANCED_PARENS(COMMON_1) "++|" IPv6_literal ")?+"; // www.foo.bar
static const char port[] = "(?::[0-9]+)?+"; // :1234
#define COMMON_2 "a-z0-9\\-._~%!$&'()*+,;=:@/"
static const char path[] = "(?:/[" COMMON_2 "]*+)?+"; // /path/to/some/place
static const char query[] = "(?:\\?[" COMMON_2 "?]*+)?+"; // "?somequery=bar"
static const char fragment[] = "(?:#[" COMMON_2 "?]*+)?+"; // "#fragment"
#define COMMON_2 "a-z0-9\\-._~%!$&'*+,;=:@/"
static const char path[] = "(?:/" BALANCED_PARENS(COMMON_2) "*+)?+"; // /path/to/some/place
static const char query[] = "(?:\\?" BALANCED_PARENS(COMMON_2 "?") "*+)?+"; // "?somequery=bar"
static const char fragment[] = "(?:#" BALANCED_PARENS(COMMON_2 "?") "*+)?+"; // "#fragment"
using LS1 = QLatin1String;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment