Commit 65d5d45c authored by Ahmad Samir's avatar Ahmad Samir
Browse files

Use possessive quantifiers to optimise regex matching performance

See 'ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS' section at:
https://pcre.org/pcre.txt

Thanks to Giuseppe D'Angelo for pointing that out.
parent c8d8abd1
......@@ -12,35 +12,71 @@ QTEST_GUILESS_MAIN(HotSpotFilterTest)
void HotSpotFilterTest::testUrlFilterRegex_data()
{
QTest::addColumn<QString>("url");
QTest::addColumn<QString>("expectedUrl");
QTest::addColumn<bool>("matchResult");
// A space, \n, or \t before the url to match what happens at runtime,
// i.e. to match "http" but not "foohttp"
QTest::newRow("url_simple") << " https://api.kde.org" << true;
QTest::newRow("url_with_port") << "\nhttps://api.kde.org:2098" << true;
QTest::newRow("url_with_path") << "https://api.kde.org/path/to/somewhere" << true;
QTest::newRow("url_with_query") << "https://user:pass@api.kde.org?somequery=foo" << true;
QTest::newRow("url_with_port_path") << " https://api.kde.org:2098/path/to/somewhere" << true;
QTest::newRow("url_with_user_password") << "\thttps://user:blah@api.kde.org" << true;
QTest::newRow("url_with_user_password_port_fragment") << " https://user:blah@api.kde.org:2098#fragment" << true;
QTest::newRow("url_all_bells") << " https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" << true;
QTest::newRow("uppercase") << " https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true;
QTest::newRow("markup") << " [https://foobar](https://foobar)" << true;
QTest::newRow("bad_url_no_scheme") << QStringLiteral(" www.kde.org") << false;
QTest::newRow("url_simple") << " https://api.kde.org"
<< "https://api.kde.org" << true;
QTest::newRow("url_with_port") << "\nhttps://api.kde.org:2098"
<< "https://api.kde.org:2098" << true;
QTest::newRow("url_with_path") << "https://api.kde.org/path/to/somewhere"
<< "https://api.kde.org/path/to/somewhere" << true;
QTest::newRow("url_with_query") << "https://user:pass@api.kde.org?somequery=foo"
<< "https://user:pass@api.kde.org?somequery=foo" << true;
QTest::newRow("url_with_port_path") << " https://api.kde.org:2098/path/to/somewhere"
<< "https://api.kde.org:2098/path/to/somewhere" << true;
QTest::newRow("url_with_user_password") << "\thttps://user:blah@api.kde.org"
<< "https://user:blah@api.kde.org" << true;
QTest::newRow("url_with_user_password_port_fragment") << " https://user:blah@api.kde.org:2098#fragment"
<< "https://user:blah@api.kde.org:2098#fragment" << true;
QTest::newRow("url_all_bells") << " https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment"
<< "https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" << true;
QTest::newRow("uppercase") << " https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md"
<< "https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true;
QTest::newRow("markup") << " [https://foobar](https://foobar)"
<< "https://foobar" << true;
QTest::newRow("bracket_before") << "[198]http://www.ietf.org/rfc/rfc2396.txt"
<< "http://www.ietf.org/rfc/rfc2396.txt" << true;
QTest::newRow("quote_before") << "\"http://www.ietf.org/rfc/rfc2396.txt"
<< "http://www.ietf.org/rfc/rfc2396.txt" << true;
QTest::newRow("file_scheme") << "file:///some/file"
<< "file:///some/file" << true;
QTest::newRow("uppercase_host") << "https://EXAMPLE.com"
<< "https://EXAMPLE.com" << true;
QTest::newRow("uppercase_query") << "https://example.com?fooOpt=barVal"
<< "https://example.com?fooOpt=barVal" << true;
QTest::newRow("uppercase_fragment") << "https://example.com?fooOpt=barVal#FRAG"
<< "https://example.com?fooOpt=barVal#FRAG" << true;
QTest::newRow("www") << " www.kde.org"
<< "www.kde.org" << true;
QTest::newRow("with_comma_in_path") << "https://example.com/foo,bar"
<< "https://example.com/foo,bar" << true;
QTest::newRow("empty_query") << "http://example.com/?"
<< "http://example.com" << true;
QTest::newRow("empty_fragment") << "http://example.com/#"
<< "http://example.com" << true;
}
void HotSpotFilterTest::testUrlFilterRegex()
{
QFETCH(QString, url);
QFETCH(QString, expectedUrl);
QFETCH(bool, matchResult);
const QRegularExpression &regex = Konsole::UrlFilter::FullUrlRegExp;
const QRegularExpressionMatch match = regex.match(url);
// qDebug() << match;
QCOMPARE(match.hasMatch(), matchResult);
if (strcmp(QTest::currentDataTag(), "markup") == 0) {
QCOMPARE(match.capturedView(0), u"https://foobar");
} else if (matchResult) {
QCOMPARE(match.capturedView(0), url.trimmed());
if (matchResult) {
QCOMPARE(match.capturedView(0), expectedUrl);
}
}
......@@ -19,11 +19,17 @@ using namespace Konsole;
// FullUrlRegExp is implemented based on:
// https://datatracker.ietf.org/doc/html/rfc3986
// See above URL for what "unreserved", "pct-encoded" ...etc mean, also
// for the regex used for each part of the url being matched against
// unreserved / pct-encoded / sub-delims
// [a-z0-9\\-._~%!$&'()*+,;=]
// The above string is used in various char[] below
// for the regex used for each part of the url being matched against.
//
// It deviates from rfc3986:
// - We only recognize URIs with authority (even if it is an empty authority)
// - We match URIs starting with 'www.'
// - "userinfo" is assumed to have a single ':' character
// - We _don't_ match IPv6 addresses (e.g. http://[2010:836B:4179::836B:4179])
// or IPvFuture
// - "port" (':1234'), if present, is assumed to be non-empty
// - We don't check the validity of percent-encoded characters
// (e.g. "www.example.com/foo%XXbar")
// All () groups are non-capturing (by using "(?:)" notation)
// less bookkeeping on the PCRE engine side
......@@ -31,32 +37,37 @@ using namespace Konsole;
// scheme://
// - Must start with an ASCII letter, preceeded by any non-word character,
// so "http" but not "mhttp"
static const char scheme[] = "(?<=^|\\s|\\W)(?:[a-z][a-z0-9+\\-.]*://)";
static const char scheme_or_www[] = "(?<=^|[\\s\\[\\]()'\"])(?:www\\.|[a-z][a-z0-9+\\-.]*+://)";
// unreserved / pct-encoded / sub-delims
#define COMMON_1 "a-z0-9\\-._~%!$&'()*+,;="
/* clang-format off */
// user:password@
static const char userInfo[] =
"(?:"
"[a-z0-9\\-._~%!$&'()*+,;=]+?:?"
"[a-z0-9\\-._~%!$&'()*+,;=]+@"
")?";
static const char host[] = "(?:[a-z0-9\\-._~%!$&'()*+,;=]+)"; // www.foo.bar
static const char port[] = "(?::[0-9]+)?"; // :1234
static const char path[] = "(?:[a-zA-Z0-9\\-._~%!$&'()*+,;=:@/]+)?"; // /path/to/some/place
static const char query[] = "(?:\\?[a-z0-9\\-._~%!$&'()*+,;=:@/]+)?"; // "?somequery=bar"
static const char fragment[] = "(?:#[a-z0-9/?]+)?";
"[" COMMON_1 "]+?:?"
"[" COMMON_1 "]++@"
")?+";
static const char host[] = "(?:[" COMMON_1 "]*+)"; // www.foo.bar
static const char port[] = "(?::[0-9]+)?+"; // :1234
#define COMMON_2 "a-z0-9\\-._~%!$&'()*+,;=:@/"
static const char path[] = "(?:/[" COMMON_2 "]+)?+"; // /path/to/some/place
static const char query[] = "(?:\\?[" COMMON_2 "]+)?+"; // "?somequery=bar"
static const char fragment[] = "(?:#[" COMMON_2 "]+)?+"; // "#fragment"
using LS1 = QLatin1String;
/* clang-format off */
const QRegularExpression UrlFilter::FullUrlRegExp(
LS1(scheme)
LS1(scheme_or_www)
+ LS1(userInfo)
+ LS1(host)
+ LS1(port)
+ LS1(path)
+ LS1(query)
+ LS1(fragment)
);
, QRegularExpression::CaseInsensitiveOption);
/* clang-format on */
/////////////////////////////////////////////
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment