Commit 40b1fcf4 authored by Ahmad Samir's avatar Ahmad Samir Committed by Kurt Hindenburg
Browse files

Use possessive quantifiers to optimise regex matching performance

See 'ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS' section at:
https://pcre.org/pcre.txt

Thanks to Giuseppe D'Angelo for pointing that out.
parent 3b7e73f5
......@@ -12,35 +12,71 @@ QTEST_GUILESS_MAIN(HotSpotFilterTest)
void HotSpotFilterTest::testUrlFilterRegex_data()
{
QTest::addColumn<QString>("url");
QTest::addColumn<QString>("expectedUrl");
QTest::addColumn<bool>("matchResult");
// A space, \n, or \t before the url to match what happens at runtime,
// i.e. to match "http" but not "foohttp"
QTest::newRow("url_simple") << " https://api.kde.org" << true;
QTest::newRow("url_with_port") << "\nhttps://api.kde.org:2098" << true;
QTest::newRow("url_with_path") << "https://api.kde.org/path/to/somewhere" << true;
QTest::newRow("url_with_query") << "https://user:pass@api.kde.org?somequery=foo" << true;
QTest::newRow("url_with_port_path") << " https://api.kde.org:2098/path/to/somewhere" << true;
QTest::newRow("url_with_user_password") << "\thttps://user:blah@api.kde.org" << true;
QTest::newRow("url_with_user_password_port_fragment") << " https://user:blah@api.kde.org:2098#fragment" << true;
QTest::newRow("url_all_bells") << " https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" << true;
QTest::newRow("uppercase") << " https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true;
QTest::newRow("markup") << " [https://foobar](https://foobar)" << true;
QTest::newRow("bad_url_no_scheme") << QStringLiteral(" www.kde.org") << false;
QTest::newRow("url_simple") << " https://api.kde.org"
<< "https://api.kde.org" << true;
QTest::newRow("url_with_port") << "\nhttps://api.kde.org:2098"
<< "https://api.kde.org:2098" << true;
QTest::newRow("url_with_path") << "https://api.kde.org/path/to/somewhere"
<< "https://api.kde.org/path/to/somewhere" << true;
QTest::newRow("url_with_query") << "https://user:pass@api.kde.org?somequery=foo"
<< "https://user:pass@api.kde.org?somequery=foo" << true;
QTest::newRow("url_with_port_path") << " https://api.kde.org:2098/path/to/somewhere"
<< "https://api.kde.org:2098/path/to/somewhere" << true;
QTest::newRow("url_with_user_password") << "\thttps://user:blah@api.kde.org"
<< "https://user:blah@api.kde.org" << true;
QTest::newRow("url_with_user_password_port_fragment") << " https://user:blah@api.kde.org:2098#fragment"
<< "https://user:blah@api.kde.org:2098#fragment" << true;
QTest::newRow("url_all_bells") << " https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment"
<< "https://user:pass@api.kde.org:2098/path/to/somewhere?somequery=foo#fragment" << true;
QTest::newRow("uppercase") << " https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md"
<< "https://invent.kde.org/frameworks/ktexteditor/-/blob/master/README.md" << true;
QTest::newRow("markup") << " [https://foobar](https://foobar)"
<< "https://foobar" << true;
QTest::newRow("bracket_before") << "[198]http://www.ietf.org/rfc/rfc2396.txt"
<< "http://www.ietf.org/rfc/rfc2396.txt" << true;
QTest::newRow("quote_before") << "\"http://www.ietf.org/rfc/rfc2396.txt"
<< "http://www.ietf.org/rfc/rfc2396.txt" << true;
QTest::newRow("file_scheme") << "file:///some/file"
<< "file:///some/file" << true;
QTest::newRow("uppercase_host") << "https://EXAMPLE.com"
<< "https://EXAMPLE.com" << true;
QTest::newRow("uppercase_query") << "https://example.com?fooOpt=barVal"
<< "https://example.com?fooOpt=barVal" << true;
QTest::newRow("uppercase_fragment") << "https://example.com?fooOpt=barVal#FRAG"
<< "https://example.com?fooOpt=barVal#FRAG" << true;
QTest::newRow("www") << " www.kde.org"
<< "www.kde.org" << true;
QTest::newRow("with_comma_in_path") << "https://example.com/foo,bar"
<< "https://example.com/foo,bar" << true;
QTest::newRow("empty_query") << "http://example.com/?"
<< "http://example.com" << true;
QTest::newRow("empty_fragment") << "http://example.com/#"
<< "http://example.com" << true;
}
void HotSpotFilterTest::testUrlFilterRegex()
{
QFETCH(QString, url);
QFETCH(QString, expectedUrl);
QFETCH(bool, matchResult);
const QRegularExpression &regex = Konsole::UrlFilter::FullUrlRegExp;
const QRegularExpressionMatch match = regex.match(url);
// qDebug() << match;
QCOMPARE(match.hasMatch(), matchResult);
if (strcmp(QTest::currentDataTag(), "markup") == 0) {
QCOMPARE(match.capturedView(0), u"https://foobar");
} else if (matchResult) {
QCOMPARE(match.capturedView(0), url.trimmed());
if (matchResult) {
QCOMPARE(match.capturedView(0), expectedUrl);
}
}
......@@ -19,11 +19,17 @@ using namespace Konsole;
// FullUrlRegExp is implemented based on:
// https://datatracker.ietf.org/doc/html/rfc3986
// See above URL for what "unreserved", "pct-encoded" ...etc mean, also
// for the regex used for each part of the url being matched against
// unreserved / pct-encoded / sub-delims
// [a-z0-9\\-._~%!$&'()*+,;=]
// The above string is used in various char[] below
// for the regex used for each part of the url being matched against.
//
// It deviates from rfc3986:
// - We only recognize URIs with authority (even if it is an empty authority)
// - We match URIs starting with 'www.'
// - "userinfo" is assumed to have a single ':' character
// - We _don't_ match IPv6 addresses (e.g. http://[2010:836B:4179::836B:4179])
// or IPvFuture
// - "port" (':1234'), if present, is assumed to be non-empty
// - We don't check the validity of percent-encoded characters
// (e.g. "www.example.com/foo%XXbar")
// All () groups are non-capturing (by using "(?:)" notation)
// less bookkeeping on the PCRE engine side
......@@ -31,32 +37,37 @@ using namespace Konsole;
// scheme://
// - Must start with an ASCII letter, preceeded by any non-word character,
// so "http" but not "mhttp"
static const char scheme[] = "(?<=^|\\s|\\W)(?:[a-z][a-z0-9+\\-.]*://)";
static const char scheme_or_www[] = "(?<=^|[\\s\\[\\]()'\"])(?:www\\.|[a-z][a-z0-9+\\-.]*+://)";
// unreserved / pct-encoded / sub-delims
#define COMMON_1 "a-z0-9\\-._~%!$&'()*+,;="
/* clang-format off */
// user:password@
static const char userInfo[] =
"(?:"
"[a-z0-9\\-._~%!$&'()*+,;=]+?:?"
"[a-z0-9\\-._~%!$&'()*+,;=]+@"
")?";
static const char host[] = "(?:[a-z0-9\\-._~%!$&'()*+,;=]+)"; // www.foo.bar
static const char port[] = "(?::[0-9]+)?"; // :1234
static const char path[] = "(?:[a-zA-Z0-9\\-._~%!$&'()*+,;=:@/]+)?"; // /path/to/some/place
static const char query[] = "(?:\\?[a-z0-9\\-._~%!$&'()*+,;=:@/]+)?"; // "?somequery=bar"
static const char fragment[] = "(?:#[a-z0-9/?]+)?";
"[" COMMON_1 "]+?:?"
"[" COMMON_1 "]++@"
")?+";
static const char host[] = "(?:[" COMMON_1 "]*+)"; // www.foo.bar
static const char port[] = "(?::[0-9]+)?+"; // :1234
#define COMMON_2 "a-z0-9\\-._~%!$&'()*+,;=:@/"
static const char path[] = "(?:/[" COMMON_2 "]+)?+"; // /path/to/some/place
static const char query[] = "(?:\\?[" COMMON_2 "]+)?+"; // "?somequery=bar"
static const char fragment[] = "(?:#[" COMMON_2 "]+)?+"; // "#fragment"
using LS1 = QLatin1String;
/* clang-format off */
const QRegularExpression UrlFilter::FullUrlRegExp(
LS1(scheme)
LS1(scheme_or_www)
+ LS1(userInfo)
+ LS1(host)
+ LS1(port)
+ LS1(path)
+ LS1(query)
+ LS1(fragment)
);
, QRegularExpression::CaseInsensitiveOption);
/* clang-format on */
/////////////////////////////////////////////
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment