Commit 99779f09 authored by Laurent Montel's avatar Laurent Montel 😁
Browse files

Minor optimization

parent 960dc821
......@@ -73,13 +73,15 @@ void MessageViewerUtilsTest::shouldExtractHtml()
{
QFETCH(QString, input);
QFETCH(QString, output);
const MessageViewer::Util::HtmlMessageInfo processHtml = MessageViewer::Util::processHtml(input);
bool equal = processHtml.htmlSource == output;
if (!equal) {
qDebug() << " processed " << processHtml.htmlSource;
qDebug() << " ref " << output;
QBENCHMARK {
const MessageViewer::Util::HtmlMessageInfo processHtml = MessageViewer::Util::processHtml(input);
bool equal = processHtml.htmlSource == output;
if (!equal) {
qDebug() << " processed " << processHtml.htmlSource;
qDebug() << " ref " << output;
}
QVERIFY(equal);
}
QVERIFY(equal);
}
void MessageViewerUtilsTest::shouldExtractHtml_data()
......@@ -91,16 +93,50 @@ void MessageViewerUtilsTest::shouldExtractHtml_data()
QString output = QStringLiteral("foo");
QTest::newRow("test1") << input << output;
input = QStringLiteral("<html><head></head><body>foo</body></html></div>");
output = QStringLiteral("foo");
QTest::newRow("test2") << input << output;
input = QStringLiteral(
"That's interesting. I don't see new commits or anything relevant to it on the author's releases. I don't actually know why the author uses the other library as they do seem to have similar data... Maybe some other functions that are easier to use.<br><br><br>All the best,<br><br>C<br><br><br>-------- Original Message --------<br>On Mar 3, 2020, 09:56, foo wrote:<blockquote class=\"protonmail_quote\"><br><!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\r\n<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\r\np, li { white-space: pre-wrap; }\r\n</style></head><body>\r\n<p>Hey bla,</p>\r\n<p>&nbsp;</p>\r\n<p>how are things going? Done your PhD?</p>\r\n<p>&nbsp;</p>\r\n<p>On a recent installation I had an issue with the Orthanc-Module, during initialization of the database:</p>\r\n<p><span style=\" font-family:'monospace';\"><br /> from .datetime import DateTime </span><span style=\" font-family:'monospace','Noto Sans';\"><br />'Something' as changed in the setup of timezone data (between December and now so to say).</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">To make the story short, this module pytzdata comes from the pypi-package pytzdata and contains basically the same stuff as pytz.</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">Except that pendulum and pytzdata is from the same author.</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">Do you have an idea why he not uses pytz, as everybody else?</span></p>\r\n<p>&nbsp;</p>\r\n<p>Thanks</p>\r\n<p>&nbsp;</p>\r\n<p>-- </p>\r\n<p>T: @coogor</p>\r\n<p>Matrix: @docb:matrix.org</p>\r\n<p>PGP Fingerprint: 2E7F 3A19 A4A4 844A 3D09 7656 822D EB64 A3BA 290D</p>\r\n<p>&nbsp;</p>\r\n<p>http://gnuhealth.ghf2020.org</p></body></html></div>");
"That's interesting. I don't see new commits or anything relevant to it on the author's releases. I don't actually know why the author uses the other library as they do seem to have similar data... Maybe some other functions that are easier to use.<br><br><br>All the best,<br><br>C<br><br><br>-------- Original Message --------<br>On Mar 3, 2020, 09:56, foo wrote:<blockquote class=\"protonmail_quote\"><br><!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\r\n<html><head><meta name=\"qrichtext\" content=\"1\" /><style type=\"text/css\">\r\np, li { white-space: pre-wrap; }\r\n</style></head><body>\r\n<p>Hey bla,</p>\r\n<p>&nbsp;</p>\r\n<p>how are things going? Done your PhD?</p>\r\n<p>&nbsp;</p>\r\n<p>On a recent installation I had an issue with the Orthanc-Module, during initialization of the database:</p>\r\n<p><span style=\" font-family:'monospace';\"><br /> from .datetime import DateTime </span><span style=\" font-family:'monospace','Noto Sans';\"><br />'Something' as changed in the setup of timezone data (between December and now so to say).</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">To make the story short, this module pytzdata comes from the pypi-package pytzdata and contains basically the same stuff as pytz.</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">Except that pendulum and pytzdata is from the same author.</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">Do you have an idea why he not uses pytz, as everybody else?</span></p>\r\n<p>&nbsp;</p>\r\n<p>Thanks</p>\r\n<p>&nbsp;</p>\r\n<p>-- </p>\r\n<p>T: @coogor</p>\r\n<p>Matrix: @docb:matrix.org</p>\r\n<p>PGP Fingerprint: 2E7F 3A19 A4A4 844A 3D09 7656 822D EB64 A3BA 290D</p>\r\n<p>&nbsp;</p>\r\n<p>http://gnuhealth.ghf2020.org</p></body></html></div>");
output = QStringLiteral(
"That's interesting. I don't see new commits or anything relevant to it on the author's releases. I don't actually know why the author uses the other library as they do seem to have similar data... Maybe some other functions that are easier to use.<br><br><br>All the best,<br><br>C<br><br><br>-------- Original Message --------<br>On Mar 3, 2020, 09:56, foo wrote:<blockquote class=\"protonmail_quote\"><br><p>Hey bla,</p>\r\n<p>&nbsp;</p>\r\n<p>how are things going? Done your PhD?</p>\r\n<p>&nbsp;</p>\r\n<p>On a recent installation I had an issue with the Orthanc-Module, during initialization of the database:</p>\r\n<p><span style=\" font-family:'monospace';\"><br /> from .datetime import DateTime </span><span style=\" font-family:'monospace','Noto Sans';\"><br />'Something' as changed in the setup of timezone data (between December and now so to say).</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">To make the story short, this module pytzdata comes from the pypi-package pytzdata and contains basically the same stuff as pytz.</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">Except that pendulum and pytzdata is from the same author.</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">Do you have an idea why he not uses pytz, as everybody else?</span></p>\r\n<p>&nbsp;</p>\r\n<p>Thanks</p>\r\n<p>&nbsp;</p>\r\n<p>-- </p>\r\n<p>T: @coogor</p>\r\n<p>Matrix: @docb:matrix.org</p>\r\n<p>PGP Fingerprint: 2E7F 3A19 A4A4 844A 3D09 7656 822D EB64 A3BA 290D</p>\r\n<p>&nbsp;</p>\r\n<p>http://gnuhealth.ghf2020.org</p>");
"That's interesting. I don't see new commits or anything relevant to it on the author's releases. I don't actually know why the author uses the other library as they do seem to have similar data... Maybe some other functions that are easier to use.<br><br><br>All the best,<br><br>C<br><br><br>-------- Original Message --------<br>On Mar 3, 2020, 09:56, foo wrote:<blockquote class=\"protonmail_quote\"><br><p>Hey bla,</p>\r\n<p>&nbsp;</p>\r\n<p>how are things going? Done your PhD?</p>\r\n<p>&nbsp;</p>\r\n<p>On a recent installation I had an issue with the Orthanc-Module, during initialization of the database:</p>\r\n<p><span style=\" font-family:'monospace';\"><br /> from .datetime import DateTime </span><span style=\" font-family:'monospace','Noto Sans';\"><br />'Something' as changed in the setup of timezone data (between December and now so to say).</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">To make the story short, this module pytzdata comes from the pypi-package pytzdata and contains basically the same stuff as pytz.</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">Except that pendulum and pytzdata is from the same author.</span></p>\r\n<p><span style=\" font-family:'monospace','Noto Sans';\">Do you have an idea why he not uses pytz, as everybody else?</span></p>\r\n<p>&nbsp;</p>\r\n<p>Thanks</p>\r\n<p>&nbsp;</p>\r\n<p>-- </p>\r\n<p>T: @coogor</p>\r\n<p>Matrix: @docb:matrix.org</p>\r\n<p>PGP Fingerprint: 2E7F 3A19 A4A4 844A 3D09 7656 822D EB64 A3BA 290D</p>\r\n<p>&nbsp;</p>\r\n<p>http://gnuhealth.ghf2020.org</p>");
QTest::newRow("bug418482") << input << output;
input = QStringLiteral(
"HTML REPLY<br>\nSECOND LINE<br>\n-- <br>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/plain; charset=utf-8\" /></head><body style=\"overflow-wrap:break-word; word-break: break-word;white-space:pre-wrap;\"><div>You wrote:<blockquote style=\"margin: 0.8ex 0pt 0pt 0.8ex; border-left: 1px solid rgb(204, 204, 204); padding-left: 1ex;\">HTML QUOTE\n\nSECOND LINE\n</blockquote></div></body></html>");
"HTML REPLY<br>\nSECOND LINE<br>\n-- <br>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/plain; charset=utf-8\" /></head><body style=\"overflow-wrap:break-word; word-break: break-word;white-space:pre-wrap;\"><div>You wrote:<blockquote style=\"margin: 0.8ex 0pt 0pt 0.8ex; border-left: 1px solid rgb(204, 204, 204); padding-left: 1ex;\">HTML QUOTE\n\nSECOND LINE\n</blockquote></div></body></html>");
output = QStringLiteral("HTML REPLY<br>\nSECOND LINE<br>\n-- <br>\n<div>You wrote:<blockquote style=\"margin: 0.8ex 0pt 0pt 0.8ex; border-left: 1px solid rgb(204, 204, 204); padding-left: 1ex;\">HTML QUOTE\n\nSECOND LINE\n</blockquote></div>");
QTest::newRow("bug419949") << input << output;
//Before
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"empty":
// 0.099 msecs per iteration (total: 51, iterations: 512)
//PASS : MessageViewerUtilsTest::shouldExtractHtml(test1)
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"test1":
// 0.10 msecs per iteration (total: 54, iterations: 512)
//PASS : MessageViewerUtilsTest::shouldExtractHtml(bug418482)
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"bug418482":
// 0.11 msecs per iteration (total: 58, iterations: 512)
//PASS : MessageViewerUtilsTest::shouldExtractHtml(bug419949)
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"bug419949":
// 0.10 msecs per iteration (total: 54, iterations: 512)
//After
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"empty":
// 0.0014 msecs per iteration (total: 95, iterations: 65536)
//PASS : MessageViewerUtilsTest::shouldExtractHtml(test1)
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"test1":
// 0.0031 msecs per iteration (total: 51, iterations: 16384)
//PASS : MessageViewerUtilsTest::shouldExtractHtml(test2)
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"test2":
// 0.0031 msecs per iteration (total: 51, iterations: 16384)
//PASS : MessageViewerUtilsTest::shouldExtractHtml(bug418482)
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"bug418482":
// 0.0095 msecs per iteration (total: 78, iterations: 8192)
//PASS : MessageViewerUtilsTest::shouldExtractHtml(bug419949)
//RESULT : MessageViewerUtilsTest::shouldExtractHtml():"bug419949":
// 0.0046 msecs per iteration (total: 76, iterations: 16384)
}
void MessageViewerUtilsTest::shouldUseCorrectCodec()
......
......@@ -703,16 +703,23 @@ Util::HtmlMessageInfo Util::processHtml(const QString &htmlSource)
{
Util::HtmlMessageInfo messageInfo;
QString s = htmlSource.trimmed();
const int indexDoctype = s.indexOf(QRegularExpression(QStringLiteral("<!DOCTYPE[^>]*>"), QRegularExpression::CaseInsensitiveOption));
static QRegularExpression body = QRegularExpression(QStringLiteral("<body[^>]*>"), QRegularExpression::CaseInsensitiveOption);
static QRegularExpression docTypeRegularExpression = QRegularExpression(QStringLiteral("<!DOCTYPE[^>]*>"), QRegularExpression::CaseInsensitiveOption);
QRegularExpressionMatch match;
const int indexDoctype = s.indexOf(docTypeRegularExpression, 0, &match);
QString textBeforeDoctype;
if (indexDoctype != -1) {
if (indexDoctype > 0) {
textBeforeDoctype = s.left(indexDoctype);
s.remove(textBeforeDoctype);
}
s = s.remove(QRegularExpression(QStringLiteral("^<!DOCTYPE[^>]*>"), QRegularExpression::CaseInsensitiveOption)).trimmed();
s = s.remove(QRegularExpression(QStringLiteral("<html[^>]*>"), QRegularExpression::CaseInsensitiveOption)).trimmed();
if (!match.captured().isEmpty()) {
s = s.remove(match.captured()).trimmed();
}
static QRegularExpression htmlRegularExpression = QRegularExpression(QStringLiteral("<html[^>]*>"), QRegularExpression::CaseInsensitiveOption);
s = s.remove(htmlRegularExpression).trimmed();
// head
s = s.remove(QRegularExpression(QStringLiteral("^<head/>"), QRegularExpression::CaseInsensitiveOption)).trimmed();
static QRegularExpression headEndRegularExpression = QRegularExpression(QStringLiteral("^<head/>"), QRegularExpression::CaseInsensitiveOption);
s = s.remove(headEndRegularExpression).trimmed();
const int startIndex = s.indexOf(QLatin1String("<head>"), Qt::CaseInsensitive);
if (startIndex >= 0) {
const auto endIndex = s.indexOf(QLatin1String("</head>"), Qt::CaseInsensitive);
......@@ -721,7 +728,8 @@ Util::HtmlMessageInfo Util::processHtml(const QString &htmlSource)
messageInfo.htmlSource = htmlSource;
return messageInfo;
}
messageInfo.extraHead = s.mid(startIndex + 6, endIndex - startIndex - 6);
const int index = startIndex + 6;
messageInfo.extraHead = s.mid(index, endIndex - index);
#if QTWEBENGINEWIDGETS_VERSION < QT_VERSION_CHECK(5, 13, 0)
//Remove this hack with https://codereview.qt-project.org/#/c/256100/2 is merged
//Don't authorize to refresh content.
......@@ -732,11 +740,13 @@ Util::HtmlMessageInfo Util::processHtml(const QString &htmlSource)
s = s.remove(startIndex, endIndex - startIndex + 7).trimmed();
}
// body
s = s.remove(QRegularExpression(QStringLiteral("<body[^>]*>"), QRegularExpression::CaseInsensitiveOption)).trimmed();
s = s.remove(body).trimmed();
//Some mail has </div>$ at end
s = s.remove(QRegularExpression(QStringLiteral("</html></div>$"), QRegularExpression::CaseInsensitiveOption)).trimmed();
s = s.remove(QRegularExpression(QStringLiteral("</html>$"), QRegularExpression::CaseInsensitiveOption)).trimmed();
s = s.remove(QRegularExpression(QStringLiteral("</body>$"), QRegularExpression::CaseInsensitiveOption)).trimmed();
static QRegularExpression htmlDivRegularExpression = QRegularExpression(QStringLiteral("(</html></div>|</html>)$"), QRegularExpression::CaseInsensitiveOption);
s = s.remove(htmlDivRegularExpression).trimmed();
//s = s.remove(QRegularExpression(QStringLiteral("</html>$"), QRegularExpression::CaseInsensitiveOption)).trimmed();
static QRegularExpression bodyEndRegularExpression = QRegularExpression(QStringLiteral("</body>$"), QRegularExpression::CaseInsensitiveOption);
s = s.remove(bodyEndRegularExpression).trimmed();
s = textBeforeDoctype + s;
messageInfo.htmlSource = s;
return messageInfo;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment