Commit 5ec3d6a4 authored by Volker Krause's avatar Volker Krause
Browse files

Correctly decode HTML MIME nodes which specify codecs in HTML and MIME

We must not apply the same codec twice there, as the input to the HTML
encoding is UTF-8 in that case, not what might be specified in the HTML
content-type header.

This is a problem for e.g. Japanese and Korean codecs in particular, where
doing this wrongly completely destroys the content.
parent 962dea47
Pipeline #82618 passed with stage
in 12 minutes and 21 seconds
......@@ -366,4 +366,23 @@ HtmlDocument* HtmlDocument::fromData(const QByteArray &data, QObject *parent)
#endif
}
HtmlDocument* HtmlDocument::fromString(const QString &data, QObject *parent)
{
#ifdef HAVE_LIBXML2
const auto utf8Data = data.toUtf8();
auto tree = htmlReadMemory(utf8Data.constData(), utf8Data.size(), nullptr, "utf-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS | HTML_PARSE_NONET | HTML_PARSE_COMPACT);
if (!tree) {
return nullptr;
}
auto doc = new HtmlDocument(parent);
doc->d->m_doc = tree;
return doc;
#else
Q_UNUSED(data)
Q_UNUSED(parent)
return nullptr;
#endif
}
#include "moc_htmldocument.cpp"
......@@ -86,6 +86,10 @@ public:
* @returns @c nullptr if loading fails or libxml was not found.
*/
static HtmlDocument* fromData(const QByteArray &data, QObject *parent = nullptr);
/** Creates a HtmlDocument from a given (unicode) string.
* @returns @c nullptr if loading fails or libxml was not found.
*/
static HtmlDocument* fromString(const QString &data, QObject *parent = nullptr);
/** Returns the root element of the document. */
HtmlElement root() const;
......
......@@ -42,9 +42,8 @@ bool HtmlDocumentProcessor::canHandleData(const QByteArray &encodedData, QString
|| fileName.endsWith(QLatin1String(".htm"), Qt::CaseInsensitive);
}
ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromData(const QByteArray &encodedData) const
static ExtractorDocumentNode nodeFromHtml(HtmlDocument *html)
{
auto html = HtmlDocument::fromData(encodedData);
if (!html || html->root().firstChild().isNull()) {
return {};
}
......@@ -54,6 +53,19 @@ ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromData(const QByteArray
return node;
}
ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromData(const QByteArray &encodedData) const
{
return nodeFromHtml(HtmlDocument::fromData(encodedData));
}
ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromContent(const QVariant &decodedData) const
{
if (decodedData.type() == QVariant::String) {
return nodeFromHtml(HtmlDocument::fromString(decodedData.toString()));
}
return ExtractorDocumentProcessor::createNodeFromContent(decodedData);
}
void HtmlDocumentProcessor::expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const
{
const auto html = node.content<HtmlDocument*>();
......
......@@ -18,6 +18,7 @@ class HtmlDocumentProcessor : public ExtractorDocumentProcessor
public:
bool canHandleData(const QByteArray &encodedData, QStringView fileName) const override;
ExtractorDocumentNode createNodeFromData(const QByteArray &encodedData) const override;
ExtractorDocumentNode createNodeFromContent(const QVariant& decodedData) const override;
void expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
void preExtract(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
QJSValue contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const override;
......
......@@ -120,7 +120,7 @@ static void expandContentNode(ExtractorDocumentNode &node, KMime::Content *conte
if ((ct && ct->isPlainText() && fileName.isEmpty()) || (!ct && content->isTopLevel())) {
child = engine->documentNodeFactory()->createNode(content->decodedText(), u"text/plain");
} else if (ct && ct->isHTMLText()) {
child = engine->documentNodeFactory()->createNode(content->decodedContent(), fileName, u"text/html");
child = engine->documentNodeFactory()->createNode(content->decodedText(), u"text/html");
} else {
child = engine->documentNodeFactory()->createNode(content->decodedContent(), fileName);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment