Commit 1595b341 authored by Kai Uwe Broulik's avatar Kai Uwe Broulik 🍇
Browse files

Search for JSON-LD inside of microdata tree

I've noticed it quite often that websites annotate their <body> as WebPage and then have a <script type="application/ld+json> inside.
We would not find such a tag since we're only on the lookout for more microdata once processing one.

Differential Revision: https://phabricator.kde.org/D28921
parent bee118a5
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<div itemscope itemtype="http://schema.org/WebPage">
<h1>Konqi Resort and Spa</h1>
<p>
Discover a 4 star hotel in the heart of Paris.
</p>
<a href="https://www.kde.org">Come visit us</a>
<script type="application/ld+json">
{
"@context": "http://schema.org",
"@type": "Hotel",
"description": "Discover a 4 star hotel in the heart of Paris.",
"image": {
"@type": "ImageObject",
"url": "https://community.kde.org/images.community/thumb/4/40/Mascot_konqi.png/144px-Mascot_konqi.png"
},
"name": "Konqi Resort and Spa",
"url": "https://www.kde.org"
}
</script>
</div>
</body>
</html>
[
{
"@context": "http://schema.org",
"@type": "Hotel",
"description": "Discover a 4 star hotel in the heart of Paris.",
"image": {
"@type": "ImageObject",
"url": "https://community.kde.org/images.community/thumb/4/40/Mascot_konqi.png/144px-Mascot_konqi.png"
},
"name": "Konqi Resort and Spa",
"url": "https://www.kde.org"
}
]
......@@ -29,6 +29,11 @@
using namespace KItinerary;
static bool isJsonLdTag(const HtmlElement &elem)
{
return elem.name() == QLatin1String("script") && elem.attribute(QStringLiteral("type")) == QLatin1String("application/ld+json");
}
static QByteArray fixupJson(const QByteArray &data)
{
auto output(data);
......@@ -112,7 +117,10 @@ static void parseMicroData(const HtmlElement &elem, QJsonObject &obj, QJsonArray
}
} else if (!prop.isEmpty()) {
obj.insert(prop, valueForItemProperty(child));
} else {
// Maybe there is more JSON-LD inside this microdata tree
} else if (isJsonLdTag(child)) {
parseJson(child.content().toUtf8(), result);
} else {
// skip intermediate nodes without Microdata annotations
parseMicroData(child, obj, result);
}
......@@ -123,7 +131,7 @@ static void parseMicroData(const HtmlElement &elem, QJsonObject &obj, QJsonArray
static void extractRecursive(const HtmlElement &elem, QJsonArray &result)
{
// JSON-LD
if (elem.name() == QLatin1String("script") && elem.attribute(QStringLiteral("type")) == QLatin1String("application/ld+json")) {
if (isJsonLdTag(elem)) {
parseJson(elem.content().toUtf8(), result);
return;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment