structureddataextractor.cpp 5.87 KB
Newer Older
1
/*
2
   SPDX-FileCopyrightText: 2017 Volker Krause <vkrause@kde.org>
3

4
   SPDX-License-Identifier: LGPL-2.0-or-later
5 6
*/

7
#include "structureddataextractor_p.h"
8
#include "htmldocument.h"
Volker Krause's avatar
Volker Krause committed
9
#include "logging.h"
10

11
#include <QJsonArray>
12 13 14 15 16
#include <QJsonDocument>
#include <QJsonObject>
#include <QString>
#include <QUrl>

17 18
using namespace KItinerary;

19 20 21 22 23
static bool isJsonLdTag(const HtmlElement &elem)
{
    return elem.name() == QLatin1String("script") && elem.attribute(QStringLiteral("type")) == QLatin1String("application/ld+json");
}

24
static QByteArray fixupJson(const QByteArray &data)
25
{
26 27 28
    if (data.isEmpty()) {
        return {};
    }
29
    auto output(data);
30

31 32
    // Eurowings doesn't put a comma between objects in top-level arrays...
    output.replace("}{", "},{");
33

34 35 36 37 38 39
    // Volotea doesn't put square brackets in top level arrays...
    if (output.front() != '[' && output.back() != ']') {
        output.prepend("[");
        output.append("]");
    }

40 41 42 43 44 45 46 47 48 49 50 51
    // Eventbrite adds commas where there shouldn't be one...
    for (int idx = output.indexOf("\",\n"); idx > 0 && idx + 3 < output.size(); idx = output.indexOf("\",\n", idx)) {
        const auto comma = idx + 1;
        idx += 3;
        while (idx < output.size() && std::isspace(output[idx])) {
            ++idx;
        }
        if (idx < output.size() && output[idx] == '}') {
            output[comma] = ' ';
        }
    }

52
    return output;
53
}
54

55
static void parseJson(const QByteArray &data, QJsonArray &result)
56 57
{
    QJsonParseError error;
58
    auto jsonDoc = QJsonDocument::fromJson(data, &error);
59
    if (jsonDoc.isNull()) {
60 61 62 63 64
        if (error.error != QJsonParseError::NoError) {
            // try to fix up common JSON encoding errors
            jsonDoc = QJsonDocument::fromJson(fixupJson(data));
        }
        if (jsonDoc.isNull()) {
Volker Krause's avatar
Volker Krause committed
65 66
            qCDebug(Log).noquote() << data;
            qCDebug(Log) << error.errorString() << "at offset" << error.offset;
67 68
            return;
        }
69 70
    }
    if (jsonDoc.isArray()) {
71 72
        const auto jsonArray = jsonDoc.array();
        std::copy(jsonArray.begin(), jsonArray.end(), std::back_inserter(result));
73
    } else if (jsonDoc.isObject()) {
74
        result.push_back(jsonDoc.object());
75 76
    }
}
77

78
static QString valueForItemProperty(const HtmlElement &elem)
79 80 81 82 83
{
    // TODO see https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/itemprop#Values
    const auto elemName = elem.name();
    QString v;
    if (elemName == QLatin1String("meta")) {
Laurent Montel's avatar
Laurent Montel committed
84
        v = elem.attribute(QStringLiteral("content"));
85
    } else if (elemName == QLatin1String("time")) {
Laurent Montel's avatar
Laurent Montel committed
86
        v = elem.attribute(QStringLiteral("datetime"));
87
    } else if (elemName == QLatin1String("link") || elemName == QLatin1Char('a') || elemName == QLatin1String("img")) {
Laurent Montel's avatar
Laurent Montel committed
88 89 90 91
        if (elem.hasAttribute(QStringLiteral("href"))) {
            v = elem.attribute(QStringLiteral("href"));
        } else if (elem.hasAttribute(QStringLiteral("content"))) {
            v = elem.attribute(QStringLiteral("content"));
92 93
        } else if (elem.hasAttribute(QStringLiteral("src"))) {
            v = elem.attribute(QStringLiteral("src"));
94 95
        } else {
            v = elem.recursiveContent();
96 97
        }
    } else {
98
        v = elem.recursiveContent();
99 100 101 102 103
    }

    return v;
}

104
static void parseMicroData(const HtmlElement &elem, QJsonObject &obj, QJsonArray &result)
105 106 107
{
    auto child = elem.firstChild();
    while (!child.isNull()) {
Laurent Montel's avatar
Laurent Montel committed
108 109
        const auto prop = child.attribute(QStringLiteral("itemprop"));
        const auto type = child.attribute(QStringLiteral("itemtype"));
110
        if (type.startsWith(QLatin1String("http://schema.org/"))) {
111
            QJsonObject subObj;
112
            parseMicroData(child, subObj, result);
113 114
            const QUrl typeUrl(type);
            subObj.insert(QStringLiteral("@type"), typeUrl.fileName());
115 116 117 118 119
            if (prop.isEmpty()) {
                result.push_back(subObj); // stand-alone object that just happens to be nested
            } else {
                obj.insert(prop, subObj);
            }
120 121
        } else if (!prop.isEmpty()) {
            obj.insert(prop, valueForItemProperty(child));
122 123 124 125
        // Maybe there is more JSON-LD inside this microdata tree
        } else if (isJsonLdTag(child)) {
            parseJson(child.content().toUtf8(), result);
        } else {
126
            // skip intermediate nodes without Microdata annotations
127
            parseMicroData(child, obj, result);
128 129 130 131 132
        }
        child = child.nextSibling();
    }
}

133
static void extractRecursive(const HtmlElement &elem, QJsonArray &result)
134 135
{
    // JSON-LD
136
    if (isJsonLdTag(elem)) {
137 138 139 140 141
        parseJson(elem.content().toUtf8(), result);
        return;
    }

    // Microdata
Laurent Montel's avatar
Laurent Montel committed
142
    const auto itemType = elem.attribute(QStringLiteral("itemtype"));
143
    if (itemType.startsWith(QLatin1String("http://schema.org/"))) {
144
        QJsonObject obj;
145
        parseMicroData(elem, obj, result);
146 147 148 149 150 151 152
        if (obj.isEmpty()) {
            return;
        }

        const QUrl typeUrl(itemType);
        obj.insert(QStringLiteral("@type"), typeUrl.fileName());

Laurent Montel's avatar
Laurent Montel committed
153
        const auto itemProp = elem.attribute(QStringLiteral("itemprop"));
154
        if (!itemProp.isEmpty() && !result.isEmpty()) {
Yuri Chornoivan's avatar
Yuri Chornoivan committed
155
            // this is likely a child of our preceding sibling, but broken XML put it here
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
            auto parent = result.last().toObject();
            parent.insert(itemProp, obj);
            result[result.size() - 1] = parent;
        } else {
            obj.insert(QStringLiteral("@context"), QStringLiteral("http://schema.org"));
            result.push_back(obj);
        }
        return;
    }

    // recurse otherwise
    auto child = elem.firstChild();
    while (!child.isNull()) {
        extractRecursive(child, result);
        child = child.nextSibling();
    }
}

QJsonArray StructuredDataExtractor::extract(HtmlDocument *doc)
{
    Q_ASSERT(doc);

    QJsonArray result;
    if (doc->root().isNull()) {
        return result;
    }
    extractRecursive(doc->root(), result);
    return result;
}