Commit dea29a32 authored by Volker Krause's avatar Volker Krause
Browse files

Create plain text nodes for HTML and PDF content

This makes extractors work that relied on the implicit type conversion
that the old system had special-cased for a few types.

Counter-intuitively this has practically no performance impact despite
doing the conversion unconditionally: In case the parent type is extracted
from, doing the text conversion comes almost for free (ie. the full PDF
or HTML parsing is done already), and in case the parent doesn't produce
output, content-based matching for plain text extractors will always
trigger the type conversion.
parent 66dc0997
......@@ -62,7 +62,13 @@ private Q_SLOTS:
QVERIFY(root.location().isNull());
root.processor()->expandNode(root, &engine);
QCOMPARE(root.childNodes().size(), 2);
QCOMPARE(root.childNodes().size(), 3);
auto c3 = root.childNodes()[2];
QVERIFY(!c3.isNull());
QCOMPARE(c3.mimeType(), QLatin1String("text/plain"));
QCOMPARE(c3.content<QString>(), QLatin1String("This is the first page.\nIt contains a PDF 417 barcode.\nThis is the second page.\nIt contains an Aztec code.\n"));
auto c1 = root.childNodes()[0];
QVERIFY(!c1.isNull());
QCOMPARE(c1.mimeType(), QLatin1String("internal/qimage"));
......@@ -95,7 +101,7 @@ private Q_SLOTS:
QCOMPARE(root.mimeType(), QLatin1String("application/pdf"));
root.processor()->expandNode(root, &engine);
QCOMPARE(root.childNodes().size(), 2);
QCOMPARE(root.childNodes().size(), 3);
auto c2 = root.childNodes()[1];
QVERIFY(!c2.isNull());
QCOMPARE(c2.mimeType(), QLatin1String("internal/qimage"));
......
......@@ -12,6 +12,16 @@
},
{
"childNodes": [
{
"childNodes": [
],
"content": "{\n \"@context\": \"http://schema.org\",\n \"@type\": \"FlightReservation\",\n \"reservationNumber\": \"RXJ34P\",\n \"reservationStatus\": \"http://schema.org/Confirmed\",\n \"underName\": {\n \"@type\": \"Person\",\n \"name\": \"Eva Green\"\n },\n \"reservationFor\": {\n \"@type\": \"Flight\",\n \"flightNumber\": \"110\",\n \"airline\": {\n \"@type\": \"Airline\",\n \"name\": \"United\",\n \"iataCode\": \"UA\"\n },\n \"departureAirport\": {\n \"@type\": \"Airport\",\n \"name\": \"San Francisco Airport\",\n \"iataCode\": \"SFO\"\n },\n \"departureTime\": \"2027-03-04T20:15:00-08:00\",\n \"arrivalAirport\": {\n \"@type\": \"Airport\",\n \"name\": \"John F. Kennedy International Airport\",\n \"iataCode\": \"JFK\"\n },\n \"arrivalTime\": \"2027-03-05T06:30:00-05:00\"\n }\n}",
"contextDateTime": "Invalid Date",
"isNull": false,
"mimeType": "text/plain",
"result": [
]
}
],
"content": {
"objectName": "",
......
......@@ -35,6 +35,16 @@
"mimeType": "internal/qimage",
"result": [
]
},
{
"childNodes": [
],
"content": " Akademy Airways\n Boarding Pass\nFrom: Vienna International, Terminal 2\nTo: Milano Malpensa, Terminal 1\nFlight: AK 1996\nGate: A36\nBoarding Time: 15:20\nDeparture Time: 15:45\nArrival Time: 17:20\nPassenger: Dragon, Dr. Konqi\n",
"contextDateTime": "Mon Aug 19 20:23:28 2019 GMT+0200",
"isNull": false,
"mimeType": "text/plain",
"result": [
]
}
],
"content": {
......
......@@ -46,6 +46,16 @@
"mimeType": "internal/qimage",
"result": [
]
},
{
"childNodes": [
],
"content": " Akademy Airways\n Boarding Pass\nFrom: Vienna International, Terminal 2\nTo: Milano Malpensa, Terminal 1\nFlight: AK 1996\nGate: A36\nBoarding Time: 15:20\nDeparture Time: 15:45\nArrival Time: 17:20\nPassenger: Dragon, Dr. Konqi\n",
"contextDateTime": "Mon Aug 19 20:23:28 2019 GMT+0200",
"isNull": false,
"mimeType": "text/plain",
"result": [
]
}
],
"content": {
......
......@@ -7,6 +7,8 @@
#include "htmldocumentprocessor.h"
#include "logging.h"
#include <KItinerary/ExtractorDocumentNodeFactory>
#include <KItinerary/ExtractorEngine>
#include <KItinerary/ExtractorResult>
#include <KItinerary/HtmlDocument>
......@@ -52,6 +54,14 @@ ExtractorDocumentNode HtmlDocumentProcessor::createNodeFromData(const QByteArray
return node;
}
void HtmlDocumentProcessor::expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const
{
// plain text fallback node
const auto html = node.content<HtmlDocument*>();
auto fallback = engine->documentNodeFactory()->createNode(html->root().recursiveContent(), u"text/plain");
node.appendChild(fallback);
}
static bool isJsonLdTag(const HtmlElement &elem)
{
return elem.name() == QLatin1String("script") && elem.attribute(QStringLiteral("type")) == QLatin1String("application/ld+json");
......
......@@ -17,6 +17,7 @@ class HtmlDocumentProcessor : public ExtractorDocumentProcessor
public:
bool canHandleData(const QByteArray &encodedData, QStringView fileName) const override;
ExtractorDocumentNode createNodeFromData(const QByteArray &encodedData) const override;
void expandNode(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
void preExtract(ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
QJSValue contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const override;
void destroyNode(ExtractorDocumentNode &node) const override;
......
......@@ -129,6 +129,10 @@ void PdfDocumentProcessor::expandNode(ExtractorDocumentNode &node, const Extract
}
}
}
// fallback node for implicit conversion to plain text
auto fallback = engine->documentNodeFactory()->createNode(doc->text(), u"text/plain");
node.appendChild(fallback);
}
QJSValue PdfDocumentProcessor::contentToScriptValue(const ExtractorDocumentNode &node, QJSEngine *engine) const
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment