webarchivecreator.cpp 13 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*
   Copyright (C) 2001 Malte Starostik <malte@kde.org>
   Copyright (C) 2020 Jonathan Marten <jjm@keelhaul.me.uk>

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public
   License as published by the Free Software Foundation; either
   version 2 of the License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; see the file COPYING.  If not, see
   <http://www.gnu.org/licenses/>.
*/

#include "webarchivecreator.h"

#include <qdebug.h>
#include <qpixmap.h>
#include <qimage.h>
#include <qapplication.h>
#include <qurl.h>
#include <qtimer.h>
#include <qmimetype.h>
#include <qmimedatabase.h>
#include <qtemporarydir.h>

#ifdef THUMBNAIL_USE_WEBKIT
#include <qwebview.h>
#include <qwebpage.h>
#include <qwebsettings.h>
#include <qnetworkcookie.h>
#else // THUMBNAIL_USE_WEBKIT
#include <qwebengineview.h>
#include <qwebenginepage.h>
#include <qwebengineprofile.h>
#include <qwebenginesettings.h>
#include <qwebenginecookiestore.h>
#endif // THUMBNAIL_USE_WEBKIT

#include <ktar.h>
#include <karchivedirectory.h>

#include "webarchiverdebug.h"


#undef SHOW_RENDER_WINDOW


// This is an time limit for the entire thumbnail generation process
// (page loading and rendering).  If it expires then it is assumed
// that there is a problem and no thumbnail is generated.
static const int c_completionTimeout = 5000;

// After the page is loaded, the rendering happens in the background
// with no way to find out when it has finished.  So this timer sets a
// reasonable time for that to happen, when it expires the thumbnail
// image is generated.
static const int c_renderTimeout = 500;

// The size of the pixmap onto which the rendered page is drawn, and
// the rendering scale for the web page.  These settings have nothing
// to do with the size of the pixmap requested when create() is called,
// they are chosen for a reasonable rendering of the page (which should
// work at an effective width of 800 pixels).  For the scale factor,
// 0.25 is the minimum allowed by Qt.
static const QSize c_pixmapSize = QSize(400, 600);
static const double c_renderScale = 0.5;


extern "C"
{
    Q_DECL_EXPORT ThumbCreator *new_creator()
    {
        return (new WebArchiveCreator);
    }
}


WebArchiveCreator::WebArchiveCreator()
    : ThumbCreator()
{
    m_tempDir = nullptr;
}


WebArchiveCreator::~WebArchiveCreator()
{
    delete m_tempDir;
}


#ifndef THUMBNAIL_USE_WEBKIT
static bool disallowWebEngineCookies(const QWebEngineCookieStore::FilterRequest &req)
{
    return (false);
}
#endif // THUMBNAIL_USE_WEBKIT


bool WebArchiveCreator::create(const QString &path, int width, int height, QImage &img)
{
    QMimeDatabase db;
    // Only use the file path to look up its MIME type.  Web archives are
    // gzip-compressed tar files, so if the content detection has to be
    // used it may report that.  So a web archive file must have the correct
    // file extension.
    QMimeType mimeType = db.mimeTypeForFile(path, QMimeDatabase::MatchExtension);

    qCDebug(WEBARCHIVERPLUGIN_LOG) << "path" << path;
    qCDebug(WEBARCHIVERPLUGIN_LOG) << "wh" << width << height << "mime" << mimeType.name();

    // We are using QWebEngine here directly, not the WebEnginePart KPart.
    // This means that it will only be able to use the network access methods
    // that it supports internally, effectively 'file' and 'http(s)'.  In particular
    // it does not support any other KIO protocols, including 'tar' which would
    // be needed to look into web archives.  The WebEnginePart interfaces QWebEngine
    // to KIO.
    //
    // One option would be to do the same, i.e. to implement a network access handler
    // or a URL scheme handler that forwards requests to KIO.  However, the random
    // and possible repeated access to the page elements required would mean lots
    // of seeking around in the compressed web archive file.  Therefore, the web
    // archive is first extracted into a temporary directory and then QWebEngine
    // is told to render that.

    QString indexFile = path;				// the main page to render

    if (mimeType.inherits("application/x-webarchive"))	// archive needs to be extracted?
    {
        KTar tar(path);					// auto-detects compression type
        tar.open(QIODevice::ReadOnly);
        const KArchiveDirectory *archiveDir = tar.directory();

        m_tempDir = new QTemporaryDir;
        const QString tempPath = m_tempDir->path();
        if (path.isEmpty())
        {
            qCWarning(WEBARCHIVERPLUGIN_LOG) << "Cannot create temporary directory";
            return (false);
        }

147
        qCDebug(WEBARCHIVERPLUGIN_LOG) << "extracting to tempPath" << tempPath;
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
        archiveDir->copyTo(tempPath, true);		// recursive extract from archive
        tar.close();					// finished with archive file

        const QDir tempDir(tempPath);
        const QStringList entries = tempDir.entryList(QDir::Files|QDir::QDir::NoDotAndDotDot);
        qCDebug(WEBARCHIVERPLUGIN_LOG) << "found" << entries.count() << "entries";

        QString indexHtml;
        for (const QString &name : entries)
        {
            // Look though the extracted archive files to try to identify the
            // HTML page is to be rendered.  If "index.html" or "index.htm" is
            // found, that file is used;  otherwise, the first HTML file that
            // was found is used.
            const QMimeType mime = db.mimeTypeForFile(tempDir.absoluteFilePath(name), QMimeDatabase::MatchExtension);
            if (mime.inherits("text/html"))
            {
                if (name.startsWith("index.", Qt::CaseInsensitive))
                {					// the index HTML file
                    indexHtml = name;
                    break;				// no need to look further
                }
                else if (indexHtml.isEmpty())		// any other HTML file
                {
                    indexHtml = name;
                }
            }
        }

        if (indexHtml.isEmpty())
        {
            qCWarning(WEBARCHIVERPLUGIN_LOG) << "No HTML file found in archive";
            return (false);
        }

        qCDebug(WEBARCHIVERPLUGIN_LOG) << "identified index file" << indexHtml;
        indexFile = tempPath+'/'+indexHtml;
    }

    const QUrl indexUrl = QUrl::fromLocalFile(indexFile);
    qCDebug(WEBARCHIVERPLUGIN_LOG) << "indexUrl" << indexUrl;

#ifdef THUMBNAIL_USE_WEBKIT
    QWebView view;
    connect(&view, &QWebView::loadFinished, this, &WebArchiveCreator::slotLoadFinished);

    QWebSettings *settings = view.settings();
    settings->setThirdPartyCookiePolicy(QWebSettings::AlwaysBlockThirdPartyCookies);
    settings->setAttribute(QWebSettings::LocalContentCanAccessRemoteUrls, false);
    settings->setAttribute(QWebSettings::LocalContentCanAccessFileUrls, true);
    settings->setAttribute(QWebSettings::ZoomTextOnly, false);
    settings->setAttribute(QWebSettings::PrivateBrowsingEnabled, true);
    settings->setAttribute(QWebSettings::NotificationsEnabled, false);
    settings->setAttribute(QWebSettings::JavascriptEnabled, false);
    settings->setAttribute(QWebSettings::JavaEnabled, false);
    settings->setAttribute(QWebSettings::LocalStorageEnabled, false);
    settings->setAttribute(QWebSettings::LocalContentCanAccessRemoteUrls, false);
    settings->setAttribute(QWebSettings::PluginsEnabled, false);
    settings->setAttribute(QWebSettings::AllowRunningInsecureContent, false);
    settings->setAttribute(QWebSettings::PrintElementBackgrounds, true);
    settings->setAttribute(QWebSettings::PrivateBrowsingEnabled, true);

    QWebPage *page = view.page();
    auto *cookieJar = new WebArchiveCreatorCookieJar;
    page->networkAccessManager()->setCookieJar(cookieJar);
#else // THUMBNAIL_USE_WEBKIT
    QWebEngineView view;
    connect(&view, &QWebEngineView::loadFinished, this, &WebArchiveCreator::slotLoadFinished);

    QWebEngineSettings *settings = view.settings();
    settings->setUnknownUrlSchemePolicy(QWebEngineSettings::DisallowUnknownUrlSchemes);
    settings->setAttribute(QWebEngineSettings::JavascriptEnabled, false);
    settings->setAttribute(QWebEngineSettings::LocalStorageEnabled, false);
    settings->setAttribute(QWebEngineSettings::LocalContentCanAccessRemoteUrls, false);
    settings->setAttribute(QWebEngineSettings::PluginsEnabled, false);
    settings->setAttribute(QWebEngineSettings::AutoLoadIconsForPage, false);
    settings->setAttribute(QWebEngineSettings::AllowRunningInsecureContent, false);
    settings->setAttribute(QWebEngineSettings::ShowScrollBars, false);
    settings->setAttribute(QWebEngineSettings::PdfViewerEnabled, false);
    settings->setAttribute(QWebEngineSettings::PrintElementBackgrounds, true);

    QWebEnginePage *page = view.page();
    QWebEngineProfile *profile = page->profile();
    profile->setPersistentCookiesPolicy(QWebEngineProfile::NoPersistentCookies);
    profile->setSpellCheckEnabled(false);
    profile->cookieStore()->setCookieFilter(&disallowWebEngineCookies);
#endif // THUMBNAIL_USE_WEBKIT

    view.resize(c_pixmapSize);
    view.setZoomFactor(c_renderScale);				// 0.25 is the minimum allowed

    m_error = false;
    m_rendered = false;

    view.load(indexUrl);
#ifndef SHOW_RENDER_WINDOW
    view.setAttribute(Qt::WA_ShowWithoutActivating);
    view.setAttribute(Qt::WA_OutsideWSRange);
    view.setWindowFlags(view.windowFlags()|Qt::BypassWindowManagerHint|Qt::FramelessWindowHint);
    view.move(5000, 5000);
#endif
    view.show();

    QTimer::singleShot(c_completionTimeout, this, &WebArchiveCreator::slotProcessingTimeout);
    while (!m_error && !m_rendered) qApp->processEvents(QEventLoop::WaitForMoreEvents);
    qCDebug(WEBARCHIVERPLUGIN_LOG) << "finished loop error?" << m_error;
    if (m_error) return (false);			// load error or timeout

    // Render the HTML page on a bigger pixmap and leave the scaling to the
    // caller.  Looks better than directly scaling with the QPainter (malte).
    QSize pixSize = c_pixmapSize;
    if (pixSize.width()<width || pixSize.height()<height)
    {							// default size is too small
        if ((height*3)>(width*4)) pixSize = QSize(width, (width*4)/3);
        else pixSize = QSize((height*3)/4, height);
    }

    QPixmap pix(pixSize);
    // First fill the pixmap with a light grey background, in case the
    // rendered page does not completely cover it.  If there was an error
    // then we will already have given up above.
    pix.fill(QColor(245, 245, 245));

    view.render(&pix);					// render the view into the pixmap
    view.hide();					// finished with the view and page
#ifdef THUMBNAIL_USE_WEBKIT
    page->setVisibilityState(QWebPage::VisibilityStateHidden);
#else // THUMBNAIL_USE_WEBKIT
276
277

#if QT_VERSION >= QT_VERSION_CHECK(5, 14, 0)
278
    page->setLifecycleState(QWebEnginePage::LifecycleState::Discarded);
279
#endif // QT_VERSION
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#endif // THUMBNAIL_USE_WEBKIT

    img = pix.toImage();				// return the rendered thumbnail
    return (true);
}


void WebArchiveCreator::slotLoadFinished(bool ok)
{
    qCDebug(WEBARCHIVERPLUGIN_LOG) << "ok?" << ok;
    if (!ok)
    {
        // If WebKit is being used, it is possible that 'ok' can be false
        // here even if the page load succeeded but it could only be
        // partially rendered (for example, a broken image source link).
        // Ignore the error indication and render the page anyway.
#ifndef THUMBNAIL_USE_WEBKIT
        m_error = true;
        return;
#endif // THUMBNAIL_USE_WEBKIT
    }

#ifdef THUMBNAIL_USE_WEBKIT
    // WebKit will have finished rendering when the loadFinished() signal has been
    // delivered.  Render the bitmap immediately.
    slotRenderTimer();
#else // THUMBNAIL_USE_WEBKIT
    // WebEngine renders asynchronously after the loadFinished() signal has been
    // delivered.  It is not possible to tell when page rendering has finished, so
    // a timer is used and the page is assumed to be ready when it expires.
    QTimer::singleShot(c_renderTimeout, this, &WebArchiveCreator::slotRenderTimer);
#endif // THUMBNAIL_USE_WEBKIT
}


void WebArchiveCreator::slotProcessingTimeout()
{
    m_error = true;
}


void WebArchiveCreator::slotRenderTimer()
{
    m_rendered = true;
}


ThumbCreator::Flags WebArchiveCreator::flags() const
{
    return (ThumbCreator::DrawFrame);
}


#ifdef THUMBNAIL_USE_WEBKIT

// WebArchiveCreatorCookieJar
//
// A cookie jar that ignores any cookies sent to it and never
// delivers any.

WebArchiveCreatorCookieJar::WebArchiveCreatorCookieJar(QObject *parent)
    : QNetworkCookieJar(parent)
{
}

QList<QNetworkCookie> WebArchiveCreatorCookieJar::cookiesForUrl(const QUrl &url) const
{
    return (QList<QNetworkCookie>());
}

bool WebArchiveCreatorCookieJar::insertCookie(const QNetworkCookie &cookie)
{
    return (false);
}


bool WebArchiveCreatorCookieJar::setCookiesFromUrl(const QList<QNetworkCookie> &cookieList, const QUrl &url)
{
    return (false);
}

#endif // THUMBNAIL_USE_WEBKIT