Commit 6e876efb authored by Volker Krause's avatar Volker Krause
Browse files

Remove the web archiving plugin

Summary: It's only implemented for KHTML and depends heavily on DOM access API.

Reviewers: dfaure, stefanocrocco

Reviewed By: dfaure

Subscribers: kde-doc-english

Tags: #documentation

Maniphest Tasks: T11543

Differential Revision:
parent 642cbc9d
......@@ -1710,14 +1710,6 @@ e-mail to a friend telling her about this wonderful new site.</para>
<guimenuitem>Copy Image</guimenuitem> copies the &URL; of the picture
to the clipboard.</para>
<para>To save a complete web page, including images, select <guimenuitem>
Archive Web Page...</guimenuitem> from the <guimenu>Tools</guimenu> menu.
Note that this feature is provided by a <link linkend="konq-plugin">plugin
</link> and may not have been installed on your system. The web page will be
saved as a single file with a <literal role="extension">.war</literal>
extension and can be opened by <mousebutton>left</mousebutton> clicking
on the filename in &konqueror; running in File Manager mode.</para>
<para>Printing a copy of the page you are viewing is easily done with the
Menubar <menuchoice><guimenu>File</guimenu>
<guimenuitem>Print...</guimenuitem></menuchoice> or <guimenuitem>Print
......@@ -2251,7 +2243,7 @@ are listed in the dialog.</para>
<title>Browser Mode</title>
FIXME 16.12
KHTML: Auto Refresh, HTML Settings, Minitools-Bookmarklets, Translate (disabled), Document Relations, Web Archiver, Search Bar
KHTML: Auto Refresh, HTML Settings, Minitools-Bookmarklets, Translate (disabled), Document Relations, Search Bar
kwebkitpart for kf5 in extragear/base in branch frameworks
WebKit only HTML Settings and Translate (disabled)
WebEngine -> no extensions
......@@ -2382,15 +2374,6 @@ The following menu items show a list of popular browsers such as <guimenuitem>Fi
<term>Archive Web Page (KHTML view only)</term>
<listitem><para>Invoked with <menuchoice><guimenu>Tools</guimenu><guisubmenu>
Archive Web Page</guisubmenu></menuchoice>, this tool creates an archive
(<literal role="extension">.war</literal> ) file containing the web page being
viewed including the images. <mousebutton>Left</mousebutton> click on the
archive file name to view the saved page.
<term>Auto Refresh (KHTML view only)</term>
org.kde.webenginepart webenginepart IDENTIFIER [WEBENGINEPART_LOG]
org.kde.konqueror konqueror IDENTIFIER [KONQUEROR_LOG]
org.kde.konqueror.minitools minitools (konqueror plugin) IDENTIFIER [MINITOOLSPLUGIN_LOG]
org.kde.konqueror.webarchiver webarchiver (konqueror plugin) IDENTIFIER [WEBARCHIVERPLUGIN_LOG]
......@@ -8,7 +8,6 @@ add_subdirectory( kimgalleryplugin )
add_subdirectory( dirfilter )
# TODO add_subdirectory( uachanger )
add_subdirectory( babelfish )
add_subdirectory( webarchiver )
if (Qt5TextToSpeech_FOUND)
find_package(KF5 REQUIRED COMPONENTS KIO Archive KHtml)
########### next target ###############
set(webarchiverplugin_PART_SRCS plugin_webarchiver.cpp archivedialog.cpp )
ecm_qt_declare_logging_category(webarchiverplugin_PART_SRCS HEADER webarchiverdebug.h IDENTIFIER WEBARCHIVERPLUGIN_LOG CATEGORY_NAME org.kde.konqueror.webarchiver)
ki18n_wrap_ui(webarchiverplugin_PART_SRCS archiveviewbase.ui )
add_library(webarchiverplugin MODULE ${webarchiverplugin_PART_SRCS})
target_link_libraries(webarchiverplugin KF5::KHtml KF5::KDELibs4Support KF5::KIOCore KF5::Archive)
########### next target ###############
set(webarchivethumbnail_PART_SRCS webarchivecreator.cpp )
add_library(webarchivethumbnail MODULE ${webarchivethumbnail_PART_SRCS})
target_link_libraries(webarchivethumbnail KF5::KHtml KF5::KIOCore)
install(TARGETS webarchivethumbnail DESTINATION ${KDE_INSTALL_PLUGINDIR} )
########### install files ###############
install( FILES plugin_webarchiver.rc plugin_webarchiver.desktop DESTINATION ${KDE_INSTALL_DATADIR}/khtml/kpartplugins )
install( FILES webarchivethumbnail.desktop DESTINATION ${KDE_INSTALL_KSERVICES5DIR} )
THEME hicolor
#! /bin/sh
$EXTRACTRC *.rc *.ui >> rc.cpp
$XGETTEXT *.cpp -o $podir/webarchiver.pot
This diff is collapsed.
Copyright (C) 2003 Antonio Larrosa <>
Copyright (C) 2008 Matthias Grimrath <>
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; see the file COPYING. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
#include <kdialog.h>
#include <kio/job_base.h>
#include <qlinkedlist.h>
#include <dom/dom_core.h>
#include <dom/html_document.h>
#include "ui_archiveviewbase.h"
class QWidget;
class KHTMLPart;
class ArchiveViewBase;
class QUrl;
class KTar;
class QTextStream;
class ArchiveViewBase : public QWidget, public Ui::ArchiveViewBase
ArchiveViewBase(QWidget *parent) : QWidget(parent)
/// Does all the hard work of downloading, manipulating and storing of
/// HTML files and inlined images, stylesheets ...
class ArchiveDialog : public KDialog
ArchiveDialog(QWidget *parent, const QString &targetFilename, KHTMLPart *part);
~ArchiveDialog() override;
void archive();
/// Holds attributes that are not #CDATA
class NonCDataAttr : public QSet<QString>
static NonCDataAttr non_cdata_attr;
KIO::Job *startDownload(const QUrl &url, KHTMLPart *part);
// Frame handling
typedef QHash<QString, KHTMLPart *> Name2Part;
typedef QHash<QUrl, KHTMLPart *> URL2Part;
struct PartFrameData {
Name2Part framesWithName;
URL2Part framesWithURLOnly;
typedef QHash< KHTMLPart *, PartFrameData > FramesInPart;
typedef QHash< QString, KHTMLPart * > TarName2Part;
typedef QHash< KHTMLPart *, QString > Part2TarName;
// Stylesheets
typedef QHash< QUrl, DOM::CSSStyleSheet > CSSURLSet;
typedef QHash< QString, QUrl > RawHRef2FullURL;
typedef QHash< DOM::CSSStyleSheet, RawHRef2FullURL > URLsInStyleSheet;
typedef QHash< DOM::Element, RawHRef2FullURL > URLsInStyleElement;
typedef QHash< DOM::Node, DOM::CSSStyleSheet > Node2StyleSheet;
// Recursive parsing and processing
/// Databag to hold information that is gathered during recursive traversal of the DOM tree
struct RecurseData {
KHTMLPart *const part;
QTextStream *const textStream;
PartFrameData *const partFrameData;
DOM::HTMLDocument document;
bool baseSeen;
RecurseData(KHTMLPart *_part, QTextStream *_textStream, PartFrameData *pfd);
struct DownloadInfo {
QString tarName;
KHTMLPart *part;
DownloadInfo(const QString &_tarName = QString::null, KHTMLPart *_part = nullptr)
: tarName(_tarName), part(_part) { }
typedef QMap< QUrl, DownloadInfo > UrlTarMap;
typedef QList< UrlTarMap::Iterator > DownloadList;
struct AttrElem {
QString name;
QString value;
AttrElem() { }
AttrElem(const QString &_n, const QString &_v) : name(_n), value(_v) { }
typedef QLinkedList< AttrElem > AttrList;
* Looks for URL contained in attributes.
struct ExtractURLs {
ExtractURLs(const QString &nodeName, const DOM::Element &element);
AttrList attrList; /// copy of the attribute of @p element
AttrList::iterator absURL; /// for links ala &lt;a href= ... &gt;
AttrList::iterator transURL; /// for embedded objects like &lt;img src=...&gt;, favicons, background-images...
AttrList::iterator frameURL; /// if @p element contains a frameURL
AttrList::iterator frameName; /// if it is frame tag with a name element
AttrList::iterator cssURL; /// for URLs that specify CSS
void downloadObjects();
void downloadStyleSheets();
void saveWebpages();
void finishedArchiving(bool tarerror);
void endProgressInfo(bool error);
void obtainURLs();
void obtainURLsLower(KHTMLPart *part, int level);
void obtainPartURLsLower(const DOM::Node &pNode, int level, RecurseData &data);
void obtainStyleSheetURLsLower(DOM::CSSStyleSheet styleSheet, RecurseData &data);
bool insertTranslateURL(const QUrl &fullURL, RecurseData &data);
bool insertHRefFromStyleSheet(const QString &hrefRaw, RawHRef2FullURL &raw2full,
const QUrl &fullURL, RecurseData &data);
void parseStyleDeclaration(const QUrl &baseURL, DOM::CSSStyleDeclaration decl,
RawHRef2FullURL &urls, RecurseData &data /*, bool verbose = false*/);
bool saveTopFrame();
bool saveFrame(KHTMLPart *part, int level);
void saveHTMLPart(RecurseData &data);
void saveHTMLPartLower(const DOM::Node &pNode, int indent, RecurseData &data);
QString extractCSSURL(const QString &text);
QString &changeCSSURLs(QString &text, const RawHRef2FullURL &raw2full);
static bool hasAttrWithValue(const DOM::Element &elem, const QString &attrName, const QString &attrValue);
static bool hasChildNode(const DOM::Node &pNode, const QString &nodeName);
static AttrList::Iterator getAttribute(AttrList &attrList, const QString &attr);
static bool hasSubUrl(const QUrl &url);
* completes a potentially partial URL in a HTML document (like &lt;img href="...")
* to a fully qualified one.
* It uses the URL of the document or the URL given in the &lt;base ...&gt;
* element, depending on if and where a &ltbase ...&gt; appears on the document.
* Always use this method to get full URLs from href's or similar.
* Suppose the URL of the webpage is http://host.nowhere/. The head looks like this
* <pre>
* &lt;head&gt;
* &lt;link rel="stylesheet" href="style1.css" type="text/css" /&gt;
* &lt;base href="" /&gt;
* &lt;link rel="stylesheet" href="style2.css" type="text/css" /&gt;
* &lt;/head&gt;
* </pre>
* The full URL of "style1.css" is http://host.nowhere/style1.css, whereas
* "style2.css" will become
* @return fully qualified URL of @p partURL relative to the HTML document in @c data.part
static QUrl absoluteURL(const QString &partURL, RecurseData &data);
* TODO KDE4 is this in KHTML function available now?
* Functionality taken from khtml/css/csshelper.cpp:parseURL
* Filters a href in an element inside the HTML body. This handles
* quirks in browsers that filter out \\n, \\r in URLs.
static QString parseURL(const QString &rawurl);
* Creates unique filenames to be used in the tar archive
QString uniqTarName(const QString &suggestion, KHTMLPart *part);
* Taken from khtml/misc/loader.cpp DOCLOAD_SECCHECK
* Would be better on the public interface of KHTMLPart (or similar)
* Checks if an embedded link like &lt;img src=&quot;...&quot; should be loaded
static bool urlCheckFailed(KHTMLPart *part, const QUrl &fullURL);
* Escapes HTML characters. Does not forget " as @ref Qt::escape() does.
QString escapeHTML(const QString &in);
* Adds a suffix that hints at the mimetypes if such a suffix is not
* present already. If there is no such mimetype in the KDE database
* @p filename is returned unchanged.
* 'filename' -> 'filename.gif'
* 'picture.jpg' -> 'picture.jpg'
* NOTE This function is rather slow
QString appendMimeTypeSuffix(QString filename, const QString &mimetype);
KHTMLPart *m_top;
FramesInPart m_framesInPart;
UrlTarMap m_url2tar;
TarName2Part m_tarName2part;
Part2TarName m_part2tarName;
CSSURLSet m_cssURLs;
URLsInStyleSheet m_URLsInStyleSheet;
URLsInStyleElement m_URLsInStyleElement;
Node2StyleSheet m_topStyleSheets;
KIO::Job *m_job;
CSSURLSet::Iterator m_styleSheets_it;
DownloadList m_objects;
DownloadList::Iterator m_objects_it;
UrlTarMap::Iterator m_dlurl2tar_it;
int m_uniqId;
KTar *m_tarBall;
QDateTime m_archiveTime;
QString m_filename;
ArchiveViewBase *m_widget;
private slots:
void slotObjectFinished(KJob *job);
void slotStyleSheetFinished(KJob *job);
void slotButtonClicked(int button) override;
<ui version="4.0" >
<widget class="QWidget" name="ArchiveViewBase" >
<property name="geometry" >
<property name="windowTitle" >
<string>Web Archiver</string>
<layout class="QVBoxLayout" >
<layout class="QGridLayout" >
<property name="leftMargin" >
<property name="topMargin" >
<property name="rightMargin" >
<property name="bottomMargin" >
<item row="1" column="1" >
<widget class="QLabel" name="targetLabel" >
<property name="sizePolicy" >
<sizepolicy vsizetype="Minimum" hsizetype="Expanding" >
<property name="text" >
<string>Local File</string>
<property name="wordWrap" >
<property name="openExternalLinks" >
<property name="textInteractionFlags" >
<item row="1" column="0" >
<widget class="QLabel" name="textLabel1_2" >
<property name="text" >
<property name="wordWrap" >
<item row="0" column="0" >
<widget class="QLabel" name="textLabel1" >
<property name="text" >
<property name="wordWrap" >
<item row="0" column="1" >
<widget class="QLabel" name="urlLabel" >
<property name="sizePolicy" >
<sizepolicy vsizetype="Minimum" hsizetype="Expanding" >
<property name="text" >
<string>Original URL</string>
<property name="wordWrap" >
<property name="openExternalLinks" >
<property name="textInteractionFlags" >
<widget class="QProgressBar" name="progressBar" />
<widget class="QTreeWidget" name="progressView" >
<property name="rootIsDecorated" >
<property name="columnCount" >
<property name="text" >
<property name="text" >
<layoutdefault spacing="6" margin="11" />
/* This file is part of Webarchiver
* Copyright (C) 2001 by Andreas Schlapbach <>
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Library General Public License for more details.
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
/* $Id$ */
* There are two recursions within this code:
* - Recursively create DOM-Tree for referenced links which get recursively
* converted to HTML
* => This code has the potential to download whole sites to a TarGz-Archive
//#define DEBUG_WAR
#include "plugin_webarchiver.h"
#include <QDir>
#include <QFile>
#include <QIcon>
#include <QUrl>
#include <kcomponentdata.h>
#include <kfiledialog.h>
#include <kmessagebox.h>
#include <KLocalizedString>
#include <khtmlview.h>
#include <khtml_part.h>
#include <kpluginfactory.h>
#include <kactioncollection.h>
#include <kstandardguiitem.h>
#include <QStandardPaths>
#include "archivedialog.h"
K_PLUGIN_FACTORY(PluginWebArchiverFactory, registerPlugin<PluginWebArchiver>();)
PluginWebArchiver::PluginWebArchiver(QObject *parent,
const QVariantList &)
: Plugin(parent)
QAction *a = actionCollection()->addAction(QStringLiteral("archivepage"));
a->setText(i18n("Archive &Web Page..."));
connect(a, SIGNAL(triggered()), this, SLOT(slotSaveToArchive()));
void PluginWebArchiver::slotSaveToArchive()
// ## Unicode ok?
if (!parent() || !parent()->inherits("KHTMLPart")) {
KHTMLPart *part = qobject_cast<KHTMLPart *>(parent());
QString archiveName = QString::fromUtf8(part->htmlDocument().title().string().toUtf8());
if (archiveName.isEmpty()) {
archiveName = i18n("Untitled");
KConfig config(QStringLiteral("webarchiverrc"), KConfig::SimpleConfig);
KConfigGroup configGroup ="Recent");
archiveName = archiveName.simplified();
archiveName.replace(QLatin1String("\\s:"), QLatin1String(" ")); // what is this intended to do?
archiveName.replace(QLatin1String("?"), QLatin1String(""));
archiveName.replace(QLatin1String(":"), QLatin1String(""));
archiveName.replace(QLatin1String("/"), QLatin1String(""));
// Replace space with underscore, proposed Frank Pieczynski <>
archiveName = archiveName.replace(QRegExp("\\s+"), QStringLiteral("_"));
QString lastCWD = configGroup.readPathEntry("savedialogcwd",
archiveName = lastCWD + "/" + archiveName + ".war";
QUrl url = KFileDialog::getSaveUrl(QUrl::fromLocalFile(archiveName), i18n("*.war *.tgz|Web Archives"), part->widget(),
i18n("Save Page as Web-Archive"));
if (url.isEmpty()) {
if (!url.isValid()) {
const QString title = i18nc("@title:window", "Invalid URL");
const QString text = i18n("The URL\n%1\nis not valid.", url.toString());
KMessageBox::sorry(part->widget(), text, title);