Commit f91a1e57 authored by Luigi Toscano's avatar Luigi Toscano

Revert "Port to gumbo"

libxml2 is an existing dependency used elsewhere and more important
a maintained library, while gumbo is even hardly packaged and
it's a (usual) google abandonware.

Moreover, the change itself (new dependency) should have been
communicated to the sysadmin. But really, it's not worth it.

This reverts commit f4605e5b.
parent 92ce602a
......@@ -59,8 +59,11 @@ set_package_properties(Xapian PROPERTIES
URL "https://xapian.org/"
TYPE REQUIRED)
find_package(PkgConfig REQUIRED)
pkg_search_module(gumbo REQUIRED IMPORTED_TARGET gumbo)
find_package(LibXml2 REQUIRED)
set_package_properties(LibXml2 PROPERTIES
DESCRIPTION "Support for extracting text from HTML documents"
URL "http://www.xmlsoft.org/"
TYPE REQUIRED)
add_definitions(
-DQT_USE_QSTRINGBUILDER
......
include_directories(
${XAPIAN_INCLUDE_DIR}
${LIBXML2_INCLUDE_DIR}
)
# Xapian does not like signals/slots #define's
......@@ -15,7 +16,7 @@ set(khc_xapianindexer_SOURCES
add_executable(khc_xapianindexer ${khc_xapianindexer_SOURCES})
kde_target_enable_exceptions(khc_xapianindexer PRIVATE)
ecm_mark_nongui_executable(khc_xapianindexer)
target_link_libraries(khc_xapianindexer KF5::DocTools Qt5::Core KF5::Archive KF5::CoreAddons ${XAPIAN_LIBRARIES} PkgConfig::gumbo)
target_link_libraries(khc_xapianindexer KF5::DocTools Qt5::Core KF5::Archive KF5::CoreAddons ${XAPIAN_LIBRARIES} ${LIBXML2_LIBRARIES})
install(TARGETS khc_xapianindexer DESTINATION ${LIBEXEC_INSTALL_DIR})
# Xapian search
......
// This file is part of the KDE Help Center.
//
// Extracts the text content and title of a HTML document.
//
//
// Derived from the Gumbo library example code:
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
#include <QByteArray>
#include <gumbo.h>
static QByteArray cleanText( GumboNode* node )
{
if ( node->type == GUMBO_NODE_TEXT ) {
return QByteArray( node->v.text.text );
}
if ( node->type != GUMBO_NODE_ELEMENT ) {
return "";
}
if ( node->v.element.tag == GUMBO_TAG_SCRIPT ) {
return "";
}
if ( node->v.element.tag == GUMBO_TAG_STYLE ) {
return "";
}
/*
This file is part of the KDE Help Center
QByteArray contents = "";
GumboVector* children = &node->v.element.children;
for ( size_t i = 0; i < children->length; ++i ) {
GumboNode* child = reinterpret_cast<GumboNode*>( children->data[i] );
const QByteArray text = cleanText( child );
Copyright (c) 2016 Pino Toscano <pino@kde.org>
if ( i != 0 && !text.isEmpty() ) {
contents.append( " " );
}
contents.append( text );
}
return contents;
}
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
static QByteArray findTitle( const GumboNode* root )
{
if ( root->type != GUMBO_NODE_ELEMENT ) {
return "";
}
if ( root->v.element.children.length < 2 ) {
return "";
}
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
const GumboVector* root_children = &root->v.element.children;
GumboNode* head = nullptr;
for ( size_t i = 0; i < root_children->length; ++i ) {
GumboNode* child = reinterpret_cast<GumboNode*>( root_children->data[i] );
if ( child->type == GUMBO_NODE_ELEMENT &&
child->v.element.tag == GUMBO_TAG_HEAD ) {
head = child;
break;
}
}
if ( head == nullptr ) {
return "";
}
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
GumboVector* head_children = &head->v.element.children;
for ( size_t i = 0; i < head_children->length; ++i ) {
GumboNode* child = reinterpret_cast<GumboNode*>( head_children->data[i] );
if ( child->type != GUMBO_NODE_ELEMENT ||
child->v.element.tag == GUMBO_TAG_TITLE ) {
continue;
}
#include "htmltextdump.h"
#include <QLoggingCategory>
#include <libxml/HTMLparser.h>
if ( child->v.element.children.length != 1 ) {
return "";
namespace {
Q_LOGGING_CATEGORY( LOG, "org.kde.khelpcenter.xapian.htmltextdump", QtWarningMsg )
class HtmlDocPtr {
public:
HtmlDocPtr( htmlDocPtr doc ) : _doc( doc ) {}
~HtmlDocPtr() { xmlFreeDoc( _doc ); }
operator bool() const { return _doc; }
operator htmlDocPtr() const { return _doc; }
private:
htmlDocPtr _doc;
};
}
static xmlNode* findChildElement( xmlNode *node, const char *name )
{
for ( xmlNode *n = node; n; n = n->next ) {
if ( n->type == XML_ELEMENT_NODE && xmlStrcmp( n->name, BAD_CAST name ) == 0 ) {
return n->children;
}
}
return nullptr;
}
GumboNode* title_text = reinterpret_cast<GumboNode*>( child->v.element.children.data[0] );
if ( title_text->type != GUMBO_NODE_TEXT &&
title_text->type != GUMBO_NODE_WHITESPACE ) {
return "";
static void collectText( xmlNode *node, QByteArray *text )
{
for ( xmlNode *n = node; n; n = n->next ) {
if ( n->type == XML_TEXT_NODE ) {
xmlChar *content = xmlNodeGetContent( n );
*text += QByteArray( " " ) + QByteArray( reinterpret_cast<char *>( content ) );
xmlFree( content );
}
return QByteArray( title_text->v.text.text );
collectText( n->children, text );
}
return "";
}
bool htmlTextDump( const QByteArray& data, QByteArray *title, QByteArray *text )
{
GumboOutput* output = gumbo_parse( data.constData() );
HtmlDocPtr doc( htmlReadMemory( data.constData(), data.length(), nullptr, "UTF-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET ) );
if ( !doc ) {
qCWarning(LOG) << "cannot parse html";
return false;
}
xmlNode *root = xmlDocGetRootElement( doc );
if ( !root ) {
qCWarning(LOG) << "missing root";
return false;
}
xmlNode *html = findChildElement( root, "html" );
if ( !html ) {
qCWarning(LOG) << "missing <html>";
return false;
}
xmlNode *head = findChildElement( html, "head" );
xmlNode *body = findChildElement( html, "body" );
if ( !body ) {
qCWarning(LOG) << "missing <body>";
return false;
}
*text = cleanText( output->root );
*title = findTitle( output->root );
QByteArray newText;
collectText( body, &newText );
*text = newText;
gumbo_destroy_output( &kGumboDefaultOptions, output );
if ( head ) {
xmlNode *title_node = findChildElement( head, "title" );
if ( title_node ) {
QByteArray newTitle;
collectText( title_node, &newTitle );
*title = newTitle;
}
}
return !text->isEmpty();
return true;
}
......@@ -22,7 +22,7 @@
#ifndef HTMLTEXTDUMP_H
#define HTMLTEXTDUMP_H
class QByteArray;
#include <QByteArray>
bool htmlTextDump( const QByteArray& data, QByteArray *title, QByteArray *text );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment