Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
System
KHelpCenter
Commits
f4605e5b
Commit
f4605e5b
authored
Oct 21, 2020
by
Martin Tobias Holmedahl Sandsmark
Committed by
Yuri Chornoivan
Oct 24, 2020
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Port to gumbo
A bit faster, lightweight and more robust, and designed for HTML.
parent
3e553421
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
98 additions
and
96 deletions
+98
-96
CMakeLists.txt
CMakeLists.txt
+2
-5
searchhandlers/CMakeLists.txt
searchhandlers/CMakeLists.txt
+1
-2
searchhandlers/htmltextdump.cpp
searchhandlers/htmltextdump.cpp
+94
-88
searchhandlers/htmltextdump.h
searchhandlers/htmltextdump.h
+1
-1
No files found.
CMakeLists.txt
View file @
f4605e5b
...
...
@@ -59,11 +59,8 @@ set_package_properties(Xapian PROPERTIES
URL
"https://xapian.org/"
TYPE REQUIRED
)
find_package
(
LibXml2 REQUIRED
)
set_package_properties
(
LibXml2 PROPERTIES
DESCRIPTION
"Support for extracting text from HTML documents"
URL
"http://www.xmlsoft.org/"
TYPE REQUIRED
)
find_package
(
PkgConfig REQUIRED
)
pkg_search_module
(
gumbo REQUIRED IMPORTED_TARGET gumbo
)
add_definitions
(
-DQT_USE_QSTRINGBUILDER
...
...
searchhandlers/CMakeLists.txt
View file @
f4605e5b
include_directories
(
${
XAPIAN_INCLUDE_DIR
}
${
LIBXML2_INCLUDE_DIR
}
)
# Xapian does not like signals/slots #define's
...
...
@@ -16,7 +15,7 @@ set(khc_xapianindexer_SOURCES
add_executable
(
khc_xapianindexer
${
khc_xapianindexer_SOURCES
}
)
kde_target_enable_exceptions
(
khc_xapianindexer PRIVATE
)
ecm_mark_nongui_executable
(
khc_xapianindexer
)
target_link_libraries
(
khc_xapianindexer KF5::DocTools Qt5::Core KF5::Archive KF5::CoreAddons
${
XAPIAN_LIBRARIES
}
${
LIBXML2_LIBRARIES
}
)
target_link_libraries
(
khc_xapianindexer KF5::DocTools Qt5::Core KF5::Archive KF5::CoreAddons
${
XAPIAN_LIBRARIES
}
PkgConfig::gumbo
)
install
(
TARGETS khc_xapianindexer DESTINATION
${
LIBEXEC_INSTALL_DIR
}
)
# Xapian search
...
...
searchhandlers/htmltextdump.cpp
View file @
f4605e5b
/*
This file is part of the KDE Help Center
Copyright (c) 2016 Pino Toscano <pino@kde.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
#include "htmltextdump.h"
#include <QLoggingCategory>
#include <libxml/HTMLparser.h>
namespace
{
Q_LOGGING_CATEGORY
(
LOG
,
"org.kde.khelpcenter.xapian.htmltextdump"
,
QtWarningMsg
)
class
HtmlDocPtr
{
public:
HtmlDocPtr
(
htmlDocPtr
doc
)
:
_doc
(
doc
)
{}
~
HtmlDocPtr
()
{
xmlFreeDoc
(
_doc
);
}
operator
bool
()
const
{
return
_doc
;
}
operator
htmlDocPtr
()
const
{
return
_doc
;
}
private:
htmlDocPtr
_doc
;
};
}
static
xmlNode
*
findChildElement
(
xmlNode
*
node
,
const
char
*
name
)
// This file is part of the KDE Help Center.
//
// Extracts the text content and title of a HTML document.
//
//
// Derived from the Gumbo library example code:
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)
#include <QByteArray>
#include <gumbo.h>
static
QByteArray
cleanText
(
GumboNode
*
node
)
{
for
(
xmlNode
*
n
=
node
;
n
;
n
=
n
->
next
)
{
if
(
n
->
type
==
XML_ELEMENT_NODE
&&
xmlStrcmp
(
n
->
name
,
BAD_CAST
name
)
==
0
)
{
return
n
->
children
;
}
if
(
node
->
type
==
GUMBO_NODE_TEXT
)
{
return
QByteArray
(
node
->
v
.
text
.
text
);
}
if
(
node
->
type
!=
GUMBO_NODE_ELEMENT
)
{
return
""
;
}
if
(
node
->
v
.
element
.
tag
==
GUMBO_TAG_SCRIPT
)
{
return
""
;
}
if
(
node
->
v
.
element
.
tag
==
GUMBO_TAG_STYLE
)
{
return
""
;
}
return
nullptr
;
}
static
void
collectText
(
xmlNode
*
node
,
QByteArray
*
text
)
{
for
(
xmlNode
*
n
=
node
;
n
;
n
=
n
->
next
)
{
if
(
n
->
type
==
XML_TEXT_NODE
)
{
xmlChar
*
content
=
xmlNodeGetContent
(
n
);
*
text
+=
QByteArray
(
" "
)
+
QByteArray
(
reinterpret_cast
<
char
*>
(
content
)
);
xmlFree
(
content
);
QByteArray
contents
=
""
;
GumboVector
*
children
=
&
node
->
v
.
element
.
children
;
for
(
size_t
i
=
0
;
i
<
children
->
length
;
++
i
)
{
GumboNode
*
child
=
reinterpret_cast
<
GumboNode
*>
(
children
->
data
[
i
]
);
const
QByteArray
text
=
cleanText
(
child
);
if
(
i
!=
0
&&
!
text
.
isEmpty
()
)
{
contents
.
append
(
" "
);
}
co
llectText
(
n
->
children
,
text
);
co
ntents
.
append
(
text
);
}
return
contents
;
}
bool
htmlTextDump
(
const
QByteArray
&
data
,
QByteArray
*
title
,
QByteArray
*
tex
t
)
static
QByteArray
findTitle
(
const
GumboNode
*
roo
t
)
{
HtmlDocPtr
doc
(
htmlReadMemory
(
data
.
constData
(),
data
.
length
(),
nullptr
,
"UTF-8"
,
HTML_PARSE_RECOVER
|
HTML_PARSE_NOERROR
|
HTML_PARSE_NOWARNING
|
HTML_PARSE_NONET
)
);
if
(
!
doc
)
{
qCWarning
(
LOG
)
<<
"cannot parse html"
;
return
false
;
if
(
root
->
type
!=
GUMBO_NODE_ELEMENT
)
{
return
""
;
}
xmlNode
*
root
=
xmlDocGetRootElement
(
doc
);
if
(
!
root
)
{
qCWarning
(
LOG
)
<<
"missing root"
;
return
false
;
if
(
root
->
v
.
element
.
children
.
length
<
2
)
{
return
""
;
}
xmlNode
*
html
=
findChildElement
(
root
,
"html"
);
if
(
!
html
)
{
qCWarning
(
LOG
)
<<
"missing <html>"
;
return
false
;
const
GumboVector
*
root_children
=
&
root
->
v
.
element
.
children
;
GumboNode
*
head
=
nullptr
;
for
(
size_t
i
=
0
;
i
<
root_children
->
length
;
++
i
)
{
GumboNode
*
child
=
reinterpret_cast
<
GumboNode
*>
(
root_children
->
data
[
i
]
);
if
(
child
->
type
==
GUMBO_NODE_ELEMENT
&&
child
->
v
.
element
.
tag
==
GUMBO_TAG_HEAD
)
{
head
=
child
;
break
;
}
}
xmlNode
*
head
=
findChildElement
(
html
,
"head"
);
xmlNode
*
body
=
findChildElement
(
html
,
"body"
);
if
(
!
body
)
{
qCWarning
(
LOG
)
<<
"missing <body>"
;
return
false
;
if
(
head
==
nullptr
)
{
return
""
;
}
QByteArray
newText
;
collectText
(
body
,
&
newText
);
*
text
=
newText
;
GumboVector
*
head_children
=
&
head
->
v
.
element
.
children
;
for
(
size_t
i
=
0
;
i
<
head_children
->
length
;
++
i
)
{
GumboNode
*
child
=
reinterpret_cast
<
GumboNode
*>
(
head_children
->
data
[
i
]
);
if
(
child
->
type
!=
GUMBO_NODE_ELEMENT
||
child
->
v
.
element
.
tag
==
GUMBO_TAG_TITLE
)
{
continue
;
}
if
(
head
)
{
xmlNode
*
title_node
=
findChildElement
(
head
,
"title"
);
if
(
title_node
)
{
QByteArray
newTitle
;
collectText
(
title_node
,
&
newTitle
);
*
title
=
newTitle
;
if
(
child
->
v
.
element
.
children
.
length
!=
1
)
{
return
""
;
}
GumboNode
*
title_text
=
reinterpret_cast
<
GumboNode
*>
(
child
->
v
.
element
.
children
.
data
[
0
]
);
if
(
title_text
->
type
!=
GUMBO_NODE_TEXT
&&
title_text
->
type
!=
GUMBO_NODE_WHITESPACE
)
{
return
""
;
}
return
QByteArray
(
title_text
->
v
.
text
.
text
);
}
return
""
;
}
bool
htmlTextDump
(
const
QByteArray
&
data
,
QByteArray
*
title
,
QByteArray
*
text
)
{
GumboOutput
*
output
=
gumbo_parse
(
data
.
constData
()
);
*
text
=
cleanText
(
output
->
root
);
*
title
=
findTitle
(
output
->
root
);
gumbo_destroy_output
(
&
kGumboDefaultOptions
,
output
);
return
true
;
return
!
text
->
isEmpty
()
;
}
searchhandlers/htmltextdump.h
View file @
f4605e5b
...
...
@@ -22,7 +22,7 @@
#ifndef HTMLTEXTDUMP_H
#define HTMLTEXTDUMP_H
#include <
QByteArray
>
class
QByteArray
;
bool
htmlTextDump
(
const
QByteArray
&
data
,
QByteArray
*
title
,
QByteArray
*
text
);
...
...
Luigi Toscano
@ltoscano
mentioned in commit
f91a1e57
·
Oct 29, 2020
mentioned in commit
f91a1e57
mentioned in commit f91a1e57a5c375272ccce3e0104c9f8127e1b294
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment