Commit 583ed246 authored by Pino Toscano's avatar Pino Toscano

Add Xapian support for indexing and searching

Start using Xapian for indexing and searching the documentation: it
works much better than htp://dig, it provides a C++ API, and it is well
maintained.

Introduce two helper tools to create/update an index, and search on it,
starting from the cache.bz2 files generated by meinproc: this way it is
possible to index the whole text, with the entities already expanded
(something which would not be there when parsing the docbook files
directly). Make use of libxml2 to parse the HTML documents, and get all
the text in them.

Xapian and libxml2 are currently considered as mandatory dependencies:
they are portable, generally available everywhere, and not requiring
themselves too many extra dependencies.

CCBUG: 93664
CCBUG: 103266
CCBUG: 115935
CCBUG: 122437
CCBUG: 125276
BUG: 126710
CCBUG: 152671
CCBUG: 158633
CCBUG: 209415
CCBUG: 256397
CCBUG: 266290
parent 959ec02d
......@@ -24,6 +24,7 @@ find_package(Qt5 ${QT_MIN_VERSION} CONFIG REQUIRED COMPONENTS
)
find_package(KF5 REQUIRED COMPONENTS
Archive
Config
CoreAddons
DBusAddons
......@@ -38,6 +39,18 @@ find_package(KF5 REQUIRED COMPONENTS
WindowSystem
)
find_package(Xapian REQUIRED)
set_package_properties(Xapian PROPERTIES
DESCRIPTION "Support for text indexing and searching"
URL "https://xapian.org/"
TYPE REQUIRED)
find_package(LibXml2 REQUIRED)
set_package_properties(LibXml2 PROPERTIES
DESCRIPTION "Support for extracting text from HTML documents"
URL "http://www.xmlsoft.org/"
TYPE REQUIRED)
add_subdirectory( plugins )
add_subdirectory( searchhandlers )
add_subdirectory( tests )
......
include_directories(
${XAPIAN_INCLUDE_DIR}
${LIBXML2_INCLUDE_DIR}
)
# Xapian does not like signals/slots #define's
add_definitions(-DQT_NO_KEYWORDS)
# Xapian indexer
set(khc_xapianindexer_SOURCES
cachereader.cpp
htmltextdump.cpp
xapiancommon.cpp
xapianindexer.cpp
)
add_executable(khc_xapianindexer ${khc_xapianindexer_SOURCES})
kde_target_enable_exceptions(khc_xapianindexer PRIVATE)
ecm_mark_nongui_executable(khc_xapianindexer)
target_link_libraries(khc_xapianindexer Qt5::Core KF5::Archive ${XAPIAN_LIBRARIES} ${LIBXML2_LIBRARIES})
install(TARGETS khc_xapianindexer DESTINATION ${LIBEXEC_INSTALL_DIR})
# Xapian search
set(khc_xapiansearch_SOURCES
xapiancommon.cpp
xapiansearch.cpp
)
add_executable(khc_xapiansearch ${khc_xapiansearch_SOURCES})
kde_target_enable_exceptions(khc_xapiansearch PRIVATE)
ecm_mark_nongui_executable(khc_xapiansearch)
target_link_libraries(khc_xapiansearch Qt5::Core ${XAPIAN_LIBRARIES})
install(TARGETS khc_xapiansearch DESTINATION ${LIBEXEC_INSTALL_DIR})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/xapian.desktop.cmake ${CMAKE_CURRENT_BINARY_DIR}/xapian.desktop)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/xapian.desktop DESTINATION ${DATA_INSTALL_DIR}/khelpcenter/searchhandlers)
if (NOT WIN32)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/khc_docbookdig.pl.cmake ${CMAKE_CURRENT_BINARY_DIR}/khc_docbookdig.pl @ONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/khc_htdig.pl.cmake ${CMAKE_CURRENT_BINARY_DIR}/khc_htdig.pl @ONLY )
......
/*
This file is part of the KDE Help Center
Copyright (c) 2016 Pino Toscano <pino@kde.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
#include "cachereader.h"
#include <QDebug>
#include <QLoggingCategory>
#include <QStack>
#include <QTextStream>
#include <KCompressionDevice>
#include <algorithm>
namespace {
Q_LOGGING_CATEGORY( LOG, "org.kde.khelpcenter.xapian.cachereader", QtWarningMsg )
}
static bool readAll( const QString& file, QString* data )
{
KCompressionDevice dev( file, KCompressionDevice::BZip2 );
if ( !dev.open( QIODevice::ReadOnly ) ) {
qCWarning(LOG) << "cannot open" << file << ":" << dev.errorString();
return false;
}
*data = QString::fromUtf8( dev.readAll() );
return true;
}
CacheReader::CacheReader()
{
}
CacheReader::~CacheReader()
{
}
bool CacheReader::parse( const QString& file )
{
mText.clear();
mRanges.clear();
if ( !readAll( file, &mText ) ) {
return false;
}
const int length = mText.length();
static const QString patternStart = QStringLiteral( "<FILENAME filename=\"" );
static const QString patternEnd = QStringLiteral( "</FILENAME>" );
QStack<QString> stack;
int index = 0;
while ( index < length ) {
int start = mText.indexOf( patternStart, index );
int end = mText.indexOf( patternEnd, index );
if ( start >= 0 && start < end ) {
// new document
const int quote = mText.indexOf( '"', start + patternStart.length() );
const QString name = mText.mid( start + patternStart.length(), quote - ( start + patternStart.length() ) );
if ( stack.isEmpty() ) {
} else {
if ( start > index ) {
mRanges.insert( stack.top(), qMakePair( index, start - 1 ) );
}
}
index = quote + 2;
stack.push( name );
} else if ( end >= 0 ) {
// end of current document
Q_ASSERT( !stack.isEmpty() );
mRanges.insert( stack.top(), qMakePair( index, end - 1 ) );
index = end + patternEnd.length();
stack.pop();
} else {
break;
}
}
Q_ASSERT( stack.isEmpty() );
return true;
}
QSet<QString> CacheReader::documents() const
{
return QSet<QString>::fromList( mRanges.uniqueKeys() );
}
QByteArray CacheReader::document( const QString& id ) const
{
QList<Range> docRanges = mRanges.values( id );
if ( docRanges.isEmpty() ) {
return QByteArray();
}
QByteArray doc;
if ( docRanges.count() == 1 ) {
const Range range = docRanges.first();
doc = mText.midRef( range.first, range.second - range.first + 1 ).toUtf8();
} else {
std::reverse( docRanges.begin(), docRanges.end() );
QTextStream stream( &doc );
stream.setCodec( "UTF-8" );
Q_FOREACH ( const Range &range, docRanges ) {
stream << mText.mid( range.first, range.second - range.first + 1 );
}
}
return doc;
}
/*
This file is part of the KDE Help Center
Copyright (c) 2016 Pino Toscano <pino@kde.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
#ifndef CACHEREADER_H
#define CACHEREADER_H
#include <QByteArray>
#include <QMultiHash>
#include <QPair>
#include <QSet>
#include <QString>
class CacheReader
{
public:
CacheReader();
~CacheReader();
bool parse( const QString& file );
QSet<QString> documents() const;
QByteArray document( const QString& id ) const;
private:
typedef QPair<int, int> Range;
QString mText;
QMultiHash<QString, Range> mRanges;
};
#endif
/*
This file is part of the KDE Help Center
Copyright (c) 2016 Pino Toscano <pino@kde.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
#include "htmltextdump.h"
#include <QDebug>
#include <QLoggingCategory>
#include <libxml/HTMLparser.h>
namespace {
Q_LOGGING_CATEGORY( LOG, "org.kde.khelpcenter.xapian.htmltextdump", QtWarningMsg )
class HtmlDocPtr {
public:
HtmlDocPtr( htmlDocPtr doc ) : _doc( doc ) {}
~HtmlDocPtr() { xmlFreeDoc( _doc ); }
operator bool() const { return _doc; }
operator htmlDocPtr() const { return _doc; }
private:
htmlDocPtr _doc;
};
}
static xmlNode* findChildElement( xmlNode *node, const char *name )
{
for ( xmlNode *n = node; n; n = n->next ) {
if ( n->type == XML_ELEMENT_NODE && xmlStrcmp( n->name, BAD_CAST name ) == 0 ) {
return n->children;
}
}
return 0;
}
static void collectText( xmlNode *node, QByteArray *text )
{
for ( xmlNode *n = node; n; n = n->next ) {
if ( n->type == XML_TEXT_NODE ) {
xmlChar *content = xmlNodeGetContent( n );
*text += QByteArray( " " ) + QByteArray( reinterpret_cast<char *>( content ) );
xmlFree( content );
}
collectText( n->children, text );
}
}
bool htmlTextDump( const QByteArray& data, QByteArray *title, QByteArray *text )
{
HtmlDocPtr doc( htmlReadMemory( data.constData(), data.length(), NULL, "UTF-8", HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET ) );
if ( !doc ) {
qCWarning(LOG) << "cannot parse html";
return false;
}
xmlNode *root = xmlDocGetRootElement( doc );
if ( !root ) {
qCWarning(LOG) << "missing root";
return false;
}
xmlNode *html = findChildElement( root, "html" );
if ( !html ) {
qCWarning(LOG) << "missing <html>";
return false;
}
xmlNode *head = findChildElement( html, "head" );
xmlNode *body = findChildElement( html, "body" );
if ( !body ) {
qCWarning(LOG) << "missing <body>";
return false;
}
QByteArray newText;
collectText( body, &newText );
*text = newText;
if ( head ) {
xmlNode *title_node = findChildElement( head, "title" );
if ( title_node ) {
QByteArray newTitle;
collectText( title_node, &newTitle );
*title = newTitle;
}
}
return true;
}
/*
This file is part of the KDE Help Center
Copyright (c) 2016 Pino Toscano <pino@kde.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
#ifndef HTMLTEXTDUMP_H
#define HTMLTEXTDUMP_H
#include <QByteArray>
bool htmlTextDump( const QByteArray& data, QByteArray *title, QByteArray *text );
#endif
[Desktop Entry]
DocumentTypes=application/docbook+xml
SearchCommand=${KDE_INSTALL_FULL_LIBEXECDIR}/khc_xapiansearch --indexdir=%d --identifier=%i --words=%w --method=%o --maxnum=%m --lang=%l
IndexCommand=${KDE_INSTALL_FULL_LIBEXECDIR}/khc_xapianindexer --indexdir=%d --identifier=%i --lang=%l
/*
This file is part of the KDE Help Center
Copyright (c) 2016 Pino Toscano <pino@kde.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
#include "xapiancommon.h"
#include <QByteArray>
#include <QDir>
#include <QFileInfo>
#include <QLoggingCategory>
namespace {
Q_LOGGING_CATEGORY( LOG, "org.kde.khelpcenter.xapian.common", QtWarningMsg )
const std::string versionKey = "khc-db-version";
}
enum {
KHC_DB_VERSION = 1
};
DatabaseVersionMismatch::DatabaseVersionMismatch( int aversion )
: version( aversion )
, refVersion( KHC_DB_VERSION )
{
}
static int getDatabaseVersion( const Xapian::Database& db )
{
const std::string value = db.get_metadata( versionKey );
return QByteArray::fromRawData( value.c_str(), value.size() ).toInt();
}
static Xapian::WritableDatabase openWritableDbHelper( const QString& path, bool checkVersion )
{
Xapian::WritableDatabase db;
try {
Xapian::WritableDatabase newDb = Xapian::WritableDatabase( QFile::encodeName( path ).constData(), Xapian::DB_CREATE_OR_OPEN );
if ( checkVersion && newDb.get_doccount() > 0 ) {
const int version = getDatabaseVersion( newDb );
if ( KHC_DB_VERSION != version ) {
throw DatabaseVersionMismatch( version );
}
}
newDb.set_metadata( versionKey, QByteArray::number( KHC_DB_VERSION ).constData() );
db = newDb;
} catch ( const Xapian::DatabaseCorruptError& e ) {
qCWarning(LOG) << "Xapian DB corrupted, throwing it away";
QDir( path ).removeRecursively();
return openWritableDbHelper( path, false );
} catch ( const Xapian::DatabaseVersionError& e ) {
qCWarning(LOG) << "Xapian DB version mismatch, throwing it away";
QDir( path ).removeRecursively();
return openWritableDbHelper( path, false );
} catch ( const DatabaseVersionMismatch& e ) {
qCWarning(LOG) << "Own version mismatch in Xapian DB: found" << e.version << "vs wanted" << e.refVersion << "- throwing it away";
QDir( path ).removeRecursively();
return openWritableDbHelper( path, false );
}
return db;
}
Xapian::WritableDatabase openWritableDb( const QString& path )
{
return openWritableDbHelper( path, true );
}
Xapian::Database openDb( const QString& path )
{
Xapian::Database db( QFile::encodeName( path ).constData() );
const int version = getDatabaseVersion( db );
if ( KHC_DB_VERSION != version ) {
throw DatabaseVersionMismatch( version );
}
return db;
}
void getDocInfo( const Xapian::Document& doc, std::string* lang, std::string* uid, std::string* xhtml )
{
for ( Xapian::TermIterator it = doc.termlist_begin(); it != doc.termlist_end(); ++it ) {
const std::string term = *it;
if ( term.empty() ) {
continue;
}
switch ( term[0] ) {
case 'L':
if ( lang ) {
lang->assign( term.begin() + 1, term.end() );
}
break;
case 'U':
if ( uid ) {
uid->assign( term.begin() + 1, term.end() );
}
break;
case 'X':
if ( xhtml && term.size() > 5 && term.compare( 0, 5, "XHTML") == 0 ) {
xhtml->assign( term.begin() + 5, term.end() );
}
break;
}
}
}
QDebug operator<<( QDebug dbg, const std::string& s )
{
return dbg << s.c_str();
}
/*
This file is part of the KDE Help Center
Copyright (c) 2016 Pino Toscano <pino@kde.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
#ifndef KHC_XAPIANCOMMON_H
#define KHC_XAPIANCOMMON_H
// must be put before any Qt include
#include <xapian.h>
#include <QDebug>
#include <QString>
enum {
VALUE_LASTMOD = 0,
VALUE_TITLE = 1,
};
struct DatabaseVersionMismatch
{
DatabaseVersionMismatch( int aversion );
const int version;
const int refVersion;
};
Xapian::WritableDatabase openWritableDb( const QString& path );
Xapian::Database openDb( const QString& path );
void getDocInfo( const Xapian::Document& doc, std::string* lang, std::string* uid, std::string* xhtml );
QDebug operator<<( QDebug dbg, const std::string& s );
#endif
/*
This file is part of the KDE Help Center
Copyright (c) 2016 Pino Toscano <pino@kde.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
MA 02110-1301, USA
*/
#include "cachereader.h"
#include "htmltextdump.h"
#include "xapiancommon.h"
#include <QCoreApplication>
#include <QCommandLineParser>
#include <QDebug>
#include <QDateTime>
#include <QDirIterator>
#include <QFile>
#include <QLoggingCategory>
#include <QStandardPaths>
namespace {
Q_LOGGING_CATEGORY( LOG, "org.kde.khelpcenter.xapian.indexer", QtWarningMsg )
}
static Xapian::Document createDocument( Xapian::TermGenerator& xgen, const std::string& uid, const std::string& lang, const std::string& modTime, const QString& html, const QByteArray& data )
{
Xapian::Document doc;
xgen.set_document( doc );