Verified Commit 674dfa14 authored by Daniel Vrátil's avatar Daniel Vrátil 🤖
Browse files

Discard duplicate results during contact completion

Summary:
Drop duplicate results from contact completion to return
more relevant results. This is still limited by the indexing
side as we are unable to deduplicate easily based on the email
address itself (or merge the results in some clever way).

Reviewers: #kde_pim, dfaure

Reviewed By: dfaure

Subscribers: kde-pim

Tags: #kde_pim

Differential Revision: https://phabricator.kde.org/D28615
parent 1f18e82c
......@@ -24,10 +24,12 @@
#include "contactcompleter.h"
#include "query.h"
#include "akonadi_search_pim_debug.h"
#include <QStandardPaths>
#include <QDebug>
#include <QFile>
#include <QElapsedTimer>
using namespace Akonadi::Search::PIM;
......@@ -37,6 +39,51 @@ ContactCompleter::ContactCompleter(const QString &prefix, int limit)
{
}
static QStringList processEnquire(Xapian::Enquire &enq, int limit)
{
QElapsedTimer timer;
timer.start();
// Retrieves no results but provides statistics - it's very quick
auto statsmset = enq.get_mset(0, 0);
qCDebug(AKONADI_SEARCH_PIM_LOG) << "Query:" << QString::fromStdString(enq.get_query().get_description());
qCDebug(AKONADI_SEARCH_PIM_LOG) << "Estimated matches:" << statsmset.get_matches_estimated();
const int matchEstimate = statsmset.get_matches_estimated();
QStringList list;
list.reserve(std::min(limit, matchEstimate));
int duplicates = 0;
int firstItem = 0;
// We run the query multiple times, since we may discard some results as duplicates.
while (list.size() < limit) {
// Always query the "limit"-count of results:
// * if estimate is less than limit, we make sure we don't miss results any due to wrong estimate
// * if estimate is more than limit, we don't want to query more documents than needed
Xapian::MSet mset = enq.get_mset(firstItem, limit);
if (mset.empty()) { // there are no more non-duplicate results
break;
}
for (auto it = mset.begin(), end = mset.end(); it != end && list.size() < limit; ++it) {
const auto entry = QString::fromStdString(it.get_document().get_data());
// TODO: Be smarter about the deduplication by fixing the indexing code:
// If we store mailbox name and address as separate named terms then we could deduplicate
// purely based on the email address.
if (!list.contains(entry, Qt::CaseInsensitive)) {
qCDebug(AKONADI_SEARCH_PIM_LOG, "Match: \"%s\" (%d%%), docid %u", qUtf8Printable(entry), it.get_percent(), *it);
list.push_back(entry);
} else {
++duplicates;
qCDebug(AKONADI_SEARCH_PIM_LOG, "Skipped duplicate match \"%s\" (%d%%) docid %u", qUtf8Printable(entry), it.get_percent(), *it);
}
++firstItem;
}
}
qCDebug(AKONADI_SEARCH_PIM_LOG) << "Collected" << list.size() << "results in" << timer.elapsed() << "ms, skipped" << duplicates << "duplicates.";
return list;
}
QStringList ContactCompleter::complete()
{
const QString dir = Query::defaultLocation(QStringLiteral("emailContacts"));
......@@ -44,45 +91,36 @@ QStringList ContactCompleter::complete()
try {
db = Xapian::Database(QFile::encodeName(dir).constData());
} catch (const Xapian::DatabaseOpeningError &) {
qWarning() << "Xapian Database does not exist at " << dir;
qCWarning(AKONADI_SEARCH_PIM_LOG) << "Xapian Database does not exist at " << dir;
return QStringList();
} catch (const Xapian::DatabaseCorruptError &) {
qWarning() << "Xapian Database corrupted";
qCWarning(AKONADI_SEARCH_PIM_LOG) << "Xapian Database corrupted";
return QStringList();
} catch (const Xapian::DatabaseError &e) {
qWarning() << QString::fromStdString(e.get_type()) << QString::fromStdString(e.get_description());
qCWarning(AKONADI_SEARCH_PIM_LOG) << QString::fromStdString(e.get_type()) << QString::fromStdString(e.get_description());
return QStringList();
} catch (...) {
qWarning() << "Random exception, but we do not want to crash";
qCWarning(AKONADI_SEARCH_PIM_LOG) << "Random exception, but we do not want to crash";
return QStringList();
}
Xapian::QueryParser parser;
parser.set_database(db);
std::string prefix(m_prefix.toUtf8().constData());
int flags = Xapian::QueryParser::FLAG_DEFAULT | Xapian::QueryParser::FLAG_PARTIAL;
Xapian::Query q = parser.parse_query(prefix, flags);
const int flags = Xapian::QueryParser::FLAG_DEFAULT | Xapian::QueryParser::FLAG_PARTIAL;
const Xapian::Query q = parser.parse_query(m_prefix.toStdString(), flags);
Xapian::Enquire enq(db);
enq.set_query(q);
enq.set_sort_by_relevance();
// TODO: extend the indexer to use value slots for the normalized email address so that
// duplicates can be collapsed by Xapian::Enquire::set_collapse_key()
Xapian::MSet mset = enq.get_mset(0, m_limit);
Xapian::MSetIterator mit = mset.begin();
QStringList list;
Xapian::MSetIterator end = mset.end();
list.reserve(mset.size());
Q_FOREVER {
try {
for (; mit != end; ++mit) {
std::string str = mit.get_document().get_data();
const QString entry = QString::fromUtf8(str.c_str(), str.length());
list << entry;
}
return list;
return processEnquire(enq, m_limit);
} catch (const Xapian::DatabaseCorruptError &e) {
qWarning() << "The emailContacts Xapian database is corrupted:" << QString::fromStdString(e.get_description());
qCWarning(AKONADI_SEARCH_PIM_LOG) << "The emailContacts Xapian database is corrupted:" << QString::fromStdString(e.get_description());
return QStringList();
} catch (const Xapian::DatabaseModifiedError &e) {
db.reopen();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment