Commit 3208955e authored by Martin Koller's avatar Martin Koller

Do not use KEncodingProber - it gives false results; Try dirname or UTF8

The auto-detection of the man page file content with KEncodingProber
was not successful - there are some bug reports showing it does not work
reliable - often giving EUC-JP or gb18030 as encoding, which is wrong.

I now try to find the encoding inside the man page file
(according manconv) or from the name of the directory in which the
file resides. However, on my openSuse system, neither the definition
inside nor the directory name tells me it's UTF-8, but all pages are in
UTF-8. Therefore I now use UTF-8 as default, which can be overridden
with the env-var MAN_ICONV_INPUT_CHARSET

BUG: 321074
FIXED-IN: 14.12.1
parent 73c431ec
......@@ -40,7 +40,6 @@
#include <kstandarddirs.h>
#include <KProcess>
#include <klocale.h>
#include <kencodingprober.h>
#include "kio_man.moc"
#include "man2html.h"
......@@ -226,6 +225,8 @@ QMap<QString, QString> MANProtocol::buildIndexMap(const QString &section)
return i;
}
//---------------------------------------------------------------------
QStringList MANProtocol::manDirectories()
{
checkManPaths();
......@@ -535,7 +536,7 @@ void MANProtocol::get(const KUrl& url )
char *MANProtocol::readManPage(const char *_filename)
{
QByteArray filename = _filename;
QByteArray array;
QByteArray array, dirName;
/* Determine type of man page file by checking its path. Determination by
* MIME type with KMimeType doesn't work reliablely. E.g., Solaris 7:
......@@ -564,6 +565,11 @@ char *MANProtocol::readManPage(const char *_filename)
lastdir = filename.left(filename.lastIndexOf('/'));
// get the last directory name (which might be a language name, to be able to guess the encoding)
QDir dir(lastdir);
dir.cdUp();
dirName = QFile::encodeName(dir.dirName());
if ( !QFile::exists(QFile::decodeName(filename)) ) // if given file does not exist, find with suffix
{
kDebug(7107) << "not existing " << filename;
......@@ -589,22 +595,7 @@ char *MANProtocol::readManPage(const char *_filename)
if (array.isEmpty())
return 0;
// as we do not know in which encoding the man source is, try to automatically
// detect it and always return it as UTF-8
KEncodingProber encodingProber;
encodingProber.feed(array);
kDebug(7107) << "auto-detect encoding for" << filename << "guess=" << encodingProber.encoding()
<< "confidence=" << encodingProber.confidence();
QString out = QTextCodec::codecForName(encodingProber.encoding())->toUnicode(array);
array = out.toUtf8();
const int len = array.size();
char *buf = new char[len + 4];
memmove(buf + 1, array.data(), len);
buf[0] = buf[len+1] = '\n'; // Start and end with an end of line
buf[len+2] = buf[len+3] = '\0'; // Two NUL characters at end
return buf;
return manPageToUtf8(array, dirName);
}
//---------------------------------------------------------------------
......
......@@ -129,14 +129,18 @@
#include <QtCore/QString>
#include <QTextCodec>
//#include <kencodingprober.h>
#ifdef SIMPLE_MAN2HTML
# include <stdlib.h>
# include <iostream>
# include <dirent.h>
# include <sys/stat.h>
# include <QDebug>
#include <QFile>
#include <kencodingprober.h>
# include <QFile>
# include <QFileInfo>
# include <QDir>
# include <kfilterdev.h>
# define kDebug(x) QDebug(QtDebugMsg)
# define kWarning(x) QDebug(QtWarningMsg) << "WARNING "
# define BYTEARRAY(x) x.constData()
......@@ -6054,6 +6058,84 @@ void scan_man_page(const char *man_page)
//---------------------------------------------------------------------
char *manPageToUtf8(const QByteArray &input, const QByteArray &dirName)
{
// as we do not know in which encoding the man source is, try to automatically
// detect it and always return it as UTF-8
QByteArray encoding;
// some pages contain "coding:" information. See "man manconv"
// (but I find pages which do not excactly obey the format described in manconv, e.g.
// the control char is either "." or "'")
// Therefore use a QRegExp
QRegExp regex("[\\.']\\\\\".*coding:\\s*(\\S*)\\s", Qt::CaseInsensitive);
if ( regex.indexIn(QLatin1String(input)) == 0 )
{
encoding = regex.cap(1).toLatin1();
kDebug(7107) << "found embedded encoding" << encoding;
}
else
{
// check according to the dirName the man page is in
// if the dirName contains a ".", the encoding follows, e.g. "de.UTF-8"
int dot = dirName.indexOf('.');
if ( dot != -1 )
{
encoding = dirName.mid(dot + 1);
}
else
{
/* wanted to use KEncodingProber ... however it fails and gives very unreliable
results ... telling me often UTF-8 encoded pages are EUC-JP or gb18030 ...
In fact all man pages here on openSuse are encoded in UTF-8
KEncodingProber encodingProber;
encodingProber.feed(input);
kDebug(7107) << "auto-detect encoding; guess=" << encodingProber.encoding()
<< "confidence=" << encodingProber.confidence();
encoding = encodingProber.encoding();
*/
// the original bug report #141340
// mentioned the env var MAN_ICONV_INPUT_CHARSET ... let's check if it is set
// This seems not be a std. man-db env var, but I find several traces of it on the web
encoding = qgetenv("MAN_ICONV_INPUT_CHARSET");
if ( encoding.isEmpty() )
encoding = "UTF-8";
}
}
QTextCodec *codec = 0;
if ( !encoding.isEmpty() )
codec = QTextCodec::codecForName(encoding);
if ( !codec ) // fallback encoding
codec = QTextCodec::codecForName("ISO-8859-1");
kDebug(7107) << "using the encoding" << codec->name() << "for file in dir" << dirName;
QString out = codec->toUnicode(input);
QByteArray array = out.toUtf8();
// TODO get rid of this double allocation and scan a QByteArray
const int len = array.size();
char *buf = new char[len + 4];
memmove(buf + 1, array.data(), len);
buf[0] = buf[len+1] = '\n'; // Start and end with an end of line
buf[len+2] = buf[len+3] = '\0'; // Two NUL characters at end
return buf;
}
//---------------------------------------------------------------------
#ifdef SIMPLE_MAN2HTML
void output_real(const char *insert)
{
......@@ -6064,32 +6146,22 @@ void output_real(const char *insert)
char *read_man_page(const char *filename)
{
QFile f(QFile::decodeName(filename));
QIODevice *fd = KFilterDev::deviceForFile(filename);
if ( !f.open(QIODevice::ReadOnly) )
if ( !fd || !fd->open(QIODevice::ReadOnly) )
{
std::cerr << "read_man_page: can not open " << filename << std::endl;
return 0;
}
QByteArray array = f.readAll();
// as we do not know in which encoding the man source is, try to automatically
// detect it and always return it as UTF-8
KEncodingProber encodingProber;
encodingProber.feed(array);
kDebug(7107) << "auto-detect encoding for" << filename << "guess=" << encodingProber.encoding()
<< "confidence=" << encodingProber.confidence();
QString out = QTextCodec::codecForName(encodingProber.encoding())->toUnicode(array);
array = out.toUtf8();
QDir dir(QFileInfo(QFile::decodeName(filename)).dir());
dir.cdUp();
char *data = manPageToUtf8(fd->readAll(), QFile::encodeName(dir.dirName()));
const int len = array.size();
char *buf = new char[len + 4];
memmove(buf + 1, array.data(), len);
buf[0] = buf[len+1] = '\n'; // Start and end with an end of line
buf[len+2] = buf[len+3] = '\0'; // Two NUL characters at end
fd->close();
delete fd;
return buf;
return data;
}
//---------------------------------------------------------------------
......
......@@ -11,6 +11,12 @@
class QByteArray;
/**
Try to detect the encoding of given man page content
and convert it to UTF-8
*/
char *manPageToUtf8(const QByteArray &input, const QByteArray &dirName);
/** call this with the buffer you have */
void scan_man_page(const char *man_page);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment