Commit 3e2b9a16 authored by Thomas Friedrichsmeier's avatar Thomas Friedrichsmeier
Browse files

Replace locale update hack with Qt's new mechnism for updating the current default QTextCodec.

Also, use QTextEncoder / QTextDecoder to get defined behavior on non-representable characters.

This should hopefully fix a crash on startup on Windows.
parent f8515deb
/***************************************************************************
rklocalesupport - description
-------------------
begin : Sun Mar 11 2007
copyright : (C) 2007, 2009 by Thomas Friedrichsmeier
email : thomas.friedrichsmeier@kdemail.net
***************************************************************************/
/***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
***************************************************************************/
#include "rklocalesupport.h"
#include <qtextcodec.h>
#include <QByteArray>
#ifndef Q_OS_WIN
// see http://sourceforge.net/p/rkward/patches/4/
// seems to be needed for GCC 4.3 as well.
# include <langinfo.h>
#endif
#include <stdlib.h>
#include <locale.h>
#include <ctype.h>
/* NOTE: The code in this file is an almost literal copy taken from setupLocaleMapper in qtextcodec.cpp in Qt 3.3.8 !*/
QTextCodec *checkForCodec(const char *name) {
QTextCodec *c = QTextCodec::codecForName(name);
if (!c) {
const char *at = strchr(name, '@');
if (at) {
QByteArray n(name, at - name);
c = QTextCodec::codecForName(n.data());
}
}
return c;
}
/* locale names mostly copied from XFree86 */
static const char * const iso8859_2locales[] = {
"croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
"hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
"ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
"sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
static const char * const iso8859_3locales[] = {
"eo", 0 };
static const char * const iso8859_4locales[] = {
"ee", "ee_EE", 0 };
static const char * const iso8859_5locales[] = {
"mk", "mk_MK", "sp", "sp_YU", 0 };
static const char * const cp_1251locales[] = {
"be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
static const char * const pt_154locales[] = {
"ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
static const char * const iso8859_6locales[] = {
"ar_AA", "ar_SA", "arabic", 0 };
static const char * const iso8859_7locales[] = {
"el", "el_GR", "greek", 0 };
static const char * const iso8859_8locales[] = {
"hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
static const char * const iso8859_9locales[] = {
"tr", "tr_TR", "turkish", 0 };
static const char * const iso8859_13locales[] = {
"lt", "lt_LT", "lv", "lv_LV", 0 };
static const char * const iso8859_15locales[] = {
"et", "et_EE",
// Euro countries
"br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
"es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
"fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
"nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
0 };
static const char * const koi8_ulocales[] = {
"uk", "uk_UA", "ru_UA", "ukrainian", 0 };
static const char * const tis_620locales[] = {
"th", "th_TH", "thai", 0 };
static const char * const tcvnlocales[] = {
"vi", "vi_VN", 0 };
static bool try_locale_list( const char * const locale[], const char * lang )
{
int i;
for( i=0; locale[i] && *locale[i] && strcmp(locale[i], lang); i++ )
;
return locale[i] != 0;
}
// For the probably_koi8_locales we have to look. the standard says
// these are 8859-5, but almost all Russian users use KOI8-R and
// incorrectly set $LANG to ru_RU. We'll check tolower() to see what
// tolower() thinks ru_RU means.
// If you read the history, it seems that many Russians blame ISO and
// Perestroika for the confusion.
//
// The real bug is that some programs break if the user specifies
// ru_RU.KOI8-R.
static const char * const probably_koi8_rlocales[] = {
"ru", "ru_SU", "ru_RU", "russian", 0 };
static QTextCodec * ru_RU_hack( const char * i ) {
QTextCodec * ru_RU_codec = 0;
QByteArray origlocale(setlocale(LC_CTYPE, i));
// unicode koi8r latin5 name
// 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU
// 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU
int latin5 = tolower( 0xCE );
int koi8r = tolower( 0xE0 );
if ( koi8r == 0xC0 && latin5 != 0xEE ) {
ru_RU_codec = QTextCodec::codecForName( "KOI8-R" );
} else if ( koi8r != 0xC0 && latin5 == 0xEE ) {
ru_RU_codec = QTextCodec::codecForName( "ISO 8859-5" );
} else {
// something else again... let's assume... *throws dice*
ru_RU_codec = QTextCodec::codecForName( "KOI8-R" );
qWarning( "QTextCodec: using KOI8-R, probe failed (%02x %02x %s)",
koi8r, latin5, i );
}
setlocale( LC_CTYPE, origlocale.data() );
return ru_RU_codec;
}
QTextCodec *RKGetCurrentLocaleCodec () {
QTextCodec *localeMapper = 0;
#ifdef Q_OS_WIN32
localeMapper = QTextCodec::codecForName( "System" );
#else
#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX6) && !defined(Q_OS_OSF) && !defined(Q_OS_MAC)
char *charset = nl_langinfo (CODESET);
if ( charset )
localeMapper = QTextCodec::codecForName( charset );
#endif
if ( !localeMapper ) {
// Very poorly defined and followed standards causes lots of code
// to try to get all the cases...
// Try to determine locale codeset from locale name assigned to
// LC_CTYPE category.
// First part is getting that locale name. First try setlocale() which
// definitely knows it, but since we cannot fully trust it, get ready
// to fall back to environment variables.
char * ctype = qstrdup( setlocale( LC_CTYPE, 0 ) );
// Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
// environment variables.
char * lang = qstrdup( getenv("LC_ALL") );
if ( !lang || lang[0] == 0 || strcmp( lang, "C" ) == 0 ) {
if ( lang ) delete [] lang;
lang = qstrdup( getenv("LC_CTYPE") );
}
if ( !lang || lang[0] == 0 || strcmp( lang, "C" ) == 0 ) {
if ( lang ) delete [] lang;
lang = qstrdup( getenv("LANG") );
}
// Now try these in order:
// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
// 2. CODESET from lang if it contains a .CODESET part
// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
// 4. locale (ditto)
// 5. check for "@euro"
// 6. guess locale from ctype unless ctype is "C"
// 7. guess locale from lang
// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
char * codeset = ctype ? strchr( ctype, '.' ) : 0;
if ( codeset && *codeset == '.' )
localeMapper = checkForCodec( codeset + 1 );
// 2. CODESET from lang if it contains a .CODESET part
codeset = lang ? strchr( lang, '.' ) : 0;
if ( !localeMapper && codeset && *codeset == '.' )
localeMapper = checkForCodec( codeset + 1 );
// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
if ( !localeMapper && ctype && *ctype != 0 && strcmp (ctype, "C") != 0 )
localeMapper = checkForCodec( ctype );
// 4. locale (ditto)
if ( !localeMapper && lang && *lang != 0 )
localeMapper = checkForCodec( lang );
// 5. "@euro"
if ( !localeMapper && ctype && strstr( ctype, "@euro" ) || lang && strstr( lang, "@euro" ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-15" );
// 6. guess locale from ctype unless ctype is "C"
// 7. guess locale from lang
char * try_by_name = ctype;
if ( ctype && *ctype != 0 && strcmp (ctype, "C") != 0 )
try_by_name = lang;
// Now do the guessing.
if ( lang && *lang && !localeMapper && try_by_name && *try_by_name ) {
if ( try_locale_list( iso8859_15locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-15" );
else if ( try_locale_list( iso8859_2locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-2" );
else if ( try_locale_list( iso8859_3locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-3" );
else if ( try_locale_list( iso8859_4locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-4" );
else if ( try_locale_list( iso8859_5locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-5" );
else if ( try_locale_list( iso8859_6locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-6" );
else if ( try_locale_list( iso8859_7locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-7" );
else if ( try_locale_list( iso8859_8locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-8-I" );
else if ( try_locale_list( iso8859_9locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-9" );
else if ( try_locale_list( iso8859_13locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-13" );
else if ( try_locale_list( tis_620locales, lang ) )
localeMapper = QTextCodec::codecForName( "ISO 8859-11" );
else if ( try_locale_list( koi8_ulocales, lang ) )
localeMapper = QTextCodec::codecForName( "KOI8-U" );
else if ( try_locale_list( cp_1251locales, lang ) )
localeMapper = QTextCodec::codecForName( "CP 1251" );
else if ( try_locale_list( pt_154locales, lang ) )
localeMapper = QTextCodec::codecForName( "PT 154" );
else if ( try_locale_list( probably_koi8_rlocales, lang ) )
localeMapper = ru_RU_hack( lang );
}
delete [] ctype;
delete [] lang;
}
if ( localeMapper && localeMapper->mibEnum() == 11 )
localeMapper = QTextCodec::codecForName( "ISO 8859-8-I" );
// If everything failed, we default to 8859-1
// We could perhaps default to 8859-15.
if ( !localeMapper )
localeMapper = QTextCodec::codecForName( "ISO 8859-1" );
#endif
return localeMapper;
}
/***************************************************************************
rklocalesupport - description
-------------------
begin : Sun Mar 11 2007
copyright : (C) 2007 by Thomas Friedrichsmeier
email : thomas.friedrichsmeier@kdemail.net
***************************************************************************/
/***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
***************************************************************************/
#ifndef RKLOCALESUPPORT_H
#define RKLOCALESUPPORT_H
class QTextCodec;
/** Helper function to determine the QTextCodec best suited to recode the current CTYPE to UTF-8 */
QTextCodec *RKGetCurrentLocaleCodec ();
#endif
......@@ -45,7 +45,6 @@ void* RKRBackend::default_global_context = 0;
#include "rkrsupport.h"
#include "rkstructuregetter.h"
#include "rklocalesupport.h"
#include "rksignalsupport.h"
#include "rkreventloop.h"
#include "../misc/rkcommonfunctions.h"
......@@ -307,7 +306,7 @@ int RReadConsole (const char* prompt, unsigned char* buf, int buflen, int hist)
RKRBackend::repl_status.user_command_completely_transmitted = false;
RKRBackend::repl_status.user_command_parsed_up_to = 0;
RKRBackend::repl_status.user_command_successful_up_to = 0;
RKRBackend::repl_status.user_command_buffer = RKRBackend::this_pointer->current_locale_codec->fromUnicode (command->command);
RKRBackend::repl_status.user_command_buffer = RKRBackend::fromUtf8 (command->command);
RKTransmitNextUserCommandChunk (buf, buflen);
RKRBackend::repl_status.user_command_status = RKRBackend::RKReplStatus::UserCommandTransmitted;
return 1;
......@@ -317,7 +316,7 @@ int RReadConsole (const char* prompt, unsigned char* buf, int buflen, int hist)
// fully transmitted, but R is still asking for more? This looks like an incomplete statement.
// HOWEVER: It may also have been an empty statement such as " ", so let's check whether the prompt looks like a "continue" prompt
bool incomplete = false;
if (RKRBackend::this_pointer->current_locale_codec->toUnicode (prompt) == RKRSupport::SEXPToString (Rf_GetOption (Rf_install ("continue"), R_BaseEnv))) {
if (RKRBackend::toUtf8 (prompt) == RKRSupport::SEXPToString (Rf_GetOption (Rf_install ("continue"), R_BaseEnv))) {
incomplete = true;
}
if (incomplete) RKRBackend::this_pointer->current_command->status |= RCommand::Failed | RCommand::ErrorIncomplete;
......@@ -416,7 +415,7 @@ int RReadConsole (const char* prompt, unsigned char* buf, int buflen, int hist)
RK_ASSERT (false); // should not reach this point.
}
QByteArray localres = RKRBackend::this_pointer->current_locale_codec->fromUnicode (request.params["result"].toString ());
QByteArray localres = RKRBackend::fromUtf8 (request.params["result"].toString ());
// need to append a newline, here. TODO: theoretically, RReadConsole comes back for more, if \0 was encountered before \n.
qstrncpy ((char *) buf, localres.left (buflen - 2).append ('\n').data (), buflen);
return 1;
......@@ -448,7 +447,7 @@ bool RKRBackend::fetchStdoutStderr (bool forcibly) {
if (bytes <= 0) break;
buffer[bytes] = '\0';
// NOTE: we must not risk blocking inside handleOutput, while the stdout_stderr_mutex is locked!
handleOutput (current_locale_codec->toUnicode (buffer, bytes), bytes, ROutput::Warning, false);
handleOutput (RKRBackend::toUtf8 (buffer), bytes, ROutput::Warning, false);
}
stdout_stderr_mutex.unlock ();
......@@ -479,7 +478,7 @@ void RWriteConsoleEx (const char *buf, int buflen, int type) {
if (RKRBackend::this_pointer->killed == RKRBackend::AlreadyDead) return; // this check is mostly for fork()ed clients
if (RKRBackend::repl_status.browser_context == RKRBackend::RKReplStatus::InBrowserContextPreventRecursion) return;
RKRBackend::this_pointer->fetchStdoutStderr (true);
RKRBackend::this_pointer->handleOutput (RKRBackend::this_pointer->current_locale_codec->toUnicode (buf, buflen), buflen, type == 0 ? ROutput::Output : ROutput::Warning);
RKRBackend::this_pointer->handleOutput (RKRBackend::toUtf8 (buf), buflen, type == 0 ? ROutput::Output : ROutput::Warning);
}
/** For R callbacks that we want to disable, entirely */
......@@ -596,7 +595,7 @@ int RChooseFile (int isnew, char *buf, int len) {
RKRBackend::this_pointer->handleRequest (&request);
QByteArray localres = RKRBackend::this_pointer->current_locale_codec->fromUnicode (request.params["result"].toString ());
QByteArray localres = RKRBackend::fromUtf8 (request.params["result"].toString ());
qstrncpy ((char *) buf, localres.data (), len);
// return length of filename (strlen (buf))
......@@ -731,7 +730,7 @@ void RBusy (int busy) {
if (RKRBackend::repl_status.user_command_status == RKRBackend::RKReplStatus::UserCommandTransmitted) {
if (RKRBackend::this_pointer->current_command->type & RCommand::CCCommand) {
QByteArray chunk = RKRBackend::repl_status.user_command_buffer.mid (RKRBackend::repl_status.user_command_parsed_up_to, RKRBackend::repl_status.user_command_transmitted_up_to - RKRBackend::repl_status.user_command_parsed_up_to);
RKRBackend::this_pointer->printCommand (RKRBackend::this_pointer->current_locale_codec->toUnicode (chunk));
RKRBackend::this_pointer->printCommand (RKRBackend::toUtf8 (chunk));
}
if (RKRBackend::this_pointer->current_command->type & RCommand::CCOutput) {
// flush any previous output caputre and start a new one
......@@ -746,11 +745,13 @@ void RBusy (int busy) {
// ############## R Standard callback overrides END ####################
SEXP doUpdateLocale ();
// NOTE: stdout_stderr_mutex is recursive to support fork()s, better
RKRBackend::RKRBackend () : stdout_stderr_mutex (QMutex::Recursive) {
RK_TRACE (RBACKEND);
current_locale_codec = QTextCodec::codecForLocale ();
current_locale_encoder = 0; // marks locale as not yet initialized
doUpdateLocale ();
r_running = false;
current_command = 0;
......@@ -915,8 +916,15 @@ SEXP doUpdateLocale () {
RK_TRACE (RBACKEND);
RK_DEBUG (RBACKEND, DL_WARNING, "Changing locale");
RKRBackend::this_pointer->current_locale_codec = RKGetCurrentLocaleCodec ();
RK_DEBUG (RBACKEND, DL_WARNING, "New locale codec is %s", RKRBackend::this_pointer->current_locale_codec->name ().data ());
if (RKRBackend::this_pointer->current_locale_encoder) {
delete (RKRBackend::this_pointer->current_locale_encoder);
delete (RKRBackend::this_pointer->current_locale_decoder);
QTextCodec::setCodecForLocale (0);
RK_ASSERT (QTextCodec::codecForLocale ());
RKRBackend::this_pointer->current_locale_encoder = QTextCodec::codecForLocale ()->makeEncoder (QTextCodec::DefaultConversion); // NOTE: shall pass non-representable characters unmodified, rather than stripping them.
RKRBackend::this_pointer->current_locale_decoder = QTextCodec::codecForLocale ()->makeDecoder (QTextCodec::DefaultConversion);
}
RK_DEBUG (RBACKEND, DL_WARNING, "New locale codec is %s", QTextCodec::codecForLocale ()->name ().data ());
return R_NilValue;
}
......@@ -925,10 +933,10 @@ SEXP doUpdateLocale () {
SEXP doLocaleName () {
RK_TRACE (RBACKEND);
RK_ASSERT (RKRBackend::this_pointer->current_locale_codec);
RK_ASSERT (QTextCodec::codecForLocale());
SEXP res = Rf_allocVector(STRSXP, 1);
PROTECT (res);
SET_STRING_ELT (res, 0, Rf_mkChar (RKRBackend::this_pointer->current_locale_codec->name ().data ()));
SET_STRING_ELT (res, 0, Rf_mkChar (QTextCodec::codecForLocale()->name ().data ()));
UNPROTECT (1);
return res;
}
......@@ -1031,9 +1039,6 @@ bool RKRBackend::startR () {
RKSignalSupport::installSignalProxies (); // for the crash signals
RKSignalSupport::installSigIntAndUsrHandlers (RK_scheduleIntr);
RKRBackend::this_pointer->current_locale_codec = RKGetCurrentLocaleCodec (); // Ok, why is this needed? Beats me (mostly), but the result is different form codecForLocale() used in initialization:
// This one will turn non-representable characters into unicode numbers, the other one will just strip them...
// KF5 TODO: Use makeEncoder() and makeDecoder() to get defined behavior on this
// register our functions
R_CallMethodDef callMethods [] = {
......@@ -1140,7 +1145,7 @@ SEXP parseCommand (const QString &command_qstring, RKRBackend::RKWardRError *err
SafeParseWrap wrap;
wrap.status = PARSE_NULL;
QByteArray localc = RKRBackend::this_pointer->current_locale_codec->fromUnicode (command_qstring); // needed so the string below does not go out of scope
QByteArray localc = RKRBackend::fromUtf8 (command_qstring); // needed so the string below does not go out of scope
const char *command = localc.data ();
PROTECT(wrap.cv=Rf_allocVector(STRSXP, 1));
......@@ -1366,7 +1371,7 @@ void RKRBackend::catToOutputFile (const QString &out) {
RK_ASSERT (false);
return;
}
f.write (current_locale_codec->fromUnicode (out));
f.write (RKRBackend::fromUtf8 (out));
f.close ();
}
......@@ -1427,7 +1432,7 @@ void RKRBackend::commandFinished (bool check_object_updates_needed) {
// This method may look a bit over-complex, but remember that repl_status.user_command_successful_up_to works on an *encoded* buffer
QByteArray remainder_encoded = repl_status.user_command_buffer.mid (repl_status.user_command_successful_up_to);
QString remainder = current_locale_codec->toUnicode (remainder_encoded);
QString remainder = RKRBackend::toUtf8 (remainder_encoded);
current_command->has_been_run_up_to = current_command->command.length () - remainder.length ();
}
......
......@@ -25,6 +25,8 @@
#include <QMutex>
#include <QStringList>
#include <QEvent>
#include <QTextEncoder>
#include <QTextDecoder>
#include "rcommand.h"
#include "rcommandstack.h"
......@@ -34,8 +36,6 @@
void RK_scheduleIntr();
#endif
class QStringList;
class QTextCodec;
class RInterface;
struct ROutput;
......@@ -139,7 +139,14 @@ handleHistoricalSubstackRequest(). Exactly which requests get handled by which f
void kill () { killed = ExitNow; };
bool isKilled () { return (killed != NotKilled); };
QTextCodec *current_locale_codec;
static QString toUtf8 (const char *local_coded) {
return this_pointer->current_locale_decoder->toUnicode (local_coded);
}
static QByteArray fromUtf8 (const QString &uni_coded) {
return this_pointer->current_locale_encoder->fromUnicode (uni_coded);
}
QTextEncoder *current_locale_encoder;
QTextDecoder *current_locale_decoder;
struct RKReplStatus {
QByteArray user_command_buffer;
......
......@@ -118,7 +118,7 @@ QStringList RKRSupport::SEXPToStringList (SEXP from_exp) {
} else if (IS_LATIN1 (dummy)) {
list.append (QString::fromLatin1 ((char *) STRING_PTR (dummy)));
} else {
list.append (RKRBackend::this_pointer->current_locale_codec->toUnicode ((char *) STRING_PTR (dummy)));
list.append (RKRBackend::toUtf8 ((char *) STRING_PTR (dummy)));
}
}
}
......@@ -136,7 +136,7 @@ SEXP RKRSupport::StringListToSEXP (const QStringList& list) {
SET_STRING_ELT (ret, i, Rf_mkCharCE (list[i].toUtf8 (), CE_UTF8));
#else
// TODO Rf_mkCharCE _might_ have been introduced earlier. Check if still an ongoing concern.
SET_STRING_ELT (ret, i, Rf_mkChar (RKRBackend::this_pointer->current_locale_codec->fromUnicode (list[i]).data ()));
SET_STRING_ELT (ret, i, Rf_mkChar (RKRBackend::fromUtf8 (list[i]).data ()));
#endif
}
return ret;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment