indexededictfile.cpp 11.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/*****************************************************************************
 * This file is part of Kiten, a KDE Japanese Reference Tool                 *
 * Copyright (C) 2001 Jason Katz-Brown <jason@katzbrown.com>                 *
 * Copyright (C) 2008 Joseph Kerian <jkerian@gmail.com>                      *
 *                                                                           *
 * This library is free software; you can redistribute it and/or             *
 * modify it under the terms of the GNU Library General Public               *
 * License as published by the Free Software Foundation; either              *
 * version 2 of the License, or (at your option) any later version.          *
 *                                                                           *
 * This library is distributed in the hope that it will be useful,           *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of            *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU         *
 * Library General Public License for more details.                          *
 *                                                                           *
 * You should have received a copy of the GNU Library General Public License *
 * along with this library; see the file COPYING.LIB.  If not, write to      *
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,      *
 * Boston, MA 02110-1301, USA.                                               *
 *****************************************************************************/

#include "indexededictfile.h"

24
#include <QApplication>
25
26
27
28
#include <KProcess>

#include <QFile>
#include <QFileInfo>
29
#include <QStandardPaths>
30
31
32
#include <QString>
#include <QTextCodec>
#include <QVector>
33
34
35

#include <sys/mman.h>

36
IndexedEdictFile::IndexedEdictFile()
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
37
38
39
: m_valid( false )
, m_dictPtr( static_cast<unsigned char*>( MAP_FAILED ) )
, m_indexPtr( static_cast<uint32_t*>( MAP_FAILED ) )
40
41
42
{
}

43
IndexedEdictFile::~IndexedEdictFile()
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
44
45
46
47
48
49
50
51
52
{
  if( m_valid )
  {
    munmap( static_cast<void*>( m_dictPtr ), m_dictFile.size() );
    munmap( static_cast<void*>( m_indexPtr ), m_indexFile.size() );
    m_dictFile.close();
    m_indexFile.close();
  }
}
53

54
55
//Warning: This assumes that both files are CLOSED
bool IndexedEdictFile::buildIndex()
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
56
{
57
  KProcess proc;
Laurent Montel's avatar
Laurent Montel committed
58
  proc << QStandardPaths::findExecutable(QStringLiteral("kitengen")) << m_dictFile.fileName() << m_indexFile.fileName();
59
60
  proc.start();
  proc.waitForStarted();
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
61

62
  do
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
63
  {
64
    QApplication::processEvents();
65
  } while( proc.waitForFinished( 5000 ) ); //FIXME: This just cuts the index generator off after 5 sec
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
66

67
68
69
70
71
72
73
74
75
76
77
78
79
  //FIXME: Check for the result of this operation
  return proc.exitStatus() == QProcess::NormalExit && proc.exitCode() == 0;
}

//Warning: This assumes that both files have already been opened
bool IndexedEdictFile::checkIndex() const
{
  //Verify the index file version and size
  uint32_t dictionaryLength = static_cast<uint32_t>( m_dictFile.size() );
  dictionaryLength++;
  uint32_t indexVersionTest;

  if( 4 == m_indexFile.read( reinterpret_cast<char*>( &indexVersionTest ), 4 ) )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
80
  {
81
    if( indexVersionTest == dictionaryLength + indexFileVersion )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
82
    {
83
      return true;
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
84
85
86
    }
  }

87
88
  return false;
}
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
89

90
91
92
93
94
95
96
97
98
99
100
/**
 * A small set of EUC formatted string comparison functions
 * that will handle katakana and hiragana as the same, and compare both
 * kanji and latin chars in the same function. Handy for a few sections
 * of the doSearch() routine.
 * This version returns 0 if the first string is equal to or the same
 * as the beginning of the second string.
 */
int IndexedEdictFile::equalOrSubstring( const char *str1, const char *str2 ) const
{
  for(unsigned i=0; ; ++i)
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
101
  {
102
103
104
105
    unsigned char c1 = static_cast<unsigned char>( str1[ i ] );
    unsigned char c2 = static_cast<unsigned char>( str2[ i ] );

    if( c1 == '\0' )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
106
    {
107
108
109
110
111
112
      return 0;
    }

    if( ( i % 2 ) == 0 )
    {
      //on the highbyte (of kana)
113
      if( c2 == 0xA5 ) //Make katakana and hiragana equivalent
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
114
      {
115
116
117
118
119
        c2 = 0xA4;
      }
      if( c1 == 0xA5 )
      {
        c1 = 0xA4;
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
120
121
122
      }
    }

123
124
125
126
127
128
129
130
131
132
133
134
135
    if( ( 'A' <= c1 ) && ( c1 <= 'Z' ) )
    {
      c1 |= 0x20; // 'fix' uppercase
    }
    if( ( 'A' <= c2 ) && ( c2 <= 'Z' ) )
    {
      c2 |= 0x20;
    }

    if( c1 != c2 )
    {
      return (int)c2 - (int)c1;
    }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
136
137
  }

138
  return 0; //silly compiler requirements
139
140
}

141
142
143
144
145
/**
 * Simple binary search through the index to find the query
 * Returns 0 on failure.
 */
uint32_t IndexedEdictFile::findFirstMatch( const QByteArray &query ) const
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
146
{
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
  int low = 0;
  int high = m_indexFile.size() / sizeof( uint32_t ) - 1;
  int cur;
  int comp = 0;

  do
  {
    cur = ( high + low ) / 2;
    comp = equalOrSubstring( query, lookupDictLine( cur ) );
    if( comp < 0 )
    {
      low = cur + 1;
    }
    if( comp > 0 )
    {
      high = cur - 1;
    }
  } while( high >= low && comp != 0 && ! ( high == 0 && low == 0 ) );

  if( comp != 0 )
  {
    return 0;
  }

  while( cur - 1 && 0 == equalOrSubstring( query,lookupDictLine( cur ) ) )
  {
    --cur;
  }

  return cur;
177
178
}

179
QVector<QString> IndexedEdictFile::findMatches( const QString &query ) const
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
{
  QVector<QString> results;
  if( ! m_valid )
  {
    return results;
  }

  QTextCodec *codec = QTextCodec::codecForName( "eucJP" );
  if( ! codec )
  {
    return results;
  }

  QByteArray searchString = codec->fromUnicode( query );
  int indexSize = m_indexFile.size() / sizeof( uint32_t );

  int matchLocation = findFirstMatch( searchString );
  QByteArray currentWord = lookupDictLine( ++matchLocation );
  if( matchLocation == 0 )
  {
    return results;
  }

  QVector<uint32_t> possibleHits;

  do
  {
    currentWord = lookupDictLine( ++matchLocation );
    int i = 0;
    while( lookupDictChar( m_indexPtr[ matchLocation - 1 ] + i - 2 ) != 0x0A )
    {
      --i;
    }
    possibleHits.push_back( m_indexPtr[ matchLocation - 1 ] + i - 1 );
  } while( matchLocation < indexSize && 0 == equalOrSubstring( searchString, currentWord ) );

  if( possibleHits.size() <= 0 )
  {
    return results;
  }

Laurent Montel's avatar
Laurent Montel committed
221
  std::sort(possibleHits.begin(), possibleHits.end());
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
222
223
224
225
226
227
228
229
230
231
232
233
  uint32_t last = 0;

  foreach( uint32_t it, possibleHits )
  {
    if(last != it)
    {
      last = it;
      results.push_back( codec->toUnicode( lookupFullLine( it ) ) );
    }
  }

  return results;
234
235
}

Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
236
/**
237
238
239
240
241
242
 * A small set of EUC formatted string comparison functions
 * that will handle katakana and hiragana as the same, and compare both
 * kanji and latin chars in the same function. Handy for a few sections
 * of the doSearch() routine.
 * findMatches() is another string comparer, but unlike the other one
 * compares strictly (with an exepmption for punctuation).
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
243
 */
244
int IndexedEdictFile::findMatches( const char *str1, const char *str2 ) const
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
245
{
246
#define EUC_LATIN_CHARACTER(x) (('a'<=x && x<='z')||(x==0xA4)||(x==0x80))
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
247

248
  for(unsigned i=0; ; ++i)
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
249
  {
250
251
252
253
    unsigned char c1 = static_cast<unsigned char>( str1[ i ] );
    unsigned char c2 = static_cast<unsigned char>( str2[ i ] );

    if( ( i % 2 ) == 0 )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
254
    {
255
      //on the highbyte (of kana)
256
      if( c2 == 0xA5 ) //Make katakana and hiragana equivalent
257
258
259
260
261
262
263
264
      {
        c2 = 0xA4;
      }

      if( c1 == 0xA5 )
      {
        c1 = 0xA4;
      }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
265
    }
266
267

    if( ( 'A' <= c1 ) && ( c1 <= 'Z' ) )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
268
    {
269
270
271
272
273
      c1 |= 0x20; // 'fix' uppercase
    }
    if( ( 'A' <= c2 ) && ( c2 <= 'Z' ) )
    {
      c2 |= 0x20;
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
274
275
    }

276
277
278
279
280
281
    if( c1 == '\0' )
    {
      if( ! EUC_LATIN_CHARACTER( c2 ) )
      {
        return 0;
      }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
282

283
284
285
286
287
288
289
      return c2;
    }

    if( c1 != c2 )
    {
      return (int)c2 - (int)c1;
    }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
290
291
  }

292
  return 0; //shouldn't happen... but gcc will warn if this isn't here
293
294
}

295
bool IndexedEdictFile::loadFile( const QString &fileName )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
296
{
297
298
299
300
  if( m_valid )
  {
    return false;
  }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
301

302
303
  m_dictFile.setFileName( fileName );
  if( ! m_dictFile.exists() )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
304
  {
305
306
    return false;		//Bail if the file doesn't exist
  }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
307

308
  m_dictPtr = static_cast<unsigned char*>( MAP_FAILED );
309
310
  m_indexFile.setFileName( QStandardPaths::writableLocation(QStandardPaths::GenericDataLocation) + QLatin1Char('/') + "kiten/xjdx/"
                        + QFileInfo( fileName ).baseName() + ".xjdx" );
311
312
313
314
315
316
317
318
319
320
  m_indexPtr = static_cast<uint32_t*>( MAP_FAILED );
  if( ! m_indexFile.exists() )
  {
    //If the index file isn't there, build it
    //TODO: Verify the format if the index doesn't exist?
    if( ! buildIndex() ) //If we can't build the file, bail
    {
      return false;
    }
  }
321

322
323
324
325
  if( ! m_dictFile.open( QIODevice::ReadOnly ) )
  {
    return false;
  }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
326

327
  if( m_indexFile.open( QIODevice::ReadOnly ) )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
328
  {
329
    if( checkIndex() )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
330
    {
331
332
333
334
335
      if( loadmmaps() )
      {
        m_valid = true;
        return true;
      }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
336
    }
337
338

    m_indexFile.close();
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
339
340
  }

341
342
343
  //Success is actually in the middle of that if statement, so if we get here
  //something failed and we need to clean up
  m_dictFile.close();
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
344
  return false;
345
346
347
}

//Warning: This assumes that both files have already been opened
348
bool IndexedEdictFile::loadmmaps()
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
349
350
{
  m_indexPtr = static_cast<uint32_t*>(
Laurent Montel's avatar
Laurent Montel committed
351
                  mmap(nullptr, m_indexFile.size(), PROT_READ, MAP_SHARED, m_indexFile.handle(), 0));
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
352
353
354
355
356
  if( m_indexPtr == static_cast<uint32_t*>( MAP_FAILED ) )
  {
    return false;
  }

Laurent Montel's avatar
Laurent Montel committed
357
  m_dictPtr = static_cast<unsigned char*>( mmap(  nullptr
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
358
359
360
361
362
363
364
365
366
367
368
369
370
                                                , m_dictFile.size()
                                                , PROT_READ
                                                , MAP_SHARED
                                                , m_dictFile.handle()
                                                , 0 ) );
  if( m_dictPtr == static_cast<unsigned char*>( MAP_FAILED ) )
  {
    munmap( static_cast<void*>( m_indexPtr ), m_indexFile.size() );
    m_indexPtr = static_cast<uint32_t*>( MAP_FAILED );
    return false;
  }

  return true;
371
372
}

Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
373
/**
374
 * Similar to it's larger cousin below, this scans the dictionary at
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
375
376
377
378
 * a particular location and extracts the unsigned char at a particular
 * location within the file/memorymap, offset from the start of the
 * dictionary.
 */
379
inline unsigned char IndexedEdictFile::lookupDictChar( uint32_t i ) const
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
380
{
381
  if( i > static_cast<uint32_t>( m_dictFile.size() ) /*|| i < 0*/ )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
382
383
384
  {
    return 0x0A; //If out of bounds, return endl
  }
385

Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
386
  return m_dictPtr[i];
387
388
}

Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
389
390
/**
 * This quick utility method looks in index at location i and pulls,
391
 * the corresponding line from the dictionary returning it as an euc
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
392
393
 * formatted QCString. i=1 is the first entry that the index points to.
 */
394
QByteArray IndexedEdictFile::lookupDictLine( uint32_t i ) const
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
395
{
396
  if( i > static_cast<uint32_t>( m_dictFile.size()) /*|| i < 0*/ )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
  {
    return QByteArray( "" );
  }

  uint32_t start = m_indexPtr[ i ] - 1;
  uint32_t pos = start;
  const unsigned size = m_dictFile.size();
  //Grab the whole word
  //As long as we don't get EOF, null or newline... keep going forward
  while( pos<=size && m_dictPtr[ pos ] != 0 && m_dictPtr[ pos ] != 0x0A )
  {
    ++pos;
  }

  //Copy the word to a QCString
  QByteArray retval(  (const char*)( m_dictPtr + start )
                    , 1 + pos - start );
  //and away we go
  return retval;
416
417
}

Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
418
/**
419
 * This is just like lookupDictChar, but grabs till EOL
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
420
 */
421
QByteArray IndexedEdictFile::lookupFullLine( uint32_t i ) const
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
422
{
423
  if( i > static_cast<uint32_t>( m_dictFile.size() ) /*|| i < 0*/ )
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
424
  {
425
426
    return QByteArray (0x0A, 1 );	//If out of bounds, return endl
  }
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
427

428
429
430
431
432
433
  uint32_t start = i;
  uint32_t pos = start;
  const unsigned max = m_dictFile.size();
  while( pos <= max && m_dictPtr[ pos ] != 0 && m_dictPtr[ pos ] != 0x0A )
  {
    ++pos;
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
434
435
  }

436
437
438
439
  QByteArray retval(  (const char*)( m_dictPtr + start )
                    , 1 + pos - start );
  //and away we go
  return retval;
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
440
}
441

442
bool IndexedEdictFile::valid() const
Daniel E. Moctezuma's avatar
Daniel E. Moctezuma committed
443
{
444
  return m_valid;
445
}