linklocator.cpp 16.7 KB
Newer Older
1
/*
Allen Winter's avatar
Allen Winter committed
2
  Copyright (c) 2002 Dave Corrie <kde@davecorrie.com>
3

Allen Winter's avatar
Allen Winter committed
4 5 6 7
  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Library General Public
  License as published by the Free Software Foundation; either
  version 2 of the License, or (at your option) any later version.
8

Allen Winter's avatar
Allen Winter committed
9 10 11 12
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Library General Public License for more details.
13

Allen Winter's avatar
Allen Winter committed
14 15 16 17 18
  You should have received a copy of the GNU Library General Public License
  along with this library; see the file COPYING.LIB.  If not, write to
  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  Boston, MA 02110-1301, USA.
*/
19 20 21 22 23 24 25 26 27 28
/**
  @file
  This file is part of the KDEPIM Utilities library and provides the
  LinkLocator class.

  @brief
  Identifies URLs and email addresses embedded in plaintext.

  @author Dave Corrie \<kde@davecorrie.com\>
*/
29
#include "linklocator.h"
Allen Winter's avatar
Allen Winter committed
30

31
#include <KEmoticons>
32

33
#include <QtCore/QCoreApplication>
Allen Winter's avatar
Allen Winter committed
34 35
#include <QtCore/QFile>
#include <QtCore/QRegExp>
36
#include <QTextDocument>
37

38
#include <climits>
39

40
using namespace KPIMUtils;
41

42 43 44 45 46 47 48 49 50 51 52 53 54
/**
  Private class that helps to provide binary compatibility between releases.
  @internal
*/
//@cond PRIVATE
class KPIMUtils::LinkLocator::Private
{
  public:
    int mMaxUrlLen;
    int mMaxAddressLen;
};
//@endcond

55
// Use a static for this as calls to the KEmoticons constructor are expensive.
Pino Toscano's avatar
Pino Toscano committed
56
K_GLOBAL_STATIC( KEmoticons, sEmoticons )
57

Allen Winter's avatar
Allen Winter committed
58
LinkLocator::LinkLocator( const QString &text, int pos )
59
  : mText( text ), mPos( pos ), d( new KPIMUtils::LinkLocator::Private )
60
{
61 62 63
  d->mMaxUrlLen = 4096;
  d->mMaxAddressLen = 255;

64 65 66 67 68 69 70
  // If you change either of the above values for maxUrlLen or
  // maxAddressLen, then please also update the documentation for
  // setMaxUrlLen()/setMaxAddressLen() in the header file AND the
  // default values used for the maxUrlLen/maxAddressLen parameters
  // of convertToHtml().
}

71 72 73 74 75
LinkLocator::~LinkLocator()
{
  delete d;
}

Allen Winter's avatar
Allen Winter committed
76
void LinkLocator::setMaxUrlLen( int length )
77
{
78
  d->mMaxUrlLen = length;
79 80 81 82
}

int LinkLocator::maxUrlLen() const
{
83
  return d->mMaxUrlLen;
84 85
}

Allen Winter's avatar
Allen Winter committed
86
void LinkLocator::setMaxAddressLen( int length )
87
{
88
  d->mMaxAddressLen = length;
89 90 91 92
}

int LinkLocator::maxAddressLen() const
{
93
  return d->mMaxAddressLen;
94 95 96 97 98
}

QString LinkLocator::getUrl()
{
  QString url;
Allen Winter's avatar
Allen Winter committed
99
  if ( atUrl() ) {
Martin Koller's avatar
Martin Koller committed
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
    // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C
    // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall
    // be allowed and should be ignored when the URI is extracted.

    // This implementation follows this recommendation and
    // allows the URL to be enclosed within different kind of brackets/quotes
    // If an URL is enclosed, whitespace characters are allowed and removed, otherwise
    // the URL ends with the first whitespace
    // Also, if the URL is enclosed in brackets, the URL itself is not allowed
    // to contain the closing bracket, as this would be detected as the end of the URL

    QChar beforeUrl, afterUrl;

    // detect if the url has been surrounded by brackets or quotes
    if ( mPos > 0 ) {
      beforeUrl = mText[mPos - 1];

117
      /*if ( beforeUrl == '(' ) {
Martin Koller's avatar
Martin Koller committed
118
        afterUrl = ')';
119 120 121 122 123 124 125 126
      } else */if ( beforeUrl == QLatin1Char('[') ) {
        afterUrl = QLatin1Char(']');
      } else if ( beforeUrl == QLatin1Char('<') ) {
        afterUrl = QLatin1Char('>');
      } else if ( beforeUrl == QLatin1Char('>') ) { // for e.g. <link>http://.....</link>
        afterUrl = QLatin1Char('<');
      } else if ( beforeUrl == QLatin1Char('"') ) {
        afterUrl = QLatin1Char('"');
Allen Winter's avatar
Allen Winter committed
127
      }
128
    }
Allen Winter's avatar
Allen Winter committed
129

Martin Koller's avatar
Martin Koller committed
130 131 132 133 134
    url.reserve( maxUrlLen() );  // avoid allocs
    int start = mPos;
    while ( ( mPos < (int)mText.length() ) &&
            ( mText[mPos].isPrint() || mText[mPos].isSpace() ) &&
            ( ( afterUrl.isNull() && !mText[mPos].isSpace() ) ||
Allen Winter's avatar
Allen Winter committed
135
              ( !afterUrl.isNull() && mText[mPos] != afterUrl ) ) ) {
Martin Koller's avatar
Martin Koller committed
136 137
      if ( !mText[mPos].isSpace() ) {   // skip whitespace
        url.append( mText[mPos] );
Allen Winter's avatar
Allen Winter committed
138
        if ( url.length() > maxUrlLen() ) {
Martin Koller's avatar
Martin Koller committed
139
          break;
Allen Winter's avatar
Allen Winter committed
140
        }
Martin Koller's avatar
Martin Koller committed
141 142 143
      }

      mPos++;
144 145
    }

146
    if ( isEmptyUrl( url ) || ( url.length() > maxUrlLen() ) ) {
147
      mPos = start;
148
      url.clear();
Allen Winter's avatar
Allen Winter committed
149
    } else {
150 151 152
      --mPos;
    }
  }
153 154 155 156 157 158

  // HACK: This is actually against the RFC. However, most people don't properly escape the URL in
  //       their text with "" or <>. That leads to people writing an url, followed immediatley by
  //       a dot to finish the sentence. That would lead the parser to include the dot in the url,
  //       even though that is not wanted. So work around that here.
  //       Most real-life URLs hopefully don't end with dots or commas.
159
  QList<QChar> wordBoundaries;
160
  wordBoundaries << QLatin1Char('.') << QLatin1Char(',') << QLatin1Char(':') << QLatin1Char('!') << QLatin1Char('?') << QLatin1Char(')') << QLatin1Char('>');
161
  if ( url.length() > 1 ) {
162 163 164 165 166 167 168 169
    do {
      if ( wordBoundaries.contains( url.at( url.length() - 1 ) ) ) {
        url.chop( 1 );
        --mPos;
      } else {
        break;
      }
    } while( url.length() > 1 );
170 171
  }

172 173 174 175 176 177 178 179
  return url;
}

// keep this in sync with KMMainWin::slotUrlClicked()
bool LinkLocator::atUrl() const
{
  // the following characters are allowed in a dot-atom (RFC 2822):
  // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
180
  static const QString allowedSpecialChars = QLatin1String( ".!#$%&'*+-/=?^_`{|}~" );
181 182 183

  // the character directly before the URL must not be a letter, a number or
  // any other character allowed in a dot-atom (RFC 2822).
Allen Winter's avatar
Allen Winter committed
184 185 186
  if ( ( mPos > 0 ) &&
       ( mText[mPos-1].isLetterOrNumber() ||
         ( allowedSpecialChars.indexOf( mText[mPos-1] ) != -1 ) ) ) {
187
    return false;
Allen Winter's avatar
Allen Winter committed
188
  }
189 190

  QChar ch = mText[mPos];
191
  return
192
    ( ch == QLatin1Char('h') && ( mText.mid( mPos, 7 ) == QLatin1String( "http://" ) ||
Allen Winter's avatar
Allen Winter committed
193
                     mText.mid( mPos, 8 ) == QLatin1String( "https://" ) ) ) ||
194 195
    ( ch == QLatin1Char('v') && mText.mid( mPos, 6 ) == QLatin1String( "vnc://" ) ) ||
    ( ch == QLatin1Char('f') && ( mText.mid( mPos, 7 ) == QLatin1String( "fish://" ) ||
Allen Winter's avatar
Allen Winter committed
196 197
                     mText.mid( mPos, 6 ) == QLatin1String( "ftp://" ) ||
                     mText.mid( mPos, 7 ) == QLatin1String( "ftps://" ) ) ) ||
198
    ( ch == QLatin1Char('s') && ( mText.mid( mPos, 7 ) == QLatin1String( "sftp://" ) ||
Allen Winter's avatar
Allen Winter committed
199
                     mText.mid( mPos, 6 ) == QLatin1String( "smb://" ) ) ) ||
200 201 202
    ( ch == QLatin1Char('m') && mText.mid( mPos, 7 ) == QLatin1String( "mailto:" ) ) ||
    ( ch == QLatin1Char('w') && mText.mid( mPos, 4 ) == QLatin1String( "www." ) ) ||
    ( ch == QLatin1Char('f') && ( mText.mid( mPos, 4 ) == QLatin1String( "ftp." ) ||
Allen Winter's avatar
Allen Winter committed
203
                     mText.mid( mPos, 7 ) == QLatin1String( "file://" ) ) )||
204
    ( ch == QLatin1Char('n') && mText.mid( mPos, 5 ) == QLatin1String( "news:" ) );
205 206
}

Allen Winter's avatar
Allen Winter committed
207
bool LinkLocator::isEmptyUrl( const QString &url ) const
208 209
{
  return url.isEmpty() ||
Allen Winter's avatar
Allen Winter committed
210 211 212 213 214 215 216 217 218 219 220 221 222
    url == QLatin1String( "http://" ) ||
    url == QLatin1String( "https://" ) ||
    url == QLatin1String( "fish://" ) ||
    url == QLatin1String( "ftp://" ) ||
    url == QLatin1String( "ftps://" ) ||
    url == QLatin1String( "sftp://" ) ||
    url == QLatin1String( "smb://" ) ||
    url == QLatin1String( "vnc://" ) ||
    url == QLatin1String( "mailto" ) ||
    url == QLatin1String( "www" ) ||
    url == QLatin1String( "ftp" ) ||
    url == QLatin1String( "news" ) ||
    url == QLatin1String( "news://" );
223 224 225 226 227 228
}

QString LinkLocator::getEmailAddress()
{
  QString address;

229
  if ( mText[mPos] == QLatin1Char('@') ) {
230 231
    // the following characters are allowed in a dot-atom (RFC 2822):
    // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~
232
    static const QString allowedSpecialChars = QLatin1String( ".!#$%&'*+-/=?^_`{|}~" );
233 234 235 236 237

    // determine the local part of the email address
    int start = mPos - 1;
    while ( start >= 0 && mText[start].unicode() < 128 &&
            ( mText[start].isLetterOrNumber() ||
238
              mText[start] == QLatin1Char('@') || // allow @ to find invalid email addresses
239
              allowedSpecialChars.indexOf( mText[start] ) != -1 ) ) {
240
      if ( mText[start] == QLatin1Char('@') ) {
241
        return QString(); // local part contains '@' -> no email address
Allen Winter's avatar
Allen Winter committed
242
      }
243 244 245 246
      --start;
    }
    ++start;
    // we assume that an email address starts with a letter or a digit
Allen Winter's avatar
Allen Winter committed
247
    while ( ( start < mPos ) && !mText[start].isLetterOrNumber() ) {
248
      ++start;
Allen Winter's avatar
Allen Winter committed
249 250
    }
    if ( start == mPos ) {
251
      return QString(); // local part is empty -> no email address
Allen Winter's avatar
Allen Winter committed
252
    }
253 254 255 256 257 258

    // determine the domain part of the email address
    int dotPos = INT_MAX;
    int end = mPos + 1;
    while ( end < (int)mText.length() &&
            ( mText[end].isLetterOrNumber() ||
259 260 261 262
              mText[end] == QLatin1Char('@') || // allow @ to find invalid email addresses
              mText[end] == QLatin1Char('.') ||
              mText[end] == QLatin1Char('-') ) ) {
      if ( mText[end] == QLatin1Char('@') ) {
263
        return QString(); // domain part contains '@' -> no email address
Allen Winter's avatar
Allen Winter committed
264
      }
265
      if ( mText[end] == QLatin1Char('.') ) {
266
        dotPos = qMin( dotPos, end ); // remember index of first dot in domain
Allen Winter's avatar
Allen Winter committed
267
      }
268 269 270
      ++end;
    }
    // we assume that an email address ends with a letter or a digit
Allen Winter's avatar
Allen Winter committed
271
    while ( ( end > mPos ) && !mText[end - 1].isLetterOrNumber() ) {
272
      --end;
Allen Winter's avatar
Allen Winter committed
273 274
    }
    if ( end == mPos ) {
275
      return QString(); // domain part is empty -> no email address
Allen Winter's avatar
Allen Winter committed
276 277
    }
    if ( dotPos >= end ) {
278
      return QString(); // domain part doesn't contain a dot
Allen Winter's avatar
Allen Winter committed
279
    }
280

Allen Winter's avatar
Allen Winter committed
281
    if ( end - start > maxAddressLen() ) {
282
      return QString(); // too long -> most likely no email address
Allen Winter's avatar
Allen Winter committed
283
    }
284 285 286 287 288 289 290
    address = mText.mid( start, end - start );

    mPos = end - 1;
  }
  return address;
}

Allen Winter's avatar
Allen Winter committed
291 292
QString LinkLocator::convertToHtml( const QString &plainText, int flags,
                                    int maxUrlLen, int maxAddressLen )
293
{
Allen Winter's avatar
Allen Winter committed
294 295 296
  LinkLocator locator( plainText );
  locator.setMaxUrlLen( maxUrlLen );
  locator.setMaxAddressLen( maxAddressLen );
297 298

  QString str;
Allen Winter's avatar
Allen Winter committed
299
  QString result( (QChar*)0, (int)locator.mText.length() * 2 );
300 301 302 303
  QChar ch;
  int x;
  bool startOfLine = true;

Allen Winter's avatar
Allen Winter committed
304 305
  for ( locator.mPos = 0, x = 0; locator.mPos < (int)locator.mText.length();
        locator.mPos++, x++ ) {
306
    ch = locator.mText[locator.mPos];
Allen Winter's avatar
Allen Winter committed
307
    if ( flags & PreserveSpaces ) {
308
      if ( ch == QLatin1Char(' ') ) {
309
        if ( locator.mPos + 1 < locator.mText.length() ) {
310
          if ( locator.mText[locator.mPos + 1] != QLatin1Char(' ') ) {
311 312

            // A single space, make it breaking if not at the start or end of the line
313
            const bool endOfLine = locator.mText[locator.mPos + 1] == QLatin1Char('\n');
Allen Winter's avatar
Allen Winter committed
314
            if ( !startOfLine && !endOfLine ) {
315
              result += QLatin1Char(' ');
Allen Winter's avatar
Allen Winter committed
316
            } else {
317
              result += QLatin1String("&nbsp;");
Allen Winter's avatar
Allen Winter committed
318 319
            }
          } else {
320 321

            // Whitespace of more than one space, make it all non-breaking
322 323
            while ( locator.mPos < locator.mText.length() && locator.mText[locator.mPos] == QLatin1Char(' ') ) {
              result += QLatin1String("&nbsp;");
324 325 326 327 328 329 330
              locator.mPos++;
              x++;
            }

            // We incremented once to often, undo that
            locator.mPos--;
            x--;
331
          }
Allen Winter's avatar
Allen Winter committed
332
        } else {
333
          // Last space in the text, it is non-breaking
334
          result += QLatin1String("&nbsp;");
335 336
        }

337 338
        if ( startOfLine ) {
          startOfLine = false;
339
        }
340
        continue;
341
      } else if ( ch == QLatin1Char('\t') ) {
342
        do {
343
          result += QLatin1String("&nbsp;");
344
          x++;
345
        } while ( ( x & 7 ) != 0 );
346 347 348 349 350
        x--;
        startOfLine = false;
        continue;
      }
    }
351 352
    if ( ch == QLatin1Char('\n') ) {
      result += QLatin1String("<br />\n"); // Keep the \n, so apps can figure out the quoting levels correctly.
353 354 355 356 357 358
      startOfLine = true;
      x = -1;
      continue;
    }

    startOfLine = false;
359 360 361 362 363 364 365 366
    if ( ch == QLatin1Char('&') ) {
      result += QLatin1String("&amp;");
    } else if ( ch == QLatin1Char('"') ) {
      result += QLatin1String("&quot;");
    } else if ( ch == QLatin1Char('<') ) {
      result += QLatin1String("&lt;");
    } else if ( ch == QLatin1Char('>') ) {
      result += QLatin1String("&gt;");
Allen Winter's avatar
Allen Winter committed
367
    } else {
368
      const int start = locator.mPos;
Allen Winter's avatar
Allen Winter committed
369
      if ( !( flags & IgnoreUrls ) ) {
370
        str = locator.getUrl();
Allen Winter's avatar
Allen Winter committed
371
        if ( !str.isEmpty() ) {
372
          QString hyperlink;
373 374 375 376
          if ( str.left( 4 ) == QLatin1String("www.") ) {
            hyperlink = QLatin1String("http://") + str;
          } else if ( str.left( 4 ) == QLatin1String("ftp.") ) {
            hyperlink = QLatin1String("ftp://") + str;
Allen Winter's avatar
Allen Winter committed
377
          } else {
378
            hyperlink = str;
Allen Winter's avatar
Allen Winter committed
379
          }
380

381
          result += QLatin1String("<a href=\"") + hyperlink + QLatin1String("\">") + Qt::escape( str ) + QLatin1String("</a>");
382 383 384 385
          x += locator.mPos - start;
          continue;
        }
        str = locator.getEmailAddress();
Allen Winter's avatar
Allen Winter committed
386
        if ( !str.isEmpty() ) {
387
          // len is the length of the local part
388
          int len = str.indexOf( QLatin1Char('@') );
Allen Winter's avatar
Allen Winter committed
389
          QString localPart = str.left( len );
390 391 392

          // remove the local part from the result (as '&'s have been expanded to
          // &amp; we have to take care of the 4 additional characters per '&')
Allen Winter's avatar
Allen Winter committed
393
          result.truncate( result.length() -
394
                           len - ( localPart.count( QLatin1Char('&') ) * 4 ) );
395 396
          x -= len;

397
          result += QLatin1String("<a href=\"mailto:") + str + QLatin1String("\">") + str + QLatin1String("</a>");
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
          x += str.length() - 1;
          continue;
        }
      }
      if ( flags & HighlightText ) {
        str = locator.highlightedText();
        if ( !str.isEmpty() ) {
          result += str;
          x += locator.mPos - start;
          continue;
        }
      }
      result += ch;
    }
  }

Carlo Segato's avatar
Carlo Segato committed
414 415
  if ( flags & ReplaceSmileys ) {
    QStringList exclude;
416 417 418 419 420
    exclude << QLatin1String("(c)") << QLatin1String("(C)") << QLatin1String("&gt;:-(") << QLatin1String("&gt;:(") << QLatin1String("(B)") << QLatin1String("(b)") << QLatin1String("(P)") << QLatin1String("(p)");
    exclude << QLatin1String("(O)") << QLatin1String("(o)") << QLatin1String("(D)") << QLatin1String("(d)") << QLatin1String("(E)") << QLatin1String("(e)") << QLatin1String("(K)")<< QLatin1String("(k)");
    exclude << QLatin1String("(I)") << QLatin1String("(i)") << QLatin1String("(L)") << QLatin1String("(l)") << QLatin1String("(8)") << QLatin1String("(T)") << QLatin1String("(t)") << QLatin1String("(G)");
    exclude << QLatin1String("(g)") << QLatin1String("(F)") << QLatin1String("(f)") << QLatin1String("(H)");
    exclude << QLatin1String("8)") << QLatin1String("(N)") << QLatin1String("(n)") << QLatin1String("(Y)") << QLatin1String("(y)" )<< QLatin1String("(U)") << QLatin1String("(u)") << QLatin1String("(W)") << QLatin1String("(w)");
421 422 423 424
    static QString cachedEmoticonsThemeName;
    if ( cachedEmoticonsThemeName.isEmpty() ) {
      cachedEmoticonsThemeName = KEmoticons::currentThemeName();
    }
Allen Winter's avatar
Allen Winter committed
425 426 427
    result =
      sEmoticons->theme( cachedEmoticonsThemeName ).parseEmoticons(
        result, KEmoticonsTheme::StrictParse | KEmoticonsTheme::SkipHTML, exclude );
Carlo Segato's avatar
Carlo Segato committed
428 429
  }

430 431 432
  return result;
}

Allen Winter's avatar
Allen Winter committed
433
QString LinkLocator::pngToDataUrl( const QString &iconPath )
434
{
Allen Winter's avatar
Allen Winter committed
435
  if ( iconPath.isEmpty() ) {
436
    return QString();
Allen Winter's avatar
Allen Winter committed
437
  }
438 439

  QFile pngFile( iconPath );
Allen Winter's avatar
Allen Winter committed
440
  if ( !pngFile.open( QIODevice::ReadOnly | QIODevice::Unbuffered ) ) {
441
    return QString();
Allen Winter's avatar
Allen Winter committed
442
  }
443 444 445

  QByteArray ba = pngFile.readAll();
  pngFile.close();
446
  return QString::fromLatin1( "data:image/png;base64,%1" ).arg( QLatin1String(ba.toBase64().constData()) );
447 448 449 450 451
}

QString LinkLocator::highlightedText()
{
  // formating symbols must be prepended with a whitespace
Allen Winter's avatar
Allen Winter committed
452
  if ( ( mPos > 0 ) && !mText[mPos-1].isSpace() ) {
453
    return QString();
Allen Winter's avatar
Allen Winter committed
454
  }
455 456

  const QChar ch = mText[mPos];
457
  if ( ch != QLatin1Char('/') && ch != QLatin1Char('*') && ch != QLatin1Char('_') && ch != QLatin1Char('-') ) {
458
    return QString();
Allen Winter's avatar
Allen Winter committed
459
  }
460

Allen Winter's avatar
Allen Winter committed
461
  QRegExp re =
462
    QRegExp( QString::fromLatin1( "\\%1((\\w+)([\\s-']\\w+)*( ?[,.:\\?!;])?)\\%2" ).arg( ch ).arg( ch ) );
463
  re.setMinimal( true );
464 465
  if ( re.indexIn( mText, mPos ) == mPos ) {
    int length = re.matchedLength();
466 467
    // there must be a whitespace after the closing formating symbol
    if ( mPos + length < mText.length() && !mText[mPos + length].isSpace() ) {
468
      return QString();
Allen Winter's avatar
Allen Winter committed
469
    }
470 471
    mPos += length - 1;
    switch ( ch.toLatin1() ) {
Allen Winter's avatar
Allen Winter committed
472
    case '*':
473
      return QLatin1String("<b>*") + re.cap( 1 ) + QLatin1String("*</b>");
Allen Winter's avatar
Allen Winter committed
474
    case '_':
475
      return QLatin1String("<u>_") + re.cap( 1 ) + QLatin1String("_</u>");
Allen Winter's avatar
Allen Winter committed
476
    case '/':
477
      return QLatin1String("<i>/") + re.cap( 1 ) + QLatin1String("/</i>");
Laurent Montel's avatar
Laurent Montel committed
478
    case '-':
479
      return QLatin1String("<strike>-") + re.cap( 1 ) + QLatin1String("-</strike>");
480 481 482 483
    }
  }
  return QString();
}