NewImageFinder.cpp 31.2 KB
Newer Older
1
/* Copyright (C) 2003-2020 The KPhotoAlbum Development Team
Jesper Pedersen's avatar
Jesper Pedersen committed
2 3 4 5 6 7 8 9 10 11 12 13 14

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public
   License as published by the Free Software Foundation; either
   version 2 of the License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; see the file COPYING.  If not, write to
Dirk Mueller's avatar
Dirk Mueller committed
15
   the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
Jesper Pedersen's avatar
Jesper Pedersen committed
16 17
   Boston, MA 02110-1301, USA.
*/
18
#include "NewImageFinder.h"
19

Jesper Pedersen's avatar
Jesper Pedersen committed
20
#include "FastDir.h"
21
#include "ImageDB.h"
22
#include "ImageScout.h"
23
#include "Logging.h"
24
#include "MD5Map.h"
Henner Zeller's avatar
Henner Zeller committed
25

26
#include <BackgroundJobs/ReadVideoLengthJob.h>
27
#include <BackgroundJobs/SearchForVideosWithoutVideoThumbnailsJob.h>
28 29 30 31 32 33
#include <BackgroundTaskManager/JobManager.h>
#include <Exif/Database.h>
#include <ImageManager/RawImageDecoder.h>
#include <ImageManager/ThumbnailBuilder.h>
#include <ImageManager/ThumbnailCache.h>
#include <MainWindow/FeatureDialog.h>
Johannes Zarl-Zierl's avatar
Johannes Zarl-Zierl committed
34
#include <MainWindow/Logging.h>
35 36
#include <MainWindow/Window.h>
#include <Settings/SettingsData.h>
37
#include <Utilities/FileNameUtil.h>
38
#include <Utilities/FileUtil.h>
39
#include <Utilities/VideoUtil.h>
40

41 42
#include <KLocalizedString>
#include <KMessageBox>
43
#include <QApplication>
44 45
#include <QDataStream>
#include <QElapsedTimer>
46
#include <QEventLoop>
47
#include <QFile>
48
#include <QFileInfo>
49
#include <QImageReader>
Johannes Zarl-Zierl's avatar
Johannes Zarl-Zierl committed
50
#include <QLoggingCategory>
51
#include <QMimeDatabase>
52
#include <QProgressBar>
53 54 55
#include <QProgressDialog>
#include <QStringList>

56 57
using namespace DB;

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
/*****************************************************************
 *
 * NOTES ON PERFORMANCE
 * ===== == ===========
 *
 * - Robert Krawitz <rlk@alum.mit.edu> 2018-05-24
 *
 *
 * GENERAL NOTES ON STORAGE I/O
 * ------- ----- -- ------- ---
 *
 * The two main gates to loading new images are:
 *
 * 1) I/O (how fast can we read images off mass storage)
 *
 *    Different I/O devices have different characteristics in terms of
 *    througput, media latency, and protocol latency.
 *
 *    - Throughput is the raw speed at which data can be transferred,
 *      limited by the physical and/or electronic characteristics of
 *      the medium and the interface.  Short of reducing the amount of
 *      data that's transferred, or clever games with using the most
 *      efficient part of the medium (the outer tracks only for HDD's,
 *      a practice referred to as "short stroking" because it reduces
 *      the distance the head has to seek, at the cost of wasting a
 *      lot of capacity), there's nothing that can be done about this.
 *
 *    - Media latency is the latency component due to characteristics
 *      of the underlying storage medium.  For spinning disks, this is
 *      a function of rotational latency and sek latency.  In some
 *      cases, particularly with hard disks, it is possible to reduce
 *      media latency by arranging to access the data in a way that
 *      reduces seeking.  See DB/FastDir.cpp for an example of this.
 *
 *      While media latency can sometimes be hidden by overlapping
 *      I/O, generally not possible to avoid it.  Sometimes trying too
 *      hard can actually increase media latency if it results in I/O
 *      operations competing against each other requiring additional
 *      seeks.
 *
 *      Overlapping I/O with computation is another matter; that can
 *      easily yield benefit, especially if it eliminates rotational
 *      latency.
 *
 *    - Protocol latency.  This refers to things like SATA overhead,
 *      network overhead (for images stored on a network), and so
 *      forth.  This can encompass multiple things, and often they can
 *      be pipelined by means of multiple queued I/O operations.  For
 *      example, multiple commands can be issued to modern interfaces
 *      (SATA, NVMe) and many network interfaces without waiting for
 *      earlier operations to return.
 *
 *      If protocol latency is high compared with media latency,
 *      having multiple requests outstanding simultaneously can
 *      yield significant benefits.
 *
 *    iostat is a valuable tool for investigating throughput and
 *    looking for possible optimizations.  The IO/sec and data
 *    read/written per second when compared against known media
 *    characteristics (disk and SSD throughput, network bandwidth)
 *    provides valuable information about whether we're getting close
 *    to full performance from the I/O, and user and system CPU time
 *    give us additional clues about whether we're I/O-bound or
 *    CPU-bound.
 *
 *    Historically in the computer field, operations that require
 *    relatively simple processing on large volumes of data are I/O
 *    bound.  But with very fast I/O devices such as NVMe SSDs, some
 *    of which reach 3 GB/sec, that's not always the case.
 *
 * 2) Image (mostly JPEG) loading.
 *
 *    This is a function of image characteristics and image processing
 *    libraries.  Sometimes it's possible to apply parameters to
 *    the underlying image loader to speed it up.  This shows up as user
 *    CPU time.  Usually the only way to improve this performance
 *    characteristic is to use more or faster CPU cores (sometimes GPUs
 *    can assist here) or use better image loading routines (better
 *    libraries).
 *
 *
 * DESCRIPTION OF KPHOTOALBUM IMAGE LOAD PROCESS
 * ----------- -- ----------- ----- ---- -------
 *
 * KPhotoAlbum, when it loads an image, performs three processing steps:
 *
 * 1) Compute the MD5 checksum
 *
Antoni Bella Pérez's avatar
Antoni Bella Pérez committed
146
 * 2) Extract the Exif metadata
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
 *
 * 3) Generate a thumbnail
 *
 * Previous to this round of performance tuning, the first two steps
 * were performed in the first pass, and thumbnails were generated in
 * a separate pass.  Assuming that the set of new images is large enough
 * that they cannot all fit in RAM buffers, this results in the I/O
 * being performed twice.  The rewrite results in I/O being performed once.
 *
 * In addition, I have made many other changes:
 *
 * 1) Prior to the MD5 calculation step, a new thread, called a "scout
 *    thread", reads the files into memory.  While this memory is not
 *    directly used in the later computations, it results in the images
 *    being in RAM when they are later needed, making the I/O very fast
 *    (copying data in memory rather than reading it from storage).
 *
 *    This is a way to overlap I/O with computation.
 *
 * 2) The MD5 checksum uses its own I/O to read the data in in larger
 *    chunks than the Qt MD5 routine does.  The Qt routine reads it in
 *    in 4KiB chunks; my experimentation has found that 256KiB chunks
 *    are more efficient, even with a scout thread (it reduces the
 *    number of system calls).
 *
 * 3) When searching for other images to stack with the image being
 *    loaded, the new image loader no longer attempts to determine
 *    whether other candidate filenames are present, nor does it
 *    compute the MD5 checksum of any such files it does find.  Rather,
 *    it only checks for files that are already in KPhotoAlbum, either
 *    previously or as a result of the current load.  Merely checking
 *    for the presence of another file is not cheap, and it's not
 *    necessary; if an image will belong to a stack, we'll either know
 *    it now or when other images that can be stacked are loaded.
 *
Antoni Bella Pérez's avatar
Antoni Bella Pérez committed
182
 * 4) The Exif metadata extraction is now done only once; previously
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
 *    it was performed several times at different stages of the loading
 *    process.
 *
 * 5) The thumbnail index is now written out incrementally rather than
 *    the entire index (which can be many megabytes in a large image
 *    database) being rewritten frequently.  The index is fully rewritten
 *    prior to exit.
 *
 *
 * BASELINE PERFORMANCE
 * -------- -----------
 *
 * These measurements were all taken on a Lenovo ThinkPad P70 with 32
 * GB of dual-channel DDR4-2400 DRAM, a Xeon E3-1505M CPU (4 cores/8
 * total hyperthreads, 2.8-3.7 GHz Skylake; usually runs around
 * 3.1-3.2 GHz in practice), a Seagate ST2000LM015-2E8174 2TB HDD, and
 * a Crucial MX300 1TB SATA SSD.  Published numbers and measurements I
 * took otherwise indicate that the HDD can handle about 105-110
 * MB/sec with a maximum of 180 IO/sec (in a favorable case).  The SSD
 * is rated to handle 530 MB/sec read, 510 MB/sec write, 92K random
 * reads/sec, and 83K random writes/sec.
 *
 * The image set I used for all measurements, except as noted,
 * consists of 10839 total files of which about 85% are 20 MP JPEG and
 * the remainder (with a few exceptions are 20 MP RAW files from a
 * Canon EOS 7D mkII camera.  The total dataset is about 92 GB in
 * size.
 *
 * I baselined both drives by reading the same dataset by means of
 *
 * % ls | xargs cat | dd bs=1048576 of=/dev/null
 *
 * The HDD required between 850 and 870 seconds (14'10" to 14'30") to
 * perform this operation, yielding about 105-108 MB/sec.  The SSD
 * achieved about 271 MB/sec, which is well under its rated throughput
 * (hdparm -Tt yields 355 MB/sec, which is likewise nowhere close to
 * its rated throughput).  hdparm -Tt on the HDD yields about 120
 * MB/sec, but throughput to an HDD depends upon which part of the
 * disk is being read.  The outer tracks have a greater angular
 * density to achieve the same linear density (in other words, the
 * circumference of an outer track is longer than that of an inner
 * track, and the data is stored at a constant linear density).  So
 * hdparm isn't very useful on an HDD except as a best case.
 *
 * Note also that hdparm does a single stream read from the device.
 * It does not take advantage of the ability to queue multiple
 * requests.
230
 *
231 232 233 234 235 236 237 238 239 240 241 242 243
 *
 * ANALYSIS OF KPHOTOALBUM LOAD PERFORMANCE
 * -------- -- ----------- ---- -----------
 *
 * I analyzed the following cases, with images stored both on the
 * HDD and the SSD:
 *
 * 1) Images loaded (All, JPEG only, RAW only)
 *
 * B) Thumbnail creation (Including, Excluding)
 *
 * C) Scout threads (0, 1, 2, 3)
 *
244 245 246 247 248 249 250 251
 * The JPG image set constitutes 9293 images totaling about 55 GB.  The
 *   JPEG files are mostly 20 MP high quality files, in the range of
 *   6-10 MB.
 * The RAW image set constitutes 1544 images totaling about 37 GB.  The
 *   RAW files are 20 MP files, in the range of 25 MB.
 * The ALL set consists of 10839 or 10840 images totaling about 92 GB
 *   (the above set plus 2 .MOV files and in some cases one additional
 *   JPEG file).
252
 *
253
 * Times are elapsed times; CPU consumption is approximate user+system
254 255 256 257
 * CPU consumption.  Numbers in parentheses are with thumbnail
 * building disabled.  Note that in the cases with no scout threads on
 * the SSD the times were reproducibly shorter with thumbnail building
 * enabled (reasons are not determined at this time).
258
 *
259 260 261 262 263 264 265 266
 * Cases building RAW thumbnails generally consumed somewhat more
 * system CPU (in the range of 10-15%) than JPEG-only cases.  This may
 * be due to custom I/O routines used for generating thumbnails with
 * JPEG files; RAW files used the I/O provided by libkdcraw, which
 * uses smaller I/O operations.
 *
 * Estimating CPU time for mixed workloads proved very problematic,
 * as there were significant changes over time.
267
 *
268 269
 * Elapsed Time
 * ------- ----
270
 *
271
 *                                 SSD                     HDD
272
 *
273 274 275 276
 * JPG - 0 scouts                  4:03 (3:59)
 * JPG - 1 scout                   2:46 (2:44)
 * JPG - 2 scouts                  2:20 (2:07)
 * JPG - 3 scouts                  2:21 (1:58)
277
 *
278
 * ALL - 0 scouts                  6:32 (7:03)            16:01
279 280 281
 * ALL - 1 scout                   4:33 (4:33)            15:01
 * ALL - 2 scouts                  3:37 (3:28)            16:59
 * ALL - 3 scouts                  3:36 (3:15)
282
 *
283 284
 * RAW - 0 scouts                  2:18 (2:46)
 * RAW - 1 scout                   1:46 (1:46)
285 286
 * RAW - 2 scouts                  1:17 (1:17)
 * RAW - 3 scouts                  1:13 (1:13)
287
 *
288 289
 * User+System CPU
 * ----------- ---
290
 *
291
 *                                 SSD                     HDD
292
 *
293 294 295 296
 * JPG - 0 scouts                  40% (12%)
 * JPG - 1 scout                   70% (20%)
 * JPG - 2 scouts                  85% (15%)
 * JPG - 3 scouts                  85% (15%)
297
 *
298 299 300 301
 * RAW - 0 scouts                  15% (10%)
 * RAW - 1 scout                   18% (12%)
 * RAW - 2 scouts                  25% (15%)
 * RAW - 3 scouts                  25% (15%)
302 303 304 305
 *
 * I also used kcachegrind to measure CPU consumption on smaller
 * subsets of images (with and without thumbnail creation).  In terms
 * of user CPU consumption, thumbnail creation constitutes the large
306
 * majority of CPU cycles for processing JPEG files, followed by MD5
Antoni Bella Pérez's avatar
Antoni Bella Pérez committed
307
 * computation, with Exif parsing lagging far behind.  For RAW files,
308 309 310 311
 * MD5 computation consumes more cycles, likely in part due to the
 * larger size of RAW files but possibly also related to the smaller
 * filesize of embedded thumbnails (on the Canon 7D mkII, the embedded
 * thumbnail is full size but low quality).
312
 *
313 314
 * With thumbnail generation:
 * ---- --------- -----------
315
 *
316
 *                                 RAW             JPEG
317
 *
318 319 320
 * Thumbnail generation            44%             82%
 *   libjpeg processing              43%             82%
 * MD5 computation                 51%             13%
Antoni Bella Pérez's avatar
Antoni Bella Pérez committed
321
 * Read Exif                        1%              1.0%
322
 *
323 324
 * Without thumbnail generation:
 * ------- --------- -----------
325
 *
326
 *                                 RAW             JPEG
327
 *
328
 * MD5 computation                 92%             80%
Antoni Bella Pérez's avatar
Antoni Bella Pérez committed
329
 * Read Exif                        4%             10%
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
 *
 *
 * CONCLUSIONS
 * -----------
 *
 * For loading files from hard disk (likely the most common case),
 * there's no reason to consider any loading method other than using a
 * single scout thread and computing thumbnails concurrently.  Even
 * with thumbnail computation, there is very little CPU utilization.
 *
 * Loading from SATA SSD benefits from two scout threads, and possibly
 * more.  For minimal time to regain control, there is some benefit
 * seen from separating thumbnail generation from the rest of the
 * processing stages at the cost of more total elapsed time.  This is
 * more evident with JPEG files than with RAW files in this test case.
 * RAW files typically have smaller thumbnail images which can be
 * extracted and processed more quickly than full-size JPEG files.  On
 * a slower CPU, it may be desirable to return control to the user
 * even if the thumbnails are not built yet.
 *
 * Two other cases would be NVMe (or other very fast) SSDs and network
 * storage.  Since we're seeing evidence of CPU saturation on SATA
 * SSDs, we would likely see this even more strongly with NVMe; with
 * large numbers of images it may be desirable to separate the
 * thumbnail building from the rest of the processing.  It may also be
 * beneficial to use more scout threads.
 *
 * Network storage presents a different problem.  It is likely to have
 * lower throughput -- and certainly much higher latency -- than even
 * HDD, unless the underlying storage medium is SSD and the data is
 * located on a very fast, low latency network.  So there would be no
 * benefit to separating thumbnail processing.  However, due to
 * protocol vs. media latency discussed above, it may well work to use
 * more scout threads.  However, this may saturate the network and the
 * storage, to the detriment of other users, and there's probably no
 * general (or easily discoverable) optimum for this.
 *
 * It's my judgment that most images will be stored on HDDs for at
 * least the next few years, so tuning for that use case is probably
 * the best single choice to be made.
 *
 *****************************************************************/

373 374
namespace
{
375

376
bool canReadImage(const DB::FileName &fileName)
377 378 379 380
{
    bool fastMode = !Settings::SettingsData::instance()->ignoreFileExtension();
    QMimeDatabase::MatchMode mode = fastMode ? QMimeDatabase::MatchExtension : QMimeDatabase::MatchDefault;
    QMimeDatabase db;
381
    QMimeType mimeType = db.mimeTypeForFile(fileName.absolute(), mode);
382

383 384
    return QImageReader::supportedMimeTypes().contains(mimeType.name().toUtf8())
        || ImageManager::ImageDecoder::mightDecode(fileName);
385
}
Johannes Zarl-Zierl's avatar
Johannes Zarl-Zierl committed
386
}
387

388 389 390
bool NewImageFinder::findImages()
{
    // Load the information from the XML file.
Jesper Pedersen's avatar
Jesper Pedersen committed
391
    DB::FileNameSet loadedFiles;
392

Robert Krawitz's avatar
Robert Krawitz committed
393 394 395
    QElapsedTimer timer;

    timer.start();
396
    // TODO: maybe the database interface should allow to query if it
Henner Zeller's avatar
Henner Zeller committed
397 398 399
    // knows about an image ? Here we've to iterate through all of them and it
    // might be more efficient do do this in the database without fetching the
    // whole info.
400
    for (const DB::FileName &fileName : DB::ImageDB::instance()->files()) {
Jesper Pedersen's avatar
ZZZ--  
Jesper Pedersen committed
401
        loadedFiles.insert(fileName);
402 403
    }

404
    m_pendingLoad.clear();
405
    searchForNewFiles(loadedFiles, Settings::SettingsData::instance()->imageDirectory());
Robert Krawitz's avatar
Robert Krawitz committed
406 407
    int filesToLoad = m_pendingLoad.count();
    loadExtraFiles();
408

Johannes Zarl-Zierl's avatar
Johannes Zarl-Zierl committed
409
    qCDebug(TimingLog) << "Loaded " << filesToLoad << " images in " << timer.elapsed() / 1000.0 << " seconds";
410 411

    // Man this is not super optimal, but will be changed onces the image finder moves to become a background task.
412
    if (MainWindow::FeatureDialog::hasVideoThumbnailer()) {
413
        BackgroundTaskManager::JobManager::instance()->addJob(
414
            new BackgroundJobs::SearchForVideosWithoutVideoThumbnailsJob);
415
    }
416

417 418
    // To avoid deciding if the new images are shown in a given thumbnail view or in a given search
    // we rather just go to home.
419
    return (!m_pendingLoad.isEmpty()); // returns if new images was found.
420 421
}

422
void NewImageFinder::searchForNewFiles(const DB::FileNameSet &loadedFiles, QString directory)
423
{
424
    qApp->processEvents(QEventLoop::AllEvents);
Jesper Pedersen's avatar
refacor  
Jesper Pedersen committed
425
    directory = Utilities::stripEndingForwardSlash(directory);
426

Jesper Pedersen's avatar
cleanup  
Jesper Pedersen committed
427
    const QString imageDir = Utilities::stripEndingForwardSlash(Settings::SettingsData::instance()->imageDirectory());
428

429
    qCDebug(DBFileOpsLog) << "searching for new files in" << directory;
430 431
    FastDir dir(directory);
    const QStringList dirList = dir.entryList();
432
    ImageManager::RAWImageDecoder rawDec;
433 434 435
    QStringList excluded;
    excluded << Settings::SettingsData::instance()->excludeDirectories();
    excluded = excluded.at(0).split(QString::fromLatin1(","));
Miika Turkia's avatar
Miika Turkia committed
436 437 438

    bool skipSymlinks = Settings::SettingsData::instance()->skipSymlinks();

Robert Krawitz's avatar
Robert Krawitz committed
439 440 441 442
    // Keep files within a directory more local by processing all files within the
    // directory, and then all subdirectories.
    QStringList subdirList;

443
    for (QStringList::const_iterator it = dirList.constBegin(); it != dirList.constEnd(); ++it) {
Jesper Pedersen's avatar
Jesper Pedersen committed
444
        const DB::FileName file = DB::FileName::fromAbsolutePath(directory + QString::fromLatin1("/") + *it);
445 446 447 448
        if ((*it) == QString::fromLatin1(".") || (*it) == QString::fromLatin1("..")
            || excluded.contains((*it)) || loadedFiles.contains(file)
            || rawDec.fileCanBeSkipped(loadedFiles, file)
            || (*it) == QString::fromLatin1("CategoryImages"))
Johannes Zarl-Zierl's avatar
Johannes Zarl-Zierl committed
449
            continue;
450

451
        QFileInfo fi(file.absolute());
452

453
        if (!fi.isReadable())
454
            continue;
455
        if (skipSymlinks && fi.isSymLink())
456
            continue;
457

458 459
        if (fi.isFile()) {
            if (!DB::ImageDB::instance()->isBlocking(file)) {
460 461
                if (canReadImage(file)) {
                    qCDebug(DBFileOpsLog) << "Found new image:" << file.relative();
462
                    m_pendingLoad.append(qMakePair(file, DB::Image));
463 464
                } else if (Utilities::isVideo(file)) {
                    qCDebug(DBFileOpsLog) << "Found new video:" << file.relative();
465
                    m_pendingLoad.append(qMakePair(file, DB::Video));
466
                }
467
            }
468 469
        } else if (fi.isDir()) {
            subdirList.append(file.absolute());
470 471
        }
    }
472 473
    for (QStringList::const_iterator it = subdirList.constBegin(); it != subdirList.constEnd(); ++it)
        searchForNewFiles(loadedFiles, *it);
474 475
}

Robert Krawitz's avatar
Robert Krawitz committed
476
void NewImageFinder::loadExtraFiles()
477
{
478
    // FIXME: should be converted to a threadpool for SMP stuff and whatnot :]
479
    QProgressDialog dialog;
480
    QElapsedTimer timeSinceProgressUpdate;
481 482 483
    dialog.setLabelText(i18n("<p><b>Loading information from new files</b></p>"
                             "<p>Depending on the number of images, this may take some time.<br/>"
                             "However, there is only a delay when new images are found.</p>"));
484
    QProgressBar *progressBar = new QProgressBar;
485
    progressBar->setFormat(QLatin1String("%v/%m"));
486
    dialog.setBar(progressBar);
487 488
    dialog.setMaximum(m_pendingLoad.count());
    dialog.setMinimumDuration(1000);
489
    QAtomicInt loadedCount = 0;
490

491 492
    setupFileVersionDetection();

493
    int count = 0;
494

Robert Krawitz's avatar
Robert Krawitz committed
495
    MD5::resetMD5Cache();
496
    ImageScoutQueue asyncPreloadQueue;
497
    for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it) {
498
        asyncPreloadQueue.enqueue((*it).first);
499 500
    }

501 502
    ImageScout scout(asyncPreloadQueue, loadedCount, Settings::SettingsData::instance()->getPreloadThreadCount());
    if (Settings::SettingsData::instance()->getOverlapLoadMD5())
503
        scout.setPreloadFunc(DB::PreloadMD5Sum);
Robert Krawitz's avatar
Robert Krawitz committed
504
    scout.start();
505 506

    Exif::Database::instance()->startInsertTransaction();
507
    dialog.setValue(count); // ensure to call setProgress(0)
508
    timeSinceProgressUpdate.start();
509 510
    for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it, ++count) {
        qApp->processEvents(QEventLoop::AllEvents);
511

512
        if (dialog.wasCanceled()) {
513 514 515
            m_pendingLoad.clear();
            Exif::Database::instance()->abortInsertTransaction();
            return;
516
        }
Johannes Zarl-Zierl's avatar
Johannes Zarl-Zierl committed
517 518
        // (*it).first: DB::FileName
        // (*it).second: DB::MediaType
519 520 521 522
        loadExtraFile((*it).first, (*it).second);
        loadedCount++; // Atomic
        if (timeSinceProgressUpdate.elapsed() >= 1000) {
            dialog.setValue(count);
523 524
            timeSinceProgressUpdate.restart();
        }
525
    }
526
    dialog.setValue(count);
527 528 529
    // loadExtraFile() has already inserted all images into the
    // database, but without committing the changes
    DB::ImageDB::instance()->commitDelayedImages();
530
    Exif::Database::instance()->commitInsertTransaction();
531

532
    ImageManager::ThumbnailBuilder::instance()->save();
533 534
}

535 536
void NewImageFinder::setupFileVersionDetection()
{
537
    // should be cached because loading once per image is expensive
538 539
    m_modifiedFileCompString = Settings::SettingsData::instance()->modifiedFileComponent();
    m_modifiedFileComponent = QRegExp(m_modifiedFileCompString);
540

541 542
    m_originalFileComponents << Settings::SettingsData::instance()->originalFileComponent();
    m_originalFileComponents = m_originalFileComponents.at(0).split(QString::fromLatin1(";"));
543
}
544

545
void NewImageFinder::loadExtraFile(const DB::FileName &newFileName, DB::MediaType type)
546
{
547
    qCDebug(DBFileOpsLog) << "loadExtraFile(" << newFileName.relative() << ")";
548 549
    MD5 sum = MD5Sum(newFileName);
    if (handleIfImageHasBeenMoved(newFileName, sum))
550
        return;
551

552
    // check to see if this is a new version of a previous image
Antoni Bella Pérez's avatar
Antoni Bella Pérez committed
553
    // We'll get the Exif data later, when we get the MD5 checksum.
554
    ImageInfoPtr info = ImageInfoPtr(new ImageInfo(newFileName, type, DB::FileInformation::Ignore));
555
    ImageInfoPtr originalInfo;
Jesper Pedersen's avatar
Jesper Pedersen committed
556
    DB::FileName originalFileName;
557 558 559

    if (Settings::SettingsData::instance()->detectModifiedFiles()) {
        // requires at least *something* in the modifiedFileComponent
560
        if (m_modifiedFileCompString.length() >= 0 && newFileName.relative().contains(m_modifiedFileComponent)) {
561

562 563
            for (QStringList::const_iterator it = m_originalFileComponents.constBegin();
                 it != m_originalFileComponents.constEnd(); ++it) {
Jesper Pedersen's avatar
Jesper Pedersen committed
564
                QString tmp = newFileName.relative();
565
                tmp.replace(m_modifiedFileComponent, (*it));
Jesper Pedersen's avatar
Jesper Pedersen committed
566
                originalFileName = DB::FileName::fromRelativePath(tmp);
567

568 569
                MD5 originalSum;
                if (newFileName == originalFileName)
570
                    originalSum = sum;
571 572
                else if (DB::ImageDB::instance()->md5Map()->containsFile(originalFileName))
                    originalSum = DB::ImageDB::instance()->md5Map()->lookupFile(originalFileName);
573
                else
574 575 576 577 578
                    // Do *not* attempt to compute the checksum here.  It forces a filesystem
                    // lookup on a file that may not exist and substantially degrades
                    // performance by about 25% on an SSD and about 30% on a spinning disk.
                    // If one of these other files exist, it will be found later in
                    // the image search at which point we'll detect the modified file.
579
                    continue;
580
                if (DB::ImageDB::instance()->md5Map()->contains(originalSum)) {
581 582
                    // we have a previous copy of this file; copy it's data
                    // from the original.
583 584
                    originalInfo = DB::ImageDB::instance()->info(originalFileName);
                    if (!originalInfo) {
585
                        qCDebug(DBLog) << "Original info not found by name for " << originalFileName.absolute() << ", trying by MD5 sum.";
586
                        originalFileName = DB::ImageDB::instance()->md5Map()->lookup(originalSum);
587

588
                        if (!originalFileName.isNull()) {
589
                            qCDebug(DBLog) << "Substitute image " << originalFileName.absolute() << " found.";
590
                            originalInfo = DB::ImageDB::instance()->info(originalFileName);
591 592
                        }

593 594 595
                        if (!originalInfo) {
                            qCWarning(DBLog, "How did that happen? We couldn't find info for the original image %s; can't copy the original data to %s",
                                      qPrintable(originalFileName.absolute()), qPrintable(newFileName.absolute()));
596 597
                            continue;
                        }
598
                    }
599
                    info->copyExtraData(*originalInfo);
600 601

                    /* if requested to move, then delete old data from original */
602
                    if (Settings::SettingsData::instance()->moveOriginalContents()) {
603 604 605 606
                        originalInfo->removeExtraData();
                    }

                    break;
607 608 609 610
                }
            }
        }
    }
611
    ImageInfoList newImages;
612 613
    newImages.append(info);
    DB::ImageDB::instance()->addImages(newImages, false);
614

Jesper Pedersen's avatar
Jesper Pedersen committed
615
    // also inserts image into exif db if present:
616 617
    info->setMD5Sum(sum);
    DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName());
618

619
    if (originalInfo && Settings::SettingsData::instance()->autoStackNewFiles()) {
620 621

        // stack the files together
Jesper Pedersen's avatar
ZZZ--  
Jesper Pedersen committed
622 623 624
        DB::FileName olderfile = originalFileName;
        DB::FileName newerfile = info->fileName();
        DB::FileNameList tostack;
625

Miika Turkia's avatar
Miika Turkia committed
626
        // the newest file should go to the top of the stack
627
        tostack.append(newerfile);
Miika Turkia's avatar
Miika Turkia committed
628

Jesper Pedersen's avatar
ZZZ--  
Jesper Pedersen committed
629
        DB::FileNameList oldStack;
630
        if ((oldStack = DB::ImageDB::instance()->getStackFor(olderfile)).isEmpty()) {
Miika Turkia's avatar
Miika Turkia committed
631 632
            tostack.append(olderfile);
        } else {
633 634
            for (const DB::FileName &tmp : oldStack) {
                tostack.append(tmp);
Miika Turkia's avatar
Miika Turkia committed
635 636
            }
        }
Jesper Pedersen's avatar
ZZZ--  
Jesper Pedersen committed
637 638
        DB::ImageDB::instance()->stack(tostack);
        MainWindow::Window::theMainWindow()->setStackHead(newerfile);
639 640 641 642

        // ordering: XXX we ideally want to place the new image right
        // after the older one in the list.
    }
643

644
    markUnTagged(info);
645 646
    ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info);
    if (info->isVideo() && MainWindow::FeatureDialog::hasVideoThumbnailer()) {
647 648
        // needs to be done *after* insertion into database
        BackgroundTaskManager::JobManager::instance()->addJob(
649
            new BackgroundJobs::ReadVideoLengthJob(info->fileName(), BackgroundTaskManager::BackgroundVideoPreviewRequest));
650
    }
651 652
}

653
bool NewImageFinder::handleIfImageHasBeenMoved(const FileName &newFileName, const MD5 &sum)
654
{
655
    if (DB::ImageDB::instance()->md5Map()->contains(sum)) {
656
        const DB::FileName matchedFileName = DB::ImageDB::instance()->md5Map()->lookup(sum);
657
        QFileInfo fi(matchedFileName.absolute());
658

659
        if (!fi.exists()) {
660
            // The file we had a collapse with didn't exists anymore so it is likely moved to this new name
661 662
            ImageInfoPtr info = DB::ImageDB::instance()->info(matchedFileName);
            if (!info)
663
                qCWarning(DBLog, "How did that happen? We couldn't find info for the images %s", qPrintable(matchedFileName.relative()));
664
            else {
665 666 667 668
                fi = QFileInfo(matchedFileName.relative());
                if (info->label() == fi.completeBaseName()) {
                    fi = QFileInfo(newFileName.absolute());
                    info->setLabel(fi.completeBaseName());
669 670
                }

671
                DB::ImageDB::instance()->renameImage(info, newFileName);
672 673 674 675

                // We need to insert the new name into the MD5 map,
                // as it is a map, the value for the moved file will automatically be deleted.

676
                DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName());
677

678 679 680
                Exif::Database::instance()->remove(matchedFileName);
                Exif::Database::instance()->add(newFileName);
                ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info);
681 682 683 684 685 686 687
                return true;
            }
        }
    }
    return false; // The image wasn't just moved
}

688 689 690 691
bool NewImageFinder::calculateMD5sums(
    const DB::FileNameList &list,
    DB::MD5Map *md5Map,
    bool *wasCanceled)
692
{
693
    // FIXME: should be converted to a threadpool for SMP stuff and whatnot :]
694
    QProgressDialog dialog;
695
    dialog.setLabelText(
696
        i18np("<p><b>Calculating checksum for %1 file</b></p>", "<p><b>Calculating checksums for %1 files</b></p>", list.size())
697
        + i18n("<p>By storing a checksum for each image "
698 699
               "KPhotoAlbum is capable of finding images "
               "even when you have moved them on the disk.</p>"));
700
    dialog.setMaximum(list.size());
701
    dialog.setMinimumDuration(1000);
702 703

    int count = 0;
Jesper Pedersen's avatar
Jesper Pedersen committed
704
    DB::FileNameList cantRead;
705 706
    bool dirty = false;

707 708 709 710
    for (const FileName &fileName : list) {
        if (count % 10 == 0) {
            dialog.setValue(count); // ensure to call setProgress(0)
            qApp->processEvents(QEventLoop::AllEvents);
711

712 713
            if (dialog.wasCanceled()) {
                if (wasCanceled)
714
                    *wasCanceled = true;
715
                return dirty;
716
            }
717
        }
718

719
        MD5 md5 = MD5Sum(fileName);
720
        if (md5.isNull()) {
Jesper Pedersen's avatar
Jesper Pedersen committed
721
            cantRead << fileName;
722 723
            continue;
        }
724

Jesper Pedersen's avatar
ZZZ--  
Jesper Pedersen committed
725
        ImageInfoPtr info = ImageDB::instance()->info(fileName);
726 727
        if (info->MD5Sum() != md5) {
            info->setMD5Sum(md5);
728
            dirty = true;
729
            MainWindow::Window::theMainWindow()->thumbnailCache()->removeThumbnail(fileName);
730 731
        }

732
        md5Map->insert(md5, fileName);
733 734

        ++count;
735
    }
736
    if (wasCanceled)
737
        *wasCanceled = false;
738

739 740
    if (!cantRead.empty())
        KMessageBox::informationList(nullptr, i18n("Following files could not be read:"), cantRead.toStringList(DB::RelativeToImageRoot));
741

742 743
    return dirty;
}
744

745
void DB::NewImageFinder::markUnTagged(ImageInfoPtr info)
746
{
747
    if (DB::ImageDB::instance()->untaggedCategoryFeatureConfigured()) {
748 749
        info->addCategoryInfo(Settings::SettingsData::instance()->untaggedCategory(),
                              Settings::SettingsData::instance()->untaggedTag());
750 751
    }
}
752
// vi:expandtab:tabstop=4 shiftwidth=4: