Commit 6880a0d2 authored by Eric Jiang's avatar Eric Jiang Committed by Nathan Hinton
Browse files

Guess subtitle encoding before importing

Since many subtitle files are not UTF-8, we need to guess the encoding
of the file before reading it. For example, SubRip's default encoding is
Windows-1252 (according to Wikipedia).

This also adds KF5 Codecs as a dependency in order to use KEncodingProber.

Future work could be done to allow the user to select the encoding in the import dialog. Currently there is no way to manually select the encoding if it's not guessed correctly, but this should at least be an improvement over only supporting UTF-8.

BUG: 456871
parent d7ff1457
Pipeline #207239 passed with stage
in 7 minutes and 38 seconds
......@@ -58,7 +58,7 @@ if (ECM_VERSION VERSION_GREATER_EQUAL "5.91.0")
endif()
add_definitions(-DTRANSLATION_DOMAIN=\"kdenlive\")
find_package(KF5 ${KF_DEP_VERSION} REQUIRED COMPONENTS Archive Bookmarks CoreAddons Config ConfigWidgets
find_package(KF5 ${KF_DEP_VERSION} REQUIRED COMPONENTS Archive Bookmarks Codecs CoreAddons Config ConfigWidgets
KIO WidgetsAddons NotifyConfig NewStuff XmlGui Notifications GuiAddons TextWidgets IconThemes Declarative Solid
OPTIONAL_COMPONENTS DocTools FileMetaData Crash Purpose)
......
......@@ -57,7 +57,7 @@ sudo apt install libkf5archive-dev libkf5bookmarks-dev libkf5coreaddons-dev libk
libkf5configwidgets-dev libkf5dbusaddons-dev libkf5kio-dev libkf5widgetsaddons-dev \
libkf5notifyconfig-dev libkf5newstuff-dev libkf5xmlgui-dev libkf5declarative-dev \
libkf5notifications-dev libkf5guiaddons-dev libkf5textwidgets-dev libkf5purpose-dev \
libkf5iconthemes-dev libkf5crash-dev libkf5filemetadata-dev kio \
libkf5iconthemes-dev libkf5crash-dev libkf5filemetadata-dev libkf5codecs-dev kio \
kinit qtdeclarative5-dev libqt5svg5-dev qml-module-qtquick-controls libqt5networkauth5-dev \
qtmultimedia5-dev qtquickcontrols2-5-dev breeze-icon-theme breeze
......
......@@ -18,6 +18,7 @@
#include <mlt++/Mlt.h>
#include <mlt++/MltProperties.h>
#include <KEncodingProber>
#include <KLocalizedString>
#include <KMessageBox>
#include <QApplication>
......@@ -101,7 +102,35 @@ void SubtitleModel::unsetModel()
m_timeline.reset();
}
void SubtitleModel::importSubtitle(const QString &filePath, int offset, bool externalImport, float startFramerate, float targetFramerate)
QByteArray SubtitleModel::guessFileEncoding(const QString &file)
{
QFile textFile{file};
if (!textFile.open(QIODevice::ReadOnly | QIODevice::Text)) {
qWarning() << "Could not open" << file;
return "";
}
KEncodingProber prober{};
QByteArray sample = textFile.read(1024);
if (sample.isEmpty()) {
qWarning() << "Tried to guess the encoding of an empty file";
return "";
}
auto state = prober.feed(sample);
switch (state) {
case KEncodingProber::ProberState::FoundIt:
qDebug() << "Guessed subtitle file encoding to be " << prober.encoding();
break;
case KEncodingProber::ProberState::NotMe:
qWarning() << "Subtitle file encoding not recognized";
return "";
case KEncodingProber::ProberState::Probing:
qWarning() << "Subtitle file encoding indeterminate, confidence is" << prober.confidence();
break;
}
return prober.encoding();
}
void SubtitleModel::importSubtitle(const QString &filePath, int offset, bool externalImport, float startFramerate, float targetFramerate, const QByteArray &encoding)
{
QString start, end, comment;
QString timeLine;
......@@ -137,9 +166,13 @@ void SubtitleModel::importSubtitle(const QString &filePath, int offset, bool ext
qDebug() << "srt/vtt/sbv File";
//parsing srt file
QTextStream stream(&srtFile);
#if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
stream.setCodec(QTextCodec::codecForName("UTF-8"));
#endif
QTextCodec *inputEncoding = QTextCodec::codecForName(encoding);
if (inputEncoding) {
stream.setCodec(inputEncoding);
} else {
qWarning() << "No QTextCodec named" << encoding;
stream.setCodec("UTF-8");
}
QString line;
QStringList srtTime;
QRegExp rx("([0-9]{1,2}):([0-9]{2})");
......@@ -186,8 +219,8 @@ void SubtitleModel::importSubtitle(const QString &filePath, int offset, bool ext
timeLine.clear();
r = 0;
turn = defaultTurn;
}
}
}
}
srtFile.close();
} else if (filePath.endsWith(QLatin1String(".ass"))) {
qDebug() << "ass File";
......@@ -201,9 +234,7 @@ void SubtitleModel::importSubtitle(const QString &filePath, int offset, bool ext
return;
}
QTextStream stream(&assFile);
#if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
stream.setCodec(QTextCodec::codecForName("UTF-8"));
#endif
stream.setCodec(QTextCodec::codecForName(encoding));
QString line;
qDebug() << " correct ass file " << filePath;
scriptInfoSection.clear();
......
......@@ -89,8 +89,14 @@ public:
bool moveSubtitle(int subId, GenTime newPos, bool updateModel, bool updateView);
void requestSubtitleMove(int clipId, GenTime position);
/** @brief Guess the text encoding of the file at the provided path
* @param file The path to the text file
* @return The name of the text encoding, as guessed by KEncodingProber, or
* "" if an error occurred
*/
static QByteArray guessFileEncoding(const QString &file);
/** @brief Function that imports a subtitle file */
void importSubtitle(const QString &filePath, int offset = 0, bool externalImport = false, float startFramerate = 30.00, float targetFramerate = 30.00);
void importSubtitle(const QString &filePath, int offset = 0, bool externalImport = false, float startFramerate = 30.00, float targetFramerate = 30.00, const QByteArray &encoding = "UTF-8");
/** @brief Exports the subtitle model to json */
QString toJson();
......
......@@ -4896,11 +4896,14 @@ void TimelineController::importSubtitle(const QString &path)
if (view.cursor_pos->isChecked()) {
offset = pCore->getTimelinePosition();
}
if (view.transform_framerate_check_box->isChecked()) {
startFramerate = view.caption_original_framerate->value();
targetFramerate = view.caption_target_framerate->value();
}
subtitleModel->importSubtitle(view.subtitle_url->url().toLocalFile(), offset, true, startFramerate, targetFramerate);
if (view.transform_framerate_check_box->isChecked()) {
startFramerate = view.caption_original_framerate->value();
targetFramerate = view.caption_target_framerate->value();
}
const auto localPath = view.subtitle_url->url().toLocalFile();
QByteArray guessedEncoding = SubtitleModel::guessFileEncoding(localPath);
qDebug() << "Guessed subtitle encoding is" << guessedEncoding;
subtitleModel->importSubtitle(localPath, offset, true, startFramerate, targetFramerate, guessedEncoding);
}
emit regainFocus();
}
......
1
00:00:05,600 --> 00:00:10,600
J'hésite à vérifier
2
00:00:20,120 --> 00:00:25,120
Ce test de sous-titres
3
00:00:25,120 --> 00:00:35,000
!! Quand même !!
......@@ -55,7 +55,9 @@ TEST_CASE("Read subtitle file", "[Subtitles]")
SECTION("Load a subtitle file")
{
QString subtitleFile = sourcesPath + "/dataset/01.srt";
subtitleModel->importSubtitle(subtitleFile);
QByteArray guessedEncoding = SubtitleModel::guessFileEncoding(subtitleFile);
CHECK(guessedEncoding == "UTF-8");
subtitleModel->importSubtitle(subtitleFile, 0, false, 30.00, 30.00, guessedEncoding);
// Ensure the 3 dialogues are loaded
REQUIRE(subtitleModel->rowCount() == 3);
QList<SubtitledTime> allSubs = subtitleModel->getAllSubtitles();
......@@ -64,7 +66,7 @@ TEST_CASE("Read subtitle file", "[Subtitles]")
controleTime << GenTime(140, 25) << GenTime(265, 25) << GenTime(503, 25) << GenTime(628, 25) << GenTime(628, 25) << GenTime(875, 25);
QStringList subtitlesText;
QStringList control = {QStringLiteral("J'hésite à vérifier"), QStringLiteral("Ce test de sous-titres"), QStringLiteral("!! Quand même !!")};
for (const auto &s : allSubs) {
for (const auto &s : qAsConst(allSubs)) {
subtitlesText << s.subtitle();
sTime << s.start();
sTime << s.end();
......@@ -74,6 +76,24 @@ TEST_CASE("Read subtitle file", "[Subtitles]")
// Ensure timeing is correct
REQUIRE(sTime == controleTime);
}
SECTION("Load a non-UTF-8 subtitle")
{
QString subtitleFile = sourcesPath + "/dataset/01-iso-8859-1.srt";
QByteArray guessedEncoding = SubtitleModel::guessFileEncoding(subtitleFile);
qDebug() << "Guessed encoding: " << guessedEncoding;
subtitleModel->importSubtitle(subtitleFile, 0, false, 30.00, 30.00, guessedEncoding);
// Ensure the 3 dialogues are loaded
REQUIRE(subtitleModel->rowCount() == 3);
QList<SubtitledTime> allSubs = subtitleModel->getAllSubtitles();
QStringList subtitlesText;
QStringList control = {QStringLiteral("J'hésite à vérifier"), QStringLiteral("Ce test de sous-titres"), QStringLiteral("!! Quand même !!")};
for (const auto &s : qAsConst(allSubs)) {
subtitlesText << s.subtitle();
}
// Ensure that non-ASCII characters are read correctly
CHECK(subtitlesText == control);
}
binModel->clean();
pCore->m_projectManager = nullptr;
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment