Commit a4e7b25c authored by Jean-Baptiste Mardelle's avatar Jean-Baptiste Mardelle
Browse files

First draft of speech to text (requires the python vosk and srt modules)

parent ad96891a
Pipeline #50019 passed with stage
in 10 minutes and 27 seconds
INSTALL(FILES
speech.py
DESTINATION ${DATA_INSTALL_DIR}/kdenlive/scripts)
#!/usr/bin/env python3
#pip3 install vosk
#pip3 install srt
from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import wave
import subprocess
import srt
import json
import datetime
SetLogLevel(-1)
os.chdir(sys.argv[1])
if not os.path.exists(sys.argv[2]):
print ("Please download the model from https://alphacephei.com/vosk/models and unpack as ", sys.argv[2]," in the current folder.")
exit (1)
sample_rate=16000
model = Model(sys.argv[2])
rec = KaldiRecognizer(model, sample_rate)
process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i',
sys.argv[3],
'-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'],
stdout=subprocess.PIPE)
WORDS_PER_LINE = 7
def transcribe():
results = []
subs = []
while True:
data = process.stdout.read(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
results.append(rec.Result())
results.append(rec.FinalResult())
for i, res in enumerate(results):
jres = json.loads(res)
if not 'result' in jres:
continue
words = jres['result']
for j in range(0, len(words), WORDS_PER_LINE):
line = words[j : j + WORDS_PER_LINE]
s = srt.Subtitle(index=len(subs),
content=" ".join([l['word'] for l in line]),
start=datetime.timedelta(seconds=line[0]['start']),
end=datetime.timedelta(seconds=line[-1]['end']))
subs.append(s)
return subs
subtitle = srt.compose(transcribe())
print (subtitle)
with open(sys.argv[4], 'w') as f:
f.writelines(subtitle)
f.close()
......@@ -6,6 +6,7 @@ set(kdenlive_SRCS
dialogs/markerdialog.cpp
dialogs/profilesdialog.cpp
dialogs/renderwidget.cpp
dialogs/speechdialog.cpp
dialogs/subtitleedit.cpp
dialogs/titletemplatedialog.cpp
dialogs/wizard.cpp
......
/***************************************************************************
* Copyright (C) 2008 by Jean-Baptiste Mardelle (jb@kdenlive.org) *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
***************************************************************************/
#include "speechdialog.h"
#include "core.h"
#include "kdenlivesettings.h"
#include "monitor/monitor.h"
#include "bin/model/subtitlemodel.hpp"
#include "kdenlive_debug.h"
#include "mlt++/MltProfile.h"
#include "mlt++/MltTractor.h"
#include "mlt++/MltConsumer.h"
#include <QFontDatabase>
#include <QDir>
#include <QProcess>
#include <KLocalizedString>
#include <KUrlRequesterDialog>
#include <KArchive>
#include <KZip>
#include <KTar>
#include <KIO/FileCopyJob>
#include <KIO/OpenUrlJob>
#include <KIO/JobUiDelegate>
#include <KArchiveDirectory>
#include <KMessageWidget>
SpeechDialog::SpeechDialog(const std::shared_ptr<TimelineItemModel> &timeline, QPoint zone, bool activeTrackOnly, bool selectionOnly, QWidget *parent)
: QDialog(parent)
{
setFont(QFontDatabase::systemFont(QFontDatabase::SmallestReadableFont));
setupUi(this);
buttonBox->button(QDialogButtonBox::Apply)->setText(i18n("Process"));
dict_info->hide();
speech_info->hide();
slotParseDictionaries();
button_add->setIcon(QIcon::fromTheme(QStringLiteral("list-add")));
button_delete->setIcon(QIcon::fromTheme(QStringLiteral("edit-delete")));
connect(button_add, &QToolButton::clicked, this, &SpeechDialog::getDictionary);
connect(buttonBox->button(QDialogButtonBox::Apply), &QPushButton::clicked, [this, timeline, zone]() {
slotProcessSpeech(timeline, zone);
});
connect(dict_info, &KMessageWidget::linkActivated, [&](const QString &contents) {
qDebug()<<"=== LINK CLICKED: "<<contents;
auto *job = new KIO::OpenUrlJob(QUrl(contents));
job->setUiDelegate(new KIO::JobUiDelegate(KJobUiDelegate::AutoHandlingEnabled, this));
// methods like setRunExecutables, setSuggestedFilename, setEnableExternalBrowser, setFollowRedirections
// exist in both classes
job->start();
});
//TODO: check for the python scripts vosk and srt
connect(this, &SpeechDialog::parseDictionaries, this, &SpeechDialog::slotParseDictionaries);
}
void SpeechDialog::slotProcessSpeech(const std::shared_ptr<TimelineItemModel> &timeline, QPoint zone)
{
QString pyExec = QStandardPaths::findExecutable(QStringLiteral("python3"));
if (pyExec.isEmpty()) {
//TODO
}
speech_info->setMessageType(KMessageWidget::Information);
speech_info->setText(i18n("Starting audio export"));
speech_info->show();
qApp->processEvents();
QString sceneList;
QString speech;
QString audio;
QTemporaryFile tmpPlaylist(QDir::tempPath() + QStringLiteral("/XXXXXX.mlt"));
QTemporaryFile tmpSpeech(QDir::tempPath() + QStringLiteral("/XXXXXX.srt"));
QTemporaryFile tmpAudio(QDir::tempPath() + QStringLiteral("/XXXXXX.wav"));
if (tmpPlaylist.open()) {
sceneList = tmpPlaylist.fileName();
}
tmpPlaylist.close();
if (tmpSpeech.open()) {
speech = tmpSpeech.fileName();
}
tmpSpeech.close();
if (tmpAudio.open()) {
audio = tmpAudio.fileName();
}
tmpAudio.close();
pCore->getMonitor(Kdenlive::ProjectMonitor)->sceneList(QDir::temp().absolutePath(), sceneList);
Mlt::Producer producer(*timeline->tractor()->profile(), "xml", sceneList.toUtf8().constData());
qDebug()<<"=== STARTING RENDER B";
Mlt::Consumer xmlConsumer(*timeline->tractor()->profile(), "avformat", audio.toUtf8().constData());
qApp->processEvents();
if (!xmlConsumer.is_valid() || !producer.is_valid()) {
qDebug()<<"=== STARTING CONSUMER ERROR";
if (!producer.is_valid()) {
qDebug()<<"=== PRODUCER INVALID";
}
speech_info->setMessageType(KMessageWidget::Warning);
speech_info->setText(i18n("Audio export failed"));
qApp->processEvents();
return;
}
xmlConsumer.set("terminate_on_pause", 1);
xmlConsumer.set("properties", "WAV");
producer.set_in_and_out(zone.x(), zone.y());
xmlConsumer.connect(producer);
qDebug()<<"=== STARTING RENDER C, IN:"<<zone.x()<<" - "<<zone.y();
qApp->processEvents();
xmlConsumer.run();
qApp->processEvents();
qDebug()<<"=== STARTING RENDER D";
QString language = language_box->currentText();
QString speechScript = QStandardPaths::locate(QStandardPaths::AppDataLocation, QStringLiteral("scripts/speech.py"));
qDebug()<<"=== RUNNING SPEECH ANALYSIS: "<<speechScript;
QProcess speechJob;
speech_info->setMessageType(KMessageWidget::Information);
speech_info->setText(i18n("Starting speech recognition"));
qApp->processEvents();
QString modelDirectory = QStandardPaths::locate(QStandardPaths::AppDataLocation, QStringLiteral("speechmodels"), QStandardPaths::LocateDirectory);
qDebug()<<"==== ANALYSIS SPEECH: "<<modelDirectory<<" - "<<language<<" - "<<audio<<" - "<<speech;
speechJob.start(pyExec, {speechScript, modelDirectory, language, audio, speech});
speechJob.waitForFinished();
if (QFile::exists(speech)) {
timeline->getSubtitleModel()->importSubtitle(speech, zone.x(), true);
speech_info->setMessageType(KMessageWidget::Positive);
speech_info->setText(i18n("Subtitles imported"));
} else {
speech_info->setMessageType(KMessageWidget::Warning);
speech_info->setText(i18n("Speech recognition failed"));
}
}
void SpeechDialog::getDictionary()
{
QUrl url = KUrlRequesterDialog::getUrl(QUrl(), this, i18n("Enter url for the new dictionary"));
if (url.isEmpty()) {
return;
}
QString tmpFile;
if (!url.isLocalFile()) {
KIO::FileCopyJob *copyjob = KIO::file_copy(url, QUrl::fromLocalFile(QDir::temp().absoluteFilePath(url.fileName())));
dict_info->setMessageType(KMessageWidget::Information);
dict_info->setText(i18n("Downloading model..."));
dict_info->animatedShow();
connect(copyjob, &KIO::FileCopyJob::result, this, &SpeechDialog::processArchive);
/*if (copyjob->exec()) {
qDebug()<<"=== GOT REST: "<<copyjob->destUrl();
//
} else {
qDebug()<<"=== CANNOT DOWNLOAD";
}*/
} else {
//KMessageBox::error(this, KIO::NetAccess::lastErrorString());
//KArchive ar(tmpFile);
}
}
void SpeechDialog::processArchive(KJob* job)
{
qDebug()<<"=== DOWNLOAD FINISHED!!";
if (job->error() == 0 || job->error() == 112) {
qDebug()<<"=== NO ERROR ON DWNLD!!";
KIO::FileCopyJob *jb = static_cast<KIO::FileCopyJob*>(job);
if (jb) {
qDebug()<<"=== JOB FOUND!!";
QMimeDatabase db;
QString archiveFile = jb->destUrl().path();
QMimeType type = db.mimeTypeForFile(archiveFile);
std::unique_ptr<KArchive> archive;
if (type.inherits(QStringLiteral("application/zip"))) {
archive.reset(new KZip(archiveFile));
} else {
archive.reset(new KTar(archiveFile));
}
QString modelDirectory = QStandardPaths::writableLocation(QStandardPaths::AppDataLocation);
QDir dir(modelDirectory);
dir.mkdir(QStringLiteral("speechmodels"));
if (!dir.cd(QStringLiteral("speechmodels"))) {
qDebug()<<"=== /// CANNOT ACCESS SPEECH DICTIONARIES FOLDER";
dict_info->setMessageType(KMessageWidget::Warning);
dict_info->setText(i18n("Cannot access dictionary folder"));
return;
}
if (archive->open(QIODevice::ReadOnly)) {
dict_info->setText(i18n("Extracting archive..."));
const KArchiveDirectory *archiveDir = archive->directory();
if (!archiveDir->copyTo(dir.absolutePath())) {
qDebug()<<"=== Error extracting archive!!";
} else {
QFile::remove(archiveFile);
emit parseDictionaries();
dict_info->setMessageType(KMessageWidget::Positive);
dict_info->setText(i18n("New dictionary installed"));
}
} else {
qDebug()<<"=== CANNOT OPEN ARCHIVE!!";
}
} else {
qDebug()<<"=== JOB NOT FOUND!!";
dict_info->setMessageType(KMessageWidget::Warning);
dict_info->setText(i18n("Download error"));
}
} else {
qDebug()<<"=== GOT JOB ERROR: "<<job->error();
dict_info->setMessageType(KMessageWidget::Warning);
dict_info->setText(i18n("Download error %1", job->errorString()));
}
}
void SpeechDialog::slotParseDictionaries()
{
listWidget->clear();
language_box->clear();
buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
QString modelDirectory = QStandardPaths::writableLocation(QStandardPaths::AppDataLocation);
QDir dir(modelDirectory);
if (!dir.cd(QStringLiteral("speechmodels"))) {
qDebug()<<"=== /// CANNOT ACCESS SPEECH DICTIONARIES FOLDER";
tabWidget->setCurrentIndex(1);
dict_info->setMessageType(KMessageWidget::Information);
dict_info->setText(i18n("Download dictionaries from: <a href=\"https://alphacephei.com/vosk/models\">https://alphacephei.com/vosk/models</a>"));
dict_info->animatedShow();
return;
}
QStringList dicts = dir.entryList(QDir::Dirs | QDir::NoDotAndDotDot);
listWidget->addItems(dicts);
language_box->addItems(dicts);
if (!dicts.isEmpty()) {
buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true);
dict_info->animatedHide();
} else {
tabWidget->setCurrentIndex(1);
dict_info->setMessageType(KMessageWidget::Information);
dict_info->setText(i18n("Download dictionaries from: <a href=\"https://alphacephei.com/vosk/models\">https://alphacephei.com/vosk/models</a>"));
dict_info->animatedShow();
}
}
/***************************************************************************
* Copyright (C) 2008 by Jean-Baptiste Mardelle (jb@kdenlive.org) *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *
***************************************************************************/
#ifndef SPEECHDIALOG_H
#define SPEECHDIALOG_H
#include "ui_speechdialog_ui.h"
#include "timeline2/model/timelineitemmodel.hpp"
#include "definitions.h"
class KJob;
/**
* @class SpeechDialog
* @brief A dialog for editing markers and guides.
* @author Jean-Baptiste Mardelle
*/
class SpeechDialog : public QDialog, public Ui::SpeechDialog_UI
{
Q_OBJECT
public:
explicit SpeechDialog(const std::shared_ptr<TimelineItemModel> &timeline, QPoint zone, bool activeTrackOnly = false, bool selectionOnly = false, QWidget *parent = nullptr);
private slots:
void getDictionary();
void processArchive(KJob* job);
void slotParseDictionaries();
void slotProcessSpeech(const std::shared_ptr<TimelineItemModel> &timeline, QPoint zone);
signals:
void parseDictionaries();
};
#endif
......@@ -35,6 +35,8 @@
<Separator />
<Action name="import_subtitle" />
<Action name="export_subtitle" />
<Separator />
<Action name="audio_recognition" />
</Menu>
<Separator />
<Action name="bin_view_mode" />
......
......@@ -1741,6 +1741,7 @@ void MainWindow::setupActions()
addAction(QStringLiteral("import_subtitle"), i18n("Import Subtitle File"), this, SLOT(slotImportSubtitle()), QIcon::fromTheme(QStringLiteral("document-import")));
addAction(QStringLiteral("export_subtitle"), i18n("Export Subtitle File"), this, SLOT(slotExportSubtitle()), QIcon::fromTheme(QStringLiteral("document-export")));
addAction(QStringLiteral("delete_subtitle_clip"), i18n("Delete Subtitle"), this, SLOT(slotDeleteItem()), QIcon::fromTheme(QStringLiteral("edit-delete")));
addAction(QStringLiteral("audio_recognition"), i18n("Speech Recognition"), this, SLOT(slotSpeechRecognition()), QIcon::fromTheme(QStringLiteral("autocorrection")));
m_saveAction = KStandardAction::save(pCore->projectManager(), SLOT(saveFile()), actionCollection());
m_saveAction->setIcon(QIcon::fromTheme(QStringLiteral("document-save")));
......@@ -4321,6 +4322,14 @@ void MainWindow::slotExportSubtitle()
getCurrentTimeline()->controller()->exportSubtitle();
}
void MainWindow::slotSpeechRecognition()
{
if (pCore->getSubtitleModel() == nullptr) {
slotEditSubtitle();
}
getCurrentTimeline()->controller()->subtitleSpeechRecognition();
}
#ifdef DEBUG_MAINW
#undef DEBUG_MAINW
#endif
......@@ -533,6 +533,8 @@ private slots:
void slotImportSubtitle();
/** @brief Export a subtitle file */
void slotExportSubtitle();
/** @brief Start a speech recognition on timeline zone */
void slotSpeechRecognition();
signals:
Q_SCRIPTABLE void abortRenderJob(const QString &url);
......
/*
Copyright (C) 2017 Jean-Baptiste Mardelle <jb@kdenlive.org>
This file is part of Kdenlive. See www.kdenlive.org.
......
......@@ -821,6 +821,36 @@ Rectangle {
width: childrenRect.width
x: Math.max(2 * root.collapsedHeight + 2, parent.width - width - 4)
spacing: 0
ToolButton {
id: analyseButton
focusPolicy: Qt.NoFocus
contentItem: Item {
Image {
source: "image://icon/autocorrection"
anchors.centerIn: parent
width: root.collapsedHeight - 4
height: root.collapsedHeight - 4
cache: root.paletteUnchanged
}
}
width: root.collapsedHeight
height: root.collapsedHeight
onClicked: timeline.triggerAction('audio_recognition')
ToolTip {
visible: muteButton.hovered
font: miniFont
delay: 1500
timeout: 5000
background: Rectangle {
color: activePalette.alternateBase
border.color: activePalette.light
}
contentItem: Label {
color: activePalette.text
text: i18n("Speech recognition")
}
}
}
ToolButton {
id: muteButton
focusPolicy: Qt.NoFocus
......
......@@ -32,6 +32,7 @@
#include "core.h"
#include "dialogs/spacerdialog.h"
#include "dialogs/speeddialog.h"
#include "dialogs/speechdialog.h"
#include "doc/kdenlivedoc.h"
#include "effects/effectsrepository.hpp"
#include "effects/effectstack/model/effectstackmodel.hpp"
......@@ -4121,6 +4122,12 @@ void TimelineController::exportSubtitle()
}
}
void TimelineController::subtitleSpeechRecognition()
{
SpeechDialog d(m_model, m_zone, false, false, qApp->activeWindow());
d.exec();
}
void TimelineController::deleteSubtitle(int startframe, int endframe, QString text)
{
auto subtitleModel = pCore->getSubtitleModel();
......
......@@ -610,6 +610,8 @@ public:
void importSubtitle(const QString path = QString());
/** @brief Export a subtitle file*/
void exportSubtitle();
/** @brief Launch speech recognition on timeline zone*/
void subtitleSpeechRecognition();
public slots:
void resetView();
......
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>SpeechDialog_UI</class>
<widget class="QDialog" name="SpeechDialog_UI">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>338</width>
<height>336</height>
</rect>
</property>
<property name="windowTitle">
<string>Dialog</string>
</property>
<layout class="QGridLayout" name="gridLayout_2">
<item row="0" column="0">
<widget class="QTabWidget" name="tabWidget">
<property name="currentIndex">
<number>0</number>
</property>
<widget class="QWidget" name="tab">
<attribute name="title">
<string>Speech recognition</string>
</attribute>
<layout class="QGridLayout" name="gridLayout">
<item row="2" column="0" colspan="2">
<widget class="QRadioButton" name="timeline_track">
<property name="text">
<string>Timeline zone (selected track)</string>
</property>
</widget>
</item>
<item row="0" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>Language</string>
</property>
</widget>
</item>
<item row="1" column="0" colspan="2">
<widget class="QRadioButton" name="timeline_zone">
<property name="text">
<string>Timeline zone (all tracks)</string>
</property>
<property name="checked">
<bool>true</bool>
</property>
</widget>
</item>
<item row="3" column="0" colspan="2">
<widget class="QRadioButton" name="timeline_clips">
<property name="text">
<string>Selected clips</string>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QComboBox" name="language_box"/>
</item>
<item row="4" column="0">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>40</height>
</size>
</property>
</spacer>
</item>
<item row="5" column="0" colspan="2">
<widget class="KMessageWidget" name="speech_info"/>
</item>
</layout>
</widget>
<widget class="QWidget" name="tab_2">
<attribute name="title">
<string>Dictionaries</string>
</attribute>
<layout class="QGridLayout" name="gridLayout_3">
<item row="2" column="0">
<widget class="QToolButton" name="button_add">
<property name="text">
<string>...</string>
</property>
</widget>
</item>
<item row="2" column="2">
<spacer name="horizontalSpacer">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>209</width>
<