Commit 3d10fed3 authored by Harald Sitter's avatar Harald Sitter 🏳️‍🌈
Browse files

report smartctl failure codes to the user

as it turns out there are a number of issues that do not result in an
actually bad SMART status and would not get reported to the user. to
keep the code lean we'll simply look at the exit code bits of smartctl
as they more or less cover all failure scenarios anyway. we'll then
translate those to pretty strings and expose them on the Device objects
for consumption in the qml KCM as "instabilities". they are set visually
apart from actual bad status through different icon and description
because it's hard to say if an instability is in fact indicative of
imminent hard-failure or merely a hiccup (e.g. power loss during a disk
operation).

BUG: 429804
FIXED-in: 5.22
parent dc6903b9
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2020 Harald Sitter <sitter@kde.org>
// SPDX-FileCopyrightText: 2020-2021 Harald Sitter <sitter@kde.org>
#include <QDebug>
#include <QDir>
......@@ -24,6 +24,7 @@ private Q_SLOTS:
SMARTData data(doc);
QCOMPARE(data.m_device, "/dev/testfoobarpass");
QCOMPARE(data.m_status.m_passed, true);
QVERIFY(!data.m_smartctl.failure());
}
void testFail()
......@@ -35,6 +36,7 @@ private Q_SLOTS:
SMARTData data(doc);
QCOMPARE(data.m_device, "/dev/testfoobarfail");
QCOMPARE(data.m_status.m_passed, false);
QVERIFY(!data.m_smartctl.failure());
}
void testBroken()
......@@ -46,6 +48,10 @@ private Q_SLOTS:
SMARTData data(doc);
QCOMPARE(data.m_device, "/dev/sdc");
QCOMPARE(data.m_status.m_passed, false);
QCOMPARE(data.m_smartctl.failure(),
SMART::Failures({SMART::Failure::Disk, SMART::Failure::Prefail, SMART::Failure::ErrorsRecorded, SMART::Failure::SelfTestErrors}));
QVERIFY(data.m_smartctl.failure());
QVERIFY(!!data.m_smartctl.failure());
}
void testTimeout()
......@@ -58,6 +64,21 @@ private Q_SLOTS:
SMARTData data(doc);
QCOMPARE(data.m_device, "/dev/nvme0n1");
QCOMPARE(data.m_status.m_passed, true);
QCOMPARE(data.m_smartctl.failure(), SMART::Failures({SMART::Failure::InternalCommand}));
QVERIFY(data.m_smartctl.failure());
QVERIFY(!!data.m_smartctl.failure());
}
void testFailingSectorsButPassingStatus()
{
// SMART status is a pass but there are problems.
QFile file(QFINDTESTDATA("fixtures/failing-sectors-passing-status.json"));
QVERIFY(file.open(QFile::ReadOnly));
auto doc = QJsonDocument::fromJson(file.readAll());
SMARTData data(doc);
QCOMPARE(data.m_device, "/dev/sdb");
QCOMPARE(data.m_status.m_passed, true);
QCOMPARE(data.m_smartctl.failure(), SMART::Failures({SMART::Failure::ErrorsRecorded | SMART::Failure::SelfTestErrors}));
}
};
......
......@@ -12,6 +12,8 @@ set(kded_SRCS
device.cpp
devicenotifier.cpp
soliddevicenotifier.cpp
instabilities.cpp
smartfailure.cpp
)
if(WITH_SIMULATION)
......
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2020 Harald Sitter <sitter@kde.org>
// SPDX-FileCopyrightText: 2020-2021 Harald Sitter <sitter@kde.org>
#include "device.h"
......@@ -75,3 +75,17 @@ void Device::setIgnore(bool ignored)
m_ignored = ignored;
emit ignoreChanged();
}
QStringList Device::instabilities() const
{
return m_instabilities;
}
void Device::setInstabilities(const QStringList &instabilities)
{
if (m_instabilities == instabilities) {
return;
}
m_instabilities = instabilities;
Q_EMIT instabilitiesChanged();
}
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2020 Harald Sitter <sitter@kde.org>
// SPDX-FileCopyrightText: 2020-2021 Harald Sitter <sitter@kde.org>
#pragma once
......@@ -17,8 +17,16 @@ class Device : public QObject
Q_PROPERTY(QString udi READ udi CONSTANT)
Q_PROPERTY(QString product READ product CONSTANT)
Q_PROPERTY(QString path READ path CONSTANT)
/**
* A list of hints at problems that aren't failures.
* This may be non-empty even when failure is true, failure does outrank this information though!
* An instability is for example a pre-fail attribute or a self-test failure.
* They may point at (imminent) problems but may just as well be nothing. Think of them as soft failures.
*/
Q_PROPERTY(QStringList instabilities READ instabilities WRITE setInstabilities NOTIFY instabilitiesChanged)
// We dbus-expose objects without adaptor so the property API reflects the dbus API
// and so be mindful of what is available as writable property.
// 'failed' is writable for ease of testing and nothing more.
Q_PROPERTY(bool failed READ failed WRITE setFailed NOTIFY failedChanged)
Q_PROPERTY(bool ignore READ ignore WRITE setIgnore NOTIFY ignoreChanged)
public:
......@@ -48,7 +56,11 @@ public:
return m_path;
}
QStringList instabilities() const;
void setInstabilities(const QStringList &instabilities);
signals:
void instabilitiesChanged();
void failedChanged();
void ignoreChanged();
......@@ -56,6 +68,7 @@ private:
const QString m_udi;
const QString m_product;
const QString m_path;
QStringList m_instabilities;
bool m_failed = false;
bool m_ignored = false;
};
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2021 Harald Sitter <sitter@kde.org>
#include "instabilities.h"
#include <KLocalizedString>
#include <QMetaEnum>
#include "smartdata.h"
static QString smartCtlDataFailureToInstabilityString(SMART::Failure failure)
{
switch (failure) {
case SMART::Failure::None:
case SMART::Failure::CmdLineParse:
case SMART::Failure::DeviceOpen:
case SMART::Failure::InternalCommand:
// These are kind of internal failures that the user cannot really
// do anything about and they aren't necessarily indicative of
// anything wrong with the drive.
return {};
case SMART::Failure::Disk:
// This is reflected as failure.
return {};
case SMART::Failure::Prefail:
return i18nc("@label", "Prefail attributes <= threshold.");
case SMART::Failure::PastPrefail:
return i18nc(
"@label",
"SMART status check returned 'DISK OK' but we found that some (usage or prefail) attributes have been <= threshold at some time in the past.");
case SMART::Failure::ErrorsRecorded:
return i18nc("@label", "The device error log contains records of errors.");
case SMART::Failure::SelfTestErrors:
return i18nc("@label",
"The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test " +
"are ignored.");
}
Q_UNREACHABLE();
return {};
}
QStringList Instabilities::from(const SMARTData &data)
{
QStringList list;
const SMARTCtlData ctlData = data.m_smartctl;
const auto failureEnum = QMetaEnum::fromType<SMART::Failure>();
for (auto i = 0; i < failureEnum.keyCount(); ++i) {
const auto fail = static_cast<SMART::Failure>(failureEnum.value(i));
const bool flagSet = ctlData.failure().testFlag(fail);
if (!flagSet) {
continue;
}
const QString instability = smartCtlDataFailureToInstabilityString(fail);
if (!instability.isEmpty()) {
list << instability;
}
}
return list;
}
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2021 Harald Sitter <sitter@kde.org>
#pragma once
#include <QStringList>
class SMARTData;
// Stringifies smartctl exit codes (= failures) into pretty strings fit for
// display to user. Also filters uninteresting failures.
namespace Instabilities
{
QStringList from(const SMARTData &data);
};
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2020 Harald Sitter <sitter@kde.org>
// SPDX-FileCopyrightText: 2020-2021 Harald Sitter <sitter@kde.org>
import org.kde.kcm 1.2 as KCM
import QtQuick 2.14
import QtQml.Models 2.14
import QtQuick.Layouts 1.14
import SMART 1.0 as SMART
import org.kde.kirigami 2.12 as Kirigami
import QtQuick.Controls 2.14
......@@ -49,7 +50,38 @@ KCM.SimpleKCM {
delegate: Kirigami.Card {
banner.title: "%1 (%2)".arg(product).arg(path)
banner.titleIcon: failed ? "data-warning" : ""
banner.titleIcon: {
if (failed) {
return "data-warning"
}
if (instabilities.length !== 0) {
return "data-information"
}
return ""
}
contentItem: Label {
width: parent.width
wrapMode: Text.Wrap
text: {
if (failed) {
return i18nc("@info",
"The SMART system of this device is reporting problems. This may be a sign of imminent device failure or data reliability being compromised. " +
"Back up your data and replace this drive as soon as possible to avoid losing any data.")
}
if (instabilities.length !== 0) {
var items = instabilities.map(item => "<li>%1</li>".arg(item))
return i18nc("@info %1 is a bunch of <li> with the strings from instabilities.cpp",
"<p>The SMART firmware is not reporting a failure, but there are early signs of malfunction. " +
"This might point at imminent device failure but requires longer term analysis. " +
"Back up your data and contact the manufacturer of this disk, or replace it preemptively just to be safe.</p>" +
"<ul>%1</ul>", items.join(''))
}
return i18nc("@info",
"This device appears to be working as expected.")
}
}
actions: [
Kirigami.Action {
visible: partitionManagerRunner.canRun
......@@ -73,15 +105,6 @@ KCM.SimpleKCM {
}
}
]
contentItem: Label {
width: parent.width
wrapMode: Text.Wrap
text: failed
? i18nc("@info",
"The SMART system of this device is reporting problems. This may be a sign of imminent device failure or data reliability being compromised. It is highly recommended that you backup your data and replace this drive as soon as possible to avoid losing any data.")
: i18nc("@info",
"This device appears to be working as expected.")
}
}
}
}
......@@ -10,6 +10,7 @@
#include <QFileInfo>
#include "kded_debug.h"
#include "smartfailure.h"
void SMARTCtl::run(const QString &devicePath)
{
......@@ -44,11 +45,11 @@ void SMARTCtl::run(const QString &devicePath)
KAuth::ExecuteJob *job = action.execute();
connect(job, &KJob::result, this, [this, job, devicePath] {
const auto data = job->data();
const auto code = data.value(QStringLiteral("exitCode"), QByteArray()).toInt();
const auto code = SMART::Failures(data.value(QStringLiteral("exitCode"), QByteArray()).toInt());
const auto json = data.value(QStringLiteral("data"), QByteArray()).toByteArray();
QJsonDocument document;
if (json.isEmpty() || code & Failure::CmdLineParse || code & Failure::DeviceOpen) {
if (json.isEmpty() || code & SMART::Failure::CmdLineParse || code & SMART::Failure::DeviceOpen) {
qCDebug(KDED) << "looks like we got no data back for" << devicePath << code << json.isEmpty();
} else {
document = QJsonDocument::fromJson(json);
......
......@@ -29,34 +29,6 @@ private:
class SMARTCtl : public AbstractSMARTCtl
{
public:
/** smartctl manpage
Bit 0: Command line did not parse.
Bit 1: Device open failed, device did not return an IDENTIFY DEVICE structure,
or device is in a low-power mode (see '-n' option above).
Bit 2: Some SMART or other ATA command to the disk failed, or there was a
checksum error in a SMART data structure (see '-b' option above).
Bit 3: SMART status check returned "DISK FAILING".
Bit 4: We found prefail Attributes <= threshold.
Bit 5: SMART status check returned "DISK OK" but we found that some (usage or
prefail) Attributes have been <= threshold at some time in the past.
Bit 6: The device error log contains records of errors.
Bit 7: The device self-test log contains records of errors.
[ATA only] Failed self-tests outdated by a newer successful extended
self-test are ignored.
*/
enum Failure {
CmdLineParse = 0x1,
DeviceOpen = 0x2,
InternalCommand = 0x4,
Disk = 0x8,
Prefail = 0x10,
PastPrefail = 0x20,
ErrorsRecorded = 0x40,
SelfTestErrors = 0x80,
// The entire thing doesn't exceed 8 bits because it's a posix exit code.
};
void run(const QString &devicePath) override;
private:
......
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2020 Harald Sitter <sitter@kde.org>
// SPDX-FileCopyrightText: 2020-2021 Harald Sitter <sitter@kde.org>
#include "smartdata.h"
......@@ -13,9 +13,19 @@ SMARTStatus::SMARTStatus(const QJsonObject &object)
// Should we decide to map the value. Its meaning is "defined" in
// nvmeprint.cpp of smartmontools
}
SMARTCtlData::SMARTCtlData(const QJsonObject &object)
: m_exitStatus(object[QStringLiteral("exit_status")].toInt(static_cast<int>(SMART::Failure::None)))
{
}
SMART::Failures SMARTCtlData::failure() const
{
return SMART::Failures(static_cast<SMART::Failure>(m_exitStatus));
}
SMARTData::SMARTData(const QJsonDocument &document)
: m_status(SMARTStatus(document.object()[QStringLiteral("smart_status")].toObject()))
: m_smartctl(SMARTCtlData(document.object()[QStringLiteral("smartctl")].toObject()))
, m_status(SMARTStatus(document.object()[QStringLiteral("smart_status")].toObject()))
, m_device(document.object()[QStringLiteral("device")].toObject()[QStringLiteral("name")].toString())
{
}
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2020 Harald Sitter <sitter@kde.org>
// SPDX-FileCopyrightText: 2020-2021 Harald Sitter <sitter@kde.org>
#ifndef SMARTDATA_H
#define SMARTDATA_H
#include <QObject>
#include <QString>
#include "smartfailure.h"
class QJsonObject;
class QJsonDocument;
/** Models "smart_status" blobs */
class SMARTStatus
{
public:
......@@ -17,11 +21,25 @@ public:
bool m_passed;
};
/** Models "smartctl" blobs */
class SMARTCtlData
{
public:
SMARTCtlData(const QJsonObject &object);
SMART::Failures failure() const;
private:
int m_exitStatus = -1; // only 8 least significant are filled for posix reasons.
};
/** Models the entire json output document */
class SMARTData
{
public:
SMARTData(const QJsonDocument &document);
SMARTCtlData m_smartctl;
SMARTStatus m_status;
QString m_device;
};
......
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2021 Harald Sitter <sitter@kde.org>
#include "smartfailure.h"
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
// SPDX-FileCopyrightText: 2020-2021 Harald Sitter <sitter@kde.org>
#pragma once
#include <QFlag>
#include <QObject>
namespace SMART
{
Q_NAMESPACE
/** smartctl manpage (exit codes are flags)
Bit 0: Command line did not parse.
Bit 1: Device open failed, device did not return an IDENTIFY DEVICE structure,
or device is in a low-power mode (see '-n' option above).
Bit 2: Some SMART or other ATA command to the disk failed, or there was a
checksum error in a SMART data structure (see '-b' option above).
Bit 3: SMART status check returned "DISK FAILING".
Bit 4: We found prefail Attributes <= threshold.
Bit 5: SMART status check returned "DISK OK" but we found that some (usage or
prefail) Attributes have been <= threshold at some time in the past.
Bit 6: The device error log contains records of errors.
Bit 7: The device self-test log contains records of errors.
[ATA only] Failed self-tests outdated by a newer successful extended
self-test are ignored.
*/
enum class Failure {
None = 0x0,
CmdLineParse = 0x1,
DeviceOpen = 0x2,
InternalCommand = 0x4,
Disk = 0x8,
Prefail = 0x10,
PastPrefail = 0x20,
ErrorsRecorded = 0x40,
SelfTestErrors = 0x80,
// The entire thing doesn't exceed 8 bits because it's a posix exit code.
};
Q_ENUM_NS(Failure);
Q_DECLARE_FLAGS(Failures, Failure)
}
Q_DECLARE_OPERATORS_FOR_FLAGS(SMART::Failures)
......@@ -7,6 +7,7 @@
#include "device.h"
#include "devicenotifier.h"
#include "instabilities.h"
#include "kded_debug.h"
#include "smartctl.h"
#include "smartdata.h"
......@@ -82,10 +83,12 @@ void SMARTMonitor::onSMARTCtlFinished(const QString &devicePath, const QJsonDocu
Device *existing = *existingIt;
// update failure and call it a day. Notification is handled by the Device.
existing->setInstabilities(Instabilities::from(data));
existing->setFailed(!data.m_status.m_passed);
return;
}
device->setInstabilities(Instabilities::from(data));
device->setFailed(!data.m_status.m_passed);
m_devices << device;
......@@ -97,5 +100,3 @@ void SMARTMonitor::addDevice(Device *device)
m_pendingDevices[device->path()] = device;
m_ctl->run(device->path());
}
#include "smartmonitor.moc"
......@@ -21,12 +21,23 @@ public:
: QObject(parent)
{
m_notification->setComponentName("org.kde.kded.smart");
m_notification->setIconName(QStringLiteral("data-warning"));
if (device->failed()) {
m_notification->setIconName(QStringLiteral("data-warning"));
} else {
m_notification->setIconName(QStringLiteral("data-information"));
}
m_notification->setTitle(i18nc("@title notification", "Storage Device Problems"));
m_notification->setText(xi18nc("@info notification; text %1 is a pretty product name; %2 the device path e.g. /dev/sda",
"The storage device <emphasis>%1</emphasis> (<filename>%2</filename>) is likely to fail soon!",
device->product(),
device->path()));
if (device->failed()) {
m_notification->setText(xi18nc("@info notification; text %1 is a pretty product name; %2 the device path e.g. /dev/sda",
"The storage device <emphasis>%1</emphasis> (<filename>%2</filename>) is likely to fail soon!",
device->product(),
device->path()));
} else {
m_notification->setText(xi18nc("@info notification; text %1 is a pretty product name; %2 the device path e.g. /dev/sda",
"The storage device <emphasis>%1</emphasis> (<filename>%2</filename>) is showing indications of instability.",
device->product(),
device->path()));
}
KService::Ptr kcm = KService::serviceByStorageId(QStringLiteral("smart"));
Q_ASSERT(kcm); // there's a bug or installation is broken; mustn't happen in production
......@@ -75,7 +86,7 @@ void SMARTNotifier::maybeFailed(const Device *device)
Q_ASSERT(device);
// We notify on instabilities in the hopes that there won't be false positives.
// Might need revisiting.
if (!device->failed() || device->ignore()) {
if ((!device->failed() && device->instabilities().isEmpty()) || device->ignore()) {
return;
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment