Commit 2b25bb4f authored by Robby Stephenson's avatar Robby Stephenson
Browse files

Fix bug with serializing some unicode code points

QDomDocument::createTextNode() ends up calling fixedCharData() which
uses QXmlUtils::isChar(). WHen serializing code points with surrogates,
the high surrogate character was getting dropped after Tellico 3.4.3
started using QDomImplementation::DropInvalidChars.

Saving a collection with these characters, like emojis, with Tellico
3.4.3 will silently lose data.

BUG: 449244
FIXED-IN: 3.4.4
parent be5c588f
2022-02-13 Robby Stephenson <robby@periapsis.org>
* Fixed bug with saving and viewing surrogate code points (Bug 449244).
2022-02-04 Robby Stephenson <robby@periapsis.org>
* Fixed bug with fetchdialog layout preventing clicking (Bug 449636).
......
......@@ -40,6 +40,7 @@
#include "../entry.h"
#include "../document.h"
#include "../utils/xmlhandler.h"
#include "../utils/string_utils.h"
#include <QTest>
#include <QNetworkInterface>
......@@ -588,11 +589,19 @@ void TellicoReadTest::testBug443845() {
}
void TellicoReadTest::testEmoji() {
// https://www.fileformat.info/info/unicode/char/1f3e1/index.htm
QString textWithEmoji = QString::fromUtf8("Title 🏡️");
// stripping control codes should not affect the emoji
QCOMPARE(Tellico::removeControlCodes(textWithEmoji), textWithEmoji);
Tellico::Data::CollPtr coll(new Tellico::Data::Collection(true)); // add default fields
QVERIFY(coll->hasField(QStringLiteral("title")));
Tellico::Data::FieldPtr field = coll->fieldByName(QStringLiteral("title"));
QVERIFY(field);
field->setTitle(textWithEmoji);
Tellico::Data::EntryPtr entry1(new Tellico::Data::Entry(coll));
entry1->setField(QStringLiteral("title"), textWithEmoji);
QCOMPARE(entry1->title(), textWithEmoji);
coll->addEntries(entry1);
Tellico::Export::TellicoXMLExporter exporter(coll);
exporter.setEntries(coll->entries());
......@@ -600,8 +609,12 @@ void TellicoReadTest::testEmoji() {
Tellico::Import::TellicoImporter importer(exporter.text());
Tellico::Data::CollPtr coll2 = importer.collection();
QVERIFY(coll2);
Tellico::Data::FieldPtr field2 = coll2->fieldByName(QStringLiteral("title"));
QVERIFY(field2);
QCOMPARE(field2->title(), textWithEmoji);
Tellico::Data::EntryPtr entry2 = coll2->entries().at(0);
QVERIFY(entry2);
QCOMPARE(entry2->title(), textWithEmoji);
}
......@@ -93,11 +93,11 @@ QDomDocument TellicoXMLExporter::exportXML() const {
}
QDomImplementation impl;
// Silently drop invalid XML characters when saving the document
// the default setting would allow invalid characters in the exported XML
// which might then fail to load or validate
// Bug 443845
impl.setInvalidDataPolicy(QDomImplementation::DropInvalidChars);
// Bug 443845 - but do not just silent drop the invalid characters
// since that drops emojis and unicode points with surrogate encoding
// instead Tellico::removeControlCodes is used everywhere that
// QDomDocument::createTextNode() is called
// impl.setInvalidDataPolicy(QDomImplementation::DropInvalidChars);
QDomDocumentType doctype = impl.createDocumentType(QStringLiteral("tellico"),
XML::pubTellico(exportVersion),
XML::dtdTellico(exportVersion));
......@@ -156,7 +156,7 @@ void TellicoXMLExporter::exportCollectionXML(QDomDocument& dom_, QDomElement& pa
const Data::BibtexCollection* c = static_cast<const Data::BibtexCollection*>(coll.data());
if(!c->preamble().isEmpty()) {
QDomElement preElem = dom_.createElement(QStringLiteral("bibtex-preamble"));
preElem.appendChild(dom_.createTextNode(c->preamble()));
preElem.appendChild(dom_.createTextNode(removeControlCodes(c->preamble())));
collElem.appendChild(preElem);
}
......@@ -165,7 +165,7 @@ void TellicoXMLExporter::exportCollectionXML(QDomDocument& dom_, QDomElement& pa
if(!macroIt.value().isEmpty()) {
QDomElement macroElem = dom_.createElement(QStringLiteral("macro"));
macroElem.setAttribute(QStringLiteral("name"), macroIt.key());
macroElem.appendChild(dom_.createTextNode(macroIt.value()));
macroElem.appendChild(dom_.createTextNode(removeControlCodes(macroIt.value())));
macrosElem.appendChild(macroElem);
}
}
......@@ -241,7 +241,7 @@ void TellicoXMLExporter::exportFieldXML(QDomDocument& dom_, QDomElement& parent_
}
QDomElement e = dom_.createElement(QStringLiteral("prop"));
e.setAttribute(QStringLiteral("name"), it.key());
e.appendChild(dom_.createTextNode(it.value()));
e.appendChild(dom_.createTextNode(removeControlCodes(it.value())));
elem.appendChild(e);
}
......@@ -301,7 +301,7 @@ void TellicoXMLExporter::exportEntryXML(QDomDocument& dom_, QDomElement& parent_
}
for(int col = 0; col < columnValues.count(); ++col) {
QDomElement elem = dom_.createElement(QStringLiteral("column"));
elem.appendChild(dom_.createTextNode(columnValues.at(col)));
elem.appendChild(dom_.createTextNode(removeControlCodes(columnValues.at(col))));
fieldElem.appendChild(elem);
}
}
......@@ -320,7 +320,7 @@ void TellicoXMLExporter::exportEntryXML(QDomDocument& dom_, QDomElement& parent_
for(QStringList::ConstIterator it = fields.constBegin(); it != fields.constEnd(); ++it) {
// element for field value, child of either entryElem or ParentElem
QDomElement fieldElem = dom_.createElement(fieldName);
fieldElem.appendChild(dom_.createTextNode(*it));
fieldElem.appendChild(dom_.createTextNode(removeControlCodes(*it)));
parElem.appendChild(fieldElem);
}
} else {
......@@ -368,10 +368,10 @@ void TellicoXMLExporter::exportEntryXML(QDomDocument& dom_, QDomElement& parent_
fieldElem.appendChild(dom_.createTextNode(old_url.url()));
}
} else {
fieldElem.appendChild(dom_.createTextNode(fieldValue));
fieldElem.appendChild(dom_.createTextNode(removeControlCodes(fieldValue)));
}
} else {
fieldElem.appendChild(dom_.createTextNode(fieldValue));
fieldElem.appendChild(dom_.createTextNode(removeControlCodes(fieldValue)));
}
}
......
......@@ -39,6 +39,28 @@
namespace {
static const int STRING_STORE_SIZE = 4999; // too big, too small?
class StringIterator {
QString::const_iterator pos, e;
public:
#if (QT_VERSION >= QT_VERSION_CHECK(5, 10, 0))
explicit StringIterator(QStringView string)
#else
explicit StringIterator(const QString& string)
#endif
: pos(string.begin()), e(string.end()) {}
inline bool hasNext() const { return pos < e; }
inline uint next() {
Q_ASSERT(hasNext());
const QChar uc = *pos++;
if(uc.isSurrogate()) {
if(uc.isHighSurrogate() && pos < e && pos->isLowSurrogate())
return QChar::surrogateToUcs4(uc, *pos++);
return QChar::ReplacementCharacter;
}
return uc.unicode();
}
};
}
QString Tellico::decodeHTML(const QByteArray& data_) {
......@@ -229,12 +251,14 @@ QString Tellico::reverseObfuscate(const QByteArray& bytes) {
QString Tellico::removeControlCodes(const QString& string) {
QString result;
result.reserve(string.size());
for(int i = 0; i < string.size(); ++i) {
const ushort c = string.at(i).unicode();
StringIterator it(string);
while(it.hasNext()) {
const auto c = it.next();
// legal control codes in XML 1.0 are U+0009, U+000A, U+000D
// https://www.w3.org/TR/xml/#charsets
if(c > 0x1F || c == 0x9 || c == 0xA || c == 0xD) {
result += string.at(i);
if(c < 0xd800) result += QChar(c);
else result += QString::fromUcs4(&c, 1);
}
}
return result;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment