Autodetect weird encodings in ID3v1 tags.

Fixes issue #254
This commit is contained in:
John Maguire 2010-05-11 12:03:55 +00:00
parent 72f5307524
commit 5dd0a9c35f
5 changed files with 36 additions and 5 deletions

View File

@ -114,7 +114,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen)
{
mStart = false;
if (aLen > 3)
qDebug() << aBuf[0];
switch (aBuf[0])
{
case '\xEF':
@ -241,7 +240,6 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen)
break;
default: //pure ascii
mDetectedCharset = "ASCII";
;//do nothing here
}
return NS_OK;
@ -292,6 +290,8 @@ void nsUniversalDetector::DataEnd()
break;
case eEscAscii:
break;
case ePureAscii:
Report("ASCII");
default:
;
}

View File

@ -107,11 +107,11 @@ TagLib::String UniversalEncodingHandler::parse(const TagLib::ByteVector& data) c
// Detected codec -> QString (UTF-16) -> UTF8 -> UTF16-BE (TagLib::String)
// That's probably expensive.
QString unicode = current_codec_->toUnicode(data.data(), data.size());
qDebug() << "Decoded to:" << unicode;
return TagLib::String(unicode.toUtf8().constData(), TagLib::String::UTF8);
}
}
/*
TagLib::ByteVector UniversalEncodingHandler::render(const TagLib::String& s) const {
// TODO: what should we do here?
// 1. Coerce to ASCII
@ -120,12 +120,19 @@ TagLib::ByteVector UniversalEncodingHandler::render(const TagLib::String& s) con
// 4. Nothing and rewrite the tag as ID3v2 & UTF8
return TagLib::ByteVector();
}
*/
void UniversalEncodingHandler::Report(const char* charset) {
qDebug() << "Detected as" << charset;
if (qstrcmp(charset, "ASCII") == 0) {
current_codec_ = 0;
return;
}
QTextCodec* codec = QTextCodec::codecForName(charset);
if (!codec) {
qWarning() << "Could not identify encoding in ID3v1 tag. Assuming ASCII.";
} else {
qWarning() << "Detected non-ASCII encoding in ID3v1 tag:" << charset;
}
current_codec_ = codec;
}

View File

@ -60,7 +60,7 @@ class UniversalEncodingHandler : public TagLib::ID3v1::StringHandler,
// TagLib::ID3v1::StringHandler
virtual TagLib::String parse(const TagLib::ByteVector& data) const;
virtual TagLib::ByteVector render(const TagLib::String& s) const;
//virtual TagLib::ByteVector render(const TagLib::String& s) const;
private:
// nsUniversalDetector

View File

@ -103,6 +103,10 @@ int main(int argc, char *argv[]) {
lastfm::ws::ApiKey = LastFMService::kApiKey;
lastfm::ws::SharedSecret = LastFMService::kSecret;
// Detect technically invalid usage of non-ASCII in ID3v1 tags.
UniversalEncodingHandler handler;
TagLib::ID3v1::Tag::setStringHandler(&handler);
QtSingleApplication a(argc, argv);
a.setQuitOnLastWindowClosed(false);

View File

@ -65,4 +65,24 @@ TEST_F(SongTest, InitsFromFile) {
EXPECT_EQ("Baz", song.album());
}
TEST_F(SongTest, DetectsWindows1251) {
char cp1251[] = { 0xc2, 0xfb, 0xe4, 0xfb, 0xf5, 0xe0, 0xe9, 0x00 }; // Выдыхай
UniversalEncodingHandler handler;
TagLib::ByteVector bytes(cp1251);
TagLib::String str = handler.parse(bytes);
EXPECT_FALSE(str.isAscii());
EXPECT_FALSE(str.isLatin1());
EXPECT_STREQ("Выдыхай", str.to8Bit(true).c_str());
}
TEST_F(SongTest, LeavesASCIIAlone) {
char* ascii = "foobar";
UniversalEncodingHandler handler;
TagLib::ByteVector bytes(ascii);
TagLib::String str = handler.parse(bytes);
EXPECT_TRUE(str.isAscii());
EXPECT_TRUE(str.isLatin1());
EXPECT_STREQ("foobar", str.to8Bit(false).c_str());
}
} // namespace