Detect cp866 & windows-1251 in "latin1" tags.

This commit is contained in:
John Maguire 2010-06-02 12:31:40 +00:00
parent 3dad151608
commit f4385d4545
3 changed files with 104 additions and 13 deletions

View File

@ -96,6 +96,11 @@ UniversalEncodingHandler::UniversalEncodingHandler()
current_codec_(NULL) {
}
UniversalEncodingHandler::UniversalEncodingHandler(uint32_t language_filter)
: nsUniversalDetector(language_filter),
current_codec_(NULL) {
}
TagLib::String UniversalEncodingHandler::parse(const TagLib::ByteVector& data) const {
const_cast<UniversalEncodingHandler*>(this)->Reset();
const_cast<UniversalEncodingHandler*>(this)->HandleData(data.data(), data.size());
@ -129,14 +134,55 @@ void UniversalEncodingHandler::Report(const char* charset) {
}
QTextCodec* codec = QTextCodec::codecForName(charset);
if (!codec) {
qWarning() << "Could not identify encoding in ID3v1 tag. Assuming ASCII.";
} else {
qWarning() << "Detected non-ASCII encoding in ID3v1 tag:" << charset;
}
current_codec_ = codec;
}
QTextCodec* UniversalEncodingHandler::Guess(const char* data) {
Reset();
HandleData(data, qstrlen(data));
DataEnd();
if (!current_codec_) {
// Windows-1251 heuristic.
const uchar* d = reinterpret_cast<const uchar*>(data);
int repeats = 0;
while (uchar x = *d++) {
if (x >= 0xc0) {
++repeats;
} else {
repeats = 0;
}
if (repeats > 3) {
qWarning() << "Heuristic guessed windows-1251";
return QTextCodec::codecForName("windows-1251");
}
}
}
return current_codec_;
}
QString UniversalEncodingHandler::FixEncoding(const TagLib::String& input) {
if (input.isLatin1() && !input.isAscii()) {
qWarning() << "Extended ASCII... possibly should be CP866 or windows-1251 instead";
std::string broken = input.toCString(true);
std::string fixed;
if (broken.size() > input.size()) {
fixed = QString::fromUtf8(broken.c_str()).toStdString();
// This is single byte encoding, therefore can't be CJK.
UniversalEncodingHandler detector(NS_FILTER_NON_CJK);
QTextCodec* codec = detector.Guess(fixed.c_str());
if (!codec) {
qDebug() << "Could not guess encoding. Using extended ASCII.";
} else {
QString foo = codec->toUnicode(fixed.c_str());
return foo.trimmed();
}
}
}
return TStringToQString(input).trimmed();
}
Song::Private::Private()
: valid_(false),
@ -206,15 +252,13 @@ void Song::InitFromFile(const QString& filename, int directory_id) {
TagLib::Tag* tag = fileref->tag();
if (tag) {
#define strip(x) TStringToQString( x ).trimmed()
d->title_ = strip(tag->title());
d->artist_ = strip(tag->artist());
d->album_ = strip(tag->album());
d->comment_ = strip(tag->comment());
d->genre_ = strip(tag->genre());
d->title_ = UniversalEncodingHandler::FixEncoding(tag->title());
d->artist_ = UniversalEncodingHandler::FixEncoding(tag->artist());
d->album_ = UniversalEncodingHandler::FixEncoding(tag->album());
d->comment_ = UniversalEncodingHandler::FixEncoding(tag->comment());
d->genre_ = UniversalEncodingHandler::FixEncoding(tag->genre());
d->year_ = tag->year();
d->track_ = tag->track();
#undef strip
d->valid_ = true;
}

View File

@ -57,11 +57,14 @@ class UniversalEncodingHandler : public TagLib::ID3v1::StringHandler,
nsUniversalDetector {
public:
UniversalEncodingHandler();
explicit UniversalEncodingHandler(uint32_t language_filter);
// TagLib::ID3v1::StringHandler
virtual TagLib::String parse(const TagLib::ByteVector& data) const;
//virtual TagLib::ByteVector render(const TagLib::String& s) const;
QTextCodec* Guess(const char* data);
static QString FixEncoding(const TagLib::String& input);
private:
// nsUniversalDetector
virtual void Report(const char* charset);

View File

@ -25,6 +25,7 @@
#include "mock_taglib.h"
#include <QTemporaryFile>
#include <QTextCodec>
namespace {
@ -85,4 +86,47 @@ TEST_F(SongTest, LeavesASCIIAlone) {
EXPECT_STREQ("foobar", str.to8Bit(false).c_str());
}
TEST_F(SongTest, FixesCP866) {
const char cp866[] = { 0x8a, 0xa8, 0xad, 0xae, '\0' }; // Кино
TagLib::ByteVector bytes(cp866);
TagLib::String str(bytes);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
EXPECT_EQ(4, fixed.length());
EXPECT_STREQ("Кино", fixed.toUtf8().constData());
}
TEST_F(SongTest, FixesWindows1251) {
const char w1251[] = { 0xca, 0xe8, 0xed, 0xee, '\0' }; // Кино
TagLib::ByteVector bytes(w1251);
TagLib::String str(bytes);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
EXPECT_EQ(4, fixed.length());
EXPECT_STREQ("Кино", fixed.toUtf8().constData());
}
TEST_F(SongTest, DoesNotFixAscii) {
TagLib::ByteVector bytes("foobar");
TagLib::String str(bytes);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
EXPECT_EQ(6, fixed.length());
EXPECT_STREQ("foobar", fixed.toUtf8().constData());
}
TEST_F(SongTest, DoesNotFixUtf8) {
TagLib::ByteVector bytes("Кино");
TagLib::String str(bytes, TagLib::String::UTF8);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
EXPECT_EQ(4, fixed.length());
EXPECT_STREQ("Кино", fixed.toUtf8().constData());
}
TEST_F(SongTest, DoesNotFixExtendedAscii) {
char latin1[] = { 'R', 0xf6, 'y', 'k', 's', 'o', 'p', 'p', 0x00 };
QTextCodec* codec = QTextCodec::codecForName("latin1");
QString unicode = codec->toUnicode(latin1);
TagLib::String str(latin1);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
EXPECT_EQ(fixed, unicode);
}
} // namespace