Add voting for codecs, eg. if 3/4 tags in a file are windows-1251 then they will all be decoded with windows-1251.

This commit is contained in:
John Maguire 2010-06-03 12:36:43 +00:00
parent c0103cc1e0
commit b9df2a09d4
3 changed files with 105 additions and 21 deletions

View File

@ -154,14 +154,63 @@ QTextCodec* UniversalEncodingHandler::Guess(const char* data) {
}
if (repeats > 3) {
qWarning() << "Heuristic guessed windows-1251";
return QTextCodec::codecForName("windows-1251");
current_codec_ = QTextCodec::codecForName("windows-1251");
}
}
}
return current_codec_;
}
QTextCodec* UniversalEncodingHandler::Guess(const TagLib::Tag& tag) {
QHash<QTextCodec*, int> usages;
Guess(tag.title(), &usages);
Guess(tag.artist(), &usages);
Guess(tag.album(), &usages);
Guess(tag.comment(), &usages);
Guess(tag.genre(), &usages);
QHash<QTextCodec*, int>::const_iterator max = usages.begin();
for (QHash<QTextCodec*, int>::const_iterator it = usages.begin(); it != usages.end(); ++it) {
if (it.value() > max.value()) {
max = it;
}
}
return max.key();
}
void UniversalEncodingHandler::Guess(const TagLib::String& input,
QHash<QTextCodec*, int>* usages) {
if (input.isEmpty()) {
return; // Empty strings don't vote.
}
QTextCodec* codec = Guess(input);
QHash<QTextCodec*, int>::iterator it = usages->find(codec);
if (it == usages->end()) {
usages->insert(codec, 1);
} else {
++it.value();
}
}
QTextCodec* UniversalEncodingHandler::Guess(const TagLib::String& input) {
if (input.isAscii()) {
return NULL;
}
if (input.isLatin1()) {
qWarning() << "Extended ASCII... possibly should be CP866 or windows-1251 instead";
std::string broken = input.toCString(true);
std::string fixed;
if (broken.size() > input.size()) {
fixed = QString::fromUtf8(broken.c_str()).toStdString();
QTextCodec* codec = Guess(fixed.c_str());
return codec;
} else {
return NULL;
}
}
return QTextCodec::codecForName("UTF-8");
}
QString UniversalEncodingHandler::FixEncoding(const TagLib::String& input) {
if (input.isLatin1() && !input.isAscii()) {
qWarning() << "Extended ASCII... possibly should be CP866 or windows-1251 instead";
@ -169,12 +218,11 @@ QString UniversalEncodingHandler::FixEncoding(const TagLib::String& input) {
std::string fixed;
if (broken.size() > input.size()) {
fixed = QString::fromUtf8(broken.c_str()).toStdString();
// This is single byte encoding, therefore can't be CJK.
UniversalEncodingHandler detector(NS_FILTER_NON_CJK);
QTextCodec* codec = detector.Guess(fixed.c_str());
QTextCodec* codec = Guess(fixed.c_str());
if (!codec) {
qDebug() << "Could not guess encoding. Using extended ASCII.";
} else {
qDebug() << "Guessed:" << codec->name();
QString foo = codec->toUnicode(fixed.c_str());
return foo.trimmed();
}
@ -235,6 +283,15 @@ void Song::Init(const QString& title, const QString& artist, const QString& albu
d->length_ = length;
}
QString Song::Decode(const TagLib::String tag, const QTextCodec* codec) const {
if (codec) {
const std::string fixed = QString::fromUtf8(tag.toCString(true)).toStdString();
return codec->toUnicode(fixed.c_str()).trimmed();
} else {
return TStringToQString(tag).trimmed();
}
}
void Song::InitFromFile(const QString& filename, int directory_id) {
d->filename_ = filename;
d->directory_id_ = directory_id;
@ -250,13 +307,18 @@ void Song::InitFromFile(const QString& filename, int directory_id) {
d->mtime_ = info.lastModified().toTime_t();
d->ctime_ = info.created().toTime_t();
// This is single byte encoding, therefore can't be CJK.
UniversalEncodingHandler detector(NS_FILTER_NON_CJK);
TagLib::Tag* tag = fileref->tag();
QTextCodec* codec = NULL;
if (tag) {
d->title_ = UniversalEncodingHandler::FixEncoding(tag->title());
d->artist_ = UniversalEncodingHandler::FixEncoding(tag->artist());
d->album_ = UniversalEncodingHandler::FixEncoding(tag->album());
d->comment_ = UniversalEncodingHandler::FixEncoding(tag->comment());
d->genre_ = UniversalEncodingHandler::FixEncoding(tag->genre());
codec = detector.Guess(*tag);
d->title_ = Decode(tag->title(), codec);
d->artist_ = Decode(tag->artist(), codec);
d->album_ = Decode(tag->album(), codec);
d->comment_ = Decode(tag->comment(), codec);
d->genre_ = Decode(tag->genre(), codec);
d->year_ = tag->year();
d->track_ = tag->track();
@ -274,10 +336,10 @@ void Song::InitFromFile(const QString& filename, int directory_id) {
d->bpm_ = TStringToQString(file->ID3v2Tag()->frameListMap()["TBPM"].front()->toString()).trimmed().toFloat();
if (!file->ID3v2Tag()->frameListMap()["TCOM"].isEmpty())
d->composer_ = UniversalEncodingHandler::FixEncoding(file->ID3v2Tag()->frameListMap()["TCOM"].front()->toString());
d->composer_ = Decode(file->ID3v2Tag()->frameListMap()["TCOM"].front()->toString(), codec);
if (!file->ID3v2Tag()->frameListMap()["TPE2"].isEmpty()) // non-standard: Apple, Microsoft
d->albumartist_ = UniversalEncodingHandler::FixEncoding(file->ID3v2Tag()->frameListMap()["TPE2"].front()->toString());
d->albumartist_ = Decode(file->ID3v2Tag()->frameListMap()["TPE2"].front()->toString(), codec);
if (!file->ID3v2Tag()->frameListMap()["TCMP"].isEmpty())
compilation = TStringToQString(file->ID3v2Tag()->frameListMap()["TCMP"].front()->toString()).trimmed();
@ -286,7 +348,7 @@ void Song::InitFromFile(const QString& filename, int directory_id) {
else if (TagLib::Ogg::Vorbis::File* file = dynamic_cast<TagLib::Ogg::Vorbis::File*>(fileref->file())) {
if (file->tag()) {
if ( !file->tag()->fieldListMap()["COMPOSER"].isEmpty() )
d->composer_ = UniversalEncodingHandler::FixEncoding(file->tag()->fieldListMap()["COMPOSER"].front());
d->composer_ = Decode(file->tag()->fieldListMap()["COMPOSER"].front(), codec);
if ( !file->tag()->fieldListMap()["BPM"].isEmpty() )
d->bpm_ = TStringToQString(file->tag()->fieldListMap()["BPM"].front()).trimmed().toFloat();
@ -301,7 +363,7 @@ void Song::InitFromFile(const QString& filename, int directory_id) {
else if (TagLib::FLAC::File* file = dynamic_cast<TagLib::FLAC::File*>(fileref->file())) {
if ( file->xiphComment() ) {
if (!file->xiphComment()->fieldListMap()["COMPOSER"].isEmpty())
d->composer_ = UniversalEncodingHandler::FixEncoding( file->xiphComment()->fieldListMap()["COMPOSER"].front() );
d->composer_ = Decode(file->xiphComment()->fieldListMap()["COMPOSER"].front(), codec);
if (!file->xiphComment()->fieldListMap()["BPM"].isEmpty() )
d->bpm_ = TStringToQString( file->xiphComment()->fieldListMap()["BPM"].front() ).trimmed().toFloat();

View File

@ -17,6 +17,7 @@
#ifndef SONG_H
#define SONG_H
#include <QHash>
#include <QImage>
#include <QList>
#include <QSharedData>
@ -63,12 +64,16 @@ class UniversalEncodingHandler : public TagLib::ID3v1::StringHandler,
virtual TagLib::String parse(const TagLib::ByteVector& data) const;
QTextCodec* Guess(const char* data);
QTextCodec* Guess(const TagLib::Tag& tag);
QTextCodec* Guess(const TagLib::String& input);
static QString FixEncoding(const TagLib::String& input);
QString FixEncoding(const TagLib::String& input);
private:
// nsUniversalDetector
virtual void Report(const char* charset);
void Guess(const TagLib::String& input, QHash<QTextCodec*, int>* usages);
QTextCodec* current_codec_;
};
@ -111,6 +116,8 @@ class Song {
void InitFromLastFM(const lastfm::Track& track);
void MergeFromSimpleMetaBundle(const Engine::SimpleMetaBundle& bundle);
QString Decode(const TagLib::String tag, const QTextCodec* codec) const;
// Save
void BindToQuery(QSqlQuery* query) const;
void ToLastFM(lastfm::Track* track) const;

View File

@ -27,6 +27,8 @@
#include <QTemporaryFile>
#include <QTextCodec>
#include <taglib/id3v2tag.h>
namespace {
class SongTest : public ::testing::Test {
@ -90,7 +92,7 @@ TEST_F(SongTest, FixesCP866) {
const char cp866[] = { 0x8a, 0xa8, 0xad, 0xae, '\0' }; // Кино
TagLib::ByteVector bytes(cp866);
TagLib::String str(bytes);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
QString fixed = UniversalEncodingHandler(NS_FILTER_NON_CJK).FixEncoding(str);
EXPECT_EQ(4, fixed.length());
EXPECT_STREQ("Кино", fixed.toUtf8().constData());
}
@ -99,7 +101,7 @@ TEST_F(SongTest, FixesWindows1251) {
const char w1251[] = { 0xca, 0xe8, 0xed, 0xee, '\0' }; // Кино
TagLib::ByteVector bytes(w1251);
TagLib::String str(bytes);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
QString fixed = UniversalEncodingHandler(NS_FILTER_NON_CJK).FixEncoding(str);
EXPECT_EQ(4, fixed.length());
EXPECT_STREQ("Кино", fixed.toUtf8().constData());
}
@ -107,7 +109,7 @@ TEST_F(SongTest, FixesWindows1251) {
TEST_F(SongTest, DoesNotFixAscii) {
TagLib::ByteVector bytes("foobar");
TagLib::String str(bytes);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
QString fixed = UniversalEncodingHandler(NS_FILTER_NON_CJK).FixEncoding(str);
EXPECT_EQ(6, fixed.length());
EXPECT_STREQ("foobar", fixed.toUtf8().constData());
}
@ -115,7 +117,7 @@ TEST_F(SongTest, DoesNotFixAscii) {
TEST_F(SongTest, DoesNotFixUtf8) {
TagLib::ByteVector bytes("Кино");
TagLib::String str(bytes, TagLib::String::UTF8);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
QString fixed = UniversalEncodingHandler(NS_FILTER_NON_CJK).FixEncoding(str);
EXPECT_EQ(4, fixed.length());
EXPECT_STREQ("Кино", fixed.toUtf8().constData());
}
@ -125,16 +127,29 @@ TEST_F(SongTest, DoesNotFixExtendedAscii) {
QTextCodec* codec = QTextCodec::codecForName("latin1");
QString unicode = codec->toUnicode(latin1);
TagLib::String str(latin1);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
QString fixed = UniversalEncodingHandler(NS_FILTER_NON_CJK).FixEncoding(str);
EXPECT_EQ(fixed, unicode);
}
TEST_F(SongTest, FixesUtf8MungedIntoLatin1) {
char latin1[] = { 'E', 's', 't', 'h', 'e', 'r', 0xe2, 0x80, 0x99, 's', 0x00 };
TagLib::String str(latin1, TagLib::String::Latin1);
QString fixed = UniversalEncodingHandler::FixEncoding(str);
QString fixed = UniversalEncodingHandler(NS_FILTER_NON_CJK).FixEncoding(str);
EXPECT_EQ(8, fixed.length());
EXPECT_EQ(QString::fromUtf8("Esthers"), fixed);
}
TEST_F(SongTest, TakesMajorityVote) {
const char w1251[] = { 0xca, 0xe8, 0xed, 0xee, '\0' }; // Кино
// Actually windows-1251 but gets detected as windows-1252.
const char w1252[] = { 0xcf, '.', 0xc7, '.', '\0' }; // П.Э.
TagLib::ID3v2::Tag tag;
tag.setTitle(w1251);
tag.setArtist(w1251);
tag.setAlbum(w1252);
UniversalEncodingHandler handler(NS_FILTER_NON_CJK);
EXPECT_EQ(QTextCodec::codecForName("windows-1251"), handler.Guess(tag));
}
} // namespace