From 9f7803d9640675c1f2403e97d0be80fb90746c22 Mon Sep 17 00:00:00 2001 From: John Maguire Date: Sun, 20 Jun 2010 19:46:51 +0000 Subject: [PATCH] The atrocity that is unicode-aware FTS. Yes, this code is horrible. Maybe I'll fix it. --- src/core/database.cpp | 192 +++++++++++++++++++++++++++++++++++++++++- src/core/database.h | 126 +++++++++++++++++++++++++++ 2 files changed, 316 insertions(+), 2 deletions(-) diff --git a/src/core/database.cpp b/src/core/database.cpp index aecfd5888..d462dc277 100644 --- a/src/core/database.cpp +++ b/src/core/database.cpp @@ -40,9 +40,157 @@ const uchar* (*Database::_sqlite3_value_text) (sqlite3_value*) = NULL; void (*Database::_sqlite3_result_int64) (sqlite3_context*, sqlite_int64) = NULL; void* (*Database::_sqlite3_user_data) (sqlite3_context*) = NULL; +int (*Database::_sqlite3_prepare_v2) ( + sqlite3*, const char*, int, sqlite3_stmt**, const char**) = NULL; +int (*Database::_sqlite3_bind_text) ( + sqlite3_stmt*, int, const char*, int, void(*)(void*)) = NULL; +int (*Database::_sqlite3_bind_blob) ( + sqlite3_stmt*, int, const void*, int, void(*)(void*)) = NULL; +int (*Database::_sqlite3_step) (sqlite3_stmt*) = NULL; +int (*Database::_sqlite3_finalize) (sqlite3_stmt*) = NULL; + bool Database::sStaticInitDone = false; bool Database::sLoadedSqliteSymbols = false; +sqlite3_tokenizer_module* Database::sFTSTokenizer = NULL; + +struct Token { + QString token; + int start_offset; + int end_offset; +}; + +extern "C" { +// Based on sqlite3_tokenizer. +struct UnicodeTokenizer { + const sqlite3_tokenizer_module* pModule; +}; + +struct UnicodeTokenizerCursor { + const sqlite3_tokenizer* pTokenizer; + + QList tokens; + int position; + QByteArray current_utf8; +}; +} + +int Database::FTSCreate(int argc, const char* const* argv, sqlite3_tokenizer** tokenizer) { + *tokenizer = reinterpret_cast(new UnicodeTokenizer); + + return SQLITE_OK; +} + +int Database::FTSDestroy(sqlite3_tokenizer* tokenizer) { + UnicodeTokenizer* real_tokenizer = reinterpret_cast(tokenizer); + qDebug() << __PRETTY_FUNCTION__; + delete real_tokenizer; + return SQLITE_OK; +} + +int Database::FTSOpen( + sqlite3_tokenizer* pTokenizer, + const char* input, + int bytes, + sqlite3_tokenizer_cursor** cursor) { + UnicodeTokenizerCursor* new_cursor = new UnicodeTokenizerCursor; + new_cursor->pTokenizer = pTokenizer; + new_cursor->position = 0; + + QString str = QString::fromUtf8(input, bytes).toLower(); + QChar* data = str.data(); + // Decompose and strip punctuation. + QList tokens; + QString token; + int start_offset = 0; + int offset = 0; + for (int i = 0; i < str.length(); ++i) { + QChar c = data[i]; + ushort unicode = c.unicode(); + if (unicode >= 0x00 && unicode <= 0x007f) { + offset += 1; + } else if (unicode >= 0x0080 && unicode <= 0x07ff) { + offset += 2; + } else if (unicode >= 0x0800 && unicode <= 0xffff) { + offset += 3; + } else if (unicode >= 0x010000 && unicode <= 0x10ffff) { + offset += 4; + } + + if (!data[i].isLetterOrNumber()) { + // Token finished. + if (token.length() != 0) { + Token t; + t.token = token; + t.start_offset = start_offset; + t.end_offset = offset - 1; + start_offset = offset; + tokens << t; + token.clear(); + } else { + ++start_offset; + } + } else { + if (data[i].decompositionTag() != QChar::NoDecomposition) { + token.push_back(data[i].decomposition()[0]); + } else { + token.push_back(data[i]); + } + } + + if (i == str.length() - 1) { + if (token.length() != 0) { + Token t; + t.token = token; + t.start_offset = start_offset; + t.end_offset = offset; + start_offset = offset; + tokens << t; + token.clear(); + } + } + } + + new_cursor->tokens = tokens; + *cursor = reinterpret_cast(new_cursor); + + return SQLITE_OK; +} + +int Database::FTSClose(sqlite3_tokenizer_cursor* cursor) { + UnicodeTokenizerCursor* real_cursor = reinterpret_cast(cursor); + delete real_cursor; + + return SQLITE_OK; +} + +int Database::FTSNext( + sqlite3_tokenizer_cursor* cursor, + const char** token, + int* bytes, + int* start_offset, + int* end_offset, + int* position) { + UnicodeTokenizerCursor* real_cursor = reinterpret_cast(cursor); + + QList tokens = real_cursor->tokens; + if (real_cursor->position >= tokens.size()) { + return SQLITE_DONE; + } + + Token t = tokens[real_cursor->position]; + QByteArray utf8 = t.token.toUtf8(); + *token = utf8.constData(); + *bytes = utf8.size(); + *start_offset = t.start_offset; + *end_offset = t.end_offset; + *position = real_cursor->position++; + + real_cursor->current_utf8 = utf8; + + return SQLITE_OK; +} + void Database::StaticInit() { if (sStaticInitDone) { @@ -84,18 +232,45 @@ void Database::StaticInit() { _sqlite3_user_data = reinterpret_cast( library.resolve("sqlite3_user_data")); + _sqlite3_prepare_v2 = reinterpret_cast< + int (*) (sqlite3*, const char*, int, sqlite3_stmt**, const char**)>( + library.resolve("sqlite3_prepare_v2")); + _sqlite3_bind_text = reinterpret_cast< + int (*) (sqlite3_stmt*, int, const char*, int, void(*)(void*))>( + library.resolve("sqlite3_bind_text")); + _sqlite3_bind_blob = reinterpret_cast< + int (*) (sqlite3_stmt*, int, const void*, int, void(*)(void*))>( + library.resolve("sqlite3_bind_blob")); + _sqlite3_step = reinterpret_cast( + library.resolve("sqlite3_step")); + _sqlite3_finalize = reinterpret_cast( + library.resolve("sqlite3_finalize")); + if (!_sqlite3_create_function || !_sqlite3_value_type || !_sqlite3_value_int64 || !_sqlite3_value_text || !_sqlite3_result_int64 || - !_sqlite3_user_data) { + !_sqlite3_user_data || + !_sqlite3_prepare_v2 || + !_sqlite3_bind_text || + !_sqlite3_bind_blob || + !_sqlite3_step || + !_sqlite3_finalize) { qDebug() << "Couldn't resolve sqlite symbols"; sLoadedSqliteSymbols = false; } else { sLoadedSqliteSymbols = true; } #endif + + sFTSTokenizer = new sqlite3_tokenizer_module; + sFTSTokenizer->iVersion = 0; + sFTSTokenizer->xCreate = &Database::FTSCreate; + sFTSTokenizer->xDestroy = &Database::FTSDestroy; + sFTSTokenizer->xOpen = &Database::FTSOpen; + sFTSTokenizer->xNext = &Database::FTSNext; + sFTSTokenizer->xClose = &Database::FTSClose; } bool Database::Like(const char* needle, const char* haystack) { @@ -199,7 +374,7 @@ QSqlDatabase Database::Connect() { // Find Sqlite3 functions in the Qt plugin. StaticInit(); - // We want Unicode aware LIKE clauses if possible + // We want Unicode aware LIKE clauses and FTS if possible. if (sLoadedSqliteSymbols) { QVariant v = db.driver()->handle(); if (v.isValid() && qstrcmp(v.typeName(), "sqlite3*") == 0) { @@ -213,6 +388,17 @@ QSqlDatabase Database::Connect() { this, // Custom data available via sqlite3_user_data(). &Database::SqliteLike, // Our function :-) NULL, NULL); + + sqlite3_stmt* statement; + const char* sql = "SELECT fts3_tokenizer(?, ?)"; + int rc = _sqlite3_prepare_v2(handle, sql, -1, &statement, 0); + if (rc == SQLITE_OK) { + _sqlite3_bind_text(statement, 1, "unicode", -1, SQLITE_STATIC); + _sqlite3_bind_blob(statement, 2, &sFTSTokenizer, sizeof(sFTSTokenizer), SQLITE_STATIC); + qDebug() << _sqlite3_step(statement); + + qDebug() << _sqlite3_finalize(statement); + } } } } @@ -257,8 +443,10 @@ void Database::UpdateDatabaseSchema(int version, QSqlDatabase &db) { // Run each command QStringList commands(schema.split(";\n\n")); + db.exec("DROP TABLE songs_fts"); db.transaction(); foreach (const QString& command, commands) { + qDebug() << command; QSqlQuery query(db.exec(command)); if (CheckErrors(query.lastError())) qFatal("Unable to update music library database"); diff --git a/src/core/database.h b/src/core/database.h index bd896d999..bd91f2b07 100644 --- a/src/core/database.h +++ b/src/core/database.h @@ -27,6 +27,109 @@ #include "gtest/gtest_prod.h" +extern "C" { + +struct sqlite3_tokenizer; +struct sqlite3_tokenizer_cursor; + +struct sqlite3_tokenizer_module { + + /* + ** Structure version. Should always be set to 0. + */ + int iVersion; + + /* + ** Create a new tokenizer. The values in the argv[] array are the + ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL + ** TABLE statement that created the fts3 table. For example, if + ** the following SQL is executed: + ** + ** CREATE .. USING fts3( ... , tokenizer arg1 arg2) + ** + ** then argc is set to 2, and the argv[] array contains pointers + ** to the strings "arg1" and "arg2". + ** + ** This method should return either SQLITE_OK (0), or an SQLite error + ** code. If SQLITE_OK is returned, then *ppTokenizer should be set + ** to point at the newly created tokenizer structure. The generic + ** sqlite3_tokenizer.pModule variable should not be initialised by + ** this callback. The caller will do so. + */ + int (*xCreate)( + int argc, /* Size of argv array */ + const char *const*argv, /* Tokenizer argument strings */ + sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ + ); + + /* + ** Destroy an existing tokenizer. The fts3 module calls this method + ** exactly once for each successful call to xCreate(). + */ + int (*xDestroy)(sqlite3_tokenizer *pTokenizer); + + /* + ** Create a tokenizer cursor to tokenize an input buffer. The caller + ** is responsible for ensuring that the input buffer remains valid + ** until the cursor is closed (using the xClose() method). + */ + int (*xOpen)( + sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ + const char *pInput, int nBytes, /* Input buffer */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ + ); + + /* + ** Destroy an existing tokenizer cursor. The fts3 module calls this + ** method exactly once for each successful call to xOpen(). + */ + int (*xClose)(sqlite3_tokenizer_cursor *pCursor); + + /* + ** Retrieve the next token from the tokenizer cursor pCursor. This + ** method should either return SQLITE_OK and set the values of the + ** "OUT" variables identified below, or SQLITE_DONE to indicate that + ** the end of the buffer has been reached, or an SQLite error code. + ** + ** *ppToken should be set to point at a buffer containing the + ** normalized version of the token (i.e. after any case-folding and/or + ** stemming has been performed). *pnBytes should be set to the length + ** of this buffer in bytes. The input text that generated the token is + ** identified by the byte offsets returned in *piStartOffset and + ** *piEndOffset. *piStartOffset should be set to the index of the first + ** byte of the token in the input buffer. *piEndOffset should be set + ** to the index of the first byte just past the end of the token in + ** the input buffer. + ** + ** The buffer *ppToken is set to point at is managed by the tokenizer + ** implementation. It is only required to be valid until the next call + ** to xNext() or xClose(). + */ + /* TODO(shess) current implementation requires pInput to be + ** nul-terminated. This should either be fixed, or pInput/nBytes + ** should be converted to zInput. + */ + int (*xNext)( + sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ + const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ + int *piStartOffset, /* OUT: Byte offset of token in input buffer */ + int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ + int *piPosition /* OUT: Number of tokens returned before this one */ + ); +}; + +struct sqlite3_tokenizer { + const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ + /* Tokenizer implementations will typically add additional fields */ +}; + +struct sqlite3_tokenizer_cursor { + sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ + /* Tokenizer implementations will typically add additional fields */ +}; + +} + class Database : public QObject { Q_OBJECT @@ -87,8 +190,31 @@ class Database : public QObject { static void (*_sqlite3_result_int64) (sqlite3_context*, sqlite_int64); static void* (*_sqlite3_user_data) (sqlite3_context*); + static int (*_sqlite3_prepare_v2) (sqlite3*, const char*, int, sqlite3_stmt**, const char**); + static int (*_sqlite3_bind_text) (sqlite3_stmt*, int, const char*, int, void(*)(void*)); + static int (*_sqlite3_bind_blob) (sqlite3_stmt*, int, const void*, int, void(*)(void*)); + static int (*_sqlite3_step) (sqlite3_stmt*); + static int (*_sqlite3_finalize) (sqlite3_stmt*); + + static bool sStaticInitDone; static bool sLoadedSqliteSymbols; + + static sqlite3_tokenizer_module* sFTSTokenizer; + + static int FTSCreate(int argc, const char* const* argv, sqlite3_tokenizer** tokenizer); + static int FTSDestroy(sqlite3_tokenizer* tokenizer); + static int FTSOpen(sqlite3_tokenizer* tokenizer, + const char* input, + int bytes, + sqlite3_tokenizer_cursor** cursor); + static int FTSClose(sqlite3_tokenizer_cursor* cursor); + static int FTSNext(sqlite3_tokenizer_cursor* cursor, + const char** token, + int* bytes, + int* start_offset, + int* end_offset, + int* position); }; class MemoryDatabase : public Database {