The atrocity that is unicode-aware FTS.

Yes, this code is horrible. Maybe I'll fix it.
This commit is contained in:
John Maguire 2010-06-20 19:46:51 +00:00
parent dafedbf00c
commit 9f7803d964
2 changed files with 316 additions and 2 deletions

View File

@ -40,9 +40,157 @@ const uchar* (*Database::_sqlite3_value_text) (sqlite3_value*) = NULL;
void (*Database::_sqlite3_result_int64) (sqlite3_context*, sqlite_int64) = NULL;
void* (*Database::_sqlite3_user_data) (sqlite3_context*) = NULL;
int (*Database::_sqlite3_prepare_v2) (
sqlite3*, const char*, int, sqlite3_stmt**, const char**) = NULL;
int (*Database::_sqlite3_bind_text) (
sqlite3_stmt*, int, const char*, int, void(*)(void*)) = NULL;
int (*Database::_sqlite3_bind_blob) (
sqlite3_stmt*, int, const void*, int, void(*)(void*)) = NULL;
int (*Database::_sqlite3_step) (sqlite3_stmt*) = NULL;
int (*Database::_sqlite3_finalize) (sqlite3_stmt*) = NULL;
bool Database::sStaticInitDone = false;
bool Database::sLoadedSqliteSymbols = false;
sqlite3_tokenizer_module* Database::sFTSTokenizer = NULL;
struct Token {
QString token;
int start_offset;
int end_offset;
};
extern "C" {
// Based on sqlite3_tokenizer.
struct UnicodeTokenizer {
const sqlite3_tokenizer_module* pModule;
};
struct UnicodeTokenizerCursor {
const sqlite3_tokenizer* pTokenizer;
QList<Token> tokens;
int position;
QByteArray current_utf8;
};
}
int Database::FTSCreate(int argc, const char* const* argv, sqlite3_tokenizer** tokenizer) {
*tokenizer = reinterpret_cast<sqlite3_tokenizer*>(new UnicodeTokenizer);
return SQLITE_OK;
}
int Database::FTSDestroy(sqlite3_tokenizer* tokenizer) {
UnicodeTokenizer* real_tokenizer = reinterpret_cast<UnicodeTokenizer*>(tokenizer);
qDebug() << __PRETTY_FUNCTION__;
delete real_tokenizer;
return SQLITE_OK;
}
int Database::FTSOpen(
sqlite3_tokenizer* pTokenizer,
const char* input,
int bytes,
sqlite3_tokenizer_cursor** cursor) {
UnicodeTokenizerCursor* new_cursor = new UnicodeTokenizerCursor;
new_cursor->pTokenizer = pTokenizer;
new_cursor->position = 0;
QString str = QString::fromUtf8(input, bytes).toLower();
QChar* data = str.data();
// Decompose and strip punctuation.
QList<Token> tokens;
QString token;
int start_offset = 0;
int offset = 0;
for (int i = 0; i < str.length(); ++i) {
QChar c = data[i];
ushort unicode = c.unicode();
if (unicode >= 0x00 && unicode <= 0x007f) {
offset += 1;
} else if (unicode >= 0x0080 && unicode <= 0x07ff) {
offset += 2;
} else if (unicode >= 0x0800 && unicode <= 0xffff) {
offset += 3;
} else if (unicode >= 0x010000 && unicode <= 0x10ffff) {
offset += 4;
}
if (!data[i].isLetterOrNumber()) {
// Token finished.
if (token.length() != 0) {
Token t;
t.token = token;
t.start_offset = start_offset;
t.end_offset = offset - 1;
start_offset = offset;
tokens << t;
token.clear();
} else {
++start_offset;
}
} else {
if (data[i].decompositionTag() != QChar::NoDecomposition) {
token.push_back(data[i].decomposition()[0]);
} else {
token.push_back(data[i]);
}
}
if (i == str.length() - 1) {
if (token.length() != 0) {
Token t;
t.token = token;
t.start_offset = start_offset;
t.end_offset = offset;
start_offset = offset;
tokens << t;
token.clear();
}
}
}
new_cursor->tokens = tokens;
*cursor = reinterpret_cast<sqlite3_tokenizer_cursor*>(new_cursor);
return SQLITE_OK;
}
int Database::FTSClose(sqlite3_tokenizer_cursor* cursor) {
UnicodeTokenizerCursor* real_cursor = reinterpret_cast<UnicodeTokenizerCursor*>(cursor);
delete real_cursor;
return SQLITE_OK;
}
int Database::FTSNext(
sqlite3_tokenizer_cursor* cursor,
const char** token,
int* bytes,
int* start_offset,
int* end_offset,
int* position) {
UnicodeTokenizerCursor* real_cursor = reinterpret_cast<UnicodeTokenizerCursor*>(cursor);
QList<Token> tokens = real_cursor->tokens;
if (real_cursor->position >= tokens.size()) {
return SQLITE_DONE;
}
Token t = tokens[real_cursor->position];
QByteArray utf8 = t.token.toUtf8();
*token = utf8.constData();
*bytes = utf8.size();
*start_offset = t.start_offset;
*end_offset = t.end_offset;
*position = real_cursor->position++;
real_cursor->current_utf8 = utf8;
return SQLITE_OK;
}
void Database::StaticInit() {
if (sStaticInitDone) {
@ -84,18 +232,45 @@ void Database::StaticInit() {
_sqlite3_user_data = reinterpret_cast<void* (*) (sqlite3_context*)>(
library.resolve("sqlite3_user_data"));
_sqlite3_prepare_v2 = reinterpret_cast<
int (*) (sqlite3*, const char*, int, sqlite3_stmt**, const char**)>(
library.resolve("sqlite3_prepare_v2"));
_sqlite3_bind_text = reinterpret_cast<
int (*) (sqlite3_stmt*, int, const char*, int, void(*)(void*))>(
library.resolve("sqlite3_bind_text"));
_sqlite3_bind_blob = reinterpret_cast<
int (*) (sqlite3_stmt*, int, const void*, int, void(*)(void*))>(
library.resolve("sqlite3_bind_blob"));
_sqlite3_step = reinterpret_cast<int (*) (sqlite3_stmt*)>(
library.resolve("sqlite3_step"));
_sqlite3_finalize = reinterpret_cast<int (*) (sqlite3_stmt*)>(
library.resolve("sqlite3_finalize"));
if (!_sqlite3_create_function ||
!_sqlite3_value_type ||
!_sqlite3_value_int64 ||
!_sqlite3_value_text ||
!_sqlite3_result_int64 ||
!_sqlite3_user_data) {
!_sqlite3_user_data ||
!_sqlite3_prepare_v2 ||
!_sqlite3_bind_text ||
!_sqlite3_bind_blob ||
!_sqlite3_step ||
!_sqlite3_finalize) {
qDebug() << "Couldn't resolve sqlite symbols";
sLoadedSqliteSymbols = false;
} else {
sLoadedSqliteSymbols = true;
}
#endif
sFTSTokenizer = new sqlite3_tokenizer_module;
sFTSTokenizer->iVersion = 0;
sFTSTokenizer->xCreate = &Database::FTSCreate;
sFTSTokenizer->xDestroy = &Database::FTSDestroy;
sFTSTokenizer->xOpen = &Database::FTSOpen;
sFTSTokenizer->xNext = &Database::FTSNext;
sFTSTokenizer->xClose = &Database::FTSClose;
}
bool Database::Like(const char* needle, const char* haystack) {
@ -199,7 +374,7 @@ QSqlDatabase Database::Connect() {
// Find Sqlite3 functions in the Qt plugin.
StaticInit();
// We want Unicode aware LIKE clauses if possible
// We want Unicode aware LIKE clauses and FTS if possible.
if (sLoadedSqliteSymbols) {
QVariant v = db.driver()->handle();
if (v.isValid() && qstrcmp(v.typeName(), "sqlite3*") == 0) {
@ -213,6 +388,17 @@ QSqlDatabase Database::Connect() {
this, // Custom data available via sqlite3_user_data().
&Database::SqliteLike, // Our function :-)
NULL, NULL);
sqlite3_stmt* statement;
const char* sql = "SELECT fts3_tokenizer(?, ?)";
int rc = _sqlite3_prepare_v2(handle, sql, -1, &statement, 0);
if (rc == SQLITE_OK) {
_sqlite3_bind_text(statement, 1, "unicode", -1, SQLITE_STATIC);
_sqlite3_bind_blob(statement, 2, &sFTSTokenizer, sizeof(sFTSTokenizer), SQLITE_STATIC);
qDebug() << _sqlite3_step(statement);
qDebug() << _sqlite3_finalize(statement);
}
}
}
}
@ -257,8 +443,10 @@ void Database::UpdateDatabaseSchema(int version, QSqlDatabase &db) {
// Run each command
QStringList commands(schema.split(";\n\n"));
db.exec("DROP TABLE songs_fts");
db.transaction();
foreach (const QString& command, commands) {
qDebug() << command;
QSqlQuery query(db.exec(command));
if (CheckErrors(query.lastError()))
qFatal("Unable to update music library database");

View File

@ -27,6 +27,109 @@
#include "gtest/gtest_prod.h"
extern "C" {
struct sqlite3_tokenizer;
struct sqlite3_tokenizer_cursor;
struct sqlite3_tokenizer_module {
/*
** Structure version. Should always be set to 0.
*/
int iVersion;
/*
** Create a new tokenizer. The values in the argv[] array are the
** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
** TABLE statement that created the fts3 table. For example, if
** the following SQL is executed:
**
** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
**
** then argc is set to 2, and the argv[] array contains pointers
** to the strings "arg1" and "arg2".
**
** This method should return either SQLITE_OK (0), or an SQLite error
** code. If SQLITE_OK is returned, then *ppTokenizer should be set
** to point at the newly created tokenizer structure. The generic
** sqlite3_tokenizer.pModule variable should not be initialised by
** this callback. The caller will do so.
*/
int (*xCreate)(
int argc, /* Size of argv array */
const char *const*argv, /* Tokenizer argument strings */
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
);
/*
** Destroy an existing tokenizer. The fts3 module calls this method
** exactly once for each successful call to xCreate().
*/
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
/*
** Create a tokenizer cursor to tokenize an input buffer. The caller
** is responsible for ensuring that the input buffer remains valid
** until the cursor is closed (using the xClose() method).
*/
int (*xOpen)(
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
const char *pInput, int nBytes, /* Input buffer */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
);
/*
** Destroy an existing tokenizer cursor. The fts3 module calls this
** method exactly once for each successful call to xOpen().
*/
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
/*
** Retrieve the next token from the tokenizer cursor pCursor. This
** method should either return SQLITE_OK and set the values of the
** "OUT" variables identified below, or SQLITE_DONE to indicate that
** the end of the buffer has been reached, or an SQLite error code.
**
** *ppToken should be set to point at a buffer containing the
** normalized version of the token (i.e. after any case-folding and/or
** stemming has been performed). *pnBytes should be set to the length
** of this buffer in bytes. The input text that generated the token is
** identified by the byte offsets returned in *piStartOffset and
** *piEndOffset. *piStartOffset should be set to the index of the first
** byte of the token in the input buffer. *piEndOffset should be set
** to the index of the first byte just past the end of the token in
** the input buffer.
**
** The buffer *ppToken is set to point at is managed by the tokenizer
** implementation. It is only required to be valid until the next call
** to xNext() or xClose().
*/
/* TODO(shess) current implementation requires pInput to be
** nul-terminated. This should either be fixed, or pInput/nBytes
** should be converted to zInput.
*/
int (*xNext)(
sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
int *piStartOffset, /* OUT: Byte offset of token in input buffer */
int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
int *piPosition /* OUT: Number of tokens returned before this one */
);
};
struct sqlite3_tokenizer {
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
/* Tokenizer implementations will typically add additional fields */
};
struct sqlite3_tokenizer_cursor {
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
/* Tokenizer implementations will typically add additional fields */
};
}
class Database : public QObject {
Q_OBJECT
@ -87,8 +190,31 @@ class Database : public QObject {
static void (*_sqlite3_result_int64) (sqlite3_context*, sqlite_int64);
static void* (*_sqlite3_user_data) (sqlite3_context*);
static int (*_sqlite3_prepare_v2) (sqlite3*, const char*, int, sqlite3_stmt**, const char**);
static int (*_sqlite3_bind_text) (sqlite3_stmt*, int, const char*, int, void(*)(void*));
static int (*_sqlite3_bind_blob) (sqlite3_stmt*, int, const void*, int, void(*)(void*));
static int (*_sqlite3_step) (sqlite3_stmt*);
static int (*_sqlite3_finalize) (sqlite3_stmt*);
static bool sStaticInitDone;
static bool sLoadedSqliteSymbols;
static sqlite3_tokenizer_module* sFTSTokenizer;
static int FTSCreate(int argc, const char* const* argv, sqlite3_tokenizer** tokenizer);
static int FTSDestroy(sqlite3_tokenizer* tokenizer);
static int FTSOpen(sqlite3_tokenizer* tokenizer,
const char* input,
int bytes,
sqlite3_tokenizer_cursor** cursor);
static int FTSClose(sqlite3_tokenizer_cursor* cursor);
static int FTSNext(sqlite3_tokenizer_cursor* cursor,
const char** token,
int* bytes,
int* start_offset,
int* end_offset,
int* position);
};
class MemoryDatabase : public Database {