diff --git a/binding.gyp b/binding.gyp index e3a5a9636..e0d7b51a6 100644 --- a/binding.gyp +++ b/binding.gyp @@ -27,7 +27,8 @@ "sources": [ "src/database.cc", "src/node_sqlite3.cc", - "src/statement.cc" + "src/statement.cc", + "src/character_tokenizer.cc" ] }, { diff --git a/src/character_tokenizer.cc b/src/character_tokenizer.cc new file mode 100644 index 000000000..aacef578f --- /dev/null +++ b/src/character_tokenizer.cc @@ -0,0 +1,170 @@ +// +// character_tokenizer.c +// +// Created by Hai Feng Kao on 4/6/13. +// All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// Implementation of the "simple" full-text-search tokenizer. + +#include +#include //for tolower +#include //for memset +#include "character_tokenizer.h" + +typedef struct character_tokenizer { + sqlite3_tokenizer base; +} character_tokenizer; + +typedef struct character_tokenizer_cursor { + sqlite3_tokenizer_cursor base; + const char *pInput; // input we are tokenizing + int nBytes; // size of the input + int iPosition; // current position in pInput + int iToken; // index of next token to be returned + char *pToken; // storage for current token +} character_tokenizer_cursor; + +static int characterCreate( + int argc, const char * const *argv, + sqlite3_tokenizer **ppTokenizer + ){ + character_tokenizer *t; + t = (character_tokenizer *) sqlite3_malloc(sizeof(*t)); + if( t == NULL + ) return SQLITE_NOMEM; + memset(t, 0, sizeof(*t)); + + *ppTokenizer = &t->base; + return SQLITE_OK; +} + +static int characterDestroy(sqlite3_tokenizer *pTokenizer){ + sqlite3_free(pTokenizer); + return SQLITE_OK; +} + +static int characterOpen( + sqlite3_tokenizer *pTokenizer, /* The tokenizer */ + const char *pInput, int nBytes, /* String to be tokenized */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ + ){ + character_tokenizer_cursor *c; + if(pInput == 0){ + nBytes = 0; + }else if(nBytes < 0){ + nBytes = (int)strlen(pInput); + } + c = (character_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); + if(c == NULL){ + return SQLITE_NOMEM; + } + c->iToken = c->iPosition = 0; + c->pToken = NULL; + c->nBytes = nBytes; + c->pInput = pInput; + *ppCursor = &c->base; + return SQLITE_OK; +} + +static int characterClose(sqlite3_tokenizer_cursor *pCursor){ + character_tokenizer_cursor *c = (character_tokenizer_cursor *) pCursor; + + if(c->pToken != NULL){ + sqlite3_free(c->pToken); + c->pToken = NULL; + } + + sqlite3_free(c); + return SQLITE_OK; +} + +static int characterNext( + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by cusOpen */ + const char **ppToken, /* OUT: *ppToken is the token text */ + int *pnBytes, /* OUT: Number of bytes in token */ + int *piStartOffset, /* OUT: Starting offset of token */ + int *piEndOffset, /* OUT: Ending offset of token */ + int *piPosition /* OUT: Position integer of token */ + ){ + character_tokenizer_cursor *c = (character_tokenizer_cursor *) pCursor; + if(c->pToken != NULL){ + sqlite3_free(c->pToken); + c->pToken = NULL; + } + + if (c->iPosition >= c->nBytes) { + return SQLITE_DONE; + } + + int length = 1; // the size of current character, which can be at most 4 bytes + + const char* token = &(c->pInput[c->iPosition]); + *piStartOffset = c->iPosition; + + // find the beginning of next utf8 character + c->iPosition++; + while (c->iPosition < c->nBytes) { + char byte = c->pInput[c->iPosition]; + if (((byte & 0x80) == 0) || ((byte & 0xc0) == 0xc0)) { + // we have reached the first byte of next utf8 character + break; + } + length++; + c->iPosition++; + } + + c->pToken = (char *)sqlite3_malloc(length+1); + if(c->pToken == NULL){ + return SQLITE_NOMEM; + } + + c->pToken[length] = 0; + memcpy(c->pToken, token, length); + + for (int i = 0; i < length; ++i) { + unsigned char byte = c->pToken[i]; + + if (byte < 0x80) { + // ascii character, make it case-insensitive + c->pToken[i] = tolower(byte); + } + } + + *ppToken = c->pToken; + *pnBytes = length; + + *piEndOffset = *piStartOffset+length; + *piPosition = c->iToken++; + return SQLITE_OK; +} + +static const sqlite3_tokenizer_module characterTokenizerModule = { + 0, + characterCreate, + characterDestroy, + characterOpen, + characterClose, + characterNext, +}; + +void get_character_tokenizer_module(const sqlite3_tokenizer_module **ppModule){ + *ppModule = &characterTokenizerModule; +} diff --git a/src/character_tokenizer.h b/src/character_tokenizer.h new file mode 100644 index 000000000..5bae350b2 --- /dev/null +++ b/src/character_tokenizer.h @@ -0,0 +1,33 @@ +// +// character_tokenizer.h +// +// Created by Hai Feng Kao on 4/6/13. +// All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#ifndef SQLITE_CHARACTERTOKENIZER_H +#define SQLITE_CHARACTERTOKENIZER_H + +#include "fts3_tokenizer.h" + +void get_character_tokenizer_module(const sqlite3_tokenizer_module **ppModule); + +#endif diff --git a/src/database.cc b/src/database.cc index 27e4252e9..3aa5ef429 100644 --- a/src/database.cc +++ b/src/database.cc @@ -4,6 +4,7 @@ #include "macros.h" #include "database.h" #include "statement.h" +#include "character_tokenizer.h" using namespace node_sqlite3; @@ -24,6 +25,8 @@ void Database::Init(Handle target) { NODE_SET_PROTOTYPE_METHOD(t, "serialize", Serialize); NODE_SET_PROTOTYPE_METHOD(t, "parallelize", Parallelize); NODE_SET_PROTOTYPE_METHOD(t, "configure", Configure); + + NODE_SET_PROTOTYPE_METHOD(t, "loadCharacterTokenizer", LoadCharacterTokenizer); NODE_SET_GETTER(t, "open", OpenGetter); @@ -589,6 +592,19 @@ void Database::Work_Wait(Baton* baton) { delete baton; } +NAN_METHOD(Database::LoadCharacterTokenizer) { + NanScope(); + Database* db = ObjectWrap::Unwrap(args.This()); + + REQUIRE_ARGUMENT_STRING(0, filename); + OPTIONAL_ARGUMENT_FUNCTION(1, callback); + + Baton* baton = new LoadExtensionBaton(db, callback, *filename); + db->Schedule(Work_BeginLoadCharacterTokenizer, baton, true); + + NanReturnValue(args.This()); +} + NAN_METHOD(Database::LoadExtension) { NanScope(); Database* db = ObjectWrap::Unwrap(args.This()); @@ -602,6 +618,16 @@ NAN_METHOD(Database::LoadExtension) { NanReturnValue(args.This()); } +void Database::Work_BeginLoadCharacterTokenizer(Baton* baton) { + assert(baton->db->locked); + assert(baton->db->open); + assert(baton->db->_handle); + assert(baton->db->pending == 0); + int status = uv_queue_work(uv_default_loop(), + &baton->request, Work_LoadCharacterTokenizer, (uv_after_work_cb)Work_AfterLoadExtension); + assert(status == 0); +} + void Database::Work_BeginLoadExtension(Baton* baton) { assert(baton->db->locked); assert(baton->db->open); @@ -612,6 +638,53 @@ void Database::Work_BeginLoadExtension(Baton* baton) { assert(status == 0); } +/* + ** Register a tokenizer implementation with FTS3 or FTS4. + */ +static int registerTokenizer( + sqlite3 *db, + char *zName, + const sqlite3_tokenizer_module *p + ){ + int rc; + sqlite3_stmt *pStmt; + const char *zSql = "SELECT fts3_tokenizer(?, ?)"; + + rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); + sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); + sqlite3_step(pStmt); + + return sqlite3_finalize(pStmt); +} + +void Database::Work_LoadCharacterTokenizer(uv_work_t* req) { + LoadExtensionBaton* baton = static_cast(req->data); + + sqlite3_enable_load_extension(baton->db->_handle, 1); + + char* message = NULL; + char token_name[] = "character"; + const sqlite3_tokenizer_module *ptr; + + // get the tokenizer + get_character_tokenizer_module(&ptr); + + // register character tokenizer, note that you need to register it everytime the database is opened + registerTokenizer(baton->db->_handle, token_name, ptr); + + sqlite3_enable_load_extension(baton->db->_handle, 0); + + if (baton->status != SQLITE_OK && message != NULL) { + baton->message = std::string(message); + sqlite3_free(message); + } +} + void Database::Work_LoadExtension(uv_work_t* req) { LoadExtensionBaton* baton = static_cast(req->data); diff --git a/src/database.h b/src/database.h index af83ee715..bfa07a771 100644 --- a/src/database.h +++ b/src/database.h @@ -142,6 +142,11 @@ class Database : public ObjectWrap { static void Work_Close(uv_work_t* req); static void Work_AfterClose(uv_work_t* req); + static NAN_METHOD(LoadCharacterTokenizer); + static void Work_BeginLoadCharacterTokenizer(Baton* baton); + static void Work_LoadCharacterTokenizer(uv_work_t* req); + static void Work_AfterLoadCharacterTokenizer(uv_work_t* req); + static NAN_METHOD(LoadExtension); static void Work_BeginLoadExtension(Baton* baton); static void Work_LoadExtension(uv_work_t* req); diff --git a/src/fts3_tokenizer.h b/src/fts3_tokenizer.h new file mode 100644 index 000000000..c3f091b5f --- /dev/null +++ b/src/fts3_tokenizer.h @@ -0,0 +1,161 @@ +/* + ** 2006 July 10 + ** + ** The author disclaims copyright to this source code. + ** + ************************************************************************* + ** Defines the interface to tokenizers used by fulltext-search. There + ** are three basic components: + ** + ** sqlite3_tokenizer_module is a singleton defining the tokenizer + ** interface functions. This is essentially the class structure for + ** tokenizers. + ** + ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps + ** including customization information defined at creation time. + ** + ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate + ** tokens from a particular input. + */ +#ifndef _FTS3_TOKENIZER_H_ +#define _FTS3_TOKENIZER_H_ + +/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. + ** If tokenizers are to be allowed to call sqlite3_*() functions, then + ** we will need a way to register the API consistently. + */ +#include "sqlite3.h" + +/* + ** Structures used by the tokenizer interface. When a new tokenizer + ** implementation is registered, the caller provides a pointer to + ** an sqlite3_tokenizer_module containing pointers to the callback + ** functions that make up an implementation. + ** + ** When an fts3 table is created, it passes any arguments passed to + ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the + ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer + ** implementation. The xCreate() function in turn returns an + ** sqlite3_tokenizer structure representing the specific tokenizer to + ** be used for the fts3 table (customized by the tokenizer clause arguments). + ** + ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen() + ** method is called. It returns an sqlite3_tokenizer_cursor object + ** that may be used to tokenize a specific input buffer based on + ** the tokenization rules supplied by a specific sqlite3_tokenizer + ** object. + */ +typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; +typedef struct sqlite3_tokenizer sqlite3_tokenizer; +typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; + +struct sqlite3_tokenizer_module { + + /* + ** Structure version. Should always be set to 0 or 1. + */ + int iVersion; + + /* + ** Create a new tokenizer. The values in the argv[] array are the + ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL + ** TABLE statement that created the fts3 table. For example, if + ** the following SQL is executed: + ** + ** CREATE .. USING fts3( ... , tokenizer arg1 arg2) + ** + ** then argc is set to 2, and the argv[] array contains pointers + ** to the strings "arg1" and "arg2". + ** + ** This method should return either SQLITE_OK (0), or an SQLite error + ** code. If SQLITE_OK is returned, then *ppTokenizer should be set + ** to point at the newly created tokenizer structure. The generic + ** sqlite3_tokenizer.pModule variable should not be initialized by + ** this callback. The caller will do so. + */ + int (*xCreate)( + int argc, /* Size of argv array */ + const char *const*argv, /* Tokenizer argument strings */ + sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ + ); + + /* + ** Destroy an existing tokenizer. The fts3 module calls this method + ** exactly once for each successful call to xCreate(). + */ + int (*xDestroy)(sqlite3_tokenizer *pTokenizer); + + /* + ** Create a tokenizer cursor to tokenize an input buffer. The caller + ** is responsible for ensuring that the input buffer remains valid + ** until the cursor is closed (using the xClose() method). + */ + int (*xOpen)( + sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ + const char *pInput, int nBytes, /* Input buffer */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ + ); + + /* + ** Destroy an existing tokenizer cursor. The fts3 module calls this + ** method exactly once for each successful call to xOpen(). + */ + int (*xClose)(sqlite3_tokenizer_cursor *pCursor); + + /* + ** Retrieve the next token from the tokenizer cursor pCursor. This + ** method should either return SQLITE_OK and set the values of the + ** "OUT" variables identified below, or SQLITE_DONE to indicate that + ** the end of the buffer has been reached, or an SQLite error code. + ** + ** *ppToken should be set to point at a buffer containing the + ** normalized version of the token (i.e. after any case-folding and/or + ** stemming has been performed). *pnBytes should be set to the length + ** of this buffer in bytes. The input text that generated the token is + ** identified by the byte offsets returned in *piStartOffset and + ** *piEndOffset. *piStartOffset should be set to the index of the first + ** byte of the token in the input buffer. *piEndOffset should be set + ** to the index of the first byte just past the end of the token in + ** the input buffer. + ** + ** The buffer *ppToken is set to point at is managed by the tokenizer + ** implementation. It is only required to be valid until the next call + ** to xNext() or xClose(). + */ + /* TODO(shess) current implementation requires pInput to be + ** nul-terminated. This should either be fixed, or pInput/nBytes + ** should be converted to zInput. + */ + int (*xNext)( + sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ + const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ + int *piStartOffset, /* OUT: Byte offset of token in input buffer */ + int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ + int *piPosition /* OUT: Number of tokens returned before this one */ + ); + + /*********************************************************************** + ** Methods below this point are only available if iVersion>=1. + */ + + /* + ** Configure the language id of a tokenizer cursor. + */ + int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid); +}; + +struct sqlite3_tokenizer { + const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ + /* Tokenizer implementations will typically add additional fields */ +}; + +struct sqlite3_tokenizer_cursor { + sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ + /* Tokenizer implementations will typically add additional fields */ +}; + +int fts3_global_term_cnt(int iTerm, int iCol); +int fts3_term_cnt(int iTerm, int iCol); + + +#endif /* _FTS3_TOKENIZER_H_ */ \ No newline at end of file diff --git a/test/character-tokenizer.test.js b/test/character-tokenizer.test.js new file mode 100644 index 000000000..f8e905ccd --- /dev/null +++ b/test/character-tokenizer.test.js @@ -0,0 +1,34 @@ +var sqlite3 = require('..'); +var assert = require('assert'); + +describe('character tokenizer', function() { + var db; + before(function(done) { + db = new sqlite3.Database(':memory:', done); + }); + + it('should create a new fts4 table with a tokenize=character', function(done) { + db.loadCharacterTokenizer('',function(err) { + if (err) throw err; + db.exec('CREATE VIRTUAL TABLE t1 USING fts4(content TEXT,tokenize=character);', done); + }) + }); + it('should allow phrase queries to match substrings', function(done) { + db.run('insert into t1 values ("aaabbbccc")'); + db.run('insert into t1 values ("vvv aaabbbccc")'); + db.run('insert into t1 values ("aaazzzccc")'); + + var stmt = db.prepare('select * from t1 where t1 match \"bbb\"'); + stmt.all(function(err,rows) { + if (err) throw err; + assert.deepEqual(rows,[{"content":"aaabbbccc"},{"content":"vvv aaabbbccc"}]); + }) + + var stmt = db.prepare('select * from t1 where t1 match \"ccc\"'); + stmt.all(function(err,rows) { + if (err) throw err; + assert.deepEqual(rows,[{"content":"aaabbbccc"},{"content":"vvv aaabbbccc"},{"content":"aaazzzccc"}]); + done(); + }) + }) +});