1 /*
2 ** 2007 June 22
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a tokenizer for fts2 based on the ICU library.
13 **
14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
15 */
16
17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
18 #ifdef SQLITE_ENABLE_ICU
19
20 #include <assert.h>
21 #include <string.h>
22 #include "fts2_tokenizer.h"
23
24 #include <unicode/ubrk.h>
25 #include <unicode/ucol.h>
26 #include <unicode/ustring.h>
27 #include <unicode/utf16.h>
28
29 typedef struct IcuTokenizer IcuTokenizer;
30 typedef struct IcuCursor IcuCursor;
31
32 struct IcuTokenizer {
33 sqlite3_tokenizer base;
34 char *zLocale;
35 };
36
37 struct IcuCursor {
38 sqlite3_tokenizer_cursor base;
39
40 UBreakIterator *pIter; /* ICU break-iterator object */
41 int nChar; /* Number of UChar elements in pInput */
42 UChar *aChar; /* Copy of input using utf-16 encoding */
43 int *aOffset; /* Offsets of each character in utf-8 input */
44
45 int nBuffer;
46 char *zBuffer;
47
48 int iToken;
49 };
50
51 /*
52 ** Create a new tokenizer instance.
53 */
icuCreate(int argc,const char * const * argv,sqlite3_tokenizer ** ppTokenizer)54 static int icuCreate(
55 int argc, /* Number of entries in argv[] */
56 const char * const *argv, /* Tokenizer creation arguments */
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
58 ){
59 IcuTokenizer *p;
60 int n = 0;
61
62 if( argc>0 ){
63 n = strlen(argv[0])+1;
64 }
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
66 if( !p ){
67 return SQLITE_NOMEM;
68 }
69 memset(p, 0, sizeof(IcuTokenizer));
70
71 if( n ){
72 p->zLocale = (char *)&p[1];
73 memcpy(p->zLocale, argv[0], n);
74 }
75
76 *ppTokenizer = (sqlite3_tokenizer *)p;
77
78 return SQLITE_OK;
79 }
80
81 /*
82 ** Destroy a tokenizer
83 */
icuDestroy(sqlite3_tokenizer * pTokenizer)84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
86 sqlite3_free(p);
87 return SQLITE_OK;
88 }
89
90 /*
91 ** Prepare to begin tokenizing a particular string. The input
92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor
93 ** used to incrementally tokenize this string is returned in
94 ** *ppCursor.
95 */
icuOpen(sqlite3_tokenizer * pTokenizer,const char * zInput,int nInput,sqlite3_tokenizer_cursor ** ppCursor)96 static int icuOpen(
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
98 const char *zInput, /* Input string */
99 int nInput, /* Length of zInput in bytes */
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
101 ){
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
103 IcuCursor *pCsr;
104
105 const int32_t opt = U_FOLD_CASE_DEFAULT;
106 UErrorCode status = U_ZERO_ERROR;
107 int nChar;
108
109 UChar32 c;
110 int iInput = 0;
111 int iOut = 0;
112
113 *ppCursor = 0;
114
115 if( nInput<0 ){
116 nInput = strlen(zInput);
117 }
118 nChar = nInput+1;
119 pCsr = (IcuCursor *)sqlite3_malloc(
120 sizeof(IcuCursor) + /* IcuCursor */
121 (nChar+1) * sizeof(int) + /* IcuCursor.aOffset[] */
122 nChar * sizeof(UChar) /* IcuCursor.aChar[] */
123 );
124 if( !pCsr ){
125 return SQLITE_NOMEM;
126 }
127 memset(pCsr, 0, sizeof(IcuCursor));
128 pCsr->aOffset = (int *)&pCsr[1];
129 pCsr->aChar = (UChar *)&pCsr->aOffset[nChar+1];
130
131 pCsr->aOffset[iOut] = iInput;
132 U8_NEXT(zInput, iInput, nInput, c);
133 while( c>0 ){
134 int isError = 0;
135 c = u_foldCase(c, opt);
136 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
137 if( isError ){
138 sqlite3_free(pCsr);
139 return SQLITE_ERROR;
140 }
141 pCsr->aOffset[iOut] = iInput;
142
143 if( iInput<nInput ){
144 U8_NEXT(zInput, iInput, nInput, c);
145 }else{
146 c = 0;
147 }
148 }
149
150 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
151 if( !U_SUCCESS(status) ){
152 sqlite3_free(pCsr);
153 return SQLITE_ERROR;
154 }
155 pCsr->nChar = iOut;
156
157 ubrk_first(pCsr->pIter);
158 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
159 return SQLITE_OK;
160 }
161
162 /*
163 ** Close a tokenization cursor previously opened by a call to icuOpen().
164 */
icuClose(sqlite3_tokenizer_cursor * pCursor)165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
166 IcuCursor *pCsr = (IcuCursor *)pCursor;
167 ubrk_close(pCsr->pIter);
168 sqlite3_free(pCsr->zBuffer);
169 sqlite3_free(pCsr);
170 return SQLITE_OK;
171 }
172
173 /*
174 ** Extract the next token from a tokenization cursor.
175 */
icuNext(sqlite3_tokenizer_cursor * pCursor,const char ** ppToken,int * pnBytes,int * piStartOffset,int * piEndOffset,int * piPosition)176 static int icuNext(
177 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
178 const char **ppToken, /* OUT: *ppToken is the token text */
179 int *pnBytes, /* OUT: Number of bytes in token */
180 int *piStartOffset, /* OUT: Starting offset of token */
181 int *piEndOffset, /* OUT: Ending offset of token */
182 int *piPosition /* OUT: Position integer of token */
183 ){
184 IcuCursor *pCsr = (IcuCursor *)pCursor;
185
186 int iStart = 0;
187 int iEnd = 0;
188 int nByte = 0;
189
190 while( iStart==iEnd ){
191 UChar32 c;
192
193 iStart = ubrk_current(pCsr->pIter);
194 iEnd = ubrk_next(pCsr->pIter);
195 if( iEnd==UBRK_DONE ){
196 return SQLITE_DONE;
197 }
198
199 while( iStart<iEnd ){
200 int iWhite = iStart;
201 U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
202 if( u_isspace(c) ){
203 iStart = iWhite;
204 }else{
205 break;
206 }
207 }
208 assert(iStart<=iEnd);
209 }
210
211 do {
212 UErrorCode status = U_ZERO_ERROR;
213 if( nByte ){
214 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
215 if( !zNew ){
216 return SQLITE_NOMEM;
217 }
218 pCsr->zBuffer = zNew;
219 pCsr->nBuffer = nByte;
220 }
221
222 u_strToUTF8(
223 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
224 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
225 &status /* Output success/failure */
226 );
227 } while( nByte>pCsr->nBuffer );
228
229 *ppToken = pCsr->zBuffer;
230 *pnBytes = nByte;
231 *piStartOffset = pCsr->aOffset[iStart];
232 *piEndOffset = pCsr->aOffset[iEnd];
233 *piPosition = pCsr->iToken++;
234
235 return SQLITE_OK;
236 }
237
238 /*
239 ** The set of routines that implement the simple tokenizer
240 */
241 static const sqlite3_tokenizer_module icuTokenizerModule = {
242 0, /* iVersion */
243 icuCreate, /* xCreate */
244 icuDestroy, /* xCreate */
245 icuOpen, /* xOpen */
246 icuClose, /* xClose */
247 icuNext, /* xNext */
248 };
249
250 /*
251 ** Set *ppModule to point at the implementation of the ICU tokenizer.
252 */
sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const ** ppModule)253 void sqlite3Fts2IcuTokenizerModule(
254 sqlite3_tokenizer_module const**ppModule
255 ){
256 *ppModule = &icuTokenizerModule;
257 }
258
259 #endif /* defined(SQLITE_ENABLE_ICU) */
260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
261