• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ** 2007 June 22
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a tokenizer for fts2 based on the ICU library.
13 **
14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
15 */
16 
17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
18 #ifdef SQLITE_ENABLE_ICU
19 
20 #include <assert.h>
21 #include <string.h>
22 #include "fts2_tokenizer.h"
23 
24 #include <unicode/ubrk.h>
25 #include <unicode/ucol.h>
26 #include <unicode/ustring.h>
27 #include <unicode/utf16.h>
28 
29 typedef struct IcuTokenizer IcuTokenizer;
30 typedef struct IcuCursor IcuCursor;
31 
32 struct IcuTokenizer {
33   sqlite3_tokenizer base;
34   char *zLocale;
35 };
36 
37 struct IcuCursor {
38   sqlite3_tokenizer_cursor base;
39 
40   UBreakIterator *pIter;      /* ICU break-iterator object */
41   int nChar;                  /* Number of UChar elements in pInput */
42   UChar *aChar;               /* Copy of input using utf-16 encoding */
43   int *aOffset;               /* Offsets of each character in utf-8 input */
44 
45   int nBuffer;
46   char *zBuffer;
47 
48   int iToken;
49 };
50 
51 /*
52 ** Create a new tokenizer instance.
53 */
icuCreate(int argc,const char * const * argv,sqlite3_tokenizer ** ppTokenizer)54 static int icuCreate(
55   int argc,                            /* Number of entries in argv[] */
56   const char * const *argv,            /* Tokenizer creation arguments */
57   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
58 ){
59   IcuTokenizer *p;
60   int n = 0;
61 
62   if( argc>0 ){
63     n = strlen(argv[0])+1;
64   }
65   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
66   if( !p ){
67     return SQLITE_NOMEM;
68   }
69   memset(p, 0, sizeof(IcuTokenizer));
70 
71   if( n ){
72     p->zLocale = (char *)&p[1];
73     memcpy(p->zLocale, argv[0], n);
74   }
75 
76   *ppTokenizer = (sqlite3_tokenizer *)p;
77 
78   return SQLITE_OK;
79 }
80 
81 /*
82 ** Destroy a tokenizer
83 */
icuDestroy(sqlite3_tokenizer * pTokenizer)84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
86   sqlite3_free(p);
87   return SQLITE_OK;
88 }
89 
90 /*
91 ** Prepare to begin tokenizing a particular string.  The input
92 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
93 ** used to incrementally tokenize this string is returned in
94 ** *ppCursor.
95 */
icuOpen(sqlite3_tokenizer * pTokenizer,const char * zInput,int nInput,sqlite3_tokenizer_cursor ** ppCursor)96 static int icuOpen(
97   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
98   const char *zInput,                    /* Input string */
99   int nInput,                            /* Length of zInput in bytes */
100   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
101 ){
102   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
103   IcuCursor *pCsr;
104 
105   const int32_t opt = U_FOLD_CASE_DEFAULT;
106   UErrorCode status = U_ZERO_ERROR;
107   int nChar;
108 
109   UChar32 c;
110   int iInput = 0;
111   int iOut = 0;
112 
113   *ppCursor = 0;
114 
115   if( nInput<0 ){
116     nInput = strlen(zInput);
117   }
118   nChar = nInput+1;
119   pCsr = (IcuCursor *)sqlite3_malloc(
120       sizeof(IcuCursor) +                /* IcuCursor */
121       (nChar+1) * sizeof(int) +          /* IcuCursor.aOffset[] */
122       nChar * sizeof(UChar)              /* IcuCursor.aChar[] */
123   );
124   if( !pCsr ){
125     return SQLITE_NOMEM;
126   }
127   memset(pCsr, 0, sizeof(IcuCursor));
128   pCsr->aOffset = (int *)&pCsr[1];
129   pCsr->aChar = (UChar *)&pCsr->aOffset[nChar+1];
130 
131   pCsr->aOffset[iOut] = iInput;
132   U8_NEXT(zInput, iInput, nInput, c);
133   while( c>0 ){
134     int isError = 0;
135     c = u_foldCase(c, opt);
136     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
137     if( isError ){
138       sqlite3_free(pCsr);
139       return SQLITE_ERROR;
140     }
141     pCsr->aOffset[iOut] = iInput;
142 
143     if( iInput<nInput ){
144       U8_NEXT(zInput, iInput, nInput, c);
145     }else{
146       c = 0;
147     }
148   }
149 
150   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
151   if( !U_SUCCESS(status) ){
152     sqlite3_free(pCsr);
153     return SQLITE_ERROR;
154   }
155   pCsr->nChar = iOut;
156 
157   ubrk_first(pCsr->pIter);
158   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
159   return SQLITE_OK;
160 }
161 
162 /*
163 ** Close a tokenization cursor previously opened by a call to icuOpen().
164 */
icuClose(sqlite3_tokenizer_cursor * pCursor)165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
166   IcuCursor *pCsr = (IcuCursor *)pCursor;
167   ubrk_close(pCsr->pIter);
168   sqlite3_free(pCsr->zBuffer);
169   sqlite3_free(pCsr);
170   return SQLITE_OK;
171 }
172 
173 /*
174 ** Extract the next token from a tokenization cursor.
175 */
icuNext(sqlite3_tokenizer_cursor * pCursor,const char ** ppToken,int * pnBytes,int * piStartOffset,int * piEndOffset,int * piPosition)176 static int icuNext(
177   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
178   const char **ppToken,               /* OUT: *ppToken is the token text */
179   int *pnBytes,                       /* OUT: Number of bytes in token */
180   int *piStartOffset,                 /* OUT: Starting offset of token */
181   int *piEndOffset,                   /* OUT: Ending offset of token */
182   int *piPosition                     /* OUT: Position integer of token */
183 ){
184   IcuCursor *pCsr = (IcuCursor *)pCursor;
185 
186   int iStart = 0;
187   int iEnd = 0;
188   int nByte = 0;
189 
190   while( iStart==iEnd ){
191     UChar32 c;
192 
193     iStart = ubrk_current(pCsr->pIter);
194     iEnd = ubrk_next(pCsr->pIter);
195     if( iEnd==UBRK_DONE ){
196       return SQLITE_DONE;
197     }
198 
199     while( iStart<iEnd ){
200       int iWhite = iStart;
201       U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
202       if( u_isspace(c) ){
203         iStart = iWhite;
204       }else{
205         break;
206       }
207     }
208     assert(iStart<=iEnd);
209   }
210 
211   do {
212     UErrorCode status = U_ZERO_ERROR;
213     if( nByte ){
214       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
215       if( !zNew ){
216         return SQLITE_NOMEM;
217       }
218       pCsr->zBuffer = zNew;
219       pCsr->nBuffer = nByte;
220     }
221 
222     u_strToUTF8(
223         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
224         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
225         &status                                  /* Output success/failure */
226     );
227   } while( nByte>pCsr->nBuffer );
228 
229   *ppToken = pCsr->zBuffer;
230   *pnBytes = nByte;
231   *piStartOffset = pCsr->aOffset[iStart];
232   *piEndOffset = pCsr->aOffset[iEnd];
233   *piPosition = pCsr->iToken++;
234 
235   return SQLITE_OK;
236 }
237 
238 /*
239 ** The set of routines that implement the simple tokenizer
240 */
241 static const sqlite3_tokenizer_module icuTokenizerModule = {
242   0,                           /* iVersion */
243   icuCreate,                   /* xCreate  */
244   icuDestroy,                  /* xCreate  */
245   icuOpen,                     /* xOpen    */
246   icuClose,                    /* xClose   */
247   icuNext,                     /* xNext    */
248 };
249 
250 /*
251 ** Set *ppModule to point at the implementation of the ICU tokenizer.
252 */
sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const ** ppModule)253 void sqlite3Fts2IcuTokenizerModule(
254   sqlite3_tokenizer_module const**ppModule
255 ){
256   *ppModule = &icuTokenizerModule;
257 }
258 
259 #endif /* defined(SQLITE_ENABLE_ICU) */
260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
261