1From 53b13f3aa2f41a8d30eac3702fe066d3a4a616ee Mon Sep 17 00:00:00 2001 2From: MartinChoo <214582617@qq.com> 3Date: Wed, 23 Jul 2025 17:39:10 +0800 4Subject: [PATCH 02/12] Enable and optimize ICU 5 6--- 7 src/sqlite3.c | 1013 ++++------------------------------------------ 8 src/sqlite3icu.c | 888 ++++++++++++++++++++++++++++++++++++++++ 9 2 files changed, 962 insertions(+), 939 deletions(-) 10 create mode 100644 src/sqlite3icu.c 11 12diff --git a/src/sqlite3.c b/src/sqlite3.c 13index b132937..efc4cd4 100644 14--- a/src/sqlite3.c 15+++ b/src/sqlite3.c 16@@ -2502,6 +2502,7 @@ struct sqlite3_mem_methods { 17 #define SQLITE_CONFIG_SORTERREF_SIZE 28 /* int nByte */ 18 #define SQLITE_CONFIG_MEMDB_MAXSIZE 29 /* sqlite3_int64 */ 19 #define SQLITE_CONFIG_ROWID_IN_VIEW 30 /* int* */ 20+#define SQLITE_CONFIG_ENABLE_ICU 41 /* boolean */ 21 22 /* 23 ** CAPI3REF: Database Connection Configuration Options 24@@ -3289,6 +3290,7 @@ SQLITE_API void sqlite3_free_table(char **result); 25 # define EXPORT_SYMBOLS 26 #endif 27 #endif 28+ 29 /* 30 ** CAPI3REF: Formatted String Printing Functions 31 ** 32@@ -178413,6 +178415,7 @@ SQLITE_PRIVATE int sqlite3Fts3Init(sqlite3 *db); 33 /************** End of fts3.h ************************************************/ 34 /************** Continuing where we left off in main.c ***********************/ 35 #endif 36+ 37 #ifdef SQLITE_ENABLE_RTREE 38 /************** Include rtree.h in the middle of main.c **********************/ 39 /************** Begin file rtree.h *******************************************/ 40@@ -178475,13 +178478,54 @@ SQLITE_PRIVATE int sqlite3RtreeInit(sqlite3 *db); 41 extern "C" { 42 #endif /* __cplusplus */ 43 44-SQLITE_PRIVATE int sqlite3IcuInit(sqlite3 *db); 45+SQLITE_PRIVATE int sqlite3IcuInitInner(sqlite3 *db); 46 47 #if 0 48 } /* extern "C" */ 49 #endif /* __cplusplus */ 50 51 /************** End of sqliteicu.h *******************************************/ 52+#ifndef _WIN32 53+#include <dlfcn.h> 54+#endif 55+ 56+typedef void (*sqlite3Fts3IcuTokenizerModule_ptr)(sqlite3_tokenizer_module const** ppModule); 57+typedef int (*sqlite3IcuInit_ptr)(sqlite3 *db); 58+static sqlite3Fts3IcuTokenizerModule_ptr tokenModulePtr = NULL; 59+static sqlite3IcuInit_ptr icuInitPtr = NULL; 60+static u32 icuEnable = 0u; 61+static u32 icuInit = 0u; 62+static void *g_library = NULL; 63+ 64+int sqlite3IcuModuleInit(){ 65+ int rc = SQLITE_OK; 66+ if( icuInit ){ 67+ return rc; 68+ } 69+#ifndef _WIN32 70+ g_library = dlopen("libsqliteicu.z.so", RTLD_LAZY); 71+ if( g_library==NULL ){ 72+ sqlite3_log(SQLITE_ERROR, "load icu so failed"); 73+ return SQLITE_ERROR; 74+ } 75+ tokenModulePtr = (sqlite3Fts3IcuTokenizerModule_ptr)dlsym(g_library, "sqlite3Fts3IcuTokenizerModule"); 76+ icuInitPtr = (sqlite3IcuInit_ptr)dlsym(g_library, "sqlite3IcuInit"); 77+ if( tokenModulePtr==NULL || icuInitPtr==NULL ){ 78+ sqlite3_log(SQLITE_ERROR, "load icu init function failed"); 79+ return SQLITE_ERROR; 80+ } 81+ icuInit = 1u; 82+#endif 83+ return rc; 84+} 85+ 86+SQLITE_PRIVATE int sqlite3IcuInitInner(sqlite3 *db) 87+{ 88+ if( !icuEnable ){ 89+ return SQLITE_OK; 90+ } 91+ return icuInitPtr(db); 92+} 93 /************** Continuing where we left off in main.c ***********************/ 94 #endif 95 96@@ -178521,7 +178565,7 @@ static int (*const sqlite3BuiltinExtensions[])(sqlite3*) = { 97 sqlite3Fts5Init, 98 #endif 99 #if defined(SQLITE_ENABLE_ICU) || defined(SQLITE_ENABLE_ICU_COLLATIONS) 100- sqlite3IcuInit, 101+ sqlite3IcuInitInner, 102 #endif 103 #ifdef SQLITE_ENABLE_RTREE 104 sqlite3RtreeInit, 105@@ -178913,6 +178957,19 @@ SQLITE_API int sqlite3_shutdown(void){ 106 SQLITE_API int sqlite3_config(int op, ...){ 107 va_list ap; 108 int rc = SQLITE_OK; 109+ va_start(ap, op); 110+ 111+#if defined(SQLITE_ENABLE_ICU) || defined(SQLITE_ENABLE_ICU_COLLATIONS) 112+ if( op==SQLITE_CONFIG_ENABLE_ICU ){ 113+ int iVal = va_arg(ap, int); 114+ if( iVal==0 ){ 115+ icuEnable = 0u; 116+ }else{ 117+ icuEnable = 1u; 118+ } 119+ return rc; 120+ } 121+#endif /* SQLITE_ENABLE_ICU */ 122 123 /* sqlite3_config() normally returns SQLITE_MISUSE if it is invoked while 124 ** the SQLite library is in use. Except, a few selected opcodes 125@@ -178930,7 +178987,6 @@ SQLITE_API int sqlite3_config(int op, ...){ 126 testcase( op==SQLITE_CONFIG_PCACHE_HDRSZ ); 127 } 128 129- va_start(ap, op); 130 switch( op ){ 131 132 /* Mutex configuration options are only available in a threadsafe 133@@ -182053,6 +182109,12 @@ static int openDatabase( 134 sqlite3RegisterPerConnectionBuiltinFunctions(db); 135 rc = sqlite3_errcode(db); 136 137+#if defined(SQLITE_ENABLE_ICU) || defined(SQLITE_ENABLE_ICU_COLLATIONS) 138+ if( icuEnable ){ 139+ rc = sqlite3IcuModuleInit(); 140+ if( rc!=SQLITE_OK ) return rc; 141+ } 142+#endif 143 144 /* Load compiled-in extensions */ 145 for(i=0; rc==SQLITE_OK && i<ArraySize(sqlite3BuiltinExtensions); i++){ 146@@ -184344,114 +184406,6 @@ SQLITE_EXTENSION_INIT3 147 ** the tokenization rules supplied by a specific sqlite3_tokenizer 148 ** object. 149 */ 150-typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; 151-typedef struct sqlite3_tokenizer sqlite3_tokenizer; 152-typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; 153- 154-struct sqlite3_tokenizer_module { 155- 156- /* 157- ** Structure version. Should always be set to 0 or 1. 158- */ 159- int iVersion; 160- 161- /* 162- ** Create a new tokenizer. The values in the argv[] array are the 163- ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL 164- ** TABLE statement that created the fts3 table. For example, if 165- ** the following SQL is executed: 166- ** 167- ** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2) 168- ** 169- ** then argc is set to 2, and the argv[] array contains pointers 170- ** to the strings "arg1" and "arg2". 171- ** 172- ** This method should return either SQLITE_OK (0), or an SQLite error 173- ** code. If SQLITE_OK is returned, then *ppTokenizer should be set 174- ** to point at the newly created tokenizer structure. The generic 175- ** sqlite3_tokenizer.pModule variable should not be initialized by 176- ** this callback. The caller will do so. 177- */ 178- int (*xCreate)( 179- int argc, /* Size of argv array */ 180- const char *const*argv, /* Tokenizer argument strings */ 181- sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ 182- ); 183- 184- /* 185- ** Destroy an existing tokenizer. The fts3 module calls this method 186- ** exactly once for each successful call to xCreate(). 187- */ 188- int (*xDestroy)(sqlite3_tokenizer *pTokenizer); 189- 190- /* 191- ** Create a tokenizer cursor to tokenize an input buffer. The caller 192- ** is responsible for ensuring that the input buffer remains valid 193- ** until the cursor is closed (using the xClose() method). 194- */ 195- int (*xOpen)( 196- sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ 197- const char *pInput, int nBytes, /* Input buffer */ 198- sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ 199- ); 200- 201- /* 202- ** Destroy an existing tokenizer cursor. The fts3 module calls this 203- ** method exactly once for each successful call to xOpen(). 204- */ 205- int (*xClose)(sqlite3_tokenizer_cursor *pCursor); 206- 207- /* 208- ** Retrieve the next token from the tokenizer cursor pCursor. This 209- ** method should either return SQLITE_OK and set the values of the 210- ** "OUT" variables identified below, or SQLITE_DONE to indicate that 211- ** the end of the buffer has been reached, or an SQLite error code. 212- ** 213- ** *ppToken should be set to point at a buffer containing the 214- ** normalized version of the token (i.e. after any case-folding and/or 215- ** stemming has been performed). *pnBytes should be set to the length 216- ** of this buffer in bytes. The input text that generated the token is 217- ** identified by the byte offsets returned in *piStartOffset and 218- ** *piEndOffset. *piStartOffset should be set to the index of the first 219- ** byte of the token in the input buffer. *piEndOffset should be set 220- ** to the index of the first byte just past the end of the token in 221- ** the input buffer. 222- ** 223- ** The buffer *ppToken is set to point at is managed by the tokenizer 224- ** implementation. It is only required to be valid until the next call 225- ** to xNext() or xClose(). 226- */ 227- /* TODO(shess) current implementation requires pInput to be 228- ** nul-terminated. This should either be fixed, or pInput/nBytes 229- ** should be converted to zInput. 230- */ 231- int (*xNext)( 232- sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ 233- const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ 234- int *piStartOffset, /* OUT: Byte offset of token in input buffer */ 235- int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ 236- int *piPosition /* OUT: Number of tokens returned before this one */ 237- ); 238- 239- /*********************************************************************** 240- ** Methods below this point are only available if iVersion>=1. 241- */ 242- 243- /* 244- ** Configure the language id of a tokenizer cursor. 245- */ 246- int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid); 247-}; 248- 249-struct sqlite3_tokenizer { 250- const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ 251- /* Tokenizer implementations will typically add additional fields */ 252-}; 253- 254-struct sqlite3_tokenizer_cursor { 255- sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ 256- /* Tokenizer implementations will typically add additional fields */ 257-}; 258 259 int fts3_global_term_cnt(int iTerm, int iCol); 260 int fts3_term_cnt(int iTerm, int iCol); 261@@ -189003,9 +188957,6 @@ SQLITE_PRIVATE void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module co 262 #ifndef SQLITE_DISABLE_FTS3_UNICODE 263 SQLITE_PRIVATE void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const**ppModule); 264 #endif 265-#ifdef SQLITE_ENABLE_ICU 266-SQLITE_PRIVATE void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule); 267-#endif 268 269 /* 270 ** Initialize the fts3 extension. If this extension is built as part 271@@ -189024,7 +188975,14 @@ SQLITE_PRIVATE int sqlite3Fts3Init(sqlite3 *db){ 272 273 #ifdef SQLITE_ENABLE_ICU 274 const sqlite3_tokenizer_module *pIcu = 0; 275- sqlite3Fts3IcuTokenizerModule(&pIcu); 276+ if( icuEnable ){ 277+ if( tokenModulePtr!=NULL ){ 278+ tokenModulePtr(&pIcu); 279+ }else{ 280+ sqlite3_log(SQLITE_ERROR, "icu module ptr is null"); 281+ return SQLITE_ERROR; 282+ } 283+ } 284 #endif 285 286 #ifndef SQLITE_DISABLE_FTS3_UNICODE 287@@ -189060,7 +189018,7 @@ SQLITE_PRIVATE int sqlite3Fts3Init(sqlite3 *db){ 288 || sqlite3Fts3HashInsert(&pHash->hash, "unicode61", 10, (void *)pUnicode) 289 #endif 290 #ifdef SQLITE_ENABLE_ICU 291- || (pIcu && sqlite3Fts3HashInsert(&pHash->hash, "icu", 4, (void *)pIcu)) 292+ || (icuEnable && pIcu && sqlite3Fts3HashInsert(&pHash->hash, "icu", 4, (void *)pIcu)) 293 #endif 294 ){ 295 rc = SQLITE_NOMEM; 296@@ -213799,829 +213757,6 @@ SQLITE_API int sqlite3_rtree_init( 297 #endif 298 299 /************** End of rtree.c ***********************************************/ 300-/************** Begin file icu.c *********************************************/ 301-/* 302-** 2007 May 6 303-** 304-** The author disclaims copyright to this source code. In place of 305-** a legal notice, here is a blessing: 306-** 307-** May you do good and not evil. 308-** May you find forgiveness for yourself and forgive others. 309-** May you share freely, never taking more than you give. 310-** 311-************************************************************************* 312-** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $ 313-** 314-** This file implements an integration between the ICU library 315-** ("International Components for Unicode", an open-source library 316-** for handling unicode data) and SQLite. The integration uses 317-** ICU to provide the following to SQLite: 318-** 319-** * An implementation of the SQL regexp() function (and hence REGEXP 320-** operator) using the ICU uregex_XX() APIs. 321-** 322-** * Implementations of the SQL scalar upper() and lower() functions 323-** for case mapping. 324-** 325-** * Integration of ICU and SQLite collation sequences. 326-** 327-** * An implementation of the LIKE operator that uses ICU to 328-** provide case-independent matching. 329-*/ 330- 331-#if !defined(SQLITE_CORE) \ 332- || defined(SQLITE_ENABLE_ICU) \ 333- || defined(SQLITE_ENABLE_ICU_COLLATIONS) 334- 335-/* Include ICU headers */ 336-#include <unicode/utypes.h> 337-#include <unicode/uregex.h> 338-#include <unicode/ustring.h> 339-#include <unicode/ucol.h> 340- 341-/* #include <assert.h> */ 342- 343-#ifndef SQLITE_CORE 344-/* #include "sqlite3ext.h" */ 345- SQLITE_EXTENSION_INIT1 346-#else 347-/* #include "sqlite3.h" */ 348-#endif 349- 350-/* 351-** This function is called when an ICU function called from within 352-** the implementation of an SQL scalar function returns an error. 353-** 354-** The scalar function context passed as the first argument is 355-** loaded with an error message based on the following two args. 356-*/ 357-static void icuFunctionError( 358- sqlite3_context *pCtx, /* SQLite scalar function context */ 359- const char *zName, /* Name of ICU function that failed */ 360- UErrorCode e /* Error code returned by ICU function */ 361-){ 362- char zBuf[128]; 363- sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e)); 364- zBuf[127] = '\0'; 365- sqlite3_result_error(pCtx, zBuf, -1); 366-} 367- 368-#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) 369- 370-/* 371-** Maximum length (in bytes) of the pattern in a LIKE or GLOB 372-** operator. 373-*/ 374-#ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH 375-# define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000 376-#endif 377- 378-/* 379-** Version of sqlite3_free() that is always a function, never a macro. 380-*/ 381-static void xFree(void *p){ 382- sqlite3_free(p); 383-} 384- 385-/* 386-** This lookup table is used to help decode the first byte of 387-** a multi-byte UTF8 character. It is copied here from SQLite source 388-** code file utf8.c. 389-*/ 390-static const unsigned char icuUtf8Trans1[] = { 391- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 392- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 393- 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 394- 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 395- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 396- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 397- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 398- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 399-}; 400- 401-#define SQLITE_ICU_READ_UTF8(zIn, c) \ 402- c = *(zIn++); \ 403- if( c>=0xc0 ){ \ 404- c = icuUtf8Trans1[c-0xc0]; \ 405- while( (*zIn & 0xc0)==0x80 ){ \ 406- c = (c<<6) + (0x3f & *(zIn++)); \ 407- } \ 408- } 409- 410-#define SQLITE_ICU_SKIP_UTF8(zIn) \ 411- assert( *zIn ); \ 412- if( *(zIn++)>=0xc0 ){ \ 413- while( (*zIn & 0xc0)==0x80 ){zIn++;} \ 414- } 415- 416- 417-/* 418-** Compare two UTF-8 strings for equality where the first string is 419-** a "LIKE" expression. Return true (1) if they are the same and 420-** false (0) if they are different. 421-*/ 422-static int icuLikeCompare( 423- const uint8_t *zPattern, /* LIKE pattern */ 424- const uint8_t *zString, /* The UTF-8 string to compare against */ 425- const UChar32 uEsc /* The escape character */ 426-){ 427- static const uint32_t MATCH_ONE = (uint32_t)'_'; 428- static const uint32_t MATCH_ALL = (uint32_t)'%'; 429- 430- int prevEscape = 0; /* True if the previous character was uEsc */ 431- 432- while( 1 ){ 433- 434- /* Read (and consume) the next character from the input pattern. */ 435- uint32_t uPattern; 436- SQLITE_ICU_READ_UTF8(zPattern, uPattern); 437- if( uPattern==0 ) break; 438- 439- /* There are now 4 possibilities: 440- ** 441- ** 1. uPattern is an unescaped match-all character "%", 442- ** 2. uPattern is an unescaped match-one character "_", 443- ** 3. uPattern is an unescaped escape character, or 444- ** 4. uPattern is to be handled as an ordinary character 445- */ 446- if( uPattern==MATCH_ALL && !prevEscape && uPattern!=(uint32_t)uEsc ){ 447- /* Case 1. */ 448- uint8_t c; 449- 450- /* Skip any MATCH_ALL or MATCH_ONE characters that follow a 451- ** MATCH_ALL. For each MATCH_ONE, skip one character in the 452- ** test string. 453- */ 454- while( (c=*zPattern) == MATCH_ALL || c == MATCH_ONE ){ 455- if( c==MATCH_ONE ){ 456- if( *zString==0 ) return 0; 457- SQLITE_ICU_SKIP_UTF8(zString); 458- } 459- zPattern++; 460- } 461- 462- if( *zPattern==0 ) return 1; 463- 464- while( *zString ){ 465- if( icuLikeCompare(zPattern, zString, uEsc) ){ 466- return 1; 467- } 468- SQLITE_ICU_SKIP_UTF8(zString); 469- } 470- return 0; 471- 472- }else if( uPattern==MATCH_ONE && !prevEscape && uPattern!=(uint32_t)uEsc ){ 473- /* Case 2. */ 474- if( *zString==0 ) return 0; 475- SQLITE_ICU_SKIP_UTF8(zString); 476- 477- }else if( uPattern==(uint32_t)uEsc && !prevEscape ){ 478- /* Case 3. */ 479- prevEscape = 1; 480- 481- }else{ 482- /* Case 4. */ 483- uint32_t uString; 484- SQLITE_ICU_READ_UTF8(zString, uString); 485- uString = (uint32_t)u_foldCase((UChar32)uString, U_FOLD_CASE_DEFAULT); 486- uPattern = (uint32_t)u_foldCase((UChar32)uPattern, U_FOLD_CASE_DEFAULT); 487- if( uString!=uPattern ){ 488- return 0; 489- } 490- prevEscape = 0; 491- } 492- } 493- 494- return *zString==0; 495-} 496- 497-/* 498-** Implementation of the like() SQL function. This function implements 499-** the build-in LIKE operator. The first argument to the function is the 500-** pattern and the second argument is the string. So, the SQL statements: 501-** 502-** A LIKE B 503-** 504-** is implemented as like(B, A). If there is an escape character E, 505-** 506-** A LIKE B ESCAPE E 507-** 508-** is mapped to like(B, A, E). 509-*/ 510-static void icuLikeFunc( 511- sqlite3_context *context, 512- int argc, 513- sqlite3_value **argv 514-){ 515- const unsigned char *zA = sqlite3_value_text(argv[0]); 516- const unsigned char *zB = sqlite3_value_text(argv[1]); 517- UChar32 uEsc = 0; 518- 519- /* Limit the length of the LIKE or GLOB pattern to avoid problems 520- ** of deep recursion and N*N behavior in patternCompare(). 521- */ 522- if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){ 523- sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1); 524- return; 525- } 526- 527- 528- if( argc==3 ){ 529- /* The escape character string must consist of a single UTF-8 character. 530- ** Otherwise, return an error. 531- */ 532- int nE= sqlite3_value_bytes(argv[2]); 533- const unsigned char *zE = sqlite3_value_text(argv[2]); 534- int i = 0; 535- if( zE==0 ) return; 536- U8_NEXT(zE, i, nE, uEsc); 537- if( i!=nE){ 538- sqlite3_result_error(context, 539- "ESCAPE expression must be a single character", -1); 540- return; 541- } 542- } 543- 544- if( zA && zB ){ 545- sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc)); 546- } 547-} 548- 549-/* 550-** Function to delete compiled regexp objects. Registered as 551-** a destructor function with sqlite3_set_auxdata(). 552-*/ 553-static void icuRegexpDelete(void *p){ 554- URegularExpression *pExpr = (URegularExpression *)p; 555- uregex_close(pExpr); 556-} 557- 558-/* 559-** Implementation of SQLite REGEXP operator. This scalar function takes 560-** two arguments. The first is a regular expression pattern to compile 561-** the second is a string to match against that pattern. If either 562-** argument is an SQL NULL, then NULL Is returned. Otherwise, the result 563-** is 1 if the string matches the pattern, or 0 otherwise. 564-** 565-** SQLite maps the regexp() function to the regexp() operator such 566-** that the following two are equivalent: 567-** 568-** zString REGEXP zPattern 569-** regexp(zPattern, zString) 570-** 571-** Uses the following ICU regexp APIs: 572-** 573-** uregex_open() 574-** uregex_matches() 575-** uregex_close() 576-*/ 577-static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){ 578- UErrorCode status = U_ZERO_ERROR; 579- URegularExpression *pExpr; 580- UBool res; 581- const UChar *zString = sqlite3_value_text16(apArg[1]); 582- 583- (void)nArg; /* Unused parameter */ 584- 585- /* If the left hand side of the regexp operator is NULL, 586- ** then the result is also NULL. 587- */ 588- if( !zString ){ 589- return; 590- } 591- 592- pExpr = sqlite3_get_auxdata(p, 0); 593- if( !pExpr ){ 594- const UChar *zPattern = sqlite3_value_text16(apArg[0]); 595- if( !zPattern ){ 596- return; 597- } 598- pExpr = uregex_open(zPattern, -1, 0, 0, &status); 599- 600- if( U_SUCCESS(status) ){ 601- sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete); 602- pExpr = sqlite3_get_auxdata(p, 0); 603- } 604- if( !pExpr ){ 605- icuFunctionError(p, "uregex_open", status); 606- return; 607- } 608- } 609- 610- /* Configure the text that the regular expression operates on. */ 611- uregex_setText(pExpr, zString, -1, &status); 612- if( !U_SUCCESS(status) ){ 613- icuFunctionError(p, "uregex_setText", status); 614- return; 615- } 616- 617- /* Attempt the match */ 618- res = uregex_matches(pExpr, 0, &status); 619- if( !U_SUCCESS(status) ){ 620- icuFunctionError(p, "uregex_matches", status); 621- return; 622- } 623- 624- /* Set the text that the regular expression operates on to a NULL 625- ** pointer. This is not really necessary, but it is tidier than 626- ** leaving the regular expression object configured with an invalid 627- ** pointer after this function returns. 628- */ 629- uregex_setText(pExpr, 0, 0, &status); 630- 631- /* Return 1 or 0. */ 632- sqlite3_result_int(p, res ? 1 : 0); 633-} 634- 635-/* 636-** Implementations of scalar functions for case mapping - upper() and 637-** lower(). Function upper() converts its input to upper-case (ABC). 638-** Function lower() converts to lower-case (abc). 639-** 640-** ICU provides two types of case mapping, "general" case mapping and 641-** "language specific". Refer to ICU documentation for the differences 642-** between the two. 643-** 644-** To utilise "general" case mapping, the upper() or lower() scalar 645-** functions are invoked with one argument: 646-** 647-** upper('ABC') -> 'abc' 648-** lower('abc') -> 'ABC' 649-** 650-** To access ICU "language specific" case mapping, upper() or lower() 651-** should be invoked with two arguments. The second argument is the name 652-** of the locale to use. Passing an empty string ("") or SQL NULL value 653-** as the second argument is the same as invoking the 1 argument version 654-** of upper() or lower(). 655-** 656-** lower('I', 'en_us') -> 'i' 657-** lower('I', 'tr_tr') -> '\u131' (small dotless i) 658-** 659-** http://www.icu-project.org/userguide/posix.html#case_mappings 660-*/ 661-static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){ 662- const UChar *zInput; /* Pointer to input string */ 663- UChar *zOutput = 0; /* Pointer to output buffer */ 664- int nInput; /* Size of utf-16 input string in bytes */ 665- int nOut; /* Size of output buffer in bytes */ 666- int cnt; 667- int bToUpper; /* True for toupper(), false for tolower() */ 668- UErrorCode status; 669- const char *zLocale = 0; 670- 671- assert(nArg==1 || nArg==2); 672- bToUpper = (sqlite3_user_data(p)!=0); 673- if( nArg==2 ){ 674- zLocale = (const char *)sqlite3_value_text(apArg[1]); 675- } 676- 677- zInput = sqlite3_value_text16(apArg[0]); 678- if( !zInput ){ 679- return; 680- } 681- nOut = nInput = sqlite3_value_bytes16(apArg[0]); 682- if( nOut==0 ){ 683- sqlite3_result_text16(p, "", 0, SQLITE_STATIC); 684- return; 685- } 686- 687- for(cnt=0; cnt<2; cnt++){ 688- UChar *zNew = sqlite3_realloc(zOutput, nOut); 689- if( zNew==0 ){ 690- sqlite3_free(zOutput); 691- sqlite3_result_error_nomem(p); 692- return; 693- } 694- zOutput = zNew; 695- status = U_ZERO_ERROR; 696- if( bToUpper ){ 697- nOut = 2*u_strToUpper(zOutput,nOut/2,zInput,nInput/2,zLocale,&status); 698- }else{ 699- nOut = 2*u_strToLower(zOutput,nOut/2,zInput,nInput/2,zLocale,&status); 700- } 701- 702- if( U_SUCCESS(status) ){ 703- sqlite3_result_text16(p, zOutput, nOut, xFree); 704- }else if( status==U_BUFFER_OVERFLOW_ERROR ){ 705- assert( cnt==0 ); 706- continue; 707- }else{ 708- icuFunctionError(p, bToUpper ? "u_strToUpper" : "u_strToLower", status); 709- } 710- return; 711- } 712- assert( 0 ); /* Unreachable */ 713-} 714- 715-#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */ 716- 717-/* 718-** Collation sequence destructor function. The pCtx argument points to 719-** a UCollator structure previously allocated using ucol_open(). 720-*/ 721-static void icuCollationDel(void *pCtx){ 722- UCollator *p = (UCollator *)pCtx; 723- ucol_close(p); 724-} 725- 726-/* 727-** Collation sequence comparison function. The pCtx argument points to 728-** a UCollator structure previously allocated using ucol_open(). 729-*/ 730-static int icuCollationColl( 731- void *pCtx, 732- int nLeft, 733- const void *zLeft, 734- int nRight, 735- const void *zRight 736-){ 737- UCollationResult res; 738- UCollator *p = (UCollator *)pCtx; 739- res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2); 740- switch( res ){ 741- case UCOL_LESS: return -1; 742- case UCOL_GREATER: return +1; 743- case UCOL_EQUAL: return 0; 744- } 745- assert(!"Unexpected return value from ucol_strcoll()"); 746- return 0; 747-} 748- 749-/* 750-** Implementation of the scalar function icu_load_collation(). 751-** 752-** This scalar function is used to add ICU collation based collation 753-** types to an SQLite database connection. It is intended to be called 754-** as follows: 755-** 756-** SELECT icu_load_collation(<locale>, <collation-name>); 757-** 758-** Where <locale> is a string containing an ICU locale identifier (i.e. 759-** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the 760-** collation sequence to create. 761-*/ 762-static void icuLoadCollation( 763- sqlite3_context *p, 764- int nArg, 765- sqlite3_value **apArg 766-){ 767- sqlite3 *db = (sqlite3 *)sqlite3_user_data(p); 768- UErrorCode status = U_ZERO_ERROR; 769- const char *zLocale; /* Locale identifier - (eg. "jp_JP") */ 770- const char *zName; /* SQL Collation sequence name (eg. "japanese") */ 771- UCollator *pUCollator; /* ICU library collation object */ 772- int rc; /* Return code from sqlite3_create_collation_x() */ 773- 774- assert(nArg==2); 775- (void)nArg; /* Unused parameter */ 776- zLocale = (const char *)sqlite3_value_text(apArg[0]); 777- zName = (const char *)sqlite3_value_text(apArg[1]); 778- 779- if( !zLocale || !zName ){ 780- return; 781- } 782- 783- pUCollator = ucol_open(zLocale, &status); 784- if( !U_SUCCESS(status) ){ 785- icuFunctionError(p, "ucol_open", status); 786- return; 787- } 788- assert(p); 789- 790- rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator, 791- icuCollationColl, icuCollationDel 792- ); 793- if( rc!=SQLITE_OK ){ 794- ucol_close(pUCollator); 795- sqlite3_result_error(p, "Error registering collation function", -1); 796- } 797-} 798- 799-/* 800-** Register the ICU extension functions with database db. 801-*/ 802-SQLITE_PRIVATE int sqlite3IcuInit(sqlite3 *db){ 803-# define SQLITEICU_EXTRAFLAGS (SQLITE_DETERMINISTIC|SQLITE_INNOCUOUS) 804- static const struct IcuScalar { 805- const char *zName; /* Function name */ 806- unsigned char nArg; /* Number of arguments */ 807- unsigned int enc; /* Optimal text encoding */ 808- unsigned char iContext; /* sqlite3_user_data() context */ 809- void (*xFunc)(sqlite3_context*,int,sqlite3_value**); 810- } scalars[] = { 811- {"icu_load_collation",2,SQLITE_UTF8|SQLITE_DIRECTONLY,1, icuLoadCollation}, 812-#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) 813- {"regexp", 2, SQLITE_ANY|SQLITEICU_EXTRAFLAGS, 0, icuRegexpFunc}, 814- {"lower", 1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 815- {"lower", 2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 816- {"upper", 1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 817- {"upper", 2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 818- {"lower", 1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 819- {"lower", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 820- {"upper", 1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 821- {"upper", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 822- {"like", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuLikeFunc}, 823- {"like", 3, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuLikeFunc}, 824-#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */ 825- }; 826- int rc = SQLITE_OK; 827- int i; 828- 829- for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){ 830- const struct IcuScalar *p = &scalars[i]; 831- rc = sqlite3_create_function( 832- db, p->zName, p->nArg, p->enc, 833- p->iContext ? (void*)db : (void*)0, 834- p->xFunc, 0, 0 835- ); 836- } 837- 838- return rc; 839-} 840- 841-#if !SQLITE_CORE 842-#ifdef _WIN32 843-__declspec(dllexport) 844-#endif 845-SQLITE_API int sqlite3_icu_init( 846- sqlite3 *db, 847- char **pzErrMsg, 848- const sqlite3_api_routines *pApi 849-){ 850- SQLITE_EXTENSION_INIT2(pApi) 851- return sqlite3IcuInit(db); 852-} 853-#endif 854- 855-#endif 856- 857-/************** End of icu.c *************************************************/ 858-/************** Begin file fts3_icu.c ****************************************/ 859-/* 860-** 2007 June 22 861-** 862-** The author disclaims copyright to this source code. In place of 863-** a legal notice, here is a blessing: 864-** 865-** May you do good and not evil. 866-** May you find forgiveness for yourself and forgive others. 867-** May you share freely, never taking more than you give. 868-** 869-************************************************************************* 870-** This file implements a tokenizer for fts3 based on the ICU library. 871-*/ 872-/* #include "fts3Int.h" */ 873-#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 874-#ifdef SQLITE_ENABLE_ICU 875- 876-/* #include <assert.h> */ 877-/* #include <string.h> */ 878-/* #include "fts3_tokenizer.h" */ 879- 880-#include <unicode/ubrk.h> 881-/* #include <unicode/ucol.h> */ 882-/* #include <unicode/ustring.h> */ 883-#include <unicode/utf16.h> 884- 885-typedef struct IcuTokenizer IcuTokenizer; 886-typedef struct IcuCursor IcuCursor; 887- 888-struct IcuTokenizer { 889- sqlite3_tokenizer base; 890- char *zLocale; 891-}; 892- 893-struct IcuCursor { 894- sqlite3_tokenizer_cursor base; 895- 896- UBreakIterator *pIter; /* ICU break-iterator object */ 897- int nChar; /* Number of UChar elements in pInput */ 898- UChar *aChar; /* Copy of input using utf-16 encoding */ 899- int *aOffset; /* Offsets of each character in utf-8 input */ 900- 901- int nBuffer; 902- char *zBuffer; 903- 904- int iToken; 905-}; 906- 907-/* 908-** Create a new tokenizer instance. 909-*/ 910-static int icuCreate( 911- int argc, /* Number of entries in argv[] */ 912- const char * const *argv, /* Tokenizer creation arguments */ 913- sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ 914-){ 915- IcuTokenizer *p; 916- int n = 0; 917- 918- if( argc>0 ){ 919- n = strlen(argv[0])+1; 920- } 921- p = (IcuTokenizer *)sqlite3_malloc64(sizeof(IcuTokenizer)+n); 922- if( !p ){ 923- return SQLITE_NOMEM; 924- } 925- memset(p, 0, sizeof(IcuTokenizer)); 926- 927- if( n ){ 928- p->zLocale = (char *)&p[1]; 929- memcpy(p->zLocale, argv[0], n); 930- } 931- 932- *ppTokenizer = (sqlite3_tokenizer *)p; 933- 934- return SQLITE_OK; 935-} 936- 937-/* 938-** Destroy a tokenizer 939-*/ 940-static int icuDestroy(sqlite3_tokenizer *pTokenizer){ 941- IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 942- sqlite3_free(p); 943- return SQLITE_OK; 944-} 945- 946-/* 947-** Prepare to begin tokenizing a particular string. The input 948-** string to be tokenized is pInput[0..nBytes-1]. A cursor 949-** used to incrementally tokenize this string is returned in 950-** *ppCursor. 951-*/ 952-static int icuOpen( 953- sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 954- const char *zInput, /* Input string */ 955- int nInput, /* Length of zInput in bytes */ 956- sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 957-){ 958- IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 959- IcuCursor *pCsr; 960- 961- const int32_t opt = U_FOLD_CASE_DEFAULT; 962- UErrorCode status = U_ZERO_ERROR; 963- int nChar; 964- 965- UChar32 c; 966- int iInput = 0; 967- int iOut = 0; 968- 969- *ppCursor = 0; 970- 971- if( zInput==0 ){ 972- nInput = 0; 973- zInput = ""; 974- }else if( nInput<0 ){ 975- nInput = strlen(zInput); 976- } 977- nChar = nInput+1; 978- pCsr = (IcuCursor *)sqlite3_malloc64( 979- sizeof(IcuCursor) + /* IcuCursor */ 980- ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */ 981- (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ 982- ); 983- if( !pCsr ){ 984- return SQLITE_NOMEM; 985- } 986- memset(pCsr, 0, sizeof(IcuCursor)); 987- pCsr->aChar = (UChar *)&pCsr[1]; 988- pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3]; 989- 990- pCsr->aOffset[iOut] = iInput; 991- U8_NEXT(zInput, iInput, nInput, c); 992- while( c>0 ){ 993- int isError = 0; 994- c = u_foldCase(c, opt); 995- U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); 996- if( isError ){ 997- sqlite3_free(pCsr); 998- return SQLITE_ERROR; 999- } 1000- pCsr->aOffset[iOut] = iInput; 1001- 1002- if( iInput<nInput ){ 1003- U8_NEXT(zInput, iInput, nInput, c); 1004- }else{ 1005- c = 0; 1006- } 1007- } 1008- 1009- pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); 1010- if( !U_SUCCESS(status) ){ 1011- sqlite3_free(pCsr); 1012- return SQLITE_ERROR; 1013- } 1014- pCsr->nChar = iOut; 1015- 1016- ubrk_first(pCsr->pIter); 1017- *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; 1018- return SQLITE_OK; 1019-} 1020- 1021-/* 1022-** Close a tokenization cursor previously opened by a call to icuOpen(). 1023-*/ 1024-static int icuClose(sqlite3_tokenizer_cursor *pCursor){ 1025- IcuCursor *pCsr = (IcuCursor *)pCursor; 1026- ubrk_close(pCsr->pIter); 1027- sqlite3_free(pCsr->zBuffer); 1028- sqlite3_free(pCsr); 1029- return SQLITE_OK; 1030-} 1031- 1032-/* 1033-** Extract the next token from a tokenization cursor. 1034-*/ 1035-static int icuNext( 1036- sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ 1037- const char **ppToken, /* OUT: *ppToken is the token text */ 1038- int *pnBytes, /* OUT: Number of bytes in token */ 1039- int *piStartOffset, /* OUT: Starting offset of token */ 1040- int *piEndOffset, /* OUT: Ending offset of token */ 1041- int *piPosition /* OUT: Position integer of token */ 1042-){ 1043- IcuCursor *pCsr = (IcuCursor *)pCursor; 1044- 1045- int iStart = 0; 1046- int iEnd = 0; 1047- int nByte = 0; 1048- 1049- while( iStart==iEnd ){ 1050- UChar32 c; 1051- 1052- iStart = ubrk_current(pCsr->pIter); 1053- iEnd = ubrk_next(pCsr->pIter); 1054- if( iEnd==UBRK_DONE ){ 1055- return SQLITE_DONE; 1056- } 1057- 1058- while( iStart<iEnd ){ 1059- int iWhite = iStart; 1060- U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); 1061- if( u_isspace(c) ){ 1062- iStart = iWhite; 1063- }else{ 1064- break; 1065- } 1066- } 1067- assert(iStart<=iEnd); 1068- } 1069- 1070- do { 1071- UErrorCode status = U_ZERO_ERROR; 1072- if( nByte ){ 1073- char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); 1074- if( !zNew ){ 1075- return SQLITE_NOMEM; 1076- } 1077- pCsr->zBuffer = zNew; 1078- pCsr->nBuffer = nByte; 1079- } 1080- 1081- u_strToUTF8( 1082- pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ 1083- &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ 1084- &status /* Output success/failure */ 1085- ); 1086- } while( nByte>pCsr->nBuffer ); 1087- 1088- *ppToken = pCsr->zBuffer; 1089- *pnBytes = nByte; 1090- *piStartOffset = pCsr->aOffset[iStart]; 1091- *piEndOffset = pCsr->aOffset[iEnd]; 1092- *piPosition = pCsr->iToken++; 1093- 1094- return SQLITE_OK; 1095-} 1096- 1097-/* 1098-** The set of routines that implement the simple tokenizer 1099-*/ 1100-static const sqlite3_tokenizer_module icuTokenizerModule = { 1101- 0, /* iVersion */ 1102- icuCreate, /* xCreate */ 1103- icuDestroy, /* xCreate */ 1104- icuOpen, /* xOpen */ 1105- icuClose, /* xClose */ 1106- icuNext, /* xNext */ 1107- 0, /* xLanguageid */ 1108-}; 1109- 1110-/* 1111-** Set *ppModule to point at the implementation of the ICU tokenizer. 1112-*/ 1113-SQLITE_PRIVATE void sqlite3Fts3IcuTokenizerModule( 1114- sqlite3_tokenizer_module const**ppModule 1115-){ 1116- *ppModule = &icuTokenizerModule; 1117-} 1118- 1119-#endif /* defined(SQLITE_ENABLE_ICU) */ 1120-#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ 1121- 1122-/************** End of fts3_icu.c ********************************************/ 1123 /************** Begin file sqlite3rbu.c **************************************/ 1124 /* 1125 ** 2014 August 30 1126diff --git a/src/sqlite3icu.c b/src/sqlite3icu.c 1127new file mode 100644 1128index 0000000..b5944d5 1129--- /dev/null 1130+++ b/src/sqlite3icu.c 1131@@ -0,0 +1,888 @@ 1132+/****************************************************************************** 1133+** This file is an amalgamation of many separate C source files from SQLite 1134+** version 3.40.1. By combining all the individual C code files into this 1135+** single large file, the entire code can be compiled as a single translation 1136+** unit. This allows many compilers to do optimizations that would not be 1137+** possible if the files were compiled separately. Performance improvements 1138+** of 5% or more are commonly seen when SQLite is compiled as a single 1139+** translation unit. 1140+** 1141+** This file is all you need to compile SQLite. To use SQLite in other 1142+** programs, you need this file and the "sqlite3.h" header file that defines 1143+** the programming interface to the SQLite library. (If you do not have 1144+** the "sqlite3.h" header file at hand, you will find a copy embedded within 1145+** the text of this file. Search for "Begin file sqlite3.h" to find the start 1146+** of the embedded sqlite3.h header file.) Additional code files may be needed 1147+** if you want a wrapper to interface SQLite with your choice of programming 1148+** language. The code for the "sqlite3" command-line shell is also in a 1149+** separate file. This file contains only code for the core SQLite library. 1150+*/ 1151+/* 1152+** 2019.09.02-Complete codec logic for encryption and decryption. 1153+** Huawei Technologies Co, Ltd. 1154+*/ 1155+/************** Begin file icu.c *********************************************/ 1156+/* 1157+** 2007 May 6 1158+** 1159+** The author disclaims copyright to this source code. In place of 1160+** a legal notice, here is a blessing: 1161+** 1162+** May you do good and not evil. 1163+** May you find forgiveness for yourself and forgive others. 1164+** May you share freely, never taking more than you give. 1165+** 1166+************************************************************************* 1167+** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $ 1168+** 1169+** This file implements an integration between the ICU library 1170+** ("International Components for Unicode", an open-source library 1171+** for handling unicode data) and SQLite. The integration uses 1172+** ICU to provide the following to SQLite: 1173+** 1174+** * An implementation of the SQL regexp() function (and hence REGEXP 1175+** operator) using the ICU uregex_XX() APIs. 1176+** 1177+** * Implementations of the SQL scalar upper() and lower() functions 1178+** for case mapping. 1179+** 1180+** * Integration of ICU and SQLite collation sequences. 1181+** 1182+** * An implementation of the LIKE operator that uses ICU to 1183+** provide case-independent matching. 1184+*/ 1185+#include <stdio.h> 1186+#include <stdlib.h> 1187+#include <string.h> 1188+#include <assert.h> 1189+#include <stddef.h> 1190+ 1191+#include "sqlite3icu.h" 1192+#include "sqlite3.h" 1193+ 1194+#if !defined(SQLITE_CORE) \ 1195+ || defined(SQLITE_ENABLE_ICU) \ 1196+ || defined(SQLITE_ENABLE_ICU_COLLATIONS) 1197+ 1198+/* Include ICU headers */ 1199+#include <unicode/utypes.h> 1200+#include <unicode/uregex.h> 1201+#include <unicode/ustring.h> 1202+#include <unicode/ucol.h> 1203+ 1204+#if !defined(SQLITE_CORE) && !defined(SQLITE_OMIT_LOAD_EXTENSION) 1205+ /* This case when the file really is being compiled as a loadable 1206+ ** extension */ 1207+# define SQLITE_EXTENSION_INIT1 const sqlite3_api_routines *sqlite3_api=0; 1208+# define SQLITE_EXTENSION_INIT2(v) sqlite3_api=v; 1209+# define SQLITE_EXTENSION_INIT3 \ 1210+ extern const sqlite3_api_routines *sqlite3_api; 1211+#else 1212+ /* This case when the file is being statically linked into the 1213+ ** application */ 1214+# define SQLITE_EXTENSION_INIT1 /*no-op*/ 1215+# define SQLITE_EXTENSION_INIT2(v) (void)v; /* unused parameter */ 1216+# define SQLITE_EXTENSION_INIT3 /*no-op*/ 1217+#endif 1218+ 1219+/* #include <assert.h> */ 1220+ 1221+#ifndef SQLITE_CORE 1222+/* #include "sqlite3ext.h" */ 1223+ SQLITE_EXTENSION_INIT1 1224+#else 1225+/* #include "sqlite3.h" */ 1226+#endif 1227+ 1228+// export the symbols 1229+#ifdef SQLITE_EXPORT_SYMBOLS 1230+#if defined(__GNUC__) 1231+# define EXPORT_SYMBOLS __attribute__ ((visibility ("default"))) 1232+#elif defined(_MSC_VER) 1233+# define EXPORT_SYMBOLS __declspec(dllexport) 1234+#else 1235+# define EXPORT_SYMBOLS 1236+#endif 1237+#endif 1238+ 1239+EXPORT_SYMBOLS SQLITE_API int sqlite3IcuInit(sqlite3 *db); 1240+#ifdef SQLITE_ENABLE_ICU 1241+EXPORT_SYMBOLS SQLITE_API void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule); 1242+#endif 1243+/* 1244+** This function is called when an ICU function called from within 1245+** the implementation of an SQL scalar function returns an error. 1246+** 1247+** The scalar function context passed as the first argument is 1248+** loaded with an error message based on the following two args. 1249+*/ 1250+static void icuFunctionError( 1251+ sqlite3_context *pCtx, /* SQLite scalar function context */ 1252+ const char *zName, /* Name of ICU function that failed */ 1253+ UErrorCode e /* Error code returned by ICU function */ 1254+){ 1255+ char zBuf[128]; 1256+ sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e)); 1257+ zBuf[127] = '\0'; 1258+ sqlite3_result_error(pCtx, zBuf, -1); 1259+} 1260+ 1261+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) 1262+ 1263+/* 1264+** Maximum length (in bytes) of the pattern in a LIKE or GLOB 1265+** operator. 1266+*/ 1267+#ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH 1268+# define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000 1269+#endif 1270+ 1271+/* 1272+** Version of sqlite3_free() that is always a function, never a macro. 1273+*/ 1274+static void xFree(void *p){ 1275+ sqlite3_free(p); 1276+} 1277+ 1278+/* 1279+** This lookup table is used to help decode the first byte of 1280+** a multi-byte UTF8 character. It is copied here from SQLite source 1281+** code file utf8.c. 1282+*/ 1283+static const unsigned char icuUtf8Trans1[] = { 1284+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1285+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 1286+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 1287+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 1288+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1289+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 1290+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1291+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, 1292+}; 1293+ 1294+#define SQLITE_ICU_READ_UTF8(zIn, c) \ 1295+ c = *(zIn++); \ 1296+ if( c>=0xc0 ){ \ 1297+ c = icuUtf8Trans1[c-0xc0]; \ 1298+ while( (*zIn & 0xc0)==0x80 ){ \ 1299+ c = (c<<6) + (0x3f & *(zIn++)); \ 1300+ } \ 1301+ } 1302+ 1303+#define SQLITE_ICU_SKIP_UTF8(zIn) \ 1304+ assert( *zIn ); \ 1305+ if( *(zIn++)>=0xc0 ){ \ 1306+ while( (*zIn & 0xc0)==0x80 ){zIn++;} \ 1307+ } 1308+ 1309+ 1310+/* 1311+** Compare two UTF-8 strings for equality where the first string is 1312+** a "LIKE" expression. Return true (1) if they are the same and 1313+** false (0) if they are different. 1314+*/ 1315+static int icuLikeCompare( 1316+ const uint8_t *zPattern, /* LIKE pattern */ 1317+ const uint8_t *zString, /* The UTF-8 string to compare against */ 1318+ const UChar32 uEsc /* The escape character */ 1319+){ 1320+ static const uint32_t MATCH_ONE = (uint32_t)'_'; 1321+ static const uint32_t MATCH_ALL = (uint32_t)'%'; 1322+ 1323+ int prevEscape = 0; /* True if the previous character was uEsc */ 1324+ 1325+ while( 1 ){ 1326+ 1327+ /* Read (and consume) the next character from the input pattern. */ 1328+ uint32_t uPattern; 1329+ SQLITE_ICU_READ_UTF8(zPattern, uPattern); 1330+ if( uPattern==0 ) break; 1331+ 1332+ /* There are now 4 possibilities: 1333+ ** 1334+ ** 1. uPattern is an unescaped match-all character "%", 1335+ ** 2. uPattern is an unescaped match-one character "_", 1336+ ** 3. uPattern is an unescaped escape character, or 1337+ ** 4. uPattern is to be handled as an ordinary character 1338+ */ 1339+ if( uPattern==MATCH_ALL && !prevEscape && uPattern!=(uint32_t)uEsc ){ 1340+ /* Case 1. */ 1341+ uint8_t c; 1342+ 1343+ /* Skip any MATCH_ALL or MATCH_ONE characters that follow a 1344+ ** MATCH_ALL. For each MATCH_ONE, skip one character in the 1345+ ** test string. 1346+ */ 1347+ while( (c=*zPattern) == MATCH_ALL || c == MATCH_ONE ){ 1348+ if( c==MATCH_ONE ){ 1349+ if( *zString==0 ) return 0; 1350+ SQLITE_ICU_SKIP_UTF8(zString); 1351+ } 1352+ zPattern++; 1353+ } 1354+ 1355+ if( *zPattern==0 ) return 1; 1356+ 1357+ while( *zString ){ 1358+ if( icuLikeCompare(zPattern, zString, uEsc) ){ 1359+ return 1; 1360+ } 1361+ SQLITE_ICU_SKIP_UTF8(zString); 1362+ } 1363+ return 0; 1364+ 1365+ }else if( uPattern==MATCH_ONE && !prevEscape && uPattern!=(uint32_t)uEsc ){ 1366+ /* Case 2. */ 1367+ if( *zString==0 ) return 0; 1368+ SQLITE_ICU_SKIP_UTF8(zString); 1369+ 1370+ }else if( uPattern==(uint32_t)uEsc && !prevEscape ){ 1371+ /* Case 3. */ 1372+ prevEscape = 1; 1373+ 1374+ }else{ 1375+ /* Case 4. */ 1376+ uint32_t uString; 1377+ SQLITE_ICU_READ_UTF8(zString, uString); 1378+ uString = (uint32_t)u_foldCase((UChar32)uString, U_FOLD_CASE_DEFAULT); 1379+ uPattern = (uint32_t)u_foldCase((UChar32)uPattern, U_FOLD_CASE_DEFAULT); 1380+ if( uString!=uPattern ){ 1381+ return 0; 1382+ } 1383+ prevEscape = 0; 1384+ } 1385+ } 1386+ 1387+ return *zString==0; 1388+} 1389+ 1390+/* 1391+** Implementation of the like() SQL function. This function implements 1392+** the build-in LIKE operator. The first argument to the function is the 1393+** pattern and the second argument is the string. So, the SQL statements: 1394+** 1395+** A LIKE B 1396+** 1397+** is implemented as like(B, A). If there is an escape character E, 1398+** 1399+** A LIKE B ESCAPE E 1400+** 1401+** is mapped to like(B, A, E). 1402+*/ 1403+static void icuLikeFunc( 1404+ sqlite3_context *context, 1405+ int argc, 1406+ sqlite3_value **argv 1407+){ 1408+ const unsigned char *zA = sqlite3_value_text(argv[0]); 1409+ const unsigned char *zB = sqlite3_value_text(argv[1]); 1410+ UChar32 uEsc = 0; 1411+ 1412+ /* Limit the length of the LIKE or GLOB pattern to avoid problems 1413+ ** of deep recursion and N*N behavior in patternCompare(). 1414+ */ 1415+ if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){ 1416+ sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1); 1417+ return; 1418+ } 1419+ 1420+ 1421+ if( argc==3 ){ 1422+ /* The escape character string must consist of a single UTF-8 character. 1423+ ** Otherwise, return an error. 1424+ */ 1425+ int nE= sqlite3_value_bytes(argv[2]); 1426+ const unsigned char *zE = sqlite3_value_text(argv[2]); 1427+ int i = 0; 1428+ if( zE==0 ) return; 1429+ U8_NEXT(zE, i, nE, uEsc); 1430+ if( i!=nE){ 1431+ sqlite3_result_error(context, 1432+ "ESCAPE expression must be a single character", -1); 1433+ return; 1434+ } 1435+ } 1436+ 1437+ if( zA && zB ){ 1438+ sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc)); 1439+ } 1440+} 1441+ 1442+/* 1443+** Function to delete compiled regexp objects. Registered as 1444+** a destructor function with sqlite3_set_auxdata(). 1445+*/ 1446+static void icuRegexpDelete(void *p){ 1447+ URegularExpression *pExpr = (URegularExpression *)p; 1448+ uregex_close(pExpr); 1449+} 1450+ 1451+/* 1452+** Implementation of SQLite REGEXP operator. This scalar function takes 1453+** two arguments. The first is a regular expression pattern to compile 1454+** the second is a string to match against that pattern. If either 1455+** argument is an SQL NULL, then NULL Is returned. Otherwise, the result 1456+** is 1 if the string matches the pattern, or 0 otherwise. 1457+** 1458+** SQLite maps the regexp() function to the regexp() operator such 1459+** that the following two are equivalent: 1460+** 1461+** zString REGEXP zPattern 1462+** regexp(zPattern, zString) 1463+** 1464+** Uses the following ICU regexp APIs: 1465+** 1466+** uregex_open() 1467+** uregex_matches() 1468+** uregex_close() 1469+*/ 1470+static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){ 1471+ UErrorCode status = U_ZERO_ERROR; 1472+ URegularExpression *pExpr; 1473+ UBool res; 1474+ const UChar *zString = sqlite3_value_text16(apArg[1]); 1475+ 1476+ (void)nArg; /* Unused parameter */ 1477+ 1478+ /* If the left hand side of the regexp operator is NULL, 1479+ ** then the result is also NULL. 1480+ */ 1481+ if( !zString ){ 1482+ return; 1483+ } 1484+ 1485+ pExpr = sqlite3_get_auxdata(p, 0); 1486+ if( !pExpr ){ 1487+ const UChar *zPattern = sqlite3_value_text16(apArg[0]); 1488+ if( !zPattern ){ 1489+ return; 1490+ } 1491+ pExpr = uregex_open(zPattern, -1, 0, 0, &status); 1492+ 1493+ if( U_SUCCESS(status) ){ 1494+ sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete); 1495+ pExpr = sqlite3_get_auxdata(p, 0); 1496+ } 1497+ if( !pExpr ){ 1498+ icuFunctionError(p, "uregex_open", status); 1499+ return; 1500+ } 1501+ } 1502+ 1503+ /* Configure the text that the regular expression operates on. */ 1504+ uregex_setText(pExpr, zString, -1, &status); 1505+ if( !U_SUCCESS(status) ){ 1506+ icuFunctionError(p, "uregex_setText", status); 1507+ return; 1508+ } 1509+ 1510+ /* Attempt the match */ 1511+ res = uregex_matches(pExpr, 0, &status); 1512+ if( !U_SUCCESS(status) ){ 1513+ icuFunctionError(p, "uregex_matches", status); 1514+ return; 1515+ } 1516+ 1517+ /* Set the text that the regular expression operates on to a NULL 1518+ ** pointer. This is not really necessary, but it is tidier than 1519+ ** leaving the regular expression object configured with an invalid 1520+ ** pointer after this function returns. 1521+ */ 1522+ uregex_setText(pExpr, 0, 0, &status); 1523+ 1524+ /* Return 1 or 0. */ 1525+ sqlite3_result_int(p, res ? 1 : 0); 1526+} 1527+ 1528+/* 1529+** Implementations of scalar functions for case mapping - upper() and 1530+** lower(). Function upper() converts its input to upper-case (ABC). 1531+** Function lower() converts to lower-case (abc). 1532+** 1533+** ICU provides two types of case mapping, "general" case mapping and 1534+** "language specific". Refer to ICU documentation for the differences 1535+** between the two. 1536+** 1537+** To utilise "general" case mapping, the upper() or lower() scalar 1538+** functions are invoked with one argument: 1539+** 1540+** upper('ABC') -> 'abc' 1541+** lower('abc') -> 'ABC' 1542+** 1543+** To access ICU "language specific" case mapping, upper() or lower() 1544+** should be invoked with two arguments. The second argument is the name 1545+** of the locale to use. Passing an empty string ("") or SQL NULL value 1546+** as the second argument is the same as invoking the 1 argument version 1547+** of upper() or lower(). 1548+** 1549+** lower('I', 'en_us') -> 'i' 1550+** lower('I', 'tr_tr') -> '\u131' (small dotless i) 1551+** 1552+** http://www.icu-project.org/userguide/posix.html#case_mappings 1553+*/ 1554+static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){ 1555+ const UChar *zInput; /* Pointer to input string */ 1556+ UChar *zOutput = 0; /* Pointer to output buffer */ 1557+ int nInput; /* Size of utf-16 input string in bytes */ 1558+ int nOut; /* Size of output buffer in bytes */ 1559+ int cnt; 1560+ int bToUpper; /* True for toupper(), false for tolower() */ 1561+ UErrorCode status; 1562+ const char *zLocale = 0; 1563+ 1564+ assert(nArg==1 || nArg==2); 1565+ bToUpper = (sqlite3_user_data(p)!=0); 1566+ if( nArg==2 ){ 1567+ zLocale = (const char *)sqlite3_value_text(apArg[1]); 1568+ } 1569+ 1570+ zInput = sqlite3_value_text16(apArg[0]); 1571+ if( !zInput ){ 1572+ return; 1573+ } 1574+ nOut = nInput = sqlite3_value_bytes16(apArg[0]); 1575+ if( nOut==0 ){ 1576+ sqlite3_result_text16(p, "", 0, SQLITE_STATIC); 1577+ return; 1578+ } 1579+ 1580+ for(cnt=0; cnt<2; cnt++){ 1581+ UChar *zNew = sqlite3_realloc(zOutput, nOut); 1582+ if( zNew==0 ){ 1583+ sqlite3_free(zOutput); 1584+ sqlite3_result_error_nomem(p); 1585+ return; 1586+ } 1587+ zOutput = zNew; 1588+ status = U_ZERO_ERROR; 1589+ if( bToUpper ){ 1590+ nOut = 2*u_strToUpper(zOutput,nOut/2,zInput,nInput/2,zLocale,&status); 1591+ }else{ 1592+ nOut = 2*u_strToLower(zOutput,nOut/2,zInput,nInput/2,zLocale,&status); 1593+ } 1594+ 1595+ if( U_SUCCESS(status) ){ 1596+ sqlite3_result_text16(p, zOutput, nOut, xFree); 1597+ }else if( status==U_BUFFER_OVERFLOW_ERROR ){ 1598+ assert( cnt==0 ); 1599+ continue; 1600+ }else{ 1601+ icuFunctionError(p, bToUpper ? "u_strToUpper" : "u_strToLower", status); 1602+ } 1603+ return; 1604+ } 1605+ assert( 0 ); /* Unreachable */ 1606+} 1607+ 1608+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */ 1609+ 1610+/* 1611+** Collation sequence destructor function. The pCtx argument points to 1612+** a UCollator structure previously allocated using ucol_open(). 1613+*/ 1614+static void icuCollationDel(void *pCtx){ 1615+ UCollator *p = (UCollator *)pCtx; 1616+ ucol_close(p); 1617+} 1618+ 1619+/* 1620+** Collation sequence comparison function. The pCtx argument points to 1621+** a UCollator structure previously allocated using ucol_open(). 1622+*/ 1623+static int icuCollationColl( 1624+ void *pCtx, 1625+ int nLeft, 1626+ const void *zLeft, 1627+ int nRight, 1628+ const void *zRight 1629+){ 1630+ UCollationResult res; 1631+ UCollator *p = (UCollator *)pCtx; 1632+ res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2); 1633+ switch( res ){ 1634+ case UCOL_LESS: return -1; 1635+ case UCOL_GREATER: return +1; 1636+ case UCOL_EQUAL: return 0; 1637+ } 1638+ assert(!"Unexpected return value from ucol_strcoll()"); 1639+ return 0; 1640+} 1641+ 1642+/* 1643+** Implementation of the scalar function icu_load_collation(). 1644+** 1645+** This scalar function is used to add ICU collation based collation 1646+** types to an SQLite database connection. It is intended to be called 1647+** as follows: 1648+** 1649+** SELECT icu_load_collation(<locale>, <collation-name>); 1650+** 1651+** Where <locale> is a string containing an ICU locale identifier (i.e. 1652+** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the 1653+** collation sequence to create. 1654+*/ 1655+static void icuLoadCollation( 1656+ sqlite3_context *p, 1657+ int nArg, 1658+ sqlite3_value **apArg 1659+){ 1660+ sqlite3 *db = (sqlite3 *)sqlite3_user_data(p); 1661+ UErrorCode status = U_ZERO_ERROR; 1662+ const char *zLocale; /* Locale identifier - (eg. "jp_JP") */ 1663+ const char *zName; /* SQL Collation sequence name (eg. "japanese") */ 1664+ UCollator *pUCollator; /* ICU library collation object */ 1665+ int rc; /* Return code from sqlite3_create_collation_x() */ 1666+ 1667+ assert(nArg==2); 1668+ (void)nArg; /* Unused parameter */ 1669+ zLocale = (const char *)sqlite3_value_text(apArg[0]); 1670+ zName = (const char *)sqlite3_value_text(apArg[1]); 1671+ 1672+ if( !zLocale || !zName ){ 1673+ return; 1674+ } 1675+ 1676+ pUCollator = ucol_open(zLocale, &status); 1677+ if( !U_SUCCESS(status) ){ 1678+ icuFunctionError(p, "ucol_open", status); 1679+ return; 1680+ } 1681+ assert(p); 1682+ 1683+ rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator, 1684+ icuCollationColl, icuCollationDel 1685+ ); 1686+ if( rc!=SQLITE_OK ){ 1687+ ucol_close(pUCollator); 1688+ sqlite3_result_error(p, "Error registering collation function", -1); 1689+ } 1690+} 1691+ 1692+/* 1693+** Register the ICU extension functions with database db. 1694+*/ 1695+EXPORT_SYMBOLS SQLITE_API int sqlite3IcuInit(sqlite3 *db){ 1696+# define SQLITEICU_EXTRAFLAGS (SQLITE_DETERMINISTIC|SQLITE_INNOCUOUS) 1697+ static const struct IcuScalar { 1698+ const char *zName; /* Function name */ 1699+ unsigned char nArg; /* Number of arguments */ 1700+ unsigned int enc; /* Optimal text encoding */ 1701+ unsigned char iContext; /* sqlite3_user_data() context */ 1702+ void (*xFunc)(sqlite3_context*,int,sqlite3_value**); 1703+ } scalars[] = { 1704+ {"icu_load_collation",2,SQLITE_UTF8|SQLITE_DIRECTONLY,1, icuLoadCollation}, 1705+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) 1706+ {"regexp", 2, SQLITE_ANY|SQLITEICU_EXTRAFLAGS, 0, icuRegexpFunc}, 1707+ {"lower", 1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 1708+ {"lower", 2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 1709+ {"upper", 1, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 1710+ {"upper", 2, SQLITE_UTF16|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 1711+ {"lower", 1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 1712+ {"lower", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuCaseFunc16}, 1713+ {"upper", 1, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 1714+ {"upper", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 1, icuCaseFunc16}, 1715+ {"like", 2, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuLikeFunc}, 1716+ {"like", 3, SQLITE_UTF8|SQLITEICU_EXTRAFLAGS, 0, icuLikeFunc}, 1717+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) */ 1718+ }; 1719+#ifdef OS_FEATURE 1720+ extern void SetOhosIcuDirectory(); 1721+ SetOhosIcuDirectory(); 1722+#endif 1723+ int rc = SQLITE_OK; 1724+ int i; 1725+ 1726+ for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){ 1727+ const struct IcuScalar *p = &scalars[i]; 1728+ rc = sqlite3_create_function( 1729+ db, p->zName, p->nArg, p->enc, 1730+ p->iContext ? (void*)db : (void*)0, 1731+ p->xFunc, 0, 0 1732+ ); 1733+ } 1734+ 1735+ return rc; 1736+} 1737+ 1738+#if !SQLITE_CORE 1739+#ifdef _WIN32 1740+__declspec(dllexport) 1741+#endif 1742+SQLITE_API int sqlite3_icu_init( 1743+ sqlite3 *db, 1744+ char **pzErrMsg, 1745+ const sqlite3_api_routines *pApi 1746+){ 1747+ SQLITE_EXTENSION_INIT2(pApi) 1748+ return sqlite3IcuInit(db); 1749+} 1750+#endif 1751+ 1752+#endif 1753+ 1754+/************** End of icu.c *************************************************/ 1755+/************** Begin file fts3_icu.c ****************************************/ 1756+/* 1757+** 2007 June 22 1758+** 1759+** The author disclaims copyright to this source code. In place of 1760+** a legal notice, here is a blessing: 1761+** 1762+** May you do good and not evil. 1763+** May you find forgiveness for yourself and forgive others. 1764+** May you share freely, never taking more than you give. 1765+** 1766+************************************************************************* 1767+** This file implements a tokenizer for fts3 based on the ICU library. 1768+*/ 1769+/* #include "fts3Int.h" */ 1770+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 1771+#ifdef SQLITE_ENABLE_ICU 1772+ 1773+/* #include <assert.h> */ 1774+/* #include <string.h> */ 1775+/* #include "fts3_tokenizer.h" */ 1776+ 1777+#include <unicode/ubrk.h> 1778+/* #include <unicode/ucol.h> */ 1779+/* #include <unicode/ustring.h> */ 1780+#include <unicode/utf16.h> 1781+ 1782+typedef struct IcuTokenizer IcuTokenizer; 1783+typedef struct IcuCursor IcuCursor; 1784+ 1785+struct IcuTokenizer { 1786+ sqlite3_tokenizer base; 1787+ char *zLocale; 1788+}; 1789+ 1790+struct IcuCursor { 1791+ sqlite3_tokenizer_cursor base; 1792+ 1793+ UBreakIterator *pIter; /* ICU break-iterator object */ 1794+ int nChar; /* Number of UChar elements in pInput */ 1795+ UChar *aChar; /* Copy of input using utf-16 encoding */ 1796+ int *aOffset; /* Offsets of each character in utf-8 input */ 1797+ 1798+ int nBuffer; 1799+ char *zBuffer; 1800+ 1801+ int iToken; 1802+}; 1803+ 1804+/* 1805+** Create a new tokenizer instance. 1806+*/ 1807+static int icuCreate( 1808+ int argc, /* Number of entries in argv[] */ 1809+ const char * const *argv, /* Tokenizer creation arguments */ 1810+ sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ 1811+){ 1812+ IcuTokenizer *p; 1813+ int n = 0; 1814+ 1815+ if( argc>0 ){ 1816+ n = strlen(argv[0])+1; 1817+ } 1818+ p = (IcuTokenizer *)sqlite3_malloc64(sizeof(IcuTokenizer)+n); 1819+ if( !p ){ 1820+ return SQLITE_NOMEM; 1821+ } 1822+ memset(p, 0, sizeof(IcuTokenizer)); 1823+ 1824+ if( n ){ 1825+ p->zLocale = (char *)&p[1]; 1826+ memcpy(p->zLocale, argv[0], n); 1827+ } 1828+ 1829+ *ppTokenizer = (sqlite3_tokenizer *)p; 1830+ 1831+ return SQLITE_OK; 1832+} 1833+ 1834+/* 1835+** Destroy a tokenizer 1836+*/ 1837+static int icuDestroy(sqlite3_tokenizer *pTokenizer){ 1838+ IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 1839+ sqlite3_free(p); 1840+ return SQLITE_OK; 1841+} 1842+ 1843+/* 1844+** Prepare to begin tokenizing a particular string. The input 1845+** string to be tokenized is pInput[0..nBytes-1]. A cursor 1846+** used to incrementally tokenize this string is returned in 1847+** *ppCursor. 1848+*/ 1849+static int icuOpen( 1850+ sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 1851+ const char *zInput, /* Input string */ 1852+ int nInput, /* Length of zInput in bytes */ 1853+ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 1854+){ 1855+ IcuTokenizer *p = (IcuTokenizer *)pTokenizer; 1856+ IcuCursor *pCsr; 1857+ 1858+ const int32_t opt = U_FOLD_CASE_DEFAULT; 1859+ UErrorCode status = U_ZERO_ERROR; 1860+ int nChar; 1861+ 1862+ UChar32 c; 1863+ int iInput = 0; 1864+ int iOut = 0; 1865+ 1866+ *ppCursor = 0; 1867+ 1868+ if( zInput==0 ){ 1869+ nInput = 0; 1870+ zInput = ""; 1871+ }else if( nInput<0 ){ 1872+ nInput = strlen(zInput); 1873+ } 1874+ nChar = nInput+1; 1875+ pCsr = (IcuCursor *)sqlite3_malloc64( 1876+ sizeof(IcuCursor) + /* IcuCursor */ 1877+ ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */ 1878+ (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ 1879+ ); 1880+ if( !pCsr ){ 1881+ return SQLITE_NOMEM; 1882+ } 1883+ memset(pCsr, 0, sizeof(IcuCursor)); 1884+ pCsr->aChar = (UChar *)&pCsr[1]; 1885+ pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3]; 1886+ 1887+ pCsr->aOffset[iOut] = iInput; 1888+ U8_NEXT(zInput, iInput, nInput, c); 1889+ while( c>0 ){ 1890+ int isError = 0; 1891+ c = u_foldCase(c, opt); 1892+ U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); 1893+ if( isError ){ 1894+ sqlite3_free(pCsr); 1895+ return SQLITE_ERROR; 1896+ } 1897+ pCsr->aOffset[iOut] = iInput; 1898+ 1899+ if( iInput<nInput ){ 1900+ U8_NEXT(zInput, iInput, nInput, c); 1901+ }else{ 1902+ c = 0; 1903+ } 1904+ } 1905+ 1906+ pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); 1907+ if( !U_SUCCESS(status) ){ 1908+ sqlite3_free(pCsr); 1909+ return SQLITE_ERROR; 1910+ } 1911+ pCsr->nChar = iOut; 1912+ 1913+ ubrk_first(pCsr->pIter); 1914+ *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; 1915+ return SQLITE_OK; 1916+} 1917+ 1918+/* 1919+** Close a tokenization cursor previously opened by a call to icuOpen(). 1920+*/ 1921+static int icuClose(sqlite3_tokenizer_cursor *pCursor){ 1922+ IcuCursor *pCsr = (IcuCursor *)pCursor; 1923+ ubrk_close(pCsr->pIter); 1924+ sqlite3_free(pCsr->zBuffer); 1925+ sqlite3_free(pCsr); 1926+ return SQLITE_OK; 1927+} 1928+ 1929+/* 1930+** Extract the next token from a tokenization cursor. 1931+*/ 1932+static int icuNext( 1933+ sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ 1934+ const char **ppToken, /* OUT: *ppToken is the token text */ 1935+ int *pnBytes, /* OUT: Number of bytes in token */ 1936+ int *piStartOffset, /* OUT: Starting offset of token */ 1937+ int *piEndOffset, /* OUT: Ending offset of token */ 1938+ int *piPosition /* OUT: Position integer of token */ 1939+){ 1940+ IcuCursor *pCsr = (IcuCursor *)pCursor; 1941+ 1942+ int iStart = 0; 1943+ int iEnd = 0; 1944+ int nByte = 0; 1945+ 1946+ while( iStart==iEnd ){ 1947+ UChar32 c; 1948+ 1949+ iStart = ubrk_current(pCsr->pIter); 1950+ iEnd = ubrk_next(pCsr->pIter); 1951+ if( iEnd==UBRK_DONE ){ 1952+ return SQLITE_DONE; 1953+ } 1954+ 1955+ while( iStart<iEnd ){ 1956+ int iWhite = iStart; 1957+ U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); 1958+ if( u_isspace(c) ){ 1959+ iStart = iWhite; 1960+ }else{ 1961+ break; 1962+ } 1963+ } 1964+ assert(iStart<=iEnd); 1965+ } 1966+ 1967+ do { 1968+ UErrorCode status = U_ZERO_ERROR; 1969+ if( nByte ){ 1970+ char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); 1971+ if( !zNew ){ 1972+ return SQLITE_NOMEM; 1973+ } 1974+ pCsr->zBuffer = zNew; 1975+ pCsr->nBuffer = nByte; 1976+ } 1977+ 1978+ u_strToUTF8( 1979+ pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ 1980+ &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ 1981+ &status /* Output success/failure */ 1982+ ); 1983+ } while( nByte>pCsr->nBuffer ); 1984+ 1985+ *ppToken = pCsr->zBuffer; 1986+ *pnBytes = nByte; 1987+ *piStartOffset = pCsr->aOffset[iStart]; 1988+ *piEndOffset = pCsr->aOffset[iEnd]; 1989+ *piPosition = pCsr->iToken++; 1990+ 1991+ return SQLITE_OK; 1992+} 1993+ 1994+/* 1995+** The set of routines that implement the simple tokenizer 1996+*/ 1997+static const sqlite3_tokenizer_module icuTokenizerModule = { 1998+ 0, /* iVersion */ 1999+ icuCreate, /* xCreate */ 2000+ icuDestroy, /* xCreate */ 2001+ icuOpen, /* xOpen */ 2002+ icuClose, /* xClose */ 2003+ icuNext, /* xNext */ 2004+ 0, /* xLanguageid */ 2005+}; 2006+ 2007+/* 2008+** Set *ppModule to point at the implementation of the ICU tokenizer. 2009+*/ 2010+EXPORT_SYMBOLS SQLITE_API void sqlite3Fts3IcuTokenizerModule( 2011+ sqlite3_tokenizer_module const**ppModule 2012+){ 2013+ *ppModule = &icuTokenizerModule; 2014+} 2015+ 2016+#endif /* defined(SQLITE_ENABLE_ICU) */ 2017+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ 2018+ 2019+/************** End of fts3_icu.c ********************************************/ 2020\ No newline at end of file 2021-- 20222.47.0.windows.2 2023 2024