• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2010-2011, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  *  file name:  dicttrieperf.cpp
7  *  encoding:   US-ASCII
8  *  tab size:   8 (not used)
9  *  indentation:4
10  *
11  *  created on: 2010dec09
12  *  created by: Markus W. Scherer
13  *
14  *  Performance test program for dictionary-type tries.
15  *
16  * Usage from within <ICU build tree>/test/perf/dicttrieperf/ :
17  * (Linux)
18  *  make
19  *  export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw
20  *  ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000
21  * or
22  *  ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250
23  */
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include "unicode/bytestrie.h"
28 #include "unicode/bytestriebuilder.h"
29 #include "unicode/localpointer.h"
30 #include "unicode/ucharstrie.h"
31 #include "unicode/ucharstriebuilder.h"
32 #include "unicode/uperf.h"
33 #include "unicode/utext.h"
34 #include "charstr.h"
35 #include "package.h"
36 #include "toolutil.h"
37 #include "triedict.h"
38 #include "ucbuf.h"  // struct ULine
39 #include "uoptions.h"
40 #include "uvectr32.h"
41 
42 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
43 
44 // Test object.
45 class DictionaryTriePerfTest : public UPerfTest {
46 public:
DictionaryTriePerfTest(int32_t argc,const char * argv[],UErrorCode & status)47     DictionaryTriePerfTest(int32_t argc, const char *argv[], UErrorCode &status)
48             : UPerfTest(argc, argv, NULL, 0, "", status), numTextLines(0) {
49         if(hasFile()) {
50             getLines(status);
51             for(int32_t i=0; i<numLines; ++i) {
52                 // Skip comment lines (start with a character below 'A').
53                 if(lines[i].name[0]>=0x41) {
54                     ++numTextLines;
55                     // Remove trailing CR LF.
56                     int32_t len=lines[i].len;
57                     UChar c;
58                     while(len>0 && ((c=lines[i].name[len-1])==0xa || c==0xd)) {
59                         --len;
60                     }
61                     lines[i].len=len;
62                 }
63             }
64         }
65     }
66 
67     virtual UPerfFunction *runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
68 
getSourceDir() const69     const char *getSourceDir() const { return sourceDir; }
70 
hasFile() const71     UBool hasFile() const { return ucharBuf!=NULL; }
getCachedLines() const72     const ULine *getCachedLines() const { return lines; }
getNumLines() const73     int32_t getNumLines() const { return numLines; }
74     int32_t numTextLines;  // excluding comment lines
75 };
76 
77 // Performance test function object.
78 // Loads icudt46l.dat (or whatever its current versioned filename)
79 // from the -s or --sourcedir path.
80 class PackageLookup : public UPerfFunction {
81 protected:
PackageLookup(const DictionaryTriePerfTest & perf)82     PackageLookup(const DictionaryTriePerfTest &perf) {
83         IcuToolErrorCode errorCode("PackageLookup()");
84         CharString filename(perf.getSourceDir(), errorCode);
85         int32_t filenameLength=filename.length();
86         if(filenameLength>0 && filename[filenameLength-1]!=U_FILE_SEP_CHAR &&
87                                filename[filenameLength-1]!=U_FILE_ALT_SEP_CHAR) {
88             filename.append(U_FILE_SEP_CHAR, errorCode);
89         }
90         filename.append(U_ICUDATA_NAME, errorCode);
91         filename.append(".dat", errorCode);
92         pkg.readPackage(filename.data());
93     }
94 
95 public:
~PackageLookup()96     virtual ~PackageLookup() {}
97 
98     // virtual void call(UErrorCode* pErrorCode) { ... }
99 
getOperationsPerIteration()100     virtual long getOperationsPerIteration() {
101         return pkg.getItemCount();
102     }
103 
104     // virtual long getEventsPerIteration();
105 
106 protected:
107     Package pkg;
108 };
109 
110 struct TOCEntry {
111     int32_t nameOffset, dataOffset;
112 };
113 
114 // Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c).
simpleBinarySearch(const char * s,const char * names,const TOCEntry * toc,int32_t count)115 static int32_t simpleBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) {
116     int32_t start=0;
117     int32_t limit=count;
118     int32_t lastNumber=limit;
119     for(;;) {
120         int32_t number=(start+limit)/2;
121         if(lastNumber==number) {  // have we moved?
122             return -1;  // not found
123         }
124         lastNumber=number;
125         int32_t cmp=strcmp(s, names+toc[number].nameOffset);
126         if(cmp<0) {
127             limit=number;
128         } else if(cmp>0) {
129             start=number;
130         } else {  // found s
131             return number;
132         }
133     }
134 }
135 
136 class BinarySearchPackageLookup : public PackageLookup {
137 public:
BinarySearchPackageLookup(const DictionaryTriePerfTest & perf)138     BinarySearchPackageLookup(const DictionaryTriePerfTest &perf)
139             : PackageLookup(perf) {
140         IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
141         int32_t count=pkg.getItemCount();
142         toc=new TOCEntry[count];
143         for(int32_t i=0; i<count; ++i) {
144             toc[i].nameOffset=itemNames.length();
145             toc[i].dataOffset=i;  // arbitrary value, see toc comment below
146             // The Package class removes the "icudt46l/" prefix.
147             // We restore that here for a fair performance test.
148             const char *name=pkg.getItem(i)->name;
149             itemNames.append("icudt46l/", errorCode);
150             itemNames.append(name, strlen(name)+1, errorCode);
151         }
152         printf("size of item names: %6ld\n", (long)itemNames.length());
153         printf("size of TOC:        %6ld\n", (long)(count*8));
154         printf("total index size:   %6ld\n", (long)(itemNames.length()+count*8));
155     }
~BinarySearchPackageLookup()156     virtual ~BinarySearchPackageLookup() {
157         delete[] toc;
158     }
159 
call(UErrorCode *)160     virtual void call(UErrorCode * /*pErrorCode*/) {
161         int32_t count=pkg.getItemCount();
162         const char *itemNameChars=itemNames.data();
163         const char *name=itemNameChars;
164         for(int32_t i=0; i<count; ++i) {
165             if(simpleBinarySearch(name, itemNameChars, toc, count)<0) {
166                 fprintf(stderr, "item not found: %s\n", name);
167             }
168             name=strchr(name, 0)+1;
169         }
170     }
171 
172 protected:
173     CharString itemNames;
174     // toc imitates a .dat file's array of UDataOffsetTOCEntry
175     // with nameOffset and dataOffset.
176     // We don't need the dataOffsets, but we want to imitate the real
177     // memory density, to measure equivalent CPU cache usage.
178     TOCEntry *toc;
179 };
180 
181 #ifndef MIN
182 #define MIN(a,b) (((a)<(b)) ? (a) : (b))
183 #endif
184 
185 // Compare strings where we know the shared prefix length,
186 // and advance the prefix length as we find that the strings share even more characters.
strcmpAfterPrefix(const char * s1,const char * s2,int32_t & prefixLength)187 static int32_t strcmpAfterPrefix(const char *s1, const char *s2, int32_t &prefixLength) {
188     int32_t pl=prefixLength;
189     s1+=pl;
190     s2+=pl;
191     int32_t cmp=0;
192     for(;;) {
193         int32_t c1=(uint8_t)*s1++;
194         int32_t c2=(uint8_t)*s2++;
195         cmp=c1-c2;
196         if(cmp!=0 || c1==0) {  // different or done
197             break;
198         }
199         ++pl;  // increment shared same-prefix length
200     }
201     prefixLength=pl;
202     return cmp;
203 }
204 
prefixBinarySearch(const char * s,const char * names,const TOCEntry * toc,int32_t count)205 static int32_t prefixBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) {
206     if(count==0) {
207         return -1;
208     }
209     int32_t start=0;
210     int32_t limit=count;
211     // Remember the shared prefix between s, start and limit,
212     // and don't compare that shared prefix again.
213     // The shared prefix should get longer as we narrow the [start, limit[ range.
214     int32_t startPrefixLength=0;
215     int32_t limitPrefixLength=0;
216     // Prime the prefix lengths so that we don't keep prefixLength at 0 until
217     // both the start and limit indexes have moved.
218     // At the same time, we find if s is one of the start and (limit-1) names,
219     // and if not, exclude them from the actual binary search.
220     if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, startPrefixLength)) {
221         return 0;
222     }
223     ++start;
224     --limit;
225     if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, limitPrefixLength)) {
226         return limit;
227     }
228     while(start<limit) {
229         int32_t i=(start+limit)/2;
230         int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength);
231         int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, prefixLength);
232         if(cmp<0) {
233             limit=i;
234             limitPrefixLength=prefixLength;
235         } else if(cmp==0) {
236             return i;
237         } else {
238             start=i+1;
239             startPrefixLength=prefixLength;
240         }
241     }
242     return -1;
243 }
244 
245 class PrefixBinarySearchPackageLookup : public BinarySearchPackageLookup {
246 public:
PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest & perf)247     PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest &perf)
248             : BinarySearchPackageLookup(perf) {}
249 
call(UErrorCode *)250     virtual void call(UErrorCode * /*pErrorCode*/) {
251         int32_t count=pkg.getItemCount();
252         const char *itemNameChars=itemNames.data();
253         const char *name=itemNameChars;
254         for(int32_t i=0; i<count; ++i) {
255             if(prefixBinarySearch(name, itemNameChars, toc, count)<0) {
256                 fprintf(stderr, "item not found: %s\n", name);
257             }
258             name=strchr(name, 0)+1;
259         }
260     }
261 };
262 
bytesTrieLookup(const char * s,const char * nameTrieBytes)263 static int32_t bytesTrieLookup(const char *s, const char *nameTrieBytes) {
264     BytesTrie trie(nameTrieBytes);
265     if(USTRINGTRIE_HAS_VALUE(trie.next(s, -1))) {
266         return trie.getValue();
267     } else {
268         return -1;
269     }
270 }
271 
272 class BytesTriePackageLookup : public PackageLookup {
273 public:
BytesTriePackageLookup(const DictionaryTriePerfTest & perf)274     BytesTriePackageLookup(const DictionaryTriePerfTest &perf)
275             : PackageLookup(perf) {
276         IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
277         builder=new BytesTrieBuilder(errorCode);
278         int32_t count=pkg.getItemCount();
279         for(int32_t i=0; i<count; ++i) {
280             // The Package class removes the "icudt46l/" prefix.
281             // We restore that here for a fair performance test.
282             // We store all full names so that we do not have to reconstruct them
283             // in the call() function.
284             const char *name=pkg.getItem(i)->name;
285             int32_t offset=itemNames.length();
286             itemNames.append("icudt46l/", errorCode);
287             itemNames.append(name, -1, errorCode);
288             // As value, set the data item index.
289             // In a real implementation, we would use that to get the
290             // start and limit offset of the data item.
291             StringPiece fullName(itemNames.toStringPiece());
292             fullName.remove_prefix(offset);
293             builder->add(fullName, i, errorCode);
294             // NUL-terminate the name for call() to find the next one.
295             itemNames.append(0, errorCode);
296         }
297         int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length();
298         printf("size of BytesTrie:   %6ld\n", (long)length);
299         // count+1: +1 for the last-item limit offset which we should have always had
300         printf("size of dataOffsets:%6ld\n", (long)((count+1)*4));
301         printf("total index size:   %6ld\n", (long)(length+(count+1)*4));
302     }
~BytesTriePackageLookup()303     virtual ~BytesTriePackageLookup() {
304         delete builder;
305     }
306 
call(UErrorCode * pErrorCode)307     virtual void call(UErrorCode *pErrorCode) {
308         int32_t count=pkg.getItemCount();
309         const char *nameTrieBytes=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data();
310         const char *name=itemNames.data();
311         for(int32_t i=0; i<count; ++i) {
312             if(bytesTrieLookup(name, nameTrieBytes)<0) {
313                 fprintf(stderr, "item not found: %s\n", name);
314             }
315             name=strchr(name, 0)+1;
316         }
317     }
318 
319 protected:
320     BytesTrieBuilder *builder;
321     CharString itemNames;
322 };
323 
324 // Performance test function object.
325 // Each subclass loads a dictionary text file
326 // from the -s or --sourcedir path plus -f or --file-name.
327 // For example, <ICU source dir>/source/data/brkitr/thaidict.txt.
328 class DictLookup : public UPerfFunction {
329 public:
DictLookup(const DictionaryTriePerfTest & perfTest)330     DictLookup(const DictionaryTriePerfTest &perfTest) : perf(perfTest) {}
331 
getOperationsPerIteration()332     virtual long getOperationsPerIteration() {
333         return perf.numTextLines;
334     }
335 
336 protected:
337     const DictionaryTriePerfTest &perf;
338 };
339 
340 class CompactTrieDictLookup : public DictLookup {
341 public:
CompactTrieDictLookup(const DictionaryTriePerfTest & perfTest)342     CompactTrieDictLookup(const DictionaryTriePerfTest &perfTest)
343             : DictLookup(perfTest), ctd(NULL) {
344         IcuToolErrorCode errorCode("UCharsTrieDictLookup()");
345         // U+0E1C is the median code unit, from
346         // the UCharsTrie root node (split-branch node) for thaidict.txt.
347         MutableTrieDictionary builder(0xe1c, errorCode);
348         const ULine *lines=perf.getCachedLines();
349         int32_t numLines=perf.getNumLines();
350         for(int32_t i=0; i<numLines; ++i) {
351             // Skip comment lines (start with a character below 'A').
352             if(lines[i].name[0]<0x41) {
353                 continue;
354             }
355             builder.addWord(lines[i].name, lines[i].len, errorCode);
356         }
357         ctd=new CompactTrieDictionary(builder, errorCode);
358         int32_t length=(int32_t)ctd->dataSize();
359         printf("size of CompactTrieDict:    %6ld bytes\n", (long)length);
360     }
361 
~CompactTrieDictLookup()362     virtual ~CompactTrieDictLookup() {
363         delete ctd;
364     }
365 
call(UErrorCode * pErrorCode)366     virtual void call(UErrorCode *pErrorCode) {
367         UText text=UTEXT_INITIALIZER;
368         int32_t lengths[20];
369         const ULine *lines=perf.getCachedLines();
370         int32_t numLines=perf.getNumLines();
371         for(int32_t i=0; i<numLines; ++i) {
372             // Skip comment lines (start with a character below 'A').
373             if(lines[i].name[0]<0x41) {
374                 continue;
375             }
376             utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
377             int32_t count;
378             ctd->matches(&text, lines[i].len,
379                          lengths, count, LENGTHOF(lengths));
380             if(count==0 || lengths[count-1]!=lines[i].len) {
381                 fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
382             }
383         }
384     }
385 
386 protected:
387     CompactTrieDictionary *ctd;
388 };
389 
390 // Closely imitate CompactTrieDictionary::matches().
391 // Note: CompactTrieDictionary::matches() is part of its trie implementation,
392 // and while it loops over the text, it knows the current state.
393 // By contrast, this implementation uses UCharsTrie API functions that have to
394 // check the trie state each time and load/store state in the object.
395 // (Whether it hasNext() and whether it is in the middle of a linear-match node.)
396 static int32_t
ucharsTrieMatches(UCharsTrie & trie,UText * text,int32_t textLimit,int32_t * lengths,int & count,int limit)397 ucharsTrieMatches(UCharsTrie &trie,
398                   UText *text, int32_t textLimit,
399                   int32_t *lengths, int &count, int limit ) {
400     UChar32 c=utext_next32(text);
401     // Notes:
402     // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
403     // b) It also ignores non-BMP code points by casting to UChar!
404     if(c<0) {
405         return 0;
406     }
407     // Should be firstForCodePoint() but CompactTrieDictionary
408     // handles only code units.
409     UStringTrieResult result=trie.first(c);
410     int32_t numChars=1;
411     count=0;
412     for(;;) {
413         if(USTRINGTRIE_HAS_VALUE(result)) {
414             if(count<limit) {
415                 // lengths[count++]=(int32_t)utext_getNativeIndex(text);
416                 lengths[count++]=numChars;  // CompactTrieDictionary just counts chars too.
417             }
418             if(result==USTRINGTRIE_FINAL_VALUE) {
419                 break;
420             }
421         } else if(result==USTRINGTRIE_NO_MATCH) {
422             break;
423         }
424         if(numChars>=textLimit) {
425             // Note: Why do we have both a text limit and a UText that knows its length?
426             break;
427         }
428         UChar32 c=utext_next32(text);
429         // Notes:
430         // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
431         // b) It also ignores non-BMP code points by casting to UChar!
432         if(c<0) {
433             break;
434         }
435         ++numChars;
436         // Should be nextForCodePoint() but CompactTrieDictionary
437         // handles only code units.
438         result=trie.next(c);
439     }
440 #if 0
441     // Note: CompactTrieDictionary::matches() comments say that it leaves the UText
442     // after the longest prefix match and returns the number of characters
443     // that were matched.
444     if(index!=lastMatch) {
445         utext_setNativeIndex(text, lastMatch);
446     }
447     return lastMatch-start;
448     // However, it does not do either of these, so I am not trying to
449     // imitate it (or its docs) 100%.
450 #endif
451     return numChars;
452 }
453 
454 class UCharsTrieDictLookup : public DictLookup {
455 public:
UCharsTrieDictLookup(const DictionaryTriePerfTest & perfTest)456     UCharsTrieDictLookup(const DictionaryTriePerfTest &perfTest)
457             : DictLookup(perfTest), trie(NULL) {
458         IcuToolErrorCode errorCode("UCharsTrieDictLookup()");
459         builder=new UCharsTrieBuilder(errorCode);
460         const ULine *lines=perf.getCachedLines();
461         int32_t numLines=perf.getNumLines();
462         for(int32_t i=0; i<numLines; ++i) {
463             // Skip comment lines (start with a character below 'A').
464             if(lines[i].name[0]<0x41) {
465                 continue;
466             }
467             builder->add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode);
468         }
469         UnicodeString trieUChars;
470         int32_t length=builder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieUChars, errorCode).length();
471         printf("size of UCharsTrie:          %6ld bytes\n", (long)length*2);
472         trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode);
473     }
474 
~UCharsTrieDictLookup()475     virtual ~UCharsTrieDictLookup() {
476         delete builder;
477         delete trie;
478     }
479 
480 protected:
481     UCharsTrieBuilder *builder;
482     UCharsTrie *trie;
483 };
484 
485 class UCharsTrieDictMatches : public UCharsTrieDictLookup {
486 public:
UCharsTrieDictMatches(const DictionaryTriePerfTest & perfTest)487     UCharsTrieDictMatches(const DictionaryTriePerfTest &perfTest)
488             : UCharsTrieDictLookup(perfTest) {}
489 
call(UErrorCode * pErrorCode)490     virtual void call(UErrorCode *pErrorCode) {
491         UText text=UTEXT_INITIALIZER;
492         int32_t lengths[20];
493         const ULine *lines=perf.getCachedLines();
494         int32_t numLines=perf.getNumLines();
495         for(int32_t i=0; i<numLines; ++i) {
496             // Skip comment lines (start with a character below 'A').
497             if(lines[i].name[0]<0x41) {
498                 continue;
499             }
500             utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
501             int32_t count=0;
502             ucharsTrieMatches(*trie, &text, lines[i].len,
503                               lengths, count, LENGTHOF(lengths));
504             if(count==0 || lengths[count-1]!=lines[i].len) {
505                 fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
506             }
507         }
508     }
509 };
510 
511 class UCharsTrieDictContains : public UCharsTrieDictLookup {
512 public:
UCharsTrieDictContains(const DictionaryTriePerfTest & perfTest)513     UCharsTrieDictContains(const DictionaryTriePerfTest &perfTest)
514             : UCharsTrieDictLookup(perfTest) {}
515 
call(UErrorCode *)516     virtual void call(UErrorCode * /*pErrorCode*/) {
517         const ULine *lines=perf.getCachedLines();
518         int32_t numLines=perf.getNumLines();
519         for(int32_t i=0; i<numLines; ++i) {
520             // Skip comment lines (which start with a character below 'A').
521             if(lines[i].name[0]<0x41) {
522                 continue;
523             }
524             if(!USTRINGTRIE_HAS_VALUE(trie->reset().next(lines[i].name, lines[i].len))) {
525                 fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
526             }
527         }
528     }
529 };
530 
thaiCharToByte(UChar32 c)531 static inline int32_t thaiCharToByte(UChar32 c) {
532     if(0xe00<=c && c<=0xefe) {
533         return c&0xff;
534     } else if(c==0x2e) {
535         return 0xff;
536     } else {
537         return -1;
538     }
539 }
540 
thaiWordToBytes(const UChar * s,int32_t length,CharString & str,UErrorCode & errorCode)541 static UBool thaiWordToBytes(const UChar *s, int32_t length,
542                              CharString &str, UErrorCode &errorCode) {
543     for(int32_t i=0; i<length; ++i) {
544         UChar c=s[i];
545         int32_t b=thaiCharToByte(c);
546         if(b>=0) {
547             str.append((char)b, errorCode);
548         } else {
549             fprintf(stderr, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c);
550             return FALSE;
551         }
552     }
553     return TRUE;
554 }
555 
556 class BytesTrieDictLookup : public DictLookup {
557 public:
BytesTrieDictLookup(const DictionaryTriePerfTest & perfTest)558     BytesTrieDictLookup(const DictionaryTriePerfTest &perfTest)
559             : DictLookup(perfTest), trie(NULL), noDict(FALSE) {
560         IcuToolErrorCode errorCode("BytesTrieDictLookup()");
561         builder=new BytesTrieBuilder(errorCode);
562         CharString str;
563         const ULine *lines=perf.getCachedLines();
564         int32_t numLines=perf.getNumLines();
565         for(int32_t i=0; i<numLines; ++i) {
566             // Skip comment lines (start with a character below 'A').
567             if(lines[i].name[0]<0x41) {
568                 continue;
569             }
570             if(!thaiWordToBytes(lines[i].name, lines[i].len, str.clear(), errorCode)) {
571                 fprintf(stderr, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i);
572                 noDict=TRUE;
573                 break;
574             }
575             builder->add(str.toStringPiece(), 0, errorCode);
576         }
577         if(!noDict) {
578             int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length();
579             printf("size of BytesTrie:           %6ld bytes\n", (long)length);
580             trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode);
581         }
582     }
583 
~BytesTrieDictLookup()584     virtual ~BytesTrieDictLookup() {
585         delete builder;
586         delete trie;
587     }
588 
589 protected:
590     BytesTrieBuilder *builder;
591     BytesTrie *trie;
592     UBool noDict;
593 };
594 
595 static int32_t
bytesTrieMatches(BytesTrie & trie,UText * text,int32_t textLimit,int32_t * lengths,int & count,int limit)596 bytesTrieMatches(BytesTrie &trie,
597                  UText *text, int32_t textLimit,
598                  int32_t *lengths, int &count, int limit ) {
599     UChar32 c=utext_next32(text);
600     if(c<0) {
601         return 0;
602     }
603     UStringTrieResult result=trie.first(thaiCharToByte(c));
604     int32_t numChars=1;
605     count=0;
606     for(;;) {
607         if(USTRINGTRIE_HAS_VALUE(result)) {
608             if(count<limit) {
609                 // lengths[count++]=(int32_t)utext_getNativeIndex(text);
610                 lengths[count++]=numChars;  // CompactTrieDictionary just counts chars too.
611             }
612             if(result==USTRINGTRIE_FINAL_VALUE) {
613                 break;
614             }
615         } else if(result==USTRINGTRIE_NO_MATCH) {
616             break;
617         }
618         if(numChars>=textLimit) {
619             break;
620         }
621         UChar32 c=utext_next32(text);
622         if(c<0) {
623             break;
624         }
625         ++numChars;
626         result=trie.next(thaiCharToByte(c));
627     }
628     return numChars;
629 }
630 
631 class BytesTrieDictMatches : public BytesTrieDictLookup {
632 public:
BytesTrieDictMatches(const DictionaryTriePerfTest & perfTest)633     BytesTrieDictMatches(const DictionaryTriePerfTest &perfTest)
634             : BytesTrieDictLookup(perfTest) {}
635 
call(UErrorCode * pErrorCode)636     virtual void call(UErrorCode *pErrorCode) {
637         if(noDict) {
638             return;
639         }
640         UText text=UTEXT_INITIALIZER;
641         int32_t lengths[20];
642         const ULine *lines=perf.getCachedLines();
643         int32_t numLines=perf.getNumLines();
644         for(int32_t i=0; i<numLines; ++i) {
645             // Skip comment lines (start with a character below 'A').
646             if(lines[i].name[0]<0x41) {
647                 continue;
648             }
649             utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
650             int32_t count=0;
651             bytesTrieMatches(*trie, &text, lines[i].len,
652                              lengths, count, LENGTHOF(lengths));
653             if(count==0 || lengths[count-1]!=lines[i].len) {
654                 fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
655             }
656         }
657     }
658 };
659 
660 class BytesTrieDictContains : public BytesTrieDictLookup {
661 public:
BytesTrieDictContains(const DictionaryTriePerfTest & perfTest)662     BytesTrieDictContains(const DictionaryTriePerfTest &perfTest)
663             : BytesTrieDictLookup(perfTest) {}
664 
call(UErrorCode *)665     virtual void call(UErrorCode * /*pErrorCode*/) {
666         if(noDict) {
667             return;
668         }
669         const ULine *lines=perf.getCachedLines();
670         int32_t numLines=perf.getNumLines();
671         for(int32_t i=0; i<numLines; ++i) {
672             const UChar *line=lines[i].name;
673             // Skip comment lines (start with a character below 'A').
674             if(line[0]<0x41) {
675                 continue;
676             }
677             UStringTrieResult result=trie->first(thaiCharToByte(line[0]));
678             int32_t lineLength=lines[i].len;
679             for(int32_t j=1; j<lineLength; ++j) {
680                 if(!USTRINGTRIE_HAS_NEXT(result)) {
681                     fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
682                     break;
683                 }
684                 result=trie->next(thaiCharToByte(line[j]));
685             }
686             if(!USTRINGTRIE_HAS_VALUE(result)) {
687                 fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
688             }
689         }
690     }
691 };
692 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)693 UPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec,
694                                                       const char *&name, char * /*par*/) {
695     if(hasFile()) {
696         switch(index) {
697         case 0:
698             name="compacttriematches";
699             if(exec) {
700                 return new CompactTrieDictLookup(*this);
701             }
702             break;
703         case 1:
704             name="ucharstriematches";
705             if(exec) {
706                 return new UCharsTrieDictMatches(*this);
707             }
708             break;
709         case 2:
710             name="ucharstriecontains";
711             if(exec) {
712                 return new UCharsTrieDictContains(*this);
713             }
714             break;
715         case 3:
716             name="bytestriematches";
717             if(exec) {
718                 return new BytesTrieDictMatches(*this);
719             }
720             break;
721         case 4:
722             name="bytestriecontains";
723             if(exec) {
724                 return new BytesTrieDictContains(*this);
725             }
726             break;
727         default:
728             name="";
729             break;
730         }
731     } else {
732         if(index==0 && exec) {
733             puts("Running BytesTrie perf tests on the .dat package file from the --sourcedir.\n"
734                  "For UCharsTrie perf tests on a dictionary text file, specify the -f or --file-name.\n");
735         }
736         switch(index) {
737         case 0:
738             name="simplebinarysearch";
739             if(exec) {
740                 return new BinarySearchPackageLookup(*this);
741             }
742             break;
743         case 1:
744             name="prefixbinarysearch";
745             if(exec) {
746                 return new PrefixBinarySearchPackageLookup(*this);
747             }
748             break;
749         case 2:
750             name="bytestrie";
751             if(exec) {
752                 return new BytesTriePackageLookup(*this);
753             }
754             break;
755         default:
756             name="";
757             break;
758         }
759     }
760     return NULL;
761 }
762 
main(int argc,const char * argv[])763 int main(int argc, const char *argv[]) {
764     IcuToolErrorCode errorCode("dicttrieperf main()");
765     DictionaryTriePerfTest test(argc, argv, errorCode);
766     if(errorCode.isFailure()) {
767         fprintf(stderr, "DictionaryTriePerfTest() failed: %s\n", errorCode.errorName());
768         test.usage();
769         return errorCode.reset();
770     }
771     if(!test.run()) {
772         fprintf(stderr, "FAILED: Tests could not be run, please check the arguments.\n");
773         return -1;
774     }
775     return 0;
776 }
777