1--- 2layout: default 3title: Collation Examples 4nav_order: 7 5parent: Collation 6--- 7<!-- 8© 2020 and later: Unicode, Inc. and others. 9License & terms of use: http://www.unicode.org/copyright.html 10--> 11 12# Collation Examples 13{: .no_toc } 14 15## Contents 16{: .no_toc .text-delta } 17 181. TOC 19{:toc} 20 21--- 22 23## Simple Collation Sample Customization 24 25The following program demonstrates how to compare and create sort keys with 26default locale. 27 28In **C:** 29 30```c 31#include <stdio.h> 32#include <memory.h> 33#include <string.h> 34#include "unicode/ustring.h" 35#include "unicode/utypes.h" 36#include "unicode/uloc.h" 37#include "unicode/ucol.h" 38 39#define MAXBUFFERSIZE 100 40#define BIGBUFFERSIZE 5000 41 42UBool collateWithLocaleInC(const char* locale, UErrorCode *status) 43{ 44 UChar dispName [MAXBUFFERSIZE]; 45 int32_t bufferLen = 0; 46 UChar source [MAXBUFFERSIZE]; 47 UChar target [MAXBUFFERSIZE]; 48 UCollationResult result = UCOL_EQUAL; 49 uint8_t sourceKeyArray [MAXBUFFERSIZE]; 50 uint8_t targetKeyArray [MAXBUFFERSIZE]; 51 int32_t sourceKeyOut = 0, 52 targetKeyOut = 0; 53 UCollator *myCollator = 0; 54 if (U_FAILURE(*status)) 55 { 56 return false; 57 } 58 u_uastrcpy(source, "This is a test."); 59 u_uastrcpy(target, "THIS IS A TEST."); 60 myCollator = ucol_open(locale, status); 61 if (U_FAILURE(*status)){ 62 bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status); 63 /*Report the error with display name... */ 64 fprintf(stderr, 65 "Failed to create the collator for : \"%s\"\n", dispName); 66 return false; 67 } 68 result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); 69 /* result is 1, secondary differences only for ignorable space characters*/ 70 if (result != UCOL_LESS) 71 { 72 fprintf(stderr, 73 "Comparing two strings with only secondary differences in C failed.\n"); 74 return false; 75 } 76 /* To compare them with just primary differences */ 77 ucol_setStrength(myCollator, UCOL_PRIMARY); 78 result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); 79 /* result is 0 */ 80 if (result != 0) 81 { 82 fprintf(stderr, 83 "Comparing two strings with no differences in C failed.\n"); 84 return false; 85 } 86 87 /* Now, do the same comparison with keys */ 88 sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE); 89 targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE); 90 result = 0; 91 result = strcmp(sourceKeyArray, targetKeyArray); 92 if (result != 0) 93 { 94 fprintf(stderr, 95 "Comparing two strings with sort keys in C failed.\n"); 96 return false; 97 } 98 ucol_close(myCollator); 99 return true; 100} 101``` 102 103In **C++:** 104 105```c++ 106#include <stdio.h> 107#include "unicode/unistr.h" 108#include "unicode/utypes.h" 109#include "unicode/locid.h" 110#include "unicode/coll.h" 111#include "unicode/tblcoll.h" 112#include "unicode/coleitr.h" 113#include "unicode/sortkey.h" 114UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status) 115{ 116 UnicodeString dispName; 117 UnicodeString source("This is a test."); 118 UnicodeString target("THIS IS A TEST."); 119 Collator::EComparisonResult result = Collator::EQUAL; 120 CollationKey sourceKey; 121 CollationKey targetKey; 122 Collator *myCollator = 0; 123 if (U_FAILURE(status)) 124 { 125 return false; 126 } 127 myCollator = Collator::createInstance(locale, status); 128 if (U_FAILURE(status)){ 129 locale.getDisplayName(dispName); 130 /*Report the error with display name... */ 131 fprintf(stderr, 132 "%s: Failed to create the collator for : \"%s\"\n", dispName); 133 return false; 134 } 135 result = myCollator->compare(source, target); 136 /* result is 1, secondary differences only for ignorable space characters*/ 137 if (result != UCOL_LESS) 138 { 139 fprintf(stderr, 140 "Comparing two strings with only secondary differences in C failed.\n"); 141 return false; 142 } 143 /* To compare them with just primary differences */ 144 myCollator->setStrength(Collator::PRIMARY); 145 result = myCollator->compare(source, target); 146 /* result is 0 */ 147 if (result != 0) 148 { 149 fprintf(stderr, 150 "Comparing two strings with no differences in C failed.\n"); 151 return false; 152 } 153 /* Now, do the same comparison with keys */ 154 myCollator->getCollationKey(source, sourceKey, status); 155 myCollator->getCollationKey(target, targetKey, status); 156 result = Collator::EQUAL; 157 158 result = sourceKey.compareTo(targetKey); 159 if (result != 0) 160 { 161 fprintf(stderr, 162 "%s: Comparing two strings with sort keys in C failed.\n"); 163 return false; 164 } 165 delete myCollator; 166 return true; 167} 168``` 169 170### Main Function 171 172```c++ 173extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status); 174int main() 175{ 176 UErrorCode status = U_ZERO_ERROR; 177 fprintf(stdout, "\n"); 178 if (collateWithLocaleInCPP(Locale("en", "US"), status) != true) 179 { 180 fprintf(stderr, 181 "Collate with locale in C++ failed.\n"); 182 } else 183 { 184 fprintf(stdout, "Collate with Locale C++ example worked!!\n"); 185 } 186 status = U_ZERO_ERROR; 187 fprintf(stdout, "\n"); 188 if (collateWithLocaleInC("en_US", &status) != true) 189 { 190 fprintf(stderr, 191 "%s: Collate with locale in C failed.\n"); 192 } else 193 { 194 fprintf(stdout, "Collate with Locale C example worked!!\n"); 195 } 196 return 0; 197} 198``` 199 200In **Java:** 201 202```java 203import com.ibm.icu.text.Collator; 204import com.ibm.icu.text.CollationElementIterator; 205import com.ibm.icu.text.CollationKey; 206import java.util.Locale; 207 208public class CollateExample 209{ 210 211 public static void main(String arg[]) 212 { 213 CollateExample example = new CollateExample(); 214 try { 215 if (!example.collateWithLocale(Locale.US)) { 216 System.err.println("Collate with locale example failed."); 217 } 218 else { 219 System.out.println("Collate with Locale example worked!!"); 220 } 221 } catch (Exception e) { 222 System.err.println("Collating with locale failed"); 223 e.printStackTrace(); 224 } 225 } 226 227 public boolean collateWithLocale(Locale locale) throws Exception 228 { 229 String source = "This is a test."; 230 String target = "THIS IS A TEST."; 231 Collator myCollator = Collator.getInstance(locale); 232 233 int result = myCollator.compare(source, target); 234 // result is 1, secondary differences only for ignorable space characters 235 if (result >= 0) { 236 System.err.println( 237 "Comparing two strings with only secondary differences failed."); 238 return false; 239 } 240 // To compare them with just primary differences 241 myCollator.setStrength(Collator.PRIMARY); 242 result = myCollator.compare(source, target); 243 // result is 0 244 if (result != 0) { 245 System.err.println( 246 "Comparing two strings with no differences failed."); 247 return false; 248 } 249 // Now, do the same comparison with keys 250 CollationKey sourceKey = myCollator.getCollationKey(source); 251 CollationKey targetKey = myCollator.getCollationKey(target); 252 result = sourceKey.compareTo(targetKey); 253 if (result != 0) { 254 System.err.println("Comparing two strings with sort keys failed."); 255 return false; 256 } 257 return true; 258 } 259} 260``` 261 262## Language-sensitive searching 263 264String searching is a well-researched area, and there are algorithms that can 265optimize the searching process. Perhaps the best is the Boyer-Moore method. For a 266full description of this concept, please see Laura 267Werner's text searching article for more details 268(<http://icu-project.org/docs/papers/efficient_text_searching_in_java.html>). 269 270However, implementing collation-based search with the Boyer-Moore method 271while getting correct results is very tricky, and ICU no longer uses this method 272(as of ICU4C 4.0 and ICU4J 53). 273 274Please see the [String Search Service](./string-search) chapter. 275 276## Using large buffers to manage sort keys 277 278A good solution for the problem of not knowing the sort key size in advance is 279to allocate a large buffer and store all the sort keys there, while keeping a 280list of indexes or pointers to that buffer. 281 282Following is sample code that will take a pointer to an array of UChar pointer, 283an array of key indexes. It will allocate and fill a buffer with sort keys and 284return the maximum size for a sort key. Once you have done this to your string, 285you just need to allocate a field of maximum size and copy your sortkeys from 286the buffer to fields. 287 288```c++ 289uint32_t fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys, 290 uint32_t sourceSize, uint8_t **buffer, 291 uint32_t *maxSize, UErrorCode *status) 292{ 293 if(status == NULL || U_FAILURE(*status)) { 294 return 0; 295 } 296 297 uint32_t bufferSize = 16384; 298 uint32_t increment = 16384; 299 uint32_t currentOffset = 0; 300 uint32_t keySize = 0; 301 uint32_t i = 0; 302 *maxSize = 0; 303 304 *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t)); 305 if(buffer == NULL) { 306 *status = U_MEMORY_ALLOCATION_ERROR; 307 return 0; 308 } 309 310 for(i = 0; i < sourceSize; i++) { 311 keys[i] = currentOffset; 312 keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset); 313 if(keySize > bufferSize-currentOffset) { 314 *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment); 315 if(buffer == NULL) { 316 *status = U_MEMORY_ALLOCATION_ERROR; 317 return 0; 318 } 319 bufferSize += increment; 320 keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset); 321 } 322 /* here you can hook code that does something interesting with the keySize - 323 * remembers the maximum or similar... 324 */ 325 if(keySize > *maxSize) { 326 *maxSize = keySize; 327 } 328 currentOffset += keySize; 329 } 330 331 return currentOffset; 332} 333``` 334