--- layout: default title: Collation Examples nav_order: 7 parent: Collation --- # Collation Examples {: .no_toc } ## Contents {: .no_toc .text-delta } 1. TOC {:toc} --- ## Simple Collation Sample Customization The following program demonstrates how to compare and create sort keys with default locale. In **C:** ```c #include #include #include #include "unicode/ustring.h" #include "unicode/utypes.h" #include "unicode/uloc.h" #include "unicode/ucol.h" #define MAXBUFFERSIZE 100 #define BIGBUFFERSIZE 5000 UBool collateWithLocaleInC(const char* locale, UErrorCode *status) { UChar dispName [MAXBUFFERSIZE]; int32_t bufferLen = 0; UChar source [MAXBUFFERSIZE]; UChar target [MAXBUFFERSIZE]; UCollationResult result = UCOL_EQUAL; uint8_t sourceKeyArray [MAXBUFFERSIZE]; uint8_t targetKeyArray [MAXBUFFERSIZE]; int32_t sourceKeyOut = 0, targetKeyOut = 0; UCollator *myCollator = 0; if (U_FAILURE(*status)) { return FALSE; } u_uastrcpy(source, "This is a test."); u_uastrcpy(target, "THIS IS A TEST."); myCollator = ucol_open(locale, status); if (U_FAILURE(*status)){ bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status); /*Report the error with display name... */ fprintf(stderr, "Failed to create the collator for : \"%s\"\n", dispName); return FALSE; } result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); /* result is 1, secondary differences only for ignorable space characters*/ if (result != UCOL_LESS) { fprintf(stderr, "Comparing two strings with only secondary differences in C failed.\n"); return FALSE; } /* To compare them with just primary differences */ ucol_setStrength(myCollator, UCOL_PRIMARY); result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); /* result is 0 */ if (result != 0) { fprintf(stderr, "Comparing two strings with no differences in C failed.\n"); return FALSE; } /* Now, do the same comparison with keys */ sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE); targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE); result = 0; result = strcmp(sourceKeyArray, targetKeyArray); if (result != 0) { fprintf(stderr, "Comparing two strings with sort keys in C failed.\n"); return FALSE; } ucol_close(myCollator); return TRUE; } ``` In **C++:** ```c++ #include #include "unicode/unistr.h" #include "unicode/utypes.h" #include "unicode/locid.h" #include "unicode/coll.h" #include "unicode/tblcoll.h" #include "unicode/coleitr.h" #include "unicode/sortkey.h" UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status) { UnicodeString dispName; UnicodeString source("This is a test."); UnicodeString target("THIS IS A TEST."); Collator::EComparisonResult result = Collator::EQUAL; CollationKey sourceKey; CollationKey targetKey; Collator *myCollator = 0; if (U_FAILURE(status)) { return FALSE; } myCollator = Collator::createInstance(locale, status); if (U_FAILURE(status)){ locale.getDisplayName(dispName); /*Report the error with display name... */ fprintf(stderr, "%s: Failed to create the collator for : \"%s\"\n", dispName); return FALSE; } result = myCollator->compare(source, target); /* result is 1, secondary differences only for ignorable space characters*/ if (result != UCOL_LESS) { fprintf(stderr, "Comparing two strings with only secondary differences in C failed.\n"); return FALSE; } /* To compare them with just primary differences */ myCollator->setStrength(Collator::PRIMARY); result = myCollator->compare(source, target); /* result is 0 */ if (result != 0) { fprintf(stderr, "Comparing two strings with no differences in C failed.\n"); return FALSE; } /* Now, do the same comparison with keys */ myCollator->getCollationKey(source, sourceKey, status); myCollator->getCollationKey(target, targetKey, status); result = Collator::EQUAL; result = sourceKey.compareTo(targetKey); if (result != 0) { fprintf(stderr, "%s: Comparing two strings with sort keys in C failed.\n"); return FALSE; } delete myCollator; return TRUE; } ``` ### Main Function ```c++ extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status); int main() { UErrorCode status = U_ZERO_ERROR; fprintf(stdout, "\n"); if (collateWithLocaleInCPP(Locale("en", "US"), status) != TRUE) { fprintf(stderr, "Collate with locale in C++ failed.\n"); } else { fprintf(stdout, "Collate with Locale C++ example worked!!\n"); } status = U_ZERO_ERROR; fprintf(stdout, "\n"); if (collateWithLocaleInC("en_US", &status) != TRUE) { fprintf(stderr, "%s: Collate with locale in C failed.\n"); } else { fprintf(stdout, "Collate with Locale C example worked!!\n"); } return 0; } ``` In **Java:** ```java import com.ibm.icu.text.Collator; import com.ibm.icu.text.CollationElementIterator; import com.ibm.icu.text.CollationKey; import java.util.Locale; public class CollateExample { public static void main(String arg[]) { CollateExample example = new CollateExample(); try { if (!example.collateWithLocale(Locale.US)) { System.err.println("Collate with locale example failed."); } else { System.out.println("Collate with Locale example worked!!"); } } catch (Exception e) { System.err.println("Collating with locale failed"); e.printStackTrace(); } } public boolean collateWithLocale(Locale locale) throws Exception { String source = "This is a test."; String target = "THIS IS A TEST."; Collator myCollator = Collator.getInstance(locale); int result = myCollator.compare(source, target); // result is 1, secondary differences only for ignorable space characters if (result >= 0) { System.err.println( "Comparing two strings with only secondary differences failed."); return false; } // To compare them with just primary differences myCollator.setStrength(Collator.PRIMARY); result = myCollator.compare(source, target); // result is 0 if (result != 0) { System.err.println( "Comparing two strings with no differences failed."); return false; } // Now, do the same comparison with keys CollationKey sourceKey = myCollator.getCollationKey(source); CollationKey targetKey = myCollator.getCollationKey(target); result = sourceKey.compareTo(targetKey); if (result != 0) { System.err.println("Comparing two strings with sort keys failed."); return false; } return true; } } ``` ## Language-sensitive searching String searching is a well-researched area, and there are algorithms that can optimize the searching process. Perhaps the best is the Boyer-Moore method. For a full description of this concept, please see Laura Werner's text searching article for more details (). However, implementing collation-based search with the Boyer-Moore method while getting correct results is very tricky, and ICU no longer uses this method. Please see the [String Search Service](./string-search) chapter. ## Using large buffers to manage sort keys A good solution for the problem of not knowing the sort key size in advance is to allocate a large buffer and store all the sort keys there, while keeping a list of indexes or pointers to that buffer. Following is sample code that will take a pointer to an array of UChar pointer, an array of key indexes. It will allocate and fill a buffer with sort keys and return the maximum size for a sort key. Once you have done this to your string, you just need to allocate a field of maximum size and copy your sortkeys from the buffer to fields. ```c++ uint32_t fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys, uint32_t sourceSize, uint8_t **buffer, uint32_t *maxSize, UErrorCode *status) { if(status == NULL || U_FAILURE(*status)) { return 0; } uint32_t bufferSize = 16384; uint32_t increment = 16384; uint32_t currentOffset = 0; uint32_t keySize = 0; uint32_t i = 0; *maxSize = 0; *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t)); if(buffer == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } for(i = 0; i < sourceSize; i++) { keys[i] = currentOffset; keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset); if(keySize > bufferSize-currentOffset) { *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment); if(buffer == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return 0; } bufferSize += increment; keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset); } /* here you can hook code that does something interesting with the keySize - * remembers the maximum or similar... */ if(keySize > *maxSize) { *maxSize = keySize; } currentOffset += keySize; } return currentOffset; } ```