• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1---
2layout: default
3title: Collation Examples
4nav_order: 7
5parent: Collation
6---
7<!--
8© 2020 and later: Unicode, Inc. and others.
9License & terms of use: http://www.unicode.org/copyright.html
10-->
11
12# Collation Examples
13{: .no_toc }
14
15## Contents
16{: .no_toc .text-delta }
17
181. TOC
19{:toc}
20
21---
22
23## Simple Collation Sample Customization
24
25The following program demonstrates how to compare and create sort keys with
26default locale.
27
28In **C:**
29
30```c
31#include <stdio.h>
32#include <memory.h>
33#include <string.h>
34#include "unicode/ustring.h"
35#include "unicode/utypes.h"
36#include "unicode/uloc.h"
37#include "unicode/ucol.h"
38
39#define MAXBUFFERSIZE 100
40#define BIGBUFFERSIZE 5000
41
42UBool collateWithLocaleInC(const char* locale, UErrorCode *status)
43{
44    UChar         dispName    [MAXBUFFERSIZE];
45    int32_t       bufferLen   = 0;
46    UChar         source            [MAXBUFFERSIZE];
47    UChar         target            [MAXBUFFERSIZE];
48    UCollationResult result   = UCOL_EQUAL;
49    uint8_t             sourceKeyArray    [MAXBUFFERSIZE];
50    uint8_t             targetKeyArray    [MAXBUFFERSIZE];
51    int32_t       sourceKeyOut      = 0,
52                targetKeyOut = 0;
53    UCollator     *myCollator = 0;
54    if (U_FAILURE(*status))
55    {
56        return false;
57    }
58    u_uastrcpy(source, "This is a test.");
59    u_uastrcpy(target, "THIS IS A TEST.");
60    myCollator = ucol_open(locale, status);
61    if (U_FAILURE(*status)){
62        bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status);
63        /*Report the error with display name... */
64        fprintf(stderr,
65        "Failed to create the collator for : \"%s\"\n", dispName);
66        return false;
67    }
68    result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
69    /* result is 1, secondary differences only for ignorable space characters*/
70    if (result != UCOL_LESS)
71    {
72        fprintf(stderr,
73        "Comparing two strings with only secondary differences in C failed.\n");
74        return false;
75    }
76    /* To compare them with just primary differences */
77    ucol_setStrength(myCollator, UCOL_PRIMARY);
78    result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
79    /* result is 0 */
80    if (result != 0)
81    {
82        fprintf(stderr,
83        "Comparing two strings with no differences in C failed.\n");
84        return false;
85    }
86
87    /* Now, do the same comparison with keys */
88    sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE);
89    targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE);
90    result = 0;
91    result = strcmp(sourceKeyArray, targetKeyArray);
92    if (result != 0)
93    {
94        fprintf(stderr,
95        "Comparing two strings with sort keys in C failed.\n");
96        return false;
97    }
98    ucol_close(myCollator);
99    return true;
100}
101```
102
103In **C++:**
104
105```c++
106#include <stdio.h>
107#include "unicode/unistr.h"
108#include "unicode/utypes.h"
109#include "unicode/locid.h"
110#include "unicode/coll.h"
111#include "unicode/tblcoll.h"
112#include "unicode/coleitr.h"
113#include "unicode/sortkey.h"
114UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status)
115{
116    UnicodeString dispName;
117    UnicodeString source("This is a test.");
118    UnicodeString target("THIS IS A TEST.");
119    Collator::EComparisonResult result    = Collator::EQUAL;
120    CollationKey sourceKey;
121    CollationKey targetKey;
122    Collator      *myCollator = 0;
123    if (U_FAILURE(status))
124    {
125        return false;
126    }
127    myCollator = Collator::createInstance(locale, status);
128    if (U_FAILURE(status)){
129        locale.getDisplayName(dispName);
130        /*Report the error with display name... */
131        fprintf(stderr,
132        "%s: Failed to create the collator for : \"%s\"\n", dispName);
133        return false;
134    }
135    result = myCollator->compare(source, target);
136    /* result is 1, secondary differences only for ignorable space characters*/
137    if (result != UCOL_LESS)
138    {
139        fprintf(stderr,
140        "Comparing two strings with only secondary differences in C failed.\n");
141        return false;
142    }
143    /* To compare them with just primary differences */
144    myCollator->setStrength(Collator::PRIMARY);
145    result = myCollator->compare(source, target);
146    /* result is 0 */
147    if (result != 0)
148    {
149        fprintf(stderr,
150        "Comparing two strings with no differences in C failed.\n");
151        return false;
152    }
153    /* Now, do the same comparison with keys */
154    myCollator->getCollationKey(source, sourceKey, status);
155    myCollator->getCollationKey(target, targetKey, status);
156    result = Collator::EQUAL;
157
158    result = sourceKey.compareTo(targetKey);
159    if (result != 0)
160    {
161        fprintf(stderr,
162        "%s: Comparing two strings with sort keys in C failed.\n");
163        return false;
164    }
165    delete myCollator;
166    return true;
167}
168```
169
170### Main Function
171
172```c++
173extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status);
174int main()
175{
176   UErrorCode status = U_ZERO_ERROR;
177   fprintf(stdout, "\n");
178   if (collateWithLocaleInCPP(Locale("en", "US"), status) != true)
179   {
180        fprintf(stderr,
181        "Collate with locale in C++ failed.\n");
182   } else
183   {
184       fprintf(stdout, "Collate with Locale C++ example worked!!\n");
185   }
186   status = U_ZERO_ERROR;
187   fprintf(stdout, "\n");
188   if (collateWithLocaleInC("en_US", &status) != true)
189   {
190        fprintf(stderr,
191        "%s: Collate with locale in C failed.\n");
192   } else
193   {
194       fprintf(stdout, "Collate with Locale C example worked!!\n");
195   }
196   return 0;
197}
198```
199
200In **Java:**
201
202```java
203import com.ibm.icu.text.Collator;
204import com.ibm.icu.text.CollationElementIterator;
205import com.ibm.icu.text.CollationKey;
206import java.util.Locale;
207
208public class CollateExample
209{
210
211    public static void main(String arg[])
212    {
213        CollateExample example = new CollateExample();
214        try {
215            if (!example.collateWithLocale(Locale.US)) {
216                System.err.println("Collate with locale example failed.");
217            }
218            else {
219                System.out.println("Collate with Locale example worked!!");
220            }
221        } catch (Exception e) {
222            System.err.println("Collating with locale failed");
223            e.printStackTrace();
224        }
225    }
226
227    public boolean collateWithLocale(Locale locale) throws Exception
228    {
229        String source = "This is a test.";
230        String target = "THIS IS A TEST.";
231        Collator myCollator = Collator.getInstance(locale);
232
233        int result = myCollator.compare(source, target);
234        // result is 1, secondary differences only for ignorable space characters
235        if (result >= 0) {
236            System.err.println(
237                "Comparing two strings with only secondary differences failed.");
238            return false;
239        }
240        // To compare them with just primary differences
241        myCollator.setStrength(Collator.PRIMARY);
242        result = myCollator.compare(source, target);
243        // result is 0
244        if (result != 0) {
245            System.err.println(
246                           "Comparing two strings with no differences failed.");
247            return false;
248        }
249        // Now, do the same comparison with keys
250        CollationKey sourceKey = myCollator.getCollationKey(source);
251        CollationKey targetKey = myCollator.getCollationKey(target);
252        result = sourceKey.compareTo(targetKey);
253        if (result != 0) {
254            System.err.println("Comparing two strings with sort keys failed.");
255            return false;
256        }
257        return true;
258    }
259}
260```
261
262## Language-sensitive searching
263
264String searching is a well-researched area, and there are algorithms that can
265optimize the searching process. Perhaps the best is the Boyer-Moore method. For a
266full description of this concept, please see Laura
267Werner's text searching article for more details
268(<http://icu-project.org/docs/papers/efficient_text_searching_in_java.html>).
269
270However, implementing collation-based search with the Boyer-Moore method
271while getting correct results is very tricky, and ICU no longer uses this method
272(as of ICU4C 4.0 and ICU4J 53).
273
274Please see the [String Search Service](./string-search) chapter.
275
276## Using large buffers to manage sort keys
277
278A good solution for the problem of not knowing the sort key size in advance is
279to allocate a large buffer and store all the sort keys there, while keeping a
280list of indexes or pointers to that buffer.
281
282Following is sample code that will take a pointer to an array of UChar pointer,
283an array of key indexes. It will allocate and fill a buffer with sort keys and
284return the maximum size for a sort key. Once you have done this to your string,
285you just need to allocate a field of maximum size and copy your sortkeys from
286the buffer to fields.
287
288```c++
289uint32_t fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys,
290                            uint32_t sourceSize, uint8_t **buffer,
291                            uint32_t *maxSize, UErrorCode *status)
292{
293  if(status == NULL || U_FAILURE(*status)) {
294    return 0;
295  }
296
297  uint32_t bufferSize = 16384;
298  uint32_t increment = 16384;
299  uint32_t currentOffset = 0;
300  uint32_t keySize = 0;
301  uint32_t i = 0;
302  *maxSize = 0;
303
304  *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t));
305  if(buffer == NULL) {
306    *status = U_MEMORY_ALLOCATION_ERROR;
307    return 0;
308  }
309
310  for(i = 0; i < sourceSize; i++) {
311    keys[i] = currentOffset;
312    keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);
313    if(keySize > bufferSize-currentOffset) {
314      *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment);
315      if(buffer == NULL) {
316        *status = U_MEMORY_ALLOCATION_ERROR;
317        return 0;
318      }
319      bufferSize += increment;
320      keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);
321    }
322    /* here you can hook code that does something interesting with the keySize -
323     * remembers the maximum or similar...
324     */
325    if(keySize > *maxSize) {
326      *maxSize = keySize;
327    }
328    currentOffset += keySize;
329  }
330
331  return currentOffset;
332}
333```
334