• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *******************************************************************************
3  *
4  *   Copyright (C) 2003-2004, International Business Machines
5  *   Corporation and others.  All Rights Reserved.
6  *
7  *******************************************************************************
8  *   file name:  uidna.h
9  *   encoding:   US-ASCII
10  *   tab size:   8 (not used)
11  *   indentation:4
12  *
13  *   created on: 2003feb1
14  *   created by: Ram Viswanadha
15  */
16 
17 #ifndef __UIDNA_H__
18 #define __UIDNA_H__
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_IDNA
23 
24 #include "unicode/parseerr.h"
25 
26 /**
27  *\file
28  * UIDNA API implements the IDNA protocol as defined in the IDNA RFC
29  * (http://www.ietf.org/rfc/rfc3490.txt).
30  * The RFC defines 2 operations: ToASCII and ToUnicode. Domain labels
31  * containing non-ASCII code points are required to be processed by
32  * ToASCII operation before passing it to resolver libraries. Domain names
33  * that are obtained from resolver libraries are required to be processed by
34  * ToUnicode operation before displaying the domain name to the user.
35  * IDNA requires that implementations process input strings with Nameprep
36  * (http://www.ietf.org/rfc/rfc3491.txt),
37  * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt),
38  * and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt).
39  * Implementations of IDNA MUST fully implement Nameprep and Punycode;
40  * neither Nameprep nor Punycode are optional.
41  * The input and output of ToASCII and ToUnicode operations are Unicode
42  * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
43  * multiple times to an input string will yield the same result as applying the operation
44  * once.
45  * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
46  * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
47  *
48  */
49 
50 #ifndef U_HIDE_DRAFT_API
51 
52 /**
53  * Option to prohibit processing of unassigned codepoints in the input and
54  * do not check if the input conforms to STD-3 ASCII rules.
55  *
56  * @see  uidna_toASCII uidna_toUnicode
57  * @stable ICU 2.6
58  */
59 #define UIDNA_DEFAULT          0x0000
60 /**
61  * Option to allow processing of unassigned codepoints in the input
62  *
63  * @see  uidna_toASCII uidna_toUnicode
64  * @stable ICU 2.6
65  */
66 #define UIDNA_ALLOW_UNASSIGNED 0x0001
67 /**
68  * Option to check if input conforms to STD-3 ASCII rules
69  *
70  * @see  uidna_toASCII uidna_toUnicode
71  * @stable ICU 2.6
72  */
73 #define UIDNA_USE_STD3_RULES   0x0002
74 
75 #endif /*U_HIDE_DRAFT_API*/
76 
77 /**
78  * This function implements the ToASCII operation as defined in the IDNA RFC.
79  * This operation is done on <b>single labels</b> before sending it to something that expects
80  * ASCII names. A label is an individual part of a domain name. Labels are usually
81  * separated by dots; e.g." "www.example.com" is composed of 3 labels
82  * "www","example", and "com".
83  *
84  *
85  * @param src               Input UChar array containing label in Unicode.
86  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
87  * @param dest              Output UChar array with ASCII (ACE encoded) label.
88  * @param destCapacity      Size of dest.
89  * @param options           A bit set of options:
90  *
91  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
92  *                              and do not use STD3 ASCII rules
93  *                              If unassigned code points are found the operation fails with
94  *                              U_UNASSIGNED_ERROR error code.
95  *
96  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
97  *                              If this option is set, the unassigned code points are in the input
98  *                              are treated as normal Unicode code points.
99  *
100  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
101  *                              If this option is set and the input does not satisfy STD3 rules,
102  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
103  *
104  * @param parseError        Pointer to UParseError struct to receive information on position
105  *                          of error if an error is encountered. Can be NULL.
106  * @param status            ICU in/out error code parameter.
107  *                          U_INVALID_CHAR_FOUND if src contains
108  *                          unmatched single surrogates.
109  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
110  *                          too many code points.
111  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
112  * @return                  Number of ASCII characters converted.
113  * @stable ICU 2.6
114  */
115 U_STABLE int32_t U_EXPORT2
116 uidna_toASCII(const UChar* src, int32_t srcLength,
117               UChar* dest, int32_t destCapacity,
118               int32_t options,
119               UParseError* parseError,
120               UErrorCode* status);
121 
122 
123 /**
124  * This function implements the ToUnicode operation as defined in the IDNA RFC.
125  * This operation is done on <b>single labels</b> before sending it to something that expects
126  * Unicode names. A label is an individual part of a domain name. Labels are usually
127  * separated by dots; for e.g." "www.example.com" is composed of 3 labels
128  * "www","example", and "com".
129  *
130  * @param src               Input UChar array containing ASCII (ACE encoded) label.
131  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
132  * @param dest Output       Converted UChar array containing Unicode equivalent of label.
133  * @param destCapacity      Size of dest.
134  * @param options           A bit set of options:
135  *
136  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
137  *                              and do not use STD3 ASCII rules
138  *                              If unassigned code points are found the operation fails with
139  *                              U_UNASSIGNED_ERROR error code.
140  *
141  *  - UIDNA_ALLOW_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
142  *                              If this option is set, the unassigned code points are in the input
143  *                              are treated as normal Unicode code points. <b> Note: </b> This option is
144  *                              required on toUnicode operation because the RFC mandates
145  *                              verification of decoded ACE input by applying toASCII and comparing
146  *                              its output with source
147  *
148  *
149  *
150  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
151  *                              If this option is set and the input does not satisfy STD3 rules,
152  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
153  *
154  * @param parseError        Pointer to UParseError struct to receive information on position
155  *                          of error if an error is encountered. Can be NULL.
156  * @param status            ICU in/out error code parameter.
157  *                          U_INVALID_CHAR_FOUND if src contains
158  *                          unmatched single surrogates.
159  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
160  *                          too many code points.
161  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
162  * @return                  Number of Unicode characters converted.
163  * @stable ICU 2.6
164  */
165 U_STABLE int32_t U_EXPORT2
166 uidna_toUnicode(const UChar* src, int32_t srcLength,
167                 UChar* dest, int32_t destCapacity,
168                 int32_t options,
169                 UParseError* parseError,
170                 UErrorCode* status);
171 
172 
173 /**
174  * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
175  * This operation is done on complete domain names, e.g: "www.example.com".
176  * It is important to note that this operation can fail. If it fails, then the input
177  * domain name cannot be used as an Internationalized Domain Name and the application
178  * should have methods defined to deal with the failure.
179  *
180  * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
181  * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
182  * and then convert. This function does not offer that level of granularity. The options once
183  * set will apply to all labels in the domain name
184  *
185  * @param src               Input UChar array containing IDN in Unicode.
186  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
187  * @param dest              Output UChar array with ASCII (ACE encoded) IDN.
188  * @param destCapacity      Size of dest.
189  * @param options           A bit set of options:
190  *
191  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
192  *                              and do not use STD3 ASCII rules
193  *                              If unassigned code points are found the operation fails with
194  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
195  *
196  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
197  *                              If this option is set, the unassigned code points are in the input
198  *                              are treated as normal Unicode code points.
199  *
200  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
201  *                              If this option is set and the input does not satisfy STD3 rules,
202  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
203  *
204  * @param parseError        Pointer to UParseError struct to receive information on position
205  *                          of error if an error is encountered. Can be NULL.
206  * @param status            ICU in/out error code parameter.
207  *                          U_INVALID_CHAR_FOUND if src contains
208  *                          unmatched single surrogates.
209  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
210  *                          too many code points.
211  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
212  * @return                  Number of ASCII characters converted.
213  * @stable ICU 2.6
214  */
215 U_STABLE int32_t U_EXPORT2
216 uidna_IDNToASCII(  const UChar* src, int32_t srcLength,
217                    UChar* dest, int32_t destCapacity,
218                    int32_t options,
219                    UParseError* parseError,
220                    UErrorCode* status);
221 
222 /**
223  * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
224  * This operation is done on complete domain names, e.g: "www.example.com".
225  *
226  * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
227  * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
228  * and then convert. This function does not offer that level of granularity. The options once
229  * set will apply to all labels in the domain name
230  *
231  * @param src               Input UChar array containing IDN in ASCII (ACE encoded) form.
232  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
233  * @param dest Output       UChar array containing Unicode equivalent of source IDN.
234  * @param destCapacity      Size of dest.
235  * @param options           A bit set of options:
236  *
237  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
238  *                              and do not use STD3 ASCII rules
239  *                              If unassigned code points are found the operation fails with
240  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
241  *
242  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
243  *                              If this option is set, the unassigned code points are in the input
244  *                              are treated as normal Unicode code points.
245  *
246  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
247  *                              If this option is set and the input does not satisfy STD3 rules,
248  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
249  *
250  * @param parseError        Pointer to UParseError struct to receive information on position
251  *                          of error if an error is encountered. Can be NULL.
252  * @param status            ICU in/out error code parameter.
253  *                          U_INVALID_CHAR_FOUND if src contains
254  *                          unmatched single surrogates.
255  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
256  *                          too many code points.
257  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
258  * @return                  Number of ASCII characters converted.
259  * @stable ICU 2.6
260  */
261 U_STABLE int32_t U_EXPORT2
262 uidna_IDNToUnicode(  const UChar* src, int32_t srcLength,
263                      UChar* dest, int32_t destCapacity,
264                      int32_t options,
265                      UParseError* parseError,
266                      UErrorCode* status);
267 
268 /**
269  * Compare two IDN strings for equivalence.
270  * This function splits the domain names into labels and compares them.
271  * According to IDN RFC, whenever two labels are compared, they are
272  * considered equal if and only if their ASCII forms (obtained by
273  * applying toASCII) match using an case-insensitive ASCII comparison.
274  * Two domain names are considered a match if and only if all labels
275  * match regardless of whether label separators match.
276  *
277  * @param s1                First source string.
278  * @param length1           Length of first source string, or -1 if NUL-terminated.
279  *
280  * @param s2                Second source string.
281  * @param length2           Length of second source string, or -1 if NUL-terminated.
282  * @param options           A bit set of options:
283  *
284  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
285  *                              and do not use STD3 ASCII rules
286  *                              If unassigned code points are found the operation fails with
287  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
288  *
289  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
290  *                              If this option is set, the unassigned code points are in the input
291  *                              are treated as normal Unicode code points.
292  *
293  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
294  *                              If this option is set and the input does not satisfy STD3 rules,
295  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
296  *
297  * @param status            ICU error code in/out parameter.
298  *                          Must fulfill U_SUCCESS before the function call.
299  * @return <0 or 0 or >0 as usual for string comparisons
300  * @stable ICU 2.6
301  */
302 U_STABLE int32_t U_EXPORT2
303 uidna_compare(  const UChar *s1, int32_t length1,
304                 const UChar *s2, int32_t length2,
305                 int32_t options,
306                 UErrorCode* status);
307 
308 #endif /* #if !UCONFIG_NO_IDNA */
309 
310 #endif
311