• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *******************************************************************************
3  *
4  *   Copyright (C) 2003-2010, International Business Machines
5  *   Corporation and others.  All Rights Reserved.
6  *
7  *******************************************************************************
8  *   file name:  uidna.h
9  *   encoding:   US-ASCII
10  *   tab size:   8 (not used)
11  *   indentation:4
12  *
13  *   created on: 2003feb1
14  *   created by: Ram Viswanadha
15  */
16 
17 #ifndef __UIDNA_H__
18 #define __UIDNA_H__
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_IDNA
23 
24 #include "unicode/parseerr.h"
25 
26 /**
27  * \file
28  * \brief C API: Internationalized Domain Names in Applications Tranformation
29  *
30  * UIDNA API implements the IDNA protocol as defined in the IDNA RFC
31  * (http://www.ietf.org/rfc/rfc3490.txt).
32  * The RFC defines 2 operations: ToASCII and ToUnicode. Domain labels
33  * containing non-ASCII code points are required to be processed by
34  * ToASCII operation before passing it to resolver libraries. Domain names
35  * that are obtained from resolver libraries are required to be processed by
36  * ToUnicode operation before displaying the domain name to the user.
37  * IDNA requires that implementations process input strings with Nameprep
38  * (http://www.ietf.org/rfc/rfc3491.txt),
39  * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt),
40  * and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt).
41  * Implementations of IDNA MUST fully implement Nameprep and Punycode;
42  * neither Nameprep nor Punycode are optional.
43  * The input and output of ToASCII and ToUnicode operations are Unicode
44  * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
45  * multiple times to an input string will yield the same result as applying the operation
46  * once.
47  * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
48  * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
49  *
50  */
51 
52 /**
53  * Option to prohibit processing of unassigned codepoints in the input and
54  * do not check if the input conforms to STD-3 ASCII rules.
55  *
56  * @see  uidna_toASCII uidna_toUnicode
57  * @stable ICU 2.6
58  */
59 #define UIDNA_DEFAULT          0x0000
60 /**
61  * Option to allow processing of unassigned codepoints in the input
62  *
63  * @see  uidna_toASCII uidna_toUnicode
64  * @stable ICU 2.6
65  */
66 #define UIDNA_ALLOW_UNASSIGNED 0x0001
67 /**
68  * Option to check if input conforms to STD-3 ASCII rules
69  *
70  * @see  uidna_toASCII uidna_toUnicode
71  * @stable ICU 2.6
72  */
73 #define UIDNA_USE_STD3_RULES   0x0002
74 
75 /**
76  * This function implements the ToASCII operation as defined in the IDNA RFC.
77  * This operation is done on <b>single labels</b> before sending it to something that expects
78  * ASCII names. A label is an individual part of a domain name. Labels are usually
79  * separated by dots; e.g. "www.example.com" is composed of 3 labels "www","example", and "com".
80  *
81  *
82  * @param src               Input UChar array containing label in Unicode.
83  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
84  * @param dest              Output UChar array with ASCII (ACE encoded) label.
85  * @param destCapacity      Size of dest.
86  * @param options           A bit set of options:
87  *
88  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
89  *                              and do not use STD3 ASCII rules
90  *                              If unassigned code points are found the operation fails with
91  *                              U_UNASSIGNED_ERROR error code.
92  *
93  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
94  *                              If this option is set, the unassigned code points are in the input
95  *                              are treated as normal Unicode code points.
96  *
97  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
98  *                              If this option is set and the input does not satisfy STD3 rules,
99  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
100  *
101  * @param parseError        Pointer to UParseError struct to receive information on position
102  *                          of error if an error is encountered. Can be NULL.
103  * @param status            ICU in/out error code parameter.
104  *                          U_INVALID_CHAR_FOUND if src contains
105  *                          unmatched single surrogates.
106  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
107  *                          too many code points.
108  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
109  * @return The length of the result string, if successful - or in case of a buffer overflow,
110  *         in which case it will be greater than destCapacity.
111  * @stable ICU 2.6
112  */
113 U_STABLE int32_t U_EXPORT2
114 uidna_toASCII(const UChar* src, int32_t srcLength,
115               UChar* dest, int32_t destCapacity,
116               int32_t options,
117               UParseError* parseError,
118               UErrorCode* status);
119 
120 
121 /**
122  * This function implements the ToUnicode operation as defined in the IDNA RFC.
123  * This operation is done on <b>single labels</b> before sending it to something that expects
124  * Unicode names. A label is an individual part of a domain name. Labels are usually
125  * separated by dots; for e.g. "www.example.com" is composed of 3 labels "www","example", and "com".
126  *
127  * @param src               Input UChar array containing ASCII (ACE encoded) label.
128  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
129  * @param dest Output       Converted UChar array containing Unicode equivalent of label.
130  * @param destCapacity      Size of dest.
131  * @param options           A bit set of options:
132  *
133  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
134  *                              and do not use STD3 ASCII rules
135  *                              If unassigned code points are found the operation fails with
136  *                              U_UNASSIGNED_ERROR error code.
137  *
138  *  - UIDNA_ALLOW_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
139  *                              If this option is set, the unassigned code points are in the input
140  *                              are treated as normal Unicode code points. <b> Note: </b> This option is
141  *                              required on toUnicode operation because the RFC mandates
142  *                              verification of decoded ACE input by applying toASCII and comparing
143  *                              its output with source
144  *
145  *
146  *
147  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
148  *                              If this option is set and the input does not satisfy STD3 rules,
149  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
150  *
151  * @param parseError        Pointer to UParseError struct to receive information on position
152  *                          of error if an error is encountered. Can be NULL.
153  * @param status            ICU in/out error code parameter.
154  *                          U_INVALID_CHAR_FOUND if src contains
155  *                          unmatched single surrogates.
156  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
157  *                          too many code points.
158  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
159  * @return The length of the result string, if successful - or in case of a buffer overflow,
160  *         in which case it will be greater than destCapacity.
161  * @stable ICU 2.6
162  */
163 U_STABLE int32_t U_EXPORT2
164 uidna_toUnicode(const UChar* src, int32_t srcLength,
165                 UChar* dest, int32_t destCapacity,
166                 int32_t options,
167                 UParseError* parseError,
168                 UErrorCode* status);
169 
170 
171 /**
172  * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
173  * This operation is done on complete domain names, e.g: "www.example.com".
174  * It is important to note that this operation can fail. If it fails, then the input
175  * domain name cannot be used as an Internationalized Domain Name and the application
176  * should have methods defined to deal with the failure.
177  *
178  * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
179  * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
180  * and then convert. This function does not offer that level of granularity. The options once
181  * set will apply to all labels in the domain name
182  *
183  * @param src               Input UChar array containing IDN in Unicode.
184  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
185  * @param dest              Output UChar array with ASCII (ACE encoded) IDN.
186  * @param destCapacity      Size of dest.
187  * @param options           A bit set of options:
188  *
189  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
190  *                              and do not use STD3 ASCII rules
191  *                              If unassigned code points are found the operation fails with
192  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
193  *
194  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
195  *                              If this option is set, the unassigned code points are in the input
196  *                              are treated as normal Unicode code points.
197  *
198  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
199  *                              If this option is set and the input does not satisfy STD3 rules,
200  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
201  *
202  * @param parseError        Pointer to UParseError struct to receive information on position
203  *                          of error if an error is encountered. Can be NULL.
204  * @param status            ICU in/out error code parameter.
205  *                          U_INVALID_CHAR_FOUND if src contains
206  *                          unmatched single surrogates.
207  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
208  *                          too many code points.
209  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
210  * @return The length of the result string, if successful - or in case of a buffer overflow,
211  *         in which case it will be greater than destCapacity.
212  * @stable ICU 2.6
213  */
214 U_STABLE int32_t U_EXPORT2
215 uidna_IDNToASCII(  const UChar* src, int32_t srcLength,
216                    UChar* dest, int32_t destCapacity,
217                    int32_t options,
218                    UParseError* parseError,
219                    UErrorCode* status);
220 
221 /**
222  * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
223  * This operation is done on complete domain names, e.g: "www.example.com".
224  *
225  * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
226  * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
227  * and then convert. This function does not offer that level of granularity. The options once
228  * set will apply to all labels in the domain name
229  *
230  * @param src               Input UChar array containing IDN in ASCII (ACE encoded) form.
231  * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
232  * @param dest Output       UChar array containing Unicode equivalent of source IDN.
233  * @param destCapacity      Size of dest.
234  * @param options           A bit set of options:
235  *
236  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
237  *                              and do not use STD3 ASCII rules
238  *                              If unassigned code points are found the operation fails with
239  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
240  *
241  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
242  *                              If this option is set, the unassigned code points are in the input
243  *                              are treated as normal Unicode code points.
244  *
245  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
246  *                              If this option is set and the input does not satisfy STD3 rules,
247  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
248  *
249  * @param parseError        Pointer to UParseError struct to receive information on position
250  *                          of error if an error is encountered. Can be NULL.
251  * @param status            ICU in/out error code parameter.
252  *                          U_INVALID_CHAR_FOUND if src contains
253  *                          unmatched single surrogates.
254  *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
255  *                          too many code points.
256  *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
257  * @return The length of the result string, if successful - or in case of a buffer overflow,
258  *         in which case it will be greater than destCapacity.
259  * @stable ICU 2.6
260  */
261 U_STABLE int32_t U_EXPORT2
262 uidna_IDNToUnicode(  const UChar* src, int32_t srcLength,
263                      UChar* dest, int32_t destCapacity,
264                      int32_t options,
265                      UParseError* parseError,
266                      UErrorCode* status);
267 
268 /**
269  * Compare two IDN strings for equivalence.
270  * This function splits the domain names into labels and compares them.
271  * According to IDN RFC, whenever two labels are compared, they are
272  * considered equal if and only if their ASCII forms (obtained by
273  * applying toASCII) match using an case-insensitive ASCII comparison.
274  * Two domain names are considered a match if and only if all labels
275  * match regardless of whether label separators match.
276  *
277  * @param s1                First source string.
278  * @param length1           Length of first source string, or -1 if NUL-terminated.
279  *
280  * @param s2                Second source string.
281  * @param length2           Length of second source string, or -1 if NUL-terminated.
282  * @param options           A bit set of options:
283  *
284  *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
285  *                              and do not use STD3 ASCII rules
286  *                              If unassigned code points are found the operation fails with
287  *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
288  *
289  *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
290  *                              If this option is set, the unassigned code points are in the input
291  *                              are treated as normal Unicode code points.
292  *
293  *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
294  *                              If this option is set and the input does not satisfy STD3 rules,
295  *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
296  *
297  * @param status            ICU error code in/out parameter.
298  *                          Must fulfill U_SUCCESS before the function call.
299  * @return <0 or 0 or >0 as usual for string comparisons
300  * @stable ICU 2.6
301  */
302 U_STABLE int32_t U_EXPORT2
303 uidna_compare(  const UChar *s1, int32_t length1,
304                 const UChar *s2, int32_t length2,
305                 int32_t options,
306                 UErrorCode* status);
307 
308 #endif /* #if !UCONFIG_NO_IDNA */
309 
310 #endif
311