• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2006, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  *   file name:  ucsdet.h
7  *   encoding:   US-ASCII
8  *   indentation:4
9  *
10  *   created on: 2005Aug04
11  *   created by: Andy Heninger
12  *
13  *   ICU Character Set Detection, API for C
14  *
15  *   Draft version 18 Oct 2005
16  *
17  */
18 
19 #ifndef __UCSDET_H
20 #define __UCSDET_H
21 
22 #include "unicode/utypes.h"
23 
24 #if !UCONFIG_NO_CONVERSION
25 #include "unicode/uenum.h"
26 
27 /**
28  * \file
29  * \brief C API: Charset Detection API
30  *
31  * This API provides a facility for detecting the
32  * charset or encoding of character data in an unknown text format.
33  * The input data can be from an array of bytes.
34  * <p>
35  * Character set detection is at best an imprecise operation.  The detection
36  * process will attempt to identify the charset that best matches the characteristics
37  * of the byte data, but the process is partly statistical in nature, and
38  * the results can not be guaranteed to always be correct.
39  * <p>
40  * For best accuracy in charset detection, the input data should be primarily
41  * in a single language, and a minimum of a few hundred bytes worth of plain text
42  * in the language are needed.  The detection process will attempt to
43  * ignore html or xml style markup that could otherwise obscure the content.
44  */
45 
46 
47 struct UCharsetDetector;
48 /**
49   * Structure representing a charset detector
50   * @draft ICU 3.6
51   */
52 typedef struct UCharsetDetector UCharsetDetector;
53 
54 struct UCharsetMatch;
55 /**
56   *  Opaque structure representing a match that was identified
57   *  from a charset detection operation.
58   *  @draft ICU 3.6
59   */
60 typedef struct UCharsetMatch UCharsetMatch;
61 
62 /**
63   *  Open a charset detector.
64   *
65   *  @param status Any error conditions occurring during the open
66   *                operation are reported back in this variable.
67   *  @return the newly opened charset detector.
68   *  @draft ICU 3.6
69   */
70 U_DRAFT UCharsetDetector * U_EXPORT2
71 ucsdet_open(UErrorCode   *status);
72 
73 /**
74   * Close a charset detector.  All storage and any other resources
75   *   owned by this charset detector will be released.  Failure to
76   *   close a charset detector when finished with it can result in
77   *   memory leaks in the application.
78   *
79   *  @param ucsd  The charset detector to be closed.
80   *  @draft ICU 3.6
81   */
82 U_DRAFT void U_EXPORT2
83 ucsdet_close(UCharsetDetector *ucsd);
84 
85 /**
86   * Set the input byte data whose charset is to detected.
87   *
88   * Ownership of the input  text byte array remains with the caller.
89   * The input string must not be altered or deleted until the charset
90   * detector is either closed or reset to refer to different input text.
91   *
92   * @param ucsd   the charset detector to be used.
93   * @param textIn the input text of unknown encoding.   .
94   * @param len    the length of the input text, or -1 if the text
95   *               is NUL terminated.
96   * @param status any error conditions are reported back in this variable.
97   *
98   * @draft ICU 3.6
99   */
100 U_DRAFT void U_EXPORT2
101 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
102 
103 
104 /** Set the declared encoding for charset detection.
105  *  The declared encoding of an input text is an encoding obtained
106  *  by the user from an http header or xml declaration or similar source that
107  *  can be provided as an additional hint to the charset detector.
108  *
109  *  How and whether the declared encoding will be used during the
110  *  detection process is TBD.
111  *
112  * @param ucsd      the charset detector to be used.
113  * @param encoding  an encoding for the current data obtained from
114  *                  a header or declaration or other source outside
115  *                  of the byte data itself.
116  * @param length    the length of the encoding name, or -1 if the name string
117  *                  is NUL terminated.
118  * @param status    any error conditions are reported back in this variable.
119  *
120  * @draft ICU 3.6
121  */
122 U_DRAFT void U_EXPORT2
123 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
124 
125 
126 /**
127  * Return the charset that best matches the supplied input data.
128  *
129  * Note though, that because the detection
130  * only looks at the start of the input data,
131  * there is a possibility that the returned charset will fail to handle
132  * the full set of input data.
133  * <p>
134  * The returned UCharsetMatch object is owned by the UCharsetDetector.
135  * It will remain valid until the detector input is reset, or until
136  * the detector is closed.
137  * <p>
138  * The function will fail if
139  *  <ul>
140  *    <li>no charset appears to match the data.</li>
141  *    <li>no input text has been provided</li>
142  *  </ul>
143  *
144  * @param ucsd      the charset detector to be used.
145  * @param status    any error conditions are reported back in this variable.
146  * @return          a UCharsetMatch  representing the best matching charset,
147  *                  or NULL if no charset matches the byte data.
148  *
149  * @draft ICU 3.6
150  */
151 U_DRAFT const UCharsetMatch * U_EXPORT2
152 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
153 
154 
155 /**
156  *  Find all charset matches that appear to be consistent with the input,
157  *  returning an array of results.  The results are ordered with the
158  *  best quality match first.
159  *
160  *  Because the detection only looks at a limited amount of the
161  *  input byte data, some of the returned charsets may fail to handle
162  *  the all of input data.
163  *  <p>
164  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
165  *  They will remain valid until the detector is closed or modified
166  *
167  * <p>
168  * Return an error if
169  *  <ul>
170  *    <li>no charsets appear to match the input data.</li>
171  *    <li>no input text has been provided</li>
172  *  </ul>
173  *
174  * @param ucsd          the charset detector to be used.
175  * @param matchesFound  pointer to a variable that will be set to the
176  *                      number of charsets identified that are consistent with
177  *                      the input data.  Output only.
178  * @param status        any error conditions are reported back in this variable.
179  * @return              A pointer to an array of pointers to UCharSetMatch objects.
180  *                      This array, and the UCharSetMatch instances to which it refers,
181  *                      are owned by the UCharsetDetector, and will remain valid until
182  *                      the detector is closed or modified.
183  * @draft ICU 3.4
184  */
185 U_DRAFT const UCharsetMatch ** U_EXPORT2
186 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
187 
188 
189 
190 /**
191  *  Get the name of the charset represented by a UCharsetMatch.
192  *
193  *  The storage for the returned name string is owned by the
194  *  UCharsetMatch, and will remain valid while the UCharsetMatch
195  *  is valid.
196  *
197  *  The name returned is suitable for use with the ICU conversion APIs.
198  *
199  *  @param ucsm    The charset match object.
200  *  @param status  Any error conditions are reported back in this variable.
201  *  @return        The name of the matching charset.
202  *
203  *  @draft ICU 3.6
204  */
205 U_DRAFT const char * U_EXPORT2
206 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
207 
208 /**
209  *  Get a confidence number for the quality of the match of the byte
210  *  data with the charset.  Confidence numbers range from zero to 100,
211  *  with 100 representing complete confidence and zero representing
212  *  no confidence.
213  *
214  *  The confidence values are somewhat arbitrary.  They define an
215  *  an ordering within the results for any single detection operation
216  *  but are not generally comparable between the results for different input.
217  *
218  *  A confidence value of ten does have a general meaning - it is used
219  *  for charsets that can represent the input data, but for which there
220  *  is no other indication that suggests that the charset is the correct one.
221  *  Pure 7 bit ASCII data, for example, is compatible with a
222  *  great many charsets, most of which will appear as possible matches
223  *  with a confidence of 10.
224  *
225  *  @param ucsm    The charset match object.
226  *  @param status  Any error conditions are reported back in this variable.
227  *  @return        A confidence number for the charset match.
228  *
229  *  @draft ICU 3.6
230  */
231 U_DRAFT int32_t U_EXPORT2
232 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
233 
234 /**
235  *  Get the RFC 3066 code for the language of the input data.
236  *
237  *  The Charset Detection service is intended primarily for detecting
238  *  charsets, not language.  For some, but not all, charsets, a language is
239  *  identified as a byproduct of the detection process, and that is what
240  *  is returned by this function.
241  *
242  *  CAUTION:
243  *    1.  Language information is not available for input data encoded in
244  *        all charsets. In particular, no language is identified
245  *        for UTF-8 input data.
246  *
247  *    2.  Closely related languages may sometimes be confused.
248  *
249  *  If more accurate language detection is required, a linguistic
250  *  analysis package should be used.
251  *
252  *  The storage for the returned name string is owned by the
253  *  UCharsetMatch, and will remain valid while the UCharsetMatch
254  *  is valid.
255  *
256  *  @param ucsm    The charset match object.
257  *  @param status  Any error conditions are reported back in this variable.
258  *  @return        The RFC 3066 code for the language of the input data, or
259  *                 an empty string if the language could not be determined.
260  *
261  *  @draft ICU 3.6
262  */
263 U_DRAFT const char * U_EXPORT2
264 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
265 
266 
267 /**
268   *  Get the entire input text as a UChar string, placing it into
269   *  a caller-supplied buffer.  A terminating
270   *  NUL character will be appended to the buffer if space is available.
271   *
272   *  The number of UChars in the output string, not including the terminating
273   *  NUL, is returned.
274   *
275   *  If the supplied buffer is smaller than required to hold the output,
276   *  the contents of the buffer are undefined.  The full output string length
277   *  (in UChars) is returned as always, and can be used to allocate a buffer
278   *  of the correct size.
279   *
280   *
281   * @param ucsm    The charset match object.
282   * @param buf     A UChar buffer to be filled with the converted text data.
283   * @param cap     The capacity of the buffer in UChars.
284   * @param status  Any error conditions are reported back in this variable.
285   * @return        The number of UChars in the output string.
286   *
287   * @draft ICU 3.6
288   */
289 U_DRAFT  int32_t U_EXPORT2
290 ucsdet_getUChars(const UCharsetMatch *ucsm,
291                  UChar *buf, int32_t cap, UErrorCode *status);
292 
293 
294 
295 /**
296   *  Get an iterator over the set of all detectable charsets -
297   *  over the charsets that are known to the charset detection
298   *  service.
299   *
300   *  The returned UEnumeration provides access to the names of
301   *  the charsets.
302   *
303   *  The state of the Charset detector that is passed in does not
304   *  affect the result of this function, but requiring a valid, open
305   *  charset detector as a parameter insures that the charset detection
306   *  service has been safely initialized and that the required detection
307   *  data is available.
308   *
309   *  @param ucsd a Charset detector.
310   *  @param status  Any error conditions are reported back in this variable.
311   *  @return an iterator providing access to the detectable charset names.
312   *  @draft ICU 3.6
313   */
314 
315 U_DRAFT  UEnumeration * U_EXPORT2
316 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
317 
318 
319 /**
320   *  Test whether input filtering is enabled for this charset detector.
321   *  Input filtering removes text that appears to be HTML or xml
322   *  markup from the input before applying the code page detection
323   *  heuristics.
324   *
325   *  @param ucsd  The charset detector to check.
326   *  @return TRUE if filtering is enabled.
327   *  @draft ICU 3.4
328   */
329 U_DRAFT  UBool U_EXPORT2
330 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
331 
332 
333 /**
334  * Enable filtering of input text. If filtering is enabled,
335  * text within angle brackets ("<" and ">") will be removed
336  * before detection, which will remove most HTML or xml markup.
337  *
338  * @param ucsd   the charset detector to be modified.
339  * @param filter <code>true</code> to enable input text filtering.
340  * @return The previous setting.
341  *
342  * @draft ICU 3.6
343  */
344 U_DRAFT  UBool U_EXPORT2
345 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
346 
347 #endif
348 #endif   /* __UCSDET_H */
349 
350 
351