• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2010, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  *   file name:  ucsdet.h
7  *   encoding:   US-ASCII
8  *   indentation:4
9  *
10  *   created on: 2005Aug04
11  *   created by: Andy Heninger
12  *
13  *   ICU Character Set Detection, API for C
14  *
15  *   Draft version 18 Oct 2005
16  *
17  */
18 
19 #ifndef __UCSDET_H
20 #define __UCSDET_H
21 
22 #include "unicode/utypes.h"
23 
24 #if !UCONFIG_NO_CONVERSION
25 
26 #include "unicode/localpointer.h"
27 #include "unicode/uenum.h"
28 
29 /**
30  * \file
31  * \brief C API: Charset Detection API
32  *
33  * This API provides a facility for detecting the
34  * charset or encoding of character data in an unknown text format.
35  * The input data can be from an array of bytes.
36  * <p>
37  * Character set detection is at best an imprecise operation.  The detection
38  * process will attempt to identify the charset that best matches the characteristics
39  * of the byte data, but the process is partly statistical in nature, and
40  * the results can not be guaranteed to always be correct.
41  * <p>
42  * For best accuracy in charset detection, the input data should be primarily
43  * in a single language, and a minimum of a few hundred bytes worth of plain text
44  * in the language are needed.  The detection process will attempt to
45  * ignore html or xml style markup that could otherwise obscure the content.
46  */
47 
48 
49 struct UCharsetDetector;
50 /**
51   * Structure representing a charset detector
52   * @stable ICU 3.6
53   */
54 typedef struct UCharsetDetector UCharsetDetector;
55 
56 struct UCharsetMatch;
57 /**
58   *  Opaque structure representing a match that was identified
59   *  from a charset detection operation.
60   *  @stable ICU 3.6
61   */
62 typedef struct UCharsetMatch UCharsetMatch;
63 
64 /**
65   *  Open a charset detector.
66   *
67   *  @param status Any error conditions occurring during the open
68   *                operation are reported back in this variable.
69   *  @return the newly opened charset detector.
70   *  @stable ICU 3.6
71   */
72 U_STABLE UCharsetDetector * U_EXPORT2
73 ucsdet_open(UErrorCode   *status);
74 
75 /**
76   * Close a charset detector.  All storage and any other resources
77   *   owned by this charset detector will be released.  Failure to
78   *   close a charset detector when finished with it can result in
79   *   memory leaks in the application.
80   *
81   *  @param ucsd  The charset detector to be closed.
82   *  @stable ICU 3.6
83   */
84 U_STABLE void U_EXPORT2
85 ucsdet_close(UCharsetDetector *ucsd);
86 
87 #if U_SHOW_CPLUSPLUS_API
88 
89 U_NAMESPACE_BEGIN
90 
91 /**
92  * \class LocalUCharsetDetectorPointer
93  * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
94  * For most methods see the LocalPointerBase base class.
95  *
96  * @see LocalPointerBase
97  * @see LocalPointer
98  * @stable ICU 4.4
99  */
100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
101 
102 U_NAMESPACE_END
103 
104 #endif
105 
106 /**
107   * Set the input byte data whose charset is to detected.
108   *
109   * Ownership of the input  text byte array remains with the caller.
110   * The input string must not be altered or deleted until the charset
111   * detector is either closed or reset to refer to different input text.
112   *
113   * @param ucsd   the charset detector to be used.
114   * @param textIn the input text of unknown encoding.   .
115   * @param len    the length of the input text, or -1 if the text
116   *               is NUL terminated.
117   * @param status any error conditions are reported back in this variable.
118   *
119   * @stable ICU 3.6
120   */
121 U_STABLE void U_EXPORT2
122 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
123 
124 
125 /** Set the declared encoding for charset detection.
126  *  The declared encoding of an input text is an encoding obtained
127  *  by the user from an http header or xml declaration or similar source that
128  *  can be provided as an additional hint to the charset detector.
129  *
130  *  How and whether the declared encoding will be used during the
131  *  detection process is TBD.
132  *
133  * @param ucsd      the charset detector to be used.
134  * @param encoding  an encoding for the current data obtained from
135  *                  a header or declaration or other source outside
136  *                  of the byte data itself.
137  * @param length    the length of the encoding name, or -1 if the name string
138  *                  is NUL terminated.
139  * @param status    any error conditions are reported back in this variable.
140  *
141  * @stable ICU 3.6
142  */
143 U_STABLE void U_EXPORT2
144 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
145 
146 
147 /**
148  * Return the charset that best matches the supplied input data.
149  *
150  * Note though, that because the detection
151  * only looks at the start of the input data,
152  * there is a possibility that the returned charset will fail to handle
153  * the full set of input data.
154  * <p>
155  * The returned UCharsetMatch object is owned by the UCharsetDetector.
156  * It will remain valid until the detector input is reset, or until
157  * the detector is closed.
158  * <p>
159  * The function will fail if
160  *  <ul>
161  *    <li>no charset appears to match the data.</li>
162  *    <li>no input text has been provided</li>
163  *  </ul>
164  *
165  * @param ucsd      the charset detector to be used.
166  * @param status    any error conditions are reported back in this variable.
167  * @return          a UCharsetMatch  representing the best matching charset,
168  *                  or NULL if no charset matches the byte data.
169  *
170  * @stable ICU 3.6
171  */
172 U_STABLE const UCharsetMatch * U_EXPORT2
173 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
174 
175 
176 /**
177  *  Find all charset matches that appear to be consistent with the input,
178  *  returning an array of results.  The results are ordered with the
179  *  best quality match first.
180  *
181  *  Because the detection only looks at a limited amount of the
182  *  input byte data, some of the returned charsets may fail to handle
183  *  the all of input data.
184  *  <p>
185  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
186  *  They will remain valid until the detector is closed or modified
187  *
188  * <p>
189  * Return an error if
190  *  <ul>
191  *    <li>no charsets appear to match the input data.</li>
192  *    <li>no input text has been provided</li>
193  *  </ul>
194  *
195  * @param ucsd          the charset detector to be used.
196  * @param matchesFound  pointer to a variable that will be set to the
197  *                      number of charsets identified that are consistent with
198  *                      the input data.  Output only.
199  * @param status        any error conditions are reported back in this variable.
200  * @return              A pointer to an array of pointers to UCharSetMatch objects.
201  *                      This array, and the UCharSetMatch instances to which it refers,
202  *                      are owned by the UCharsetDetector, and will remain valid until
203  *                      the detector is closed or modified.
204  * @stable ICU 3.6
205  */
206 U_STABLE const UCharsetMatch ** U_EXPORT2
207 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
208 
209 
210 
211 /**
212  *  Get the name of the charset represented by a UCharsetMatch.
213  *
214  *  The storage for the returned name string is owned by the
215  *  UCharsetMatch, and will remain valid while the UCharsetMatch
216  *  is valid.
217  *
218  *  The name returned is suitable for use with the ICU conversion APIs.
219  *
220  *  @param ucsm    The charset match object.
221  *  @param status  Any error conditions are reported back in this variable.
222  *  @return        The name of the matching charset.
223  *
224  *  @stable ICU 3.6
225  */
226 U_STABLE const char * U_EXPORT2
227 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
228 
229 /**
230  *  Get a confidence number for the quality of the match of the byte
231  *  data with the charset.  Confidence numbers range from zero to 100,
232  *  with 100 representing complete confidence and zero representing
233  *  no confidence.
234  *
235  *  The confidence values are somewhat arbitrary.  They define an
236  *  an ordering within the results for any single detection operation
237  *  but are not generally comparable between the results for different input.
238  *
239  *  A confidence value of ten does have a general meaning - it is used
240  *  for charsets that can represent the input data, but for which there
241  *  is no other indication that suggests that the charset is the correct one.
242  *  Pure 7 bit ASCII data, for example, is compatible with a
243  *  great many charsets, most of which will appear as possible matches
244  *  with a confidence of 10.
245  *
246  *  @param ucsm    The charset match object.
247  *  @param status  Any error conditions are reported back in this variable.
248  *  @return        A confidence number for the charset match.
249  *
250  *  @stable ICU 3.6
251  */
252 U_STABLE int32_t U_EXPORT2
253 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
254 
255 /**
256  *  Get the RFC 3066 code for the language of the input data.
257  *
258  *  The Charset Detection service is intended primarily for detecting
259  *  charsets, not language.  For some, but not all, charsets, a language is
260  *  identified as a byproduct of the detection process, and that is what
261  *  is returned by this function.
262  *
263  *  CAUTION:
264  *    1.  Language information is not available for input data encoded in
265  *        all charsets. In particular, no language is identified
266  *        for UTF-8 input data.
267  *
268  *    2.  Closely related languages may sometimes be confused.
269  *
270  *  If more accurate language detection is required, a linguistic
271  *  analysis package should be used.
272  *
273  *  The storage for the returned name string is owned by the
274  *  UCharsetMatch, and will remain valid while the UCharsetMatch
275  *  is valid.
276  *
277  *  @param ucsm    The charset match object.
278  *  @param status  Any error conditions are reported back in this variable.
279  *  @return        The RFC 3066 code for the language of the input data, or
280  *                 an empty string if the language could not be determined.
281  *
282  *  @stable ICU 3.6
283  */
284 U_STABLE const char * U_EXPORT2
285 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
286 
287 
288 /**
289   *  Get the entire input text as a UChar string, placing it into
290   *  a caller-supplied buffer.  A terminating
291   *  NUL character will be appended to the buffer if space is available.
292   *
293   *  The number of UChars in the output string, not including the terminating
294   *  NUL, is returned.
295   *
296   *  If the supplied buffer is smaller than required to hold the output,
297   *  the contents of the buffer are undefined.  The full output string length
298   *  (in UChars) is returned as always, and can be used to allocate a buffer
299   *  of the correct size.
300   *
301   *
302   * @param ucsm    The charset match object.
303   * @param buf     A UChar buffer to be filled with the converted text data.
304   * @param cap     The capacity of the buffer in UChars.
305   * @param status  Any error conditions are reported back in this variable.
306   * @return        The number of UChars in the output string.
307   *
308   * @stable ICU 3.6
309   */
310 U_STABLE  int32_t U_EXPORT2
311 ucsdet_getUChars(const UCharsetMatch *ucsm,
312                  UChar *buf, int32_t cap, UErrorCode *status);
313 
314 
315 
316 /**
317   *  Get an iterator over the set of all detectable charsets -
318   *  over the charsets that are known to the charset detection
319   *  service.
320   *
321   *  The returned UEnumeration provides access to the names of
322   *  the charsets.
323   *
324   *  The state of the Charset detector that is passed in does not
325   *  affect the result of this function, but requiring a valid, open
326   *  charset detector as a parameter insures that the charset detection
327   *  service has been safely initialized and that the required detection
328   *  data is available.
329   *
330   *  @param ucsd a Charset detector.
331   *  @param status  Any error conditions are reported back in this variable.
332   *  @return an iterator providing access to the detectable charset names.
333   *  @stable ICU 3.6
334   */
335 U_STABLE  UEnumeration * U_EXPORT2
336 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
337 
338 
339 /**
340   *  Test whether input filtering is enabled for this charset detector.
341   *  Input filtering removes text that appears to be HTML or xml
342   *  markup from the input before applying the code page detection
343   *  heuristics.
344   *
345   *  @param ucsd  The charset detector to check.
346   *  @return TRUE if filtering is enabled.
347   *  @stable ICU 3.6
348   */
349 U_STABLE  UBool U_EXPORT2
350 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
351 
352 
353 /**
354  * Enable filtering of input text. If filtering is enabled,
355  * text within angle brackets ("<" and ">") will be removed
356  * before detection, which will remove most HTML or xml markup.
357  *
358  * @param ucsd   the charset detector to be modified.
359  * @param filter <code>true</code> to enable input text filtering.
360  * @return The previous setting.
361  *
362  * @stable ICU 3.6
363  */
364 U_STABLE  UBool U_EXPORT2
365 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
366 
367 #endif
368 #endif   /* __UCSDET_H */
369 
370 
371