• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2013, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  *   file name:  ucsdet.h
9  *   encoding:   UTF-8
10  *   indentation:4
11  *
12  *   created on: 2005Aug04
13  *   created by: Andy Heninger
14  *
15  *   ICU Character Set Detection, API for C
16  *
17  *   Draft version 18 Oct 2005
18  *
19  */
20 
21 #ifndef __UCSDET_H
22 #define __UCSDET_H
23 
24 #include "unicode/utypes.h"
25 
26 #if !UCONFIG_NO_CONVERSION
27 
28 #include "unicode/uenum.h"
29 
30 #if U_SHOW_CPLUSPLUS_API
31 #include "unicode/localpointer.h"
32 #endif   // U_SHOW_CPLUSPLUS_API
33 
34 /**
35  * \file
36  * \brief C API: Charset Detection API
37  *
38  * This API provides a facility for detecting the
39  * charset or encoding of character data in an unknown text format.
40  * The input data can be from an array of bytes.
41  * <p>
42  * Character set detection is at best an imprecise operation.  The detection
43  * process will attempt to identify the charset that best matches the characteristics
44  * of the byte data, but the process is partly statistical in nature, and
45  * the results can not be guaranteed to always be correct.
46  * <p>
47  * For best accuracy in charset detection, the input data should be primarily
48  * in a single language, and a minimum of a few hundred bytes worth of plain text
49  * in the language are needed.  The detection process will attempt to
50  * ignore html or xml style markup that could otherwise obscure the content.
51  * <p>
52  * An alternative to the ICU Charset Detector is the
53  * Compact Encoding Detector, https://github.com/google/compact_enc_det.
54  * It often gives more accurate results, especially with short input samples.
55  */
56 
57 
58 struct UCharsetDetector;
59 /**
60   * Structure representing a charset detector
61   * @stable ICU 3.6
62   */
63 typedef struct UCharsetDetector UCharsetDetector;
64 
65 struct UCharsetMatch;
66 /**
67   *  Opaque structure representing a match that was identified
68   *  from a charset detection operation.
69   *  @stable ICU 3.6
70   */
71 typedef struct UCharsetMatch UCharsetMatch;
72 
73 /**
74   *  Open a charset detector.
75   *
76   *  @param status Any error conditions occurring during the open
77   *                operation are reported back in this variable.
78   *  @return the newly opened charset detector.
79   *  @stable ICU 3.6
80   */
81 U_CAPI UCharsetDetector * U_EXPORT2
82 ucsdet_open(UErrorCode   *status);
83 
84 /**
85   * Close a charset detector.  All storage and any other resources
86   *   owned by this charset detector will be released.  Failure to
87   *   close a charset detector when finished with it can result in
88   *   memory leaks in the application.
89   *
90   *  @param ucsd  The charset detector to be closed.
91   *  @stable ICU 3.6
92   */
93 U_CAPI void U_EXPORT2
94 ucsdet_close(UCharsetDetector *ucsd);
95 
96 #if U_SHOW_CPLUSPLUS_API
97 
98 U_NAMESPACE_BEGIN
99 
100 /**
101  * \class LocalUCharsetDetectorPointer
102  * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
103  * For most methods see the LocalPointerBase base class.
104  *
105  * @see LocalPointerBase
106  * @see LocalPointer
107  * @stable ICU 4.4
108  */
109 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
110 
111 U_NAMESPACE_END
112 
113 #endif
114 
115 /**
116   * Set the input byte data whose charset is to detected.
117   *
118   * Ownership of the input  text byte array remains with the caller.
119   * The input string must not be altered or deleted until the charset
120   * detector is either closed or reset to refer to different input text.
121   *
122   * @param ucsd   the charset detector to be used.
123   * @param textIn the input text of unknown encoding.   .
124   * @param len    the length of the input text, or -1 if the text
125   *               is NUL terminated.
126   * @param status any error conditions are reported back in this variable.
127   *
128   * @stable ICU 3.6
129   */
130 U_CAPI void U_EXPORT2
131 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
132 
133 
134 /** Set the declared encoding for charset detection.
135  *  The declared encoding of an input text is an encoding obtained
136  *  by the user from an http header or xml declaration or similar source that
137  *  can be provided as an additional hint to the charset detector.
138  *
139  *  How and whether the declared encoding will be used during the
140  *  detection process is TBD.
141  *
142  * @param ucsd      the charset detector to be used.
143  * @param encoding  an encoding for the current data obtained from
144  *                  a header or declaration or other source outside
145  *                  of the byte data itself.
146  * @param length    the length of the encoding name, or -1 if the name string
147  *                  is NUL terminated.
148  * @param status    any error conditions are reported back in this variable.
149  *
150  * @stable ICU 3.6
151  */
152 U_CAPI void U_EXPORT2
153 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
154 
155 
156 /**
157  * Return the charset that best matches the supplied input data.
158  *
159  * Note though, that because the detection
160  * only looks at the start of the input data,
161  * there is a possibility that the returned charset will fail to handle
162  * the full set of input data.
163  * <p>
164  * The returned UCharsetMatch object is owned by the UCharsetDetector.
165  * It will remain valid until the detector input is reset, or until
166  * the detector is closed.
167  * <p>
168  * The function will fail if
169  *  <ul>
170  *    <li>no charset appears to match the data.</li>
171  *    <li>no input text has been provided</li>
172  *  </ul>
173  *
174  * @param ucsd      the charset detector to be used.
175  * @param status    any error conditions are reported back in this variable.
176  * @return          a UCharsetMatch  representing the best matching charset,
177  *                  or NULL if no charset matches the byte data.
178  *
179  * @stable ICU 3.6
180  */
181 U_CAPI const UCharsetMatch * U_EXPORT2
182 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
183 
184 
185 /**
186  *  Find all charset matches that appear to be consistent with the input,
187  *  returning an array of results.  The results are ordered with the
188  *  best quality match first.
189  *
190  *  Because the detection only looks at a limited amount of the
191  *  input byte data, some of the returned charsets may fail to handle
192  *  the all of input data.
193  *  <p>
194  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
195  *  They will remain valid until the detector is closed or modified
196  *
197  * <p>
198  * Return an error if
199  *  <ul>
200  *    <li>no charsets appear to match the input data.</li>
201  *    <li>no input text has been provided</li>
202  *  </ul>
203  *
204  * @param ucsd          the charset detector to be used.
205  * @param matchesFound  pointer to a variable that will be set to the
206  *                      number of charsets identified that are consistent with
207  *                      the input data.  Output only.
208  * @param status        any error conditions are reported back in this variable.
209  * @return              A pointer to an array of pointers to UCharSetMatch objects.
210  *                      This array, and the UCharSetMatch instances to which it refers,
211  *                      are owned by the UCharsetDetector, and will remain valid until
212  *                      the detector is closed or modified.
213  * @stable ICU 3.6
214  */
215 U_CAPI const UCharsetMatch ** U_EXPORT2
216 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
217 
218 
219 
220 /**
221  *  Get the name of the charset represented by a UCharsetMatch.
222  *
223  *  The storage for the returned name string is owned by the
224  *  UCharsetMatch, and will remain valid while the UCharsetMatch
225  *  is valid.
226  *
227  *  The name returned is suitable for use with the ICU conversion APIs.
228  *
229  *  @param ucsm    The charset match object.
230  *  @param status  Any error conditions are reported back in this variable.
231  *  @return        The name of the matching charset.
232  *
233  *  @stable ICU 3.6
234  */
235 U_CAPI const char * U_EXPORT2
236 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
237 
238 /**
239  *  Get a confidence number for the quality of the match of the byte
240  *  data with the charset.  Confidence numbers range from zero to 100,
241  *  with 100 representing complete confidence and zero representing
242  *  no confidence.
243  *
244  *  The confidence values are somewhat arbitrary.  They define an
245  *  an ordering within the results for any single detection operation
246  *  but are not generally comparable between the results for different input.
247  *
248  *  A confidence value of ten does have a general meaning - it is used
249  *  for charsets that can represent the input data, but for which there
250  *  is no other indication that suggests that the charset is the correct one.
251  *  Pure 7 bit ASCII data, for example, is compatible with a
252  *  great many charsets, most of which will appear as possible matches
253  *  with a confidence of 10.
254  *
255  *  @param ucsm    The charset match object.
256  *  @param status  Any error conditions are reported back in this variable.
257  *  @return        A confidence number for the charset match.
258  *
259  *  @stable ICU 3.6
260  */
261 U_CAPI int32_t U_EXPORT2
262 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
263 
264 /**
265  *  Get the RFC 3066 code for the language of the input data.
266  *
267  *  The Charset Detection service is intended primarily for detecting
268  *  charsets, not language.  For some, but not all, charsets, a language is
269  *  identified as a byproduct of the detection process, and that is what
270  *  is returned by this function.
271  *
272  *  CAUTION:
273  *    1.  Language information is not available for input data encoded in
274  *        all charsets. In particular, no language is identified
275  *        for UTF-8 input data.
276  *
277  *    2.  Closely related languages may sometimes be confused.
278  *
279  *  If more accurate language detection is required, a linguistic
280  *  analysis package should be used.
281  *
282  *  The storage for the returned name string is owned by the
283  *  UCharsetMatch, and will remain valid while the UCharsetMatch
284  *  is valid.
285  *
286  *  @param ucsm    The charset match object.
287  *  @param status  Any error conditions are reported back in this variable.
288  *  @return        The RFC 3066 code for the language of the input data, or
289  *                 an empty string if the language could not be determined.
290  *
291  *  @stable ICU 3.6
292  */
293 U_CAPI const char * U_EXPORT2
294 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
295 
296 
297 /**
298   *  Get the entire input text as a UChar string, placing it into
299   *  a caller-supplied buffer.  A terminating
300   *  NUL character will be appended to the buffer if space is available.
301   *
302   *  The number of UChars in the output string, not including the terminating
303   *  NUL, is returned.
304   *
305   *  If the supplied buffer is smaller than required to hold the output,
306   *  the contents of the buffer are undefined.  The full output string length
307   *  (in UChars) is returned as always, and can be used to allocate a buffer
308   *  of the correct size.
309   *
310   *
311   * @param ucsm    The charset match object.
312   * @param buf     A UChar buffer to be filled with the converted text data.
313   * @param cap     The capacity of the buffer in UChars.
314   * @param status  Any error conditions are reported back in this variable.
315   * @return        The number of UChars in the output string.
316   *
317   * @stable ICU 3.6
318   */
319 U_CAPI  int32_t U_EXPORT2
320 ucsdet_getUChars(const UCharsetMatch *ucsm,
321                  UChar *buf, int32_t cap, UErrorCode *status);
322 
323 
324 
325 /**
326   *  Get an iterator over the set of all detectable charsets -
327   *  over the charsets that are known to the charset detection
328   *  service.
329   *
330   *  The returned UEnumeration provides access to the names of
331   *  the charsets.
332   *
333   *  <p>
334   *  The state of the Charset detector that is passed in does not
335   *  affect the result of this function, but requiring a valid, open
336   *  charset detector as a parameter insures that the charset detection
337   *  service has been safely initialized and that the required detection
338   *  data is available.
339   *
340   *  <p>
341   *  <b>Note:</b> Multiple different charset encodings in a same family may use
342   *  a single shared name in this implementation. For example, this method returns
343   *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
344   *  (Windows Latin 1). However, actual detection result could be "windows-1252"
345   *  when the input data matches Latin 1 code points with any points only available
346   *  in "windows-1252".
347   *
348   *  @param ucsd a Charset detector.
349   *  @param status  Any error conditions are reported back in this variable.
350   *  @return an iterator providing access to the detectable charset names.
351   *  @stable ICU 3.6
352   */
353 U_CAPI  UEnumeration * U_EXPORT2
354 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
355 
356 /**
357   *  Test whether input filtering is enabled for this charset detector.
358   *  Input filtering removes text that appears to be HTML or xml
359   *  markup from the input before applying the code page detection
360   *  heuristics.
361   *
362   *  @param ucsd  The charset detector to check.
363   *  @return true if filtering is enabled.
364   *  @stable ICU 3.6
365   */
366 
367 U_CAPI  UBool U_EXPORT2
368 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
369 
370 
371 /**
372  * Enable filtering of input text. If filtering is enabled,
373  * text within angle brackets ("<" and ">") will be removed
374  * before detection, which will remove most HTML or xml markup.
375  *
376  * @param ucsd   the charset detector to be modified.
377  * @param filter <code>true</code> to enable input text filtering.
378  * @return The previous setting.
379  *
380  * @stable ICU 3.6
381  */
382 U_CAPI  UBool U_EXPORT2
383 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
384 
385 #ifndef U_HIDE_INTERNAL_API
386 /**
387   *  Get an iterator over the set of detectable charsets -
388   *  over the charsets that are enabled by the specified charset detector.
389   *
390   *  The returned UEnumeration provides access to the names of
391   *  the charsets.
392   *
393   *  @param ucsd a Charset detector.
394   *  @param status  Any error conditions are reported back in this variable.
395   *  @return an iterator providing access to the detectable charset names by
396   *  the specified charset detector.
397   *  @internal
398   */
399 U_CAPI UEnumeration * U_EXPORT2
400 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
401 
402 /**
403   * Enable or disable individual charset encoding.
404   * A name of charset encoding must be included in the names returned by
405   * {@link #ucsdet_getAllDetectableCharsets()}.
406   *
407   * @param ucsd a Charset detector.
408   * @param encoding encoding the name of charset encoding.
409   * @param enabled <code>true</code> to enable, or <code>false</code> to disable the
410   *   charset encoding.
411   * @param status receives the return status. When the name of charset encoding
412   *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
413   * @internal
414   */
415 U_CAPI void U_EXPORT2
416 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
417 #endif  /* U_HIDE_INTERNAL_API */
418 
419 #endif
420 #endif   /* __UCSDET_H */
421 
422 
423