1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2013, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucsdet.h 9 * encoding: UTF-8 10 * indentation:4 11 * 12 * created on: 2005Aug04 13 * created by: Andy Heninger 14 * 15 * ICU Character Set Detection, API for C 16 * 17 * Draft version 18 Oct 2005 18 * 19 */ 20 21 #ifndef __UCSDET_H 22 #define __UCSDET_H 23 24 #include "unicode/utypes.h" 25 26 #if !UCONFIG_NO_CONVERSION 27 28 #include "unicode/uenum.h" 29 30 #if U_SHOW_CPLUSPLUS_API 31 #include "unicode/localpointer.h" 32 #endif // U_SHOW_CPLUSPLUS_API 33 34 /** 35 * \file 36 * \brief C API: Charset Detection API 37 * 38 * This API provides a facility for detecting the 39 * charset or encoding of character data in an unknown text format. 40 * The input data can be from an array of bytes. 41 * <p> 42 * Character set detection is at best an imprecise operation. The detection 43 * process will attempt to identify the charset that best matches the characteristics 44 * of the byte data, but the process is partly statistical in nature, and 45 * the results can not be guaranteed to always be correct. 46 * <p> 47 * For best accuracy in charset detection, the input data should be primarily 48 * in a single language, and a minimum of a few hundred bytes worth of plain text 49 * in the language are needed. The detection process will attempt to 50 * ignore html or xml style markup that could otherwise obscure the content. 51 * <p> 52 * An alternative to the ICU Charset Detector is the 53 * Compact Encoding Detector, https://github.com/google/compact_enc_det. 54 * It often gives more accurate results, especially with short input samples. 55 */ 56 57 58 struct UCharsetDetector; 59 /** 60 * Structure representing a charset detector 61 * @stable ICU 3.6 62 */ 63 typedef struct UCharsetDetector UCharsetDetector; 64 65 struct UCharsetMatch; 66 /** 67 * Opaque structure representing a match that was identified 68 * from a charset detection operation. 69 * @stable ICU 3.6 70 */ 71 typedef struct UCharsetMatch UCharsetMatch; 72 73 /** 74 * Open a charset detector. 75 * 76 * @param status Any error conditions occurring during the open 77 * operation are reported back in this variable. 78 * @return the newly opened charset detector. 79 * @stable ICU 3.6 80 */ 81 U_CAPI UCharsetDetector * U_EXPORT2 82 ucsdet_open(UErrorCode *status); 83 84 /** 85 * Close a charset detector. All storage and any other resources 86 * owned by this charset detector will be released. Failure to 87 * close a charset detector when finished with it can result in 88 * memory leaks in the application. 89 * 90 * @param ucsd The charset detector to be closed. 91 * @stable ICU 3.6 92 */ 93 U_CAPI void U_EXPORT2 94 ucsdet_close(UCharsetDetector *ucsd); 95 96 #if U_SHOW_CPLUSPLUS_API 97 98 U_NAMESPACE_BEGIN 99 100 /** 101 * \class LocalUCharsetDetectorPointer 102 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 103 * For most methods see the LocalPointerBase base class. 104 * 105 * @see LocalPointerBase 106 * @see LocalPointer 107 * @stable ICU 4.4 108 */ 109 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 110 111 U_NAMESPACE_END 112 113 #endif 114 115 /** 116 * Set the input byte data whose charset is to detected. 117 * 118 * Ownership of the input text byte array remains with the caller. 119 * The input string must not be altered or deleted until the charset 120 * detector is either closed or reset to refer to different input text. 121 * 122 * @param ucsd the charset detector to be used. 123 * @param textIn the input text of unknown encoding. . 124 * @param len the length of the input text, or -1 if the text 125 * is NUL terminated. 126 * @param status any error conditions are reported back in this variable. 127 * 128 * @stable ICU 3.6 129 */ 130 U_CAPI void U_EXPORT2 131 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 132 133 134 /** Set the declared encoding for charset detection. 135 * The declared encoding of an input text is an encoding obtained 136 * by the user from an http header or xml declaration or similar source that 137 * can be provided as an additional hint to the charset detector. 138 * 139 * How and whether the declared encoding will be used during the 140 * detection process is TBD. 141 * 142 * @param ucsd the charset detector to be used. 143 * @param encoding an encoding for the current data obtained from 144 * a header or declaration or other source outside 145 * of the byte data itself. 146 * @param length the length of the encoding name, or -1 if the name string 147 * is NUL terminated. 148 * @param status any error conditions are reported back in this variable. 149 * 150 * @stable ICU 3.6 151 */ 152 U_CAPI void U_EXPORT2 153 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 154 155 156 /** 157 * Return the charset that best matches the supplied input data. 158 * 159 * Note though, that because the detection 160 * only looks at the start of the input data, 161 * there is a possibility that the returned charset will fail to handle 162 * the full set of input data. 163 * <p> 164 * The returned UCharsetMatch object is owned by the UCharsetDetector. 165 * It will remain valid until the detector input is reset, or until 166 * the detector is closed. 167 * <p> 168 * The function will fail if 169 * <ul> 170 * <li>no charset appears to match the data.</li> 171 * <li>no input text has been provided</li> 172 * </ul> 173 * 174 * @param ucsd the charset detector to be used. 175 * @param status any error conditions are reported back in this variable. 176 * @return a UCharsetMatch representing the best matching charset, 177 * or NULL if no charset matches the byte data. 178 * 179 * @stable ICU 3.6 180 */ 181 U_CAPI const UCharsetMatch * U_EXPORT2 182 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); 183 184 185 /** 186 * Find all charset matches that appear to be consistent with the input, 187 * returning an array of results. The results are ordered with the 188 * best quality match first. 189 * 190 * Because the detection only looks at a limited amount of the 191 * input byte data, some of the returned charsets may fail to handle 192 * the all of input data. 193 * <p> 194 * The returned UCharsetMatch objects are owned by the UCharsetDetector. 195 * They will remain valid until the detector is closed or modified 196 * 197 * <p> 198 * Return an error if 199 * <ul> 200 * <li>no charsets appear to match the input data.</li> 201 * <li>no input text has been provided</li> 202 * </ul> 203 * 204 * @param ucsd the charset detector to be used. 205 * @param matchesFound pointer to a variable that will be set to the 206 * number of charsets identified that are consistent with 207 * the input data. Output only. 208 * @param status any error conditions are reported back in this variable. 209 * @return A pointer to an array of pointers to UCharSetMatch objects. 210 * This array, and the UCharSetMatch instances to which it refers, 211 * are owned by the UCharsetDetector, and will remain valid until 212 * the detector is closed or modified. 213 * @stable ICU 3.6 214 */ 215 U_CAPI const UCharsetMatch ** U_EXPORT2 216 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); 217 218 219 220 /** 221 * Get the name of the charset represented by a UCharsetMatch. 222 * 223 * The storage for the returned name string is owned by the 224 * UCharsetMatch, and will remain valid while the UCharsetMatch 225 * is valid. 226 * 227 * The name returned is suitable for use with the ICU conversion APIs. 228 * 229 * @param ucsm The charset match object. 230 * @param status Any error conditions are reported back in this variable. 231 * @return The name of the matching charset. 232 * 233 * @stable ICU 3.6 234 */ 235 U_CAPI const char * U_EXPORT2 236 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); 237 238 /** 239 * Get a confidence number for the quality of the match of the byte 240 * data with the charset. Confidence numbers range from zero to 100, 241 * with 100 representing complete confidence and zero representing 242 * no confidence. 243 * 244 * The confidence values are somewhat arbitrary. They define an 245 * an ordering within the results for any single detection operation 246 * but are not generally comparable between the results for different input. 247 * 248 * A confidence value of ten does have a general meaning - it is used 249 * for charsets that can represent the input data, but for which there 250 * is no other indication that suggests that the charset is the correct one. 251 * Pure 7 bit ASCII data, for example, is compatible with a 252 * great many charsets, most of which will appear as possible matches 253 * with a confidence of 10. 254 * 255 * @param ucsm The charset match object. 256 * @param status Any error conditions are reported back in this variable. 257 * @return A confidence number for the charset match. 258 * 259 * @stable ICU 3.6 260 */ 261 U_CAPI int32_t U_EXPORT2 262 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); 263 264 /** 265 * Get the RFC 3066 code for the language of the input data. 266 * 267 * The Charset Detection service is intended primarily for detecting 268 * charsets, not language. For some, but not all, charsets, a language is 269 * identified as a byproduct of the detection process, and that is what 270 * is returned by this function. 271 * 272 * CAUTION: 273 * 1. Language information is not available for input data encoded in 274 * all charsets. In particular, no language is identified 275 * for UTF-8 input data. 276 * 277 * 2. Closely related languages may sometimes be confused. 278 * 279 * If more accurate language detection is required, a linguistic 280 * analysis package should be used. 281 * 282 * The storage for the returned name string is owned by the 283 * UCharsetMatch, and will remain valid while the UCharsetMatch 284 * is valid. 285 * 286 * @param ucsm The charset match object. 287 * @param status Any error conditions are reported back in this variable. 288 * @return The RFC 3066 code for the language of the input data, or 289 * an empty string if the language could not be determined. 290 * 291 * @stable ICU 3.6 292 */ 293 U_CAPI const char * U_EXPORT2 294 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); 295 296 297 /** 298 * Get the entire input text as a UChar string, placing it into 299 * a caller-supplied buffer. A terminating 300 * NUL character will be appended to the buffer if space is available. 301 * 302 * The number of UChars in the output string, not including the terminating 303 * NUL, is returned. 304 * 305 * If the supplied buffer is smaller than required to hold the output, 306 * the contents of the buffer are undefined. The full output string length 307 * (in UChars) is returned as always, and can be used to allocate a buffer 308 * of the correct size. 309 * 310 * 311 * @param ucsm The charset match object. 312 * @param buf A UChar buffer to be filled with the converted text data. 313 * @param cap The capacity of the buffer in UChars. 314 * @param status Any error conditions are reported back in this variable. 315 * @return The number of UChars in the output string. 316 * 317 * @stable ICU 3.6 318 */ 319 U_CAPI int32_t U_EXPORT2 320 ucsdet_getUChars(const UCharsetMatch *ucsm, 321 UChar *buf, int32_t cap, UErrorCode *status); 322 323 324 325 /** 326 * Get an iterator over the set of all detectable charsets - 327 * over the charsets that are known to the charset detection 328 * service. 329 * 330 * The returned UEnumeration provides access to the names of 331 * the charsets. 332 * 333 * <p> 334 * The state of the Charset detector that is passed in does not 335 * affect the result of this function, but requiring a valid, open 336 * charset detector as a parameter insures that the charset detection 337 * service has been safely initialized and that the required detection 338 * data is available. 339 * 340 * <p> 341 * <b>Note:</b> Multiple different charset encodings in a same family may use 342 * a single shared name in this implementation. For example, this method returns 343 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 344 * (Windows Latin 1). However, actual detection result could be "windows-1252" 345 * when the input data matches Latin 1 code points with any points only available 346 * in "windows-1252". 347 * 348 * @param ucsd a Charset detector. 349 * @param status Any error conditions are reported back in this variable. 350 * @return an iterator providing access to the detectable charset names. 351 * @stable ICU 3.6 352 */ 353 U_CAPI UEnumeration * U_EXPORT2 354 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 355 356 /** 357 * Test whether input filtering is enabled for this charset detector. 358 * Input filtering removes text that appears to be HTML or xml 359 * markup from the input before applying the code page detection 360 * heuristics. 361 * 362 * @param ucsd The charset detector to check. 363 * @return true if filtering is enabled. 364 * @stable ICU 3.6 365 */ 366 367 U_CAPI UBool U_EXPORT2 368 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 369 370 371 /** 372 * Enable filtering of input text. If filtering is enabled, 373 * text within angle brackets ("<" and ">") will be removed 374 * before detection, which will remove most HTML or xml markup. 375 * 376 * @param ucsd the charset detector to be modified. 377 * @param filter <code>true</code> to enable input text filtering. 378 * @return The previous setting. 379 * 380 * @stable ICU 3.6 381 */ 382 U_CAPI UBool U_EXPORT2 383 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 384 385 #ifndef U_HIDE_INTERNAL_API 386 /** 387 * Get an iterator over the set of detectable charsets - 388 * over the charsets that are enabled by the specified charset detector. 389 * 390 * The returned UEnumeration provides access to the names of 391 * the charsets. 392 * 393 * @param ucsd a Charset detector. 394 * @param status Any error conditions are reported back in this variable. 395 * @return an iterator providing access to the detectable charset names by 396 * the specified charset detector. 397 * @internal 398 */ 399 U_CAPI UEnumeration * U_EXPORT2 400 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 401 402 /** 403 * Enable or disable individual charset encoding. 404 * A name of charset encoding must be included in the names returned by 405 * {@link #ucsdet_getAllDetectableCharsets()}. 406 * 407 * @param ucsd a Charset detector. 408 * @param encoding encoding the name of charset encoding. 409 * @param enabled <code>true</code> to enable, or <code>false</code> to disable the 410 * charset encoding. 411 * @param status receives the return status. When the name of charset encoding 412 * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 413 * @internal 414 */ 415 U_CAPI void U_EXPORT2 416 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 417 #endif /* U_HIDE_INTERNAL_API */ 418 419 #endif 420 #endif /* __UCSDET_H */ 421 422 423