• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2006, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_CONVERSION
11 
12 #include "csrutf8.h"
13 
14 U_NAMESPACE_BEGIN
15 
~CharsetRecog_UTF8()16 CharsetRecog_UTF8::~CharsetRecog_UTF8()
17 {
18     // nothing to do
19 }
20 
getName() const21 const char *CharsetRecog_UTF8::getName() const
22 {
23     return "UTF-8";
24 }
25 
match(InputText * det)26 int32_t CharsetRecog_UTF8::match(InputText* det) {
27     bool hasBOM = FALSE;
28     int32_t numValid = 0;
29     int32_t numInvalid = 0;
30     const uint8_t *input = det->fRawInput;
31     int32_t i;
32     int32_t trailBytes = 0;
33     int32_t confidence;
34 
35     if (det->fRawLength >= 3 &&
36         input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
37             hasBOM = TRUE;
38         }
39 
40         // Scan for multi-byte sequences
41         for (i=0; i < det->fRawLength; i += 1) {
42             int32_t b = input[i];
43 
44             if ((b & 0x80) == 0) {
45                 continue;   // ASCII
46             }
47 
48             // Hi bit on char found.  Figure out how long the sequence should be
49             if ((b & 0x0E0) == 0x0C0) {
50                 trailBytes = 1;
51             } else if ((b & 0x0F0) == 0x0E0) {
52                 trailBytes = 2;
53             } else if ((b & 0x0F8) == 0xF0) {
54                 trailBytes = 3;
55             } else {
56                 numInvalid += 1;
57 
58                 if (numInvalid > 5) {
59                     break;
60                 }
61 
62                 trailBytes = 0;
63             }
64 
65             // Verify that we've got the right number of trail bytes in the sequence
66             for (;;) {
67                 i += 1;
68 
69                 if (i >= det->fRawLength) {
70                     break;
71                 }
72 
73                 b = input[i];
74 
75                 if ((b & 0xC0) != 0x080) {
76                     numInvalid += 1;
77                     break;
78                 }
79 
80                 if (--trailBytes == 0) {
81                     numValid += 1;
82                     break;
83                 }
84             }
85 
86         }
87 
88         // Cook up some sort of confidence score, based on presense of a BOM
89         //    and the existence of valid and/or invalid multi-byte sequences.
90         confidence = 0;
91         if (hasBOM && numInvalid == 0) {
92             confidence = 100;
93         } else if (hasBOM && numValid > numInvalid*10) {
94             confidence = 80;
95         } else if (numValid > 3 && numInvalid == 0) {
96             confidence = 100;
97         } else if (numValid > 0 && numInvalid == 0) {
98             confidence = 80;
99         } else if (numValid == 0 && numInvalid == 0) {
100             // Plain ASCII.
101             confidence = 10;
102         } else if (numValid > numInvalid*10) {
103             // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
104             confidence = 25;
105         }
106 
107         return confidence;
108 }
109 
110 U_NAMESPACE_END
111 #endif
112