• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2008, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_CONVERSION
11 
12 #include "csrutf8.h"
13 
14 U_NAMESPACE_BEGIN
15 
~CharsetRecog_UTF8()16 CharsetRecog_UTF8::~CharsetRecog_UTF8()
17 {
18     // nothing to do
19 }
20 
getName() const21 const char *CharsetRecog_UTF8::getName() const
22 {
23     return "UTF-8";
24 }
25 
match(InputText * det)26 int32_t CharsetRecog_UTF8::match(InputText* det) {
27     bool hasBOM = FALSE;
28     int32_t numValid = 0;
29     int32_t numInvalid = 0;
30     const uint8_t *input = det->fRawInput;
31     int32_t i;
32     int32_t trailBytes = 0;
33     int32_t confidence;
34 
35     if (det->fRawLength >= 3 &&
36         input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
37             hasBOM = TRUE;
38     }
39 
40     // Scan for multi-byte sequences
41     for (i=0; i < det->fRawLength; i += 1) {
42         int32_t b = input[i];
43 
44         if ((b & 0x80) == 0) {
45             continue;   // ASCII
46         }
47 
48         // Hi bit on char found.  Figure out how long the sequence should be
49         if ((b & 0x0E0) == 0x0C0) {
50             trailBytes = 1;
51         } else if ((b & 0x0F0) == 0x0E0) {
52             trailBytes = 2;
53         } else if ((b & 0x0F8) == 0xF0) {
54             trailBytes = 3;
55         } else {
56             numInvalid += 1;
57 
58             if (numInvalid > 5) {
59                 break;
60             }
61 
62             trailBytes = 0;
63         }
64 
65         // Verify that we've got the right number of trail bytes in the sequence
66         for (;;) {
67             i += 1;
68 
69             if (i >= det->fRawLength) {
70                 break;
71             }
72 
73             b = input[i];
74 
75             if ((b & 0xC0) != 0x080) {
76                 numInvalid += 1;
77                 break;
78             }
79 
80             if (--trailBytes == 0) {
81                 numValid += 1;
82                 break;
83             }
84         }
85 
86     }
87 
88     // Cook up some sort of confidence score, based on presense of a BOM
89     //    and the existence of valid and/or invalid multi-byte sequences.
90     confidence = 0;
91     if (hasBOM && numInvalid == 0) {
92         confidence = 100;
93     } else if (hasBOM && numValid > numInvalid*10) {
94         confidence = 80;
95     } else if (numValid > 3 && numInvalid == 0) {
96         confidence = 100;
97     } else if (numValid > 0 && numInvalid == 0) {
98         confidence = 80;
99     } else if (numValid == 0 && numInvalid == 0) {
100         // Plain ASCII.
101         confidence = 10;
102     } else if (numValid > numInvalid*10) {
103         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
104         confidence = 25;
105     }
106 
107     return confidence;
108 }
109 
110 U_NAMESPACE_END
111 #endif
112