• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2006, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_CONVERSION
11 
12 #include "csrucode.h"
13 
14 U_NAMESPACE_BEGIN
15 
~CharsetRecog_Unicode()16 CharsetRecog_Unicode::~CharsetRecog_Unicode()
17 {
18     // nothing to do
19 }
20 
~CharsetRecog_UTF_16_BE()21 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
22 {
23     // nothing to do
24 }
25 
getName() const26 const char *CharsetRecog_UTF_16_BE::getName() const
27 {
28     return "UTF-16BE";
29 }
30 
match(InputText * textIn)31 int32_t CharsetRecog_UTF_16_BE::match(InputText* textIn)
32 {
33     const uint8_t *input = textIn->fRawInput;
34     int32_t length = textIn->fRawLength;
35 
36     if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) {
37         return 100;
38     }
39 
40     // TODO: Do some statastics to check for unsigned UTF-16BE
41     return 0;
42 }
43 
~CharsetRecog_UTF_16_LE()44 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
45 {
46     // nothing to do
47 }
48 
getName() const49 const char *CharsetRecog_UTF_16_LE::getName() const
50 {
51     return "UTF-16LE";
52 }
53 
match(InputText * textIn)54 int32_t CharsetRecog_UTF_16_LE::match(InputText* textIn)
55 {
56     const uint8_t *input = textIn->fRawInput;
57     int32_t length = textIn->fRawLength;
58 
59     if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
60         return 100;
61     }
62 
63     // TODO: Do some statastics to check for unsigned UTF-16LE
64     return 0;
65 }
66 
~CharsetRecog_UTF_32()67 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
68 {
69     // nothing to do
70 }
71 
match(InputText * textIn)72 int32_t CharsetRecog_UTF_32::match(InputText* textIn)
73 {
74     const uint8_t *input = textIn->fRawInput;
75     int32_t limit = (textIn->fRawLength / 4) * 4;
76     int32_t numValid = 0;
77     int32_t numInvalid = 0;
78     bool hasBOM = FALSE;
79     int32_t confidence = 0;
80 
81     if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
82         hasBOM = TRUE;
83     }
84 
85     for(int32_t i = 0; i < limit; i += 4) {
86         int32_t ch = getChar(input, i);
87 
88         if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
89             numInvalid += 1;
90         } else {
91             numValid += 1;
92         }
93     }
94 
95 
96     // Cook up some sort of confidence score, based on presense of a BOM
97     //    and the existence of valid and/or invalid multi-byte sequences.
98     if (hasBOM && numInvalid==0) {
99         confidence = 100;
100     } else if (hasBOM && numValid > numInvalid*10) {
101         confidence = 80;
102     } else if (numValid > 3 && numInvalid == 0) {
103         confidence = 100;
104     } else if (numValid > 0 && numInvalid == 0) {
105         confidence = 80;
106     } else if (numValid > numInvalid*10) {
107         // Probably corruput UTF-32BE data.  Valid sequences aren't likely by chance.
108         confidence = 25;
109     }
110 
111     return confidence;
112 }
113 
~CharsetRecog_UTF_32_BE()114 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
115 {
116     // nothing to do
117 }
118 
getName() const119 const char *CharsetRecog_UTF_32_BE::getName() const
120 {
121     return "UTF-32BE";
122 }
123 
getChar(const uint8_t * input,int32_t index) const124 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
125 {
126     return input[index + 0] << 24 | input[index + 1] << 16 |
127            input[index + 2] <<  8 | input[index + 3];
128 }
129 
~CharsetRecog_UTF_32_LE()130 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
131 {
132     // nothing to do
133 }
134 
getName() const135 const char *CharsetRecog_UTF_32_LE::getName() const
136 {
137     return "UTF-32LE";
138 }
139 
getChar(const uint8_t * input,int32_t index) const140 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
141 {
142     return input[index + 3] << 24 | input[index + 2] << 16 |
143            input[index + 1] <<  8 | input[index + 0];
144 }
145 
146 U_NAMESPACE_END
147 #endif
148 
149