1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "inputext.h"
13
14 #include "cmemory.h"
15 #include "cstring.h"
16
17 #include <string.h>
18
19 U_NAMESPACE_BEGIN
20
21 #define BUFFER_SIZE 8192
22
23 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
24
25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26 #define DELETE_ARRAY(array) uprv_free((void *) (array))
27
InputText(UErrorCode & status)28 InputText::InputText(UErrorCode &status)
29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
30 // removed if appropriate.
31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
32 // Value is percent, not absolute.
33 fDeclaredEncoding(0),
34 fRawInput(0),
35 fRawLength(0)
36 {
37 if (fInputBytes == NULL || fByteStats == NULL) {
38 status = U_MEMORY_ALLOCATION_ERROR;
39 }
40 }
41
~InputText()42 InputText::~InputText()
43 {
44 DELETE_ARRAY(fDeclaredEncoding);
45 DELETE_ARRAY(fByteStats);
46 DELETE_ARRAY(fInputBytes);
47 }
48
setText(const char * in,int32_t len)49 void InputText::setText(const char *in, int32_t len)
50 {
51 fInputLen = 0;
52 fC1Bytes = FALSE;
53 fRawInput = (const uint8_t *) in;
54 fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
55 }
56
setDeclaredEncoding(const char * encoding,int32_t len)57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
58 {
59 if(encoding) {
60 if (len == -1) {
61 len = (int32_t)uprv_strlen(encoding);
62 }
63
64 len += 1; // to make place for the \0 at the end.
65 uprv_free(fDeclaredEncoding);
66 fDeclaredEncoding = NEW_ARRAY(char, len);
67 uprv_strncpy(fDeclaredEncoding, encoding, len);
68 }
69 }
70
isSet() const71 UBool InputText::isSet() const
72 {
73 return fRawInput != NULL;
74 }
75
76 /**
77 * MungeInput - after getting a set of raw input data to be analyzed, preprocess
78 * it by removing what appears to be html markup.
79 *
80 * @internal
81 */
MungeInput(UBool fStripTags)82 void InputText::MungeInput(UBool fStripTags) {
83 int srci = 0;
84 int dsti = 0;
85 uint8_t b;
86 bool inMarkup = FALSE;
87 int32_t openTags = 0;
88 int32_t badTags = 0;
89
90 //
91 // html / xml markup stripping.
92 // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
93 // discard everything within < brackets >
94 // Count how many total '<' and illegal (nested) '<' occur, so we can make some
95 // guess as to whether the input was actually marked up at all.
96 // TODO: Think about how this interacts with EBCDIC charsets that are detected.
97 if (fStripTags) {
98 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
99 b = fRawInput[srci];
100
101 if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
102 if (inMarkup) {
103 badTags += 1;
104 }
105
106 inMarkup = TRUE;
107 openTags += 1;
108 }
109
110 if (! inMarkup) {
111 fInputBytes[dsti++] = b;
112 }
113
114 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
115 inMarkup = FALSE;
116 }
117 }
118
119 fInputLen = dsti;
120 }
121
122 //
123 // If it looks like this input wasn't marked up, or if it looks like it's
124 // essentially nothing but markup abandon the markup stripping.
125 // Detection will have to work on the unstripped input.
126 //
127 if (openTags<5 || openTags/5 < badTags ||
128 (fInputLen < 100 && fRawLength>600))
129 {
130 int32_t limit = fRawLength;
131
132 if (limit > BUFFER_SIZE) {
133 limit = BUFFER_SIZE;
134 }
135
136 for (srci=0; srci<limit; srci++) {
137 fInputBytes[srci] = fRawInput[srci];
138 }
139
140 fInputLen = srci;
141 }
142
143 //
144 // Tally up the byte occurence statistics.
145 // These are available for use by the various detectors.
146 //
147
148 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
149
150 for (srci = 0; srci < fInputLen; srci += 1) {
151 fByteStats[fInputBytes[srci]] += 1;
152 }
153
154 for (int32_t i = 0x80; i <= 0x9F; i += 1) {
155 if (fByteStats[i] != 0) {
156 fC1Bytes = TRUE;
157 break;
158 }
159 }
160 }
161
162 U_NAMESPACE_END
163 #endif
164
165