• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
6 #define ENCODINGS_PUBLIC_ENCODINGS_H_
7 
8 // This interface defines the Encoding enum and various functions that
9 // depend only on Encoding values.
10 
11 // A hash-function for Encoding, hash<Encoding>, is defined in
12 // i18n/encodings/public/encodings-hash.h
13 
14 // On some Windows projects, UNICODE may be defined, which would prevent the
15 // Encoding enum below from compiling. Note that this is a quick fix that does
16 // not break any existing projects. The UNICODE enum may someday be changed
17 // to something more specific and non-colliding, but this involves careful
18 // testing of changes in many other projects.
19 #undef UNICODE
20 
21 // NOTE: The Encoding enum must always start at 0. This assumption has
22 // been made and used.
23 
24 #ifndef SWIG
25 
26 #include "encodings/proto/encodings.pb.h"
27 
28 // We must have this for compatibility.
29 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
30 //using namespace i18n::encodings;
31 
32 #else
33 
34 // Special proto SWIG workaround header file.
35 #include "i18n/encodings/internal/encodings_proto_wrapper.h"
36 
37 #endif
38 
39 const int kNumEncodings = NUM_ENCODINGS;
40 
41 // some of the popular encoding aliases
42 // TODO(jrm) Make these static const Encoding values instead of macros.
43 #define LATIN1           ISO_8859_1
44 #define LATIN2           ISO_8859_2
45 #define LATIN3           ISO_8859_3
46 #define LATIN4           ISO_8859_4
47 #define CYRILLIC         ISO_8859_5
48 #define ARABIC_ENCODING  ISO_8859_6     // avoiding the same name as language
49 #define GREEK_ENCODING   ISO_8859_7     // avoiding the same name as language
50 #define HEBREW_ENCODING  ISO_8859_8     // avoiding the same name as language
51 #define LATIN5           ISO_8859_9
52 #define LATIN6           ISO_8859_10
53 #define KOREAN_HANGUL    KOREAN_EUC_KR
54 
55 // The default Encoding (LATIN1).
56 Encoding default_encoding();
57 
58 
59 
60 // *************************************************************
61 // Encoding predicates
62 //   IsValidEncoding()
63 //   IsEncEncCompatible
64 //   IsSupersetOfAscii7Bit
65 //   Is8BitEncoding
66 //   IsCJKEncoding
67 //   IsHebrewEncoding
68 //   IsRightToLeftEncoding
69 //   IsLogicalRightToLeftEncoding
70 //   IsVisualRightToLeftEncoding
71 //   IsIso2022Encoding
72 //   IsIso2022JpOrVariant
73 //   IsShiftJisOrVariant
74 //   IsJapaneseCellPhoneCarrierSpecificEncoding
75 // *************************************************************
76 
77 // IsValidEncoding
78 // ===================================
79 //
80 // Function to check if the input language enum is within range.
81 //
82 
83 bool IsValidEncoding(Encoding enc);
84 
85 //
86 // IsEncEncCompatible
87 // ------------------
88 //
89 // This function is to determine whether or not converting from the
90 // first encoding to the second requires any changes to the underlying
91 // text (e.g.  ASCII_7BIT is a subset of UTF8).
92 //
93 // TODO(someone more familiar with i18n): the current implementation
94 // is likely incomplete.  It would be good to consider the full matrix
95 // of all pairs of encodings and to fish out all compatible pairs.
96 //
97 bool IsEncEncCompatible(const Encoding from, const Encoding to);
98 
99 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
100 // encoding represent the same characters as they do in ISO_8859_1.
101 
102 // WARNING: This function does not currently return true for all encodings that
103 // are supersets of Ascii 7-bit.
104 bool IsSupersetOfAscii7Bit(Encoding e);
105 
106 // To be an 8-bit encoding means that there are fewer than 256 symbols.
107 // Each byte determines a new character; there are no multi-byte sequences.
108 
109 // WARNING: This function does not currently return true for all encodings that
110 // are 8-bit encodings.
111 bool Is8BitEncoding(Encoding e);
112 
113 // IsCJKEncoding
114 // -------------
115 //
116 // This function returns true if the encoding is either Chinese
117 // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
118 // considered a CJK encoding.
119 bool IsCJKEncoding(Encoding e);
120 
121 // IsHebrewEncoding
122 // -------------
123 //
124 // This function returns true if the encoding is a Hebrew specific
125 // encoding (not UTF8, etc).
126 bool IsHebrewEncoding(Encoding e);
127 
128 // IsRightToLeftEncoding
129 // ---------------------
130 //
131 // Returns true if the encoding is a right-to-left encoding.
132 //
133 // Note that the name of this function is somewhat misleading. There is nothing
134 // "right to left" about these encodings. They merely contain code points for
135 // characters in RTL languages such as Hebrew and Arabic. But this is also
136 // true for UTF-8.
137 //
138 // TODO(benjy): Get rid of this function. The only special-case we
139 // should need to worry about are visual encodings. Anything we
140 // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
141 bool IsRightToLeftEncoding(Encoding enc);
142 
143 // IsLogicalRightToLeftEncoding
144 // ----------------------------
145 //
146 // Returns true if the encoding is a logical right-to-left encoding.
147 // Logical right-to-left encodings are those that the browser renders
148 // right-to-left and applies the BiDi algorithm to. Therefore the characters
149 // appear in reading order in the file, and indexing, snippet generation etc.
150 // should all just work with no special processing.
151 //
152 // TODO(benjy): Get rid of this function. The only special-case we
153 // should need to worry about are visual encodings.
154 bool IsLogicalRightToLeftEncoding(Encoding enc);
155 
156 // IsVisualRightToLeftEncoding
157 // ---------------------------
158 //
159 // Returns true if the encoding is a visual right-to-left encoding.
160 // Visual right-to-left encodings are those that the browser renders
161 // left-to-right and does not apply the BiDi algorithm to. Therefore each
162 // line appears in reverse order in the file, lines are manually wrapped
163 // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
164 // the prehistoric days when browsers couldn't render right-to-left, but
165 // unfortunately some visual pages persist to this day. These documents require
166 // special processing so that we don't index or snippet them with each line
167 // reversed.
168 bool IsVisualRightToLeftEncoding(Encoding enc);
169 
170 // IsIso2022Encoding
171 // -----------------
172 //
173 // Returns true if the encoding is a kind of ISO 2022 such as
174 // ISO-2022-JP.
175 bool IsIso2022Encoding(Encoding enc);
176 
177 // IsIso2022JpOrVariant
178 // --------------------
179 //
180 // Returns true if the encoding is ISO-2022-JP or a variant such as
181 // KDDI's ISO-2022-JP.
182 bool IsIso2022JpOrVariant(Encoding enc);
183 
184 // IsShiftJisOrVariant
185 // --------------------
186 //
187 // Returns true if the encoding is Shift_JIS or a variant such as
188 // KDDI's Shift_JIS.
189 bool IsShiftJisOrVariant(Encoding enc);
190 
191 // IsJapanesCellPhoneCarrierSpecificEncoding
192 // -----------------------------------------
193 //
194 // Returns true if it's Japanese cell phone carrier specific encoding
195 // such as KDDI_SHIFT_JIS.
196 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
197 
198 
199 
200 // *************************************************************
201 // ENCODING NAMES
202 //
203 // This interface defines a standard name for each valid encoding, and
204 // a standard name for invalid encodings. (Some names use all upper
205 // case, but others use mixed case.)
206 //
207 //   EncodingName() [Encoding to name]
208 //   MimeEncodingName() [Encoding to name]
209 //   EncodingFromName() [name to Encoding]
210 //   EncodingNameAliasToEncoding() [name to Encoding]
211 //   default_encoding_name()
212 //   invalid_encoding_name()
213 // *************************************************************
214 
215 // EncodingName
216 // ------------
217 //
218 // Given the encoding, returns its standard name.
219 // Return invalid_encoding_name() if the encoding is invalid.
220 //
221 const char* EncodingName(Encoding enc);
222 
223 //
224 // MimeEncodingName
225 // ----------------
226 //
227 // Return the "preferred MIME name" of an encoding.
228 //
229 // This name is suitable for using in HTTP headers, HTML tags,
230 // and as the "charset" parameter of a MIME Content-Type.
231 const char* MimeEncodingName(Encoding enc);
232 
233 
234 // The maximum length of an encoding name
235 const int kMaxEncodingNameSize = 50;
236 
237 // The standard name of the default encoding.
238 const char* default_encoding_name();
239 
240 // The name used for an invalid encoding.
241 const char* invalid_encoding_name();
242 
243 // EncodingFromName
244 // ----------------
245 //
246 // If enc_name matches the standard name of an Encoding, using a
247 // case-insensitive comparison, set *encoding to that Encoding and
248 // return true.  Otherwise set *encoding to UNKNOWN_ENCODING and
249 // return false.
250 //
251 // REQUIRES: encoding must not be NULL.
252 //
253 bool EncodingFromName(const char* enc_name, Encoding *encoding);
254 
255 //
256 // EncodingNameAliasToEncoding
257 // ---------------------------
258 //
259 // If enc_name matches the standard name or an alias of an Encoding,
260 // using a case-insensitive comparison, return that
261 // Encoding. Otherwise, return UNKNOWN_ENCODING.
262 //
263 // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
264 // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
265 // common variations with hyphens and underscores (e.g., "koi8-u" and
266 // "koi8u" for RUSSIAN_KOI8_R).
267 
268 Encoding EncodingNameAliasToEncoding(const char *enc_name);
269 
270 
271 // *************************************************************
272 // Miscellany
273 // *************************************************************
274 
275 // PreferredWebOutputEncoding
276 // --------------------------
277 //
278 // Some multi-byte encodings use byte values that coincide with the
279 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
280 // can misinterpret these, as indicated in an external XSS report from
281 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
282 // also use UTF8 instead of encodings that we don't support in our
283 // output, and we generally try to be conservative in what we send out.
284 // Where the client asks for single- or double-byte encodings that are
285 // not as common, we substitute a more common single- or double-byte
286 // encoding, if there is one, thereby preserving the client's intent
287 // to use less space than UTF-8. This also means that characters
288 // outside the destination set will be converted to HTML NCRs (&#NNN;)
289 // if requested.
290 Encoding PreferredWebOutputEncoding(Encoding enc);
291 
292 
293 // InitEncodings
294 // -------------
295 //
296 // Ensures the encodings module has been initialized.  Normally this happens
297 // during InitGoogle, but this allows access for scripts that don't
298 // support InitGoogle.
299 void InitEncodings();
300 
301 #endif  // ENCODINGS_PUBLIC_ENCODINGS_H_
302