1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef ENCODINGS_PUBLIC_ENCODINGS_H_ 6 #define ENCODINGS_PUBLIC_ENCODINGS_H_ 7 8 // This interface defines the Encoding enum and various functions that 9 // depend only on Encoding values. 10 11 // A hash-function for Encoding, hash<Encoding>, is defined in 12 // i18n/encodings/public/encodings-hash.h 13 14 // On some Windows projects, UNICODE may be defined, which would prevent the 15 // Encoding enum below from compiling. Note that this is a quick fix that does 16 // not break any existing projects. The UNICODE enum may someday be changed 17 // to something more specific and non-colliding, but this involves careful 18 // testing of changes in many other projects. 19 #undef UNICODE 20 21 // NOTE: The Encoding enum must always start at 0. This assumption has 22 // been made and used. 23 24 #ifndef SWIG 25 26 #include "encodings/proto/encodings.pb.h" 27 28 // We must have this for compatibility. 29 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE 30 //using namespace i18n::encodings; 31 32 #else 33 34 // Special proto SWIG workaround header file. 35 #include "i18n/encodings/internal/encodings_proto_wrapper.h" 36 37 #endif 38 39 const int kNumEncodings = NUM_ENCODINGS; 40 41 // some of the popular encoding aliases 42 // TODO(jrm) Make these static const Encoding values instead of macros. 43 #define LATIN1 ISO_8859_1 44 #define LATIN2 ISO_8859_2 45 #define LATIN3 ISO_8859_3 46 #define LATIN4 ISO_8859_4 47 #define CYRILLIC ISO_8859_5 48 #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language 49 #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language 50 #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language 51 #define LATIN5 ISO_8859_9 52 #define LATIN6 ISO_8859_10 53 #define KOREAN_HANGUL KOREAN_EUC_KR 54 55 // The default Encoding (LATIN1). 56 Encoding default_encoding(); 57 58 59 60 // ************************************************************* 61 // Encoding predicates 62 // IsValidEncoding() 63 // IsEncEncCompatible 64 // IsSupersetOfAscii7Bit 65 // Is8BitEncoding 66 // IsCJKEncoding 67 // IsHebrewEncoding 68 // IsRightToLeftEncoding 69 // IsLogicalRightToLeftEncoding 70 // IsVisualRightToLeftEncoding 71 // IsIso2022Encoding 72 // IsIso2022JpOrVariant 73 // IsShiftJisOrVariant 74 // IsJapaneseCellPhoneCarrierSpecificEncoding 75 // ************************************************************* 76 77 // IsValidEncoding 78 // =================================== 79 // 80 // Function to check if the input language enum is within range. 81 // 82 83 bool IsValidEncoding(Encoding enc); 84 85 // 86 // IsEncEncCompatible 87 // ------------------ 88 // 89 // This function is to determine whether or not converting from the 90 // first encoding to the second requires any changes to the underlying 91 // text (e.g. ASCII_7BIT is a subset of UTF8). 92 // 93 // TODO(someone more familiar with i18n): the current implementation 94 // is likely incomplete. It would be good to consider the full matrix 95 // of all pairs of encodings and to fish out all compatible pairs. 96 // 97 bool IsEncEncCompatible(const Encoding from, const Encoding to); 98 99 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given 100 // encoding represent the same characters as they do in ISO_8859_1. 101 102 // WARNING: This function does not currently return true for all encodings that 103 // are supersets of Ascii 7-bit. 104 bool IsSupersetOfAscii7Bit(Encoding e); 105 106 // To be an 8-bit encoding means that there are fewer than 256 symbols. 107 // Each byte determines a new character; there are no multi-byte sequences. 108 109 // WARNING: This function does not currently return true for all encodings that 110 // are 8-bit encodings. 111 bool Is8BitEncoding(Encoding e); 112 113 // IsCJKEncoding 114 // ------------- 115 // 116 // This function returns true if the encoding is either Chinese 117 // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not 118 // considered a CJK encoding. 119 bool IsCJKEncoding(Encoding e); 120 121 // IsHebrewEncoding 122 // ------------- 123 // 124 // This function returns true if the encoding is a Hebrew specific 125 // encoding (not UTF8, etc). 126 bool IsHebrewEncoding(Encoding e); 127 128 // IsRightToLeftEncoding 129 // --------------------- 130 // 131 // Returns true if the encoding is a right-to-left encoding. 132 // 133 // Note that the name of this function is somewhat misleading. There is nothing 134 // "right to left" about these encodings. They merely contain code points for 135 // characters in RTL languages such as Hebrew and Arabic. But this is also 136 // true for UTF-8. 137 // 138 // TODO(benjy): Get rid of this function. The only special-case we 139 // should need to worry about are visual encodings. Anything we 140 // need to do for all 'RTL' encodings we need to do for UTF-8 as well. 141 bool IsRightToLeftEncoding(Encoding enc); 142 143 // IsLogicalRightToLeftEncoding 144 // ---------------------------- 145 // 146 // Returns true if the encoding is a logical right-to-left encoding. 147 // Logical right-to-left encodings are those that the browser renders 148 // right-to-left and applies the BiDi algorithm to. Therefore the characters 149 // appear in reading order in the file, and indexing, snippet generation etc. 150 // should all just work with no special processing. 151 // 152 // TODO(benjy): Get rid of this function. The only special-case we 153 // should need to worry about are visual encodings. 154 bool IsLogicalRightToLeftEncoding(Encoding enc); 155 156 // IsVisualRightToLeftEncoding 157 // --------------------------- 158 // 159 // Returns true if the encoding is a visual right-to-left encoding. 160 // Visual right-to-left encodings are those that the browser renders 161 // left-to-right and does not apply the BiDi algorithm to. Therefore each 162 // line appears in reverse order in the file, lines are manually wrapped 163 // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of 164 // the prehistoric days when browsers couldn't render right-to-left, but 165 // unfortunately some visual pages persist to this day. These documents require 166 // special processing so that we don't index or snippet them with each line 167 // reversed. 168 bool IsVisualRightToLeftEncoding(Encoding enc); 169 170 // IsIso2022Encoding 171 // ----------------- 172 // 173 // Returns true if the encoding is a kind of ISO 2022 such as 174 // ISO-2022-JP. 175 bool IsIso2022Encoding(Encoding enc); 176 177 // IsIso2022JpOrVariant 178 // -------------------- 179 // 180 // Returns true if the encoding is ISO-2022-JP or a variant such as 181 // KDDI's ISO-2022-JP. 182 bool IsIso2022JpOrVariant(Encoding enc); 183 184 // IsShiftJisOrVariant 185 // -------------------- 186 // 187 // Returns true if the encoding is Shift_JIS or a variant such as 188 // KDDI's Shift_JIS. 189 bool IsShiftJisOrVariant(Encoding enc); 190 191 // IsJapanesCellPhoneCarrierSpecificEncoding 192 // ----------------------------------------- 193 // 194 // Returns true if it's Japanese cell phone carrier specific encoding 195 // such as KDDI_SHIFT_JIS. 196 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc); 197 198 199 200 // ************************************************************* 201 // ENCODING NAMES 202 // 203 // This interface defines a standard name for each valid encoding, and 204 // a standard name for invalid encodings. (Some names use all upper 205 // case, but others use mixed case.) 206 // 207 // EncodingName() [Encoding to name] 208 // MimeEncodingName() [Encoding to name] 209 // EncodingFromName() [name to Encoding] 210 // EncodingNameAliasToEncoding() [name to Encoding] 211 // default_encoding_name() 212 // invalid_encoding_name() 213 // ************************************************************* 214 215 // EncodingName 216 // ------------ 217 // 218 // Given the encoding, returns its standard name. 219 // Return invalid_encoding_name() if the encoding is invalid. 220 // 221 const char* EncodingName(Encoding enc); 222 223 // 224 // MimeEncodingName 225 // ---------------- 226 // 227 // Return the "preferred MIME name" of an encoding. 228 // 229 // This name is suitable for using in HTTP headers, HTML tags, 230 // and as the "charset" parameter of a MIME Content-Type. 231 const char* MimeEncodingName(Encoding enc); 232 233 234 // The maximum length of an encoding name 235 const int kMaxEncodingNameSize = 50; 236 237 // The standard name of the default encoding. 238 const char* default_encoding_name(); 239 240 // The name used for an invalid encoding. 241 const char* invalid_encoding_name(); 242 243 // EncodingFromName 244 // ---------------- 245 // 246 // If enc_name matches the standard name of an Encoding, using a 247 // case-insensitive comparison, set *encoding to that Encoding and 248 // return true. Otherwise set *encoding to UNKNOWN_ENCODING and 249 // return false. 250 // 251 // REQUIRES: encoding must not be NULL. 252 // 253 bool EncodingFromName(const char* enc_name, Encoding *encoding); 254 255 // 256 // EncodingNameAliasToEncoding 257 // --------------------------- 258 // 259 // If enc_name matches the standard name or an alias of an Encoding, 260 // using a case-insensitive comparison, return that 261 // Encoding. Otherwise, return UNKNOWN_ENCODING. 262 // 263 // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for 264 // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and 265 // common variations with hyphens and underscores (e.g., "koi8-u" and 266 // "koi8u" for RUSSIAN_KOI8_R). 267 268 Encoding EncodingNameAliasToEncoding(const char *enc_name); 269 270 271 // ************************************************************* 272 // Miscellany 273 // ************************************************************* 274 275 // PreferredWebOutputEncoding 276 // -------------------------- 277 // 278 // Some multi-byte encodings use byte values that coincide with the 279 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE 280 // can misinterpret these, as indicated in an external XSS report from 281 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We 282 // also use UTF8 instead of encodings that we don't support in our 283 // output, and we generally try to be conservative in what we send out. 284 // Where the client asks for single- or double-byte encodings that are 285 // not as common, we substitute a more common single- or double-byte 286 // encoding, if there is one, thereby preserving the client's intent 287 // to use less space than UTF-8. This also means that characters 288 // outside the destination set will be converted to HTML NCRs (&#NNN;) 289 // if requested. 290 Encoding PreferredWebOutputEncoding(Encoding enc); 291 292 293 // InitEncodings 294 // ------------- 295 // 296 // Ensures the encodings module has been initialized. Normally this happens 297 // during InitGoogle, but this allows access for scripts that don't 298 // support InitGoogle. 299 void InitEncodings(); 300 301 #endif // ENCODINGS_PUBLIC_ENCODINGS_H_ 302