• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 ////////////////////////////////////////////////////////////////////////////////
16 
17 #ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_
18 #define UTIL_ENCODINGS_ENCODINGS_PB_H_
19 
20 enum Encoding {
21   ISO_8859_1           =  0,  // Teragram ASCII
22   ISO_8859_2           =  1,  // Teragram Latin2
23   ISO_8859_3           =  2,  // in BasisTech but not in Teragram
24   ISO_8859_4           =  3,  // Teragram Latin4
25   ISO_8859_5           =  4,  // Teragram ISO-8859-5
26   ISO_8859_6           =  5,  // Teragram Arabic
27   ISO_8859_7           =  6,  // Teragram Greek
28   ISO_8859_8           =  7,  // Teragram Hebrew
29   ISO_8859_9           =  8,  // in BasisTech but not in Teragram
30   ISO_8859_10          =  9,  // in BasisTech but not in Teragram
31   JAPANESE_EUC_JP      = 10,  // Teragram EUC_JP
32   JAPANESE_SHIFT_JIS   = 11,  // Teragram SJS
33   JAPANESE_JIS         = 12,  // Teragram JIS
34   CHINESE_BIG5         = 13,  // Teragram BIG5
35   CHINESE_GB           = 14,  // Teragram GB
36   CHINESE_EUC_CN       = 15,  // Misnamed. Should be EUC_TW. Was Basis Tech
37                               // CNS11643EUC, before that Teragram EUC-CN(!)
38                               // See //i18n/basistech/basistech_encodings.h
39   KOREAN_EUC_KR        = 16,  // Teragram KSC
40   UNICODE              = 17,  // Teragram Unicode
41   CHINESE_EUC_DEC      = 18,  // Misnamed. Should be EUC_TW. Was Basis Tech
42                               // CNS11643EUC, before that Teragram EUC.
43   CHINESE_CNS          = 19,  // Misnamed. Should be EUC_TW. Was Basis Tech
44                               // CNS11643EUC, before that Teragram CNS.
45   CHINESE_BIG5_CP950   = 20,  // Teragram BIG5_CP950
46   JAPANESE_CP932       = 21,  // Teragram CP932
47   UTF8                 = 22,
48   UNKNOWN_ENCODING     = 23,
49   ASCII_7BIT           = 24,  // ISO_8859_1 with all characters <= 127.
50                               // Should be present only in the crawler
51                               // and in the repository,
52                               // *never* as a result of Document::encoding().
53   RUSSIAN_KOI8_R       = 25,  // Teragram KOI8R
54   RUSSIAN_CP1251       = 26,  // Teragram CP1251
55 
56   //----------------------------------------------------------
57   // These are _not_ output from teragram. Instead, they are as
58   // detected in the headers of usenet articles.
59   MSFT_CP1252          = 27,  // 27: CP1252 aka MSFT euro ascii
60   RUSSIAN_KOI8_RU      = 28,  // CP21866 aka KOI8-U, used for Ukrainian.
61                               // Misnamed, this is _not_ KOI8-RU but KOI8-U.
62                               // KOI8-U is used much more often than KOI8-RU.
63   MSFT_CP1250          = 29,  // CP1250 aka MSFT eastern european
64   ISO_8859_15          = 30,  // aka ISO_8859_0 aka ISO_8859_1 euroized
65   //----------------------------------------------------------
66 
67   //----------------------------------------------------------
68   // These are in BasisTech but not in Teragram. They are
69   // needed for new interface languages. Now detected by
70   // research langid
71   MSFT_CP1254          = 31,  // used for Turkish
72   MSFT_CP1257          = 32,  // used in Baltic countries
73   //----------------------------------------------------------
74 
75   //----------------------------------------------------------
76   //----------------------------------------------------------
77   // New encodings detected by Teragram
78   ISO_8859_11          = 33,  // aka TIS-620, used for Thai
79   MSFT_CP874           = 34,  // used for Thai
80   MSFT_CP1256          = 35,  // used for Arabic
81 
82   //----------------------------------------------------------
83   // Detected as ISO_8859_8 by Teragram, but can be found in META tags
84   MSFT_CP1255          = 36,  // Logical Hebrew Microsoft
85   ISO_8859_8_I         = 37,  // Iso Hebrew Logical
86   HEBREW_VISUAL        = 38,  // Iso Hebrew Visual
87   //----------------------------------------------------------
88 
89   //----------------------------------------------------------
90   // Detected by research langid
91   CZECH_CP852          = 39,
92   CZECH_CSN_369103     = 40,  // aka ISO_IR_139 aka KOI8_CS
93   MSFT_CP1253          = 41,  // used for Greek
94   RUSSIAN_CP866        = 42,
95   //----------------------------------------------------------
96 
97   //----------------------------------------------------------
98   // Handled by iconv in glibc
99   ISO_8859_13          = 43,
100   ISO_2022_KR          = 44,
101   GBK                  = 45,
102   GB18030              = 46,
103   BIG5_HKSCS           = 47,
104   ISO_2022_CN          = 48,
105 
106   //-----------------------------------------------------------
107   // Detected by xin liu's detector
108   // Handled by transcoder
109   // (Indic encodings)
110 
111   TSCII                = 49,
112   TAMIL_MONO           = 50,
113   TAMIL_BI             = 51,
114   JAGRAN               = 52,
115 
116 
117   MACINTOSH_ROMAN      = 53,
118   UTF7                 = 54,
119   BHASKAR              = 55,  // Indic encoding - Devanagari
120   HTCHANAKYA           = 56,  // 56 Indic encoding - Devanagari
121 
122   //-----------------------------------------------------------
123   // These allow a single place (inputconverter and outputconverter)
124   // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
125   // bulk conversions, with interchange-valid checking on input and
126   // fallback if needed on ouput.
127   UTF16BE              = 57,  // big-endian UTF-16
128   UTF16LE              = 58,  // little-endian UTF-16
129   UTF32BE              = 59,  // big-endian UTF-32
130   UTF32LE              = 60,  // little-endian UTF-32
131   //-----------------------------------------------------------
132 
133   //-----------------------------------------------------------
134   // An encoding that means "This is not text, but it may have some
135   // simple ASCII text embedded". Intended input conversion (not yet
136   // implemented) is to keep strings of >=4 seven-bit ASCII characters
137   // (follow each kept string with an ASCII space), delete the rest of
138   // the bytes. This will pick up and allow indexing of e.g. captions
139   // in JPEGs. No output conversion needed.
140   BINARYENC            = 61,
141   //-----------------------------------------------------------
142 
143   //-----------------------------------------------------------
144   // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
145   // ~{ ... ~} for 2-byte pairs, and the browsers support this.
146   HZ_GB_2312           = 62,
147   //-----------------------------------------------------------
148 
149   //-----------------------------------------------------------
150   // Some external vendors make the common input error of
151   // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
152   UTF8UTF8             = 63,
153   //-----------------------------------------------------------
154 
155   //-----------------------------------------------------------
156   // Handled by transcoder for tamil language specific font
157   // encodings without the support for detection at present.
158   TAM_ELANGO           = 64,  // Elango - Tamil
159   TAM_LTTMBARANI       = 65,  // Barani - Tamil
160   TAM_SHREE            = 66,  // Shree - Tamil
161   TAM_TBOOMIS          = 67,  // TBoomis - Tamil
162   TAM_TMNEWS           = 68,  // TMNews - Tamil
163   TAM_WEBTAMIL         = 69,  // Webtamil - Tamil
164   //-----------------------------------------------------------
165 
166   //-----------------------------------------------------------
167   // Shift_JIS variants used by Japanese cell phone carriers.
168   KDDI_SHIFT_JIS       = 70,
169   DOCOMO_SHIFT_JIS     = 71,
170   SOFTBANK_SHIFT_JIS   = 72,
171   // ISO-2022-JP variants used by KDDI and SoftBank.
172   KDDI_ISO_2022_JP     = 73,
173   SOFTBANK_ISO_2022_JP = 74,
174   //-----------------------------------------------------------
175 
176   NUM_ENCODINGS        = 75,  // Always keep this at the end. It is not a
177                               // valid Encoding enum, it is only used to
178                               // indicate the total number of Encodings.
179 };
180 
181 #endif  // UTIL_ENCODINGS_ENCODINGS_PB_H_
182