• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2013 Google Inc. All Rights Reserved.
2 
3    Distributed under MIT license.
4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6 
7 /* Transformations on dictionary words. */
8 
9 #ifndef BROTLI_DEC_TRANSFORM_H_
10 #define BROTLI_DEC_TRANSFORM_H_
11 
12 #include <brotli/types.h>
13 #include "./port.h"
14 
15 #if defined(__cplusplus) || defined(c_plusplus)
16 extern "C" {
17 #endif
18 
19 enum WordTransformType {
20   kIdentity = 0,
21   kOmitLast1 = 1,
22   kOmitLast2 = 2,
23   kOmitLast3 = 3,
24   kOmitLast4 = 4,
25   kOmitLast5 = 5,
26   kOmitLast6 = 6,
27   kOmitLast7 = 7,
28   kOmitLast8 = 8,
29   kOmitLast9 = 9,
30   kUppercaseFirst = 10,
31   kUppercaseAll = 11,
32   kOmitFirst1 = 12,
33   kOmitFirst2 = 13,
34   kOmitFirst3 = 14,
35   kOmitFirst4 = 15,
36   kOmitFirst5 = 16,
37   kOmitFirst6 = 17,
38   kOmitFirst7 = 18,
39   kOmitFirst8 = 19,
40   kOmitFirst9 = 20
41 };
42 
43 typedef struct {
44   const uint8_t prefix_id;
45   const uint8_t transform;
46   const uint8_t suffix_id;
47 } Transform;
48 
49 static const char kPrefixSuffix[208] =
50     "\0 \0, \0 of the \0 of \0s \0.\0 and \0 in \0\"\0 to \0\">\0\n\0. \0]\0"
51     " for \0 a \0 that \0\'\0 with \0 from \0 by \0(\0. The \0 on \0 as \0"
52     " is \0ing \0\n\t\0:\0ed \0=\"\0 at \0ly \0,\0=\'\0.com/\0. This \0"
53     " not \0er \0al \0ful \0ive \0less \0est \0ize \0\xc2\xa0\0ous ";
54 
55 enum {
56   /* EMPTY = ""
57      SP = " "
58      DQUOT = "\""
59      SQUOT = "'"
60      CLOSEBR = "]"
61      OPEN = "("
62      SLASH = "/"
63      NBSP = non-breaking space "\0xc2\xa0"
64   */
65   kPFix_EMPTY = 0,
66   kPFix_SP = 1,
67   kPFix_COMMASP = 3,
68   kPFix_SPofSPtheSP = 6,
69   kPFix_SPtheSP = 9,
70   kPFix_eSP = 12,
71   kPFix_SPofSP = 15,
72   kPFix_sSP = 20,
73   kPFix_DOT = 23,
74   kPFix_SPandSP = 25,
75   kPFix_SPinSP = 31,
76   kPFix_DQUOT = 36,
77   kPFix_SPtoSP = 38,
78   kPFix_DQUOTGT = 43,
79   kPFix_NEWLINE = 46,
80   kPFix_DOTSP = 48,
81   kPFix_CLOSEBR = 51,
82   kPFix_SPforSP = 53,
83   kPFix_SPaSP = 59,
84   kPFix_SPthatSP = 63,
85   kPFix_SQUOT = 70,
86   kPFix_SPwithSP = 72,
87   kPFix_SPfromSP = 79,
88   kPFix_SPbySP = 86,
89   kPFix_OPEN = 91,
90   kPFix_DOTSPTheSP = 93,
91   kPFix_SPonSP = 100,
92   kPFix_SPasSP = 105,
93   kPFix_SPisSP = 110,
94   kPFix_ingSP = 115,
95   kPFix_NEWLINETAB = 120,
96   kPFix_COLON = 123,
97   kPFix_edSP = 125,
98   kPFix_EQDQUOT = 129,
99   kPFix_SPatSP = 132,
100   kPFix_lySP = 137,
101   kPFix_COMMA = 141,
102   kPFix_EQSQUOT = 143,
103   kPFix_DOTcomSLASH = 146,
104   kPFix_DOTSPThisSP = 152,
105   kPFix_SPnotSP = 160,
106   kPFix_erSP = 166,
107   kPFix_alSP = 170,
108   kPFix_fulSP = 174,
109   kPFix_iveSP = 179,
110   kPFix_lessSP = 184,
111   kPFix_estSP = 190,
112   kPFix_izeSP = 195,
113   kPFix_NBSP = 200,
114   kPFix_ousSP = 203
115 };
116 
117 static const Transform kTransforms[] = {
118   { kPFix_EMPTY, kIdentity, kPFix_EMPTY },
119   { kPFix_EMPTY, kIdentity, kPFix_SP },
120   { kPFix_SP, kIdentity, kPFix_SP },
121   { kPFix_EMPTY, kOmitFirst1, kPFix_EMPTY },
122   { kPFix_EMPTY, kUppercaseFirst, kPFix_SP },
123   { kPFix_EMPTY, kIdentity, kPFix_SPtheSP },
124   { kPFix_SP, kIdentity, kPFix_EMPTY },
125   { kPFix_sSP, kIdentity, kPFix_SP },
126   { kPFix_EMPTY, kIdentity, kPFix_SPofSP },
127   { kPFix_EMPTY, kUppercaseFirst, kPFix_EMPTY },
128   { kPFix_EMPTY, kIdentity, kPFix_SPandSP },
129   { kPFix_EMPTY, kOmitFirst2, kPFix_EMPTY },
130   { kPFix_EMPTY, kOmitLast1, kPFix_EMPTY },
131   { kPFix_COMMASP, kIdentity, kPFix_SP },
132   { kPFix_EMPTY, kIdentity, kPFix_COMMASP },
133   { kPFix_SP, kUppercaseFirst, kPFix_SP },
134   { kPFix_EMPTY, kIdentity, kPFix_SPinSP },
135   { kPFix_EMPTY, kIdentity, kPFix_SPtoSP },
136   { kPFix_eSP, kIdentity, kPFix_SP },
137   { kPFix_EMPTY, kIdentity, kPFix_DQUOT },
138   { kPFix_EMPTY, kIdentity, kPFix_DOT },
139   { kPFix_EMPTY, kIdentity, kPFix_DQUOTGT },
140   { kPFix_EMPTY, kIdentity, kPFix_NEWLINE },
141   { kPFix_EMPTY, kOmitLast3, kPFix_EMPTY },
142   { kPFix_EMPTY, kIdentity, kPFix_CLOSEBR },
143   { kPFix_EMPTY, kIdentity, kPFix_SPforSP },
144   { kPFix_EMPTY, kOmitFirst3, kPFix_EMPTY },
145   { kPFix_EMPTY, kOmitLast2, kPFix_EMPTY },
146   { kPFix_EMPTY, kIdentity, kPFix_SPaSP },
147   { kPFix_EMPTY, kIdentity, kPFix_SPthatSP },
148   { kPFix_SP, kUppercaseFirst, kPFix_EMPTY },
149   { kPFix_EMPTY, kIdentity, kPFix_DOTSP },
150   { kPFix_DOT, kIdentity, kPFix_EMPTY },
151   { kPFix_SP, kIdentity, kPFix_COMMASP },
152   { kPFix_EMPTY, kOmitFirst4, kPFix_EMPTY },
153   { kPFix_EMPTY, kIdentity, kPFix_SPwithSP },
154   { kPFix_EMPTY, kIdentity, kPFix_SQUOT },
155   { kPFix_EMPTY, kIdentity, kPFix_SPfromSP },
156   { kPFix_EMPTY, kIdentity, kPFix_SPbySP },
157   { kPFix_EMPTY, kOmitFirst5, kPFix_EMPTY },
158   { kPFix_EMPTY, kOmitFirst6, kPFix_EMPTY },
159   { kPFix_SPtheSP, kIdentity, kPFix_EMPTY },
160   { kPFix_EMPTY, kOmitLast4, kPFix_EMPTY },
161   { kPFix_EMPTY, kIdentity, kPFix_DOTSPTheSP },
162   { kPFix_EMPTY, kUppercaseAll, kPFix_EMPTY },
163   { kPFix_EMPTY, kIdentity, kPFix_SPonSP },
164   { kPFix_EMPTY, kIdentity, kPFix_SPasSP },
165   { kPFix_EMPTY, kIdentity, kPFix_SPisSP },
166   { kPFix_EMPTY, kOmitLast7, kPFix_EMPTY },
167   { kPFix_EMPTY, kOmitLast1, kPFix_ingSP },
168   { kPFix_EMPTY, kIdentity, kPFix_NEWLINETAB },
169   { kPFix_EMPTY, kIdentity, kPFix_COLON },
170   { kPFix_SP, kIdentity, kPFix_DOTSP },
171   { kPFix_EMPTY, kIdentity, kPFix_edSP },
172   { kPFix_EMPTY, kOmitFirst9, kPFix_EMPTY },
173   { kPFix_EMPTY, kOmitFirst7, kPFix_EMPTY },
174   { kPFix_EMPTY, kOmitLast6, kPFix_EMPTY },
175   { kPFix_EMPTY, kIdentity, kPFix_OPEN },
176   { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMASP },
177   { kPFix_EMPTY, kOmitLast8, kPFix_EMPTY },
178   { kPFix_EMPTY, kIdentity, kPFix_SPatSP },
179   { kPFix_EMPTY, kIdentity, kPFix_lySP },
180   { kPFix_SPtheSP, kIdentity, kPFix_SPofSP },
181   { kPFix_EMPTY, kOmitLast5, kPFix_EMPTY },
182   { kPFix_EMPTY, kOmitLast9, kPFix_EMPTY },
183   { kPFix_SP, kUppercaseFirst, kPFix_COMMASP },
184   { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOT },
185   { kPFix_DOT, kIdentity, kPFix_OPEN },
186   { kPFix_EMPTY, kUppercaseAll, kPFix_SP },
187   { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOTGT },
188   { kPFix_EMPTY, kIdentity, kPFix_EQDQUOT },
189   { kPFix_SP, kIdentity, kPFix_DOT },
190   { kPFix_DOTcomSLASH, kIdentity, kPFix_EMPTY },
191   { kPFix_SPtheSP, kIdentity, kPFix_SPofSPtheSP },
192   { kPFix_EMPTY, kUppercaseFirst, kPFix_SQUOT },
193   { kPFix_EMPTY, kIdentity, kPFix_DOTSPThisSP },
194   { kPFix_EMPTY, kIdentity, kPFix_COMMA },
195   { kPFix_DOT, kIdentity, kPFix_SP },
196   { kPFix_EMPTY, kUppercaseFirst, kPFix_OPEN },
197   { kPFix_EMPTY, kUppercaseFirst, kPFix_DOT },
198   { kPFix_EMPTY, kIdentity, kPFix_SPnotSP },
199   { kPFix_SP, kIdentity, kPFix_EQDQUOT },
200   { kPFix_EMPTY, kIdentity, kPFix_erSP },
201   { kPFix_SP, kUppercaseAll, kPFix_SP },
202   { kPFix_EMPTY, kIdentity, kPFix_alSP },
203   { kPFix_SP, kUppercaseAll, kPFix_EMPTY },
204   { kPFix_EMPTY, kIdentity, kPFix_EQSQUOT },
205   { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOT },
206   { kPFix_EMPTY, kUppercaseFirst, kPFix_DOTSP },
207   { kPFix_SP, kIdentity, kPFix_OPEN },
208   { kPFix_EMPTY, kIdentity, kPFix_fulSP },
209   { kPFix_SP, kUppercaseFirst, kPFix_DOTSP },
210   { kPFix_EMPTY, kIdentity, kPFix_iveSP },
211   { kPFix_EMPTY, kIdentity, kPFix_lessSP },
212   { kPFix_EMPTY, kUppercaseAll, kPFix_SQUOT },
213   { kPFix_EMPTY, kIdentity, kPFix_estSP },
214   { kPFix_SP, kUppercaseFirst, kPFix_DOT },
215   { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOTGT },
216   { kPFix_SP, kIdentity, kPFix_EQSQUOT },
217   { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMA },
218   { kPFix_EMPTY, kIdentity, kPFix_izeSP },
219   { kPFix_EMPTY, kUppercaseAll, kPFix_DOT },
220   { kPFix_NBSP, kIdentity, kPFix_EMPTY },
221   { kPFix_SP, kIdentity, kPFix_COMMA },
222   { kPFix_EMPTY, kUppercaseFirst, kPFix_EQDQUOT },
223   { kPFix_EMPTY, kUppercaseAll, kPFix_EQDQUOT },
224   { kPFix_EMPTY, kIdentity, kPFix_ousSP },
225   { kPFix_EMPTY, kUppercaseAll, kPFix_COMMASP },
226   { kPFix_EMPTY, kUppercaseFirst, kPFix_EQSQUOT },
227   { kPFix_SP, kUppercaseFirst, kPFix_COMMA },
228   { kPFix_SP, kUppercaseAll, kPFix_EQDQUOT },
229   { kPFix_SP, kUppercaseAll, kPFix_COMMASP },
230   { kPFix_EMPTY, kUppercaseAll, kPFix_COMMA },
231   { kPFix_EMPTY, kUppercaseAll, kPFix_OPEN },
232   { kPFix_EMPTY, kUppercaseAll, kPFix_DOTSP },
233   { kPFix_SP, kUppercaseAll, kPFix_DOT },
234   { kPFix_EMPTY, kUppercaseAll, kPFix_EQSQUOT },
235   { kPFix_SP, kUppercaseAll, kPFix_DOTSP },
236   { kPFix_SP, kUppercaseFirst, kPFix_EQDQUOT },
237   { kPFix_SP, kUppercaseAll, kPFix_EQSQUOT },
238   { kPFix_SP, kUppercaseFirst, kPFix_EQSQUOT },
239 };
240 
241 static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
242 
ToUpperCase(uint8_t * p)243 static int ToUpperCase(uint8_t* p) {
244   if (p[0] < 0xc0) {
245     if (p[0] >= 'a' && p[0] <= 'z') {
246       p[0] ^= 32;
247     }
248     return 1;
249   }
250   /* An overly simplified uppercasing model for UTF-8. */
251   if (p[0] < 0xe0) {
252     p[1] ^= 32;
253     return 2;
254   }
255   /* An arbitrary transform for three byte characters. */
256   p[2] ^= 5;
257   return 3;
258 }
259 
TransformDictionaryWord(uint8_t * dst,const uint8_t * word,int len,int transform)260 static BROTLI_NOINLINE int TransformDictionaryWord(
261     uint8_t* dst, const uint8_t* word, int len, int transform) {
262   int idx = 0;
263   {
264     const char* prefix = &kPrefixSuffix[kTransforms[transform].prefix_id];
265     while (*prefix) { dst[idx++] = (uint8_t)*prefix++; }
266   }
267   {
268     const int t = kTransforms[transform].transform;
269     int i = 0;
270     int skip = t - (kOmitFirst1 - 1);
271     if (skip > 0) {
272       word += skip;
273       len -= skip;
274     } else if (t <= kOmitLast9) {
275       len -= t;
276     }
277     while (i < len) { dst[idx++] = word[i++]; }
278     if (t == kUppercaseFirst) {
279       ToUpperCase(&dst[idx - len]);
280     } else if (t == kUppercaseAll) {
281       uint8_t* uppercase = &dst[idx - len];
282       while (len > 0) {
283         int step = ToUpperCase(uppercase);
284         uppercase += step;
285         len -= step;
286       }
287     }
288   }
289   {
290     const char* suffix = &kPrefixSuffix[kTransforms[transform].suffix_id];
291     while (*suffix) { dst[idx++] = (uint8_t)*suffix++; }
292     return idx;
293   }
294 }
295 
296 #if defined(__cplusplus) || defined(c_plusplus)
297 }  /* extern "C" */
298 #endif
299 
300 #endif  /* BROTLI_DEC_TRANSFORM_H_ */
301