• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
6 #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
7 
8 #if !defined(CLD_WINDOWS)
9 
10 #include "util/utf8/utf8statetable.h"
11 
12 #else
13 
14 #include "encodings/compact_lang_det/win/cld_basictypes.h"
15 
16 // These four-byte entries compactly encode how many bytes 0..255 to delete
17 // in making a string replacement, how many bytes to add 0..255, and the offset
18 // 0..64k-1 of the replacement string in remap_string.
19 struct RemapEntry {
20   uint8 delete_bytes;
21   uint8 add_bytes;
22   uint16 bytes_offset;
23 };
24 
25 // Exit type codes for state tables. All but the first get stuffed into
26 // signed one-byte entries. The first is only generated by executable code.
27 // To distinguish from next-state entries, these must be contiguous and
28 // all <= kExitNone
29 typedef enum {
30   kExitDstSpaceFull = 239,
31   kExitIllegalStructure,  // 240
32   kExitOK,                // 241
33   kExitReject,            // ...
34   kExitReplace1,
35   kExitReplace2,
36   kExitReplace3,
37   kExitReplace21,
38   kExitReplace31,
39   kExitReplace32,
40   kExitReplaceOffset1,
41   kExitReplaceOffset2,
42   kExitReplace1S0,
43   kExitSpecial,
44   kExitDoAgain,
45   kExitRejectAlt,
46   kExitNone               // 255
47 } ExitReason;
48 
49 typedef enum {
50   kExitDstSpaceFull_2 = -32769,
51   kExitIllegalStructure_2,  // -32768
52   kExitOK_2,                // -32767
53   kExitReject_2,            // ...
54   kExitReplace1_2,
55   kExitReplace2_2,
56   kExitReplace3_2,
57   kExitReplace21_2,
58   kExitReplace31_2,
59   kExitReplace32_2,
60   kExitReplaceOffset1_2,
61   kExitReplaceOffset2_2,
62   kExitReplace1S0_2,
63   kExitSpecial_2,
64   kExitDoAgain_2,
65   kExitRejectAlt_2,
66   kExitNone_2               // -32753
67 } ExitReason_2;
68 
69 // This struct represents one entire state table. The three initialized byte
70 // areas are state_table, remap_base, and remap_string. state0 and state0_size
71 // give the byte offset and length within state_table of the initial state --
72 // table lookups are expected to start and end in this state, but for
73 // truncated UTF-8 strings, may end in a different state. These allow a quick
74 // test for that condition. entry_shift is 8 for tables subscripted by a full
75 // byte value and 6 for space-optimized tables subscripted by only six
76 // significant bits in UTF-8 continuation bytes.
77 typedef struct {
78   const uint32 state0;
79   const uint32 state0_size;
80   const uint32 total_size;
81   const int max_expand;
82   const int entry_shift;
83   const int bytes_per_entry;
84   const uint32 losub;
85   const uint32 hiadd;
86   const uint8* state_table;
87   const RemapEntry* remap_base;
88   const uint8* remap_string;
89   const uint8* fast_state;
90 } UTF8StateMachineObj;
91 
92 // Near-duplicate declaration for tables with two-byte entries
93 typedef struct {
94   const uint32 state0;
95   const uint32 state0_size;
96   const uint32 total_size;
97   const int max_expand;
98   const int entry_shift;
99   const int bytes_per_entry;
100   const uint32 losub;
101   const uint32 hiadd;
102   const signed short* state_table;
103   const RemapEntry* remap_base;
104   const uint8* remap_string;
105   const uint8* fast_state;
106 } UTF8StateMachineObj_2;
107 
108 
109 typedef UTF8StateMachineObj UTF8PropObj;
110 typedef UTF8StateMachineObj UTF8ScanObj;
111 typedef UTF8StateMachineObj_2 UTF8PropObj_2;
112 
113 
114 // Look up property of one UTF-8 character and advance over it
115 // Return 0 if input length is zero
116 // Return 0 and advance one byte if input is ill-formed
117 uint8 UTF8GenericProperty(const UTF8PropObj* st,
118                           const uint8** src,
119                           int* srclen);
120 
121 // BigOneByte versions are needed for tables > 240 states, but most
122 // won't need the TwoByte versions.
123 
124 // Look up property of one UTF-8 character and advance over it
125 // Return 0 if input length is zero
126 // Return 0 and advance one byte if input is ill-formed
127 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
128                           const uint8** src,
129                           int* srclen);
130 
131 // Scan a UTF-8 stringpiece based on a state table.
132 // Always scan complete UTF-8 characters
133 // Set number of bytes scanned. Return reason for exiting
134 int UTF8GenericScan(const UTF8ScanObj* st,
135                     const uint8* str,
136                     const int len,
137                     int* bytes_consumed);
138 
139 #endif
140 
141 #endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
142