1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_ 6 #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_ 7 8 #if !defined(CLD_WINDOWS) 9 10 #include "util/utf8/utf8statetable.h" 11 12 #else 13 14 #include "encodings/compact_lang_det/win/cld_basictypes.h" 15 16 // These four-byte entries compactly encode how many bytes 0..255 to delete 17 // in making a string replacement, how many bytes to add 0..255, and the offset 18 // 0..64k-1 of the replacement string in remap_string. 19 struct RemapEntry { 20 uint8 delete_bytes; 21 uint8 add_bytes; 22 uint16 bytes_offset; 23 }; 24 25 // Exit type codes for state tables. All but the first get stuffed into 26 // signed one-byte entries. The first is only generated by executable code. 27 // To distinguish from next-state entries, these must be contiguous and 28 // all <= kExitNone 29 typedef enum { 30 kExitDstSpaceFull = 239, 31 kExitIllegalStructure, // 240 32 kExitOK, // 241 33 kExitReject, // ... 34 kExitReplace1, 35 kExitReplace2, 36 kExitReplace3, 37 kExitReplace21, 38 kExitReplace31, 39 kExitReplace32, 40 kExitReplaceOffset1, 41 kExitReplaceOffset2, 42 kExitReplace1S0, 43 kExitSpecial, 44 kExitDoAgain, 45 kExitRejectAlt, 46 kExitNone // 255 47 } ExitReason; 48 49 typedef enum { 50 kExitDstSpaceFull_2 = -32769, 51 kExitIllegalStructure_2, // -32768 52 kExitOK_2, // -32767 53 kExitReject_2, // ... 54 kExitReplace1_2, 55 kExitReplace2_2, 56 kExitReplace3_2, 57 kExitReplace21_2, 58 kExitReplace31_2, 59 kExitReplace32_2, 60 kExitReplaceOffset1_2, 61 kExitReplaceOffset2_2, 62 kExitReplace1S0_2, 63 kExitSpecial_2, 64 kExitDoAgain_2, 65 kExitRejectAlt_2, 66 kExitNone_2 // -32753 67 } ExitReason_2; 68 69 // This struct represents one entire state table. The three initialized byte 70 // areas are state_table, remap_base, and remap_string. state0 and state0_size 71 // give the byte offset and length within state_table of the initial state -- 72 // table lookups are expected to start and end in this state, but for 73 // truncated UTF-8 strings, may end in a different state. These allow a quick 74 // test for that condition. entry_shift is 8 for tables subscripted by a full 75 // byte value and 6 for space-optimized tables subscripted by only six 76 // significant bits in UTF-8 continuation bytes. 77 typedef struct { 78 const uint32 state0; 79 const uint32 state0_size; 80 const uint32 total_size; 81 const int max_expand; 82 const int entry_shift; 83 const int bytes_per_entry; 84 const uint32 losub; 85 const uint32 hiadd; 86 const uint8* state_table; 87 const RemapEntry* remap_base; 88 const uint8* remap_string; 89 const uint8* fast_state; 90 } UTF8StateMachineObj; 91 92 // Near-duplicate declaration for tables with two-byte entries 93 typedef struct { 94 const uint32 state0; 95 const uint32 state0_size; 96 const uint32 total_size; 97 const int max_expand; 98 const int entry_shift; 99 const int bytes_per_entry; 100 const uint32 losub; 101 const uint32 hiadd; 102 const signed short* state_table; 103 const RemapEntry* remap_base; 104 const uint8* remap_string; 105 const uint8* fast_state; 106 } UTF8StateMachineObj_2; 107 108 109 typedef UTF8StateMachineObj UTF8PropObj; 110 typedef UTF8StateMachineObj UTF8ScanObj; 111 typedef UTF8StateMachineObj_2 UTF8PropObj_2; 112 113 114 // Look up property of one UTF-8 character and advance over it 115 // Return 0 if input length is zero 116 // Return 0 and advance one byte if input is ill-formed 117 uint8 UTF8GenericProperty(const UTF8PropObj* st, 118 const uint8** src, 119 int* srclen); 120 121 // BigOneByte versions are needed for tables > 240 states, but most 122 // won't need the TwoByte versions. 123 124 // Look up property of one UTF-8 character and advance over it 125 // Return 0 if input length is zero 126 // Return 0 and advance one byte if input is ill-formed 127 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st, 128 const uint8** src, 129 int* srclen); 130 131 // Scan a UTF-8 stringpiece based on a state table. 132 // Always scan complete UTF-8 characters 133 // Set number of bytes scanned. Return reason for exiting 134 int UTF8GenericScan(const UTF8ScanObj* st, 135 const uint8* str, 136 const int len, 137 int* bytes_consumed); 138 139 #endif 140 141 #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_ 142