1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package norm 6 7// This file contains Form-specific logic and wrappers for data in tables.go. 8 9// Rune info is stored in a separate trie per composing form. A composing form 10// and its corresponding decomposing form share the same trie. Each trie maps 11// a rune to a uint16. The values take two forms. For v >= 0x8000: 12// bits 13// 15: 1 (inverse of NFD_QC bit of qcInfo) 14// 13..7: qcInfo (see below). isYesD is always true (no decompostion). 15// 6..0: ccc (compressed CCC value). 16// For v < 0x8000, the respective rune has a decomposition and v is an index 17// into a byte array of UTF-8 decomposition sequences and additional info and 18// has the form: 19// <header> <decomp_byte>* [<tccc> [<lccc>]] 20// The header contains the number of bytes in the decomposition (excluding this 21// length byte). The two most significant bits of this length byte correspond 22// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1. 23// The byte sequence is followed by a trailing and leading CCC if the values 24// for these are not zero. The value of v determines which ccc are appended 25// to the sequences. For v < firstCCC, there are none, for v >= firstCCC, 26// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC 27// there is an additional leading ccc. The value of tccc itself is the 28// trailing CCC shifted left 2 bits. The two least-significant bits of tccc 29// are the number of trailing non-starters. 30 31const ( 32 qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo 33 headerLenMask = 0x3F // extract the length value from the header byte 34 headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte 35) 36 37// Properties provides access to normalization properties of a rune. 38type Properties struct { 39 pos uint8 // start position in reorderBuffer; used in composition.go 40 size uint8 // length of UTF-8 encoding of this rune 41 ccc uint8 // leading canonical combining class (ccc if not decomposition) 42 tccc uint8 // trailing canonical combining class (ccc if not decomposition) 43 nLead uint8 // number of leading non-starters. 44 flags qcInfo // quick check flags 45 index uint16 46} 47 48// functions dispatchable per form 49type lookupFunc func(b input, i int) Properties 50 51// formInfo holds Form-specific functions and tables. 52type formInfo struct { 53 form Form 54 composing, compatibility bool // form type 55 info lookupFunc 56 nextMain iterFunc 57} 58 59var formTable = []*formInfo{{ 60 form: NFC, 61 composing: true, 62 compatibility: false, 63 info: lookupInfoNFC, 64 nextMain: nextComposed, 65}, { 66 form: NFD, 67 composing: false, 68 compatibility: false, 69 info: lookupInfoNFC, 70 nextMain: nextDecomposed, 71}, { 72 form: NFKC, 73 composing: true, 74 compatibility: true, 75 info: lookupInfoNFKC, 76 nextMain: nextComposed, 77}, { 78 form: NFKD, 79 composing: false, 80 compatibility: true, 81 info: lookupInfoNFKC, 82 nextMain: nextDecomposed, 83}} 84 85// We do not distinguish between boundaries for NFC, NFD, etc. to avoid 86// unexpected behavior for the user. For example, in NFD, there is a boundary 87// after 'a'. However, 'a' might combine with modifiers, so from the application's 88// perspective it is not a good boundary. We will therefore always use the 89// boundaries for the combining variants. 90 91// BoundaryBefore returns true if this rune starts a new segment and 92// cannot combine with any rune on the left. 93func (p Properties) BoundaryBefore() bool { 94 if p.ccc == 0 && !p.combinesBackward() { 95 return true 96 } 97 // We assume that the CCC of the first character in a decomposition 98 // is always non-zero if different from info.ccc and that we can return 99 // false at this point. This is verified by maketables. 100 return false 101} 102 103// BoundaryAfter returns true if runes cannot combine with or otherwise 104// interact with this or previous runes. 105func (p Properties) BoundaryAfter() bool { 106 // TODO: loosen these conditions. 107 return p.isInert() 108} 109 110// We pack quick check data in 4 bits: 111// 5: Combines forward (0 == false, 1 == true) 112// 4..3: NFC_QC Yes(00), No (10), or Maybe (11) 113// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition. 114// 1..0: Number of trailing non-starters. 115// 116// When all 4 bits are zero, the character is inert, meaning it is never 117// influenced by normalization. 118type qcInfo uint8 119 120func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } 121func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } 122 123func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } 124func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe 125func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD 126 127func (p Properties) isInert() bool { 128 return p.flags&qcInfoMask == 0 && p.ccc == 0 129} 130 131func (p Properties) multiSegment() bool { 132 return p.index >= firstMulti && p.index < endMulti 133} 134 135func (p Properties) nLeadingNonStarters() uint8 { 136 return p.nLead 137} 138 139func (p Properties) nTrailingNonStarters() uint8 { 140 return uint8(p.flags & 0x03) 141} 142 143// Decomposition returns the decomposition for the underlying rune 144// or nil if there is none. 145func (p Properties) Decomposition() []byte { 146 // TODO: create the decomposition for Hangul? 147 if p.index == 0 { 148 return nil 149 } 150 i := p.index 151 n := decomps[i] & headerLenMask 152 i++ 153 return decomps[i : i+uint16(n)] 154} 155 156// Size returns the length of UTF-8 encoding of the rune. 157func (p Properties) Size() int { 158 return int(p.size) 159} 160 161// CCC returns the canonical combining class of the underlying rune. 162func (p Properties) CCC() uint8 { 163 if p.index >= firstCCCZeroExcept { 164 return 0 165 } 166 return ccc[p.ccc] 167} 168 169// LeadCCC returns the CCC of the first rune in the decomposition. 170// If there is no decomposition, LeadCCC equals CCC. 171func (p Properties) LeadCCC() uint8 { 172 return ccc[p.ccc] 173} 174 175// TrailCCC returns the CCC of the last rune in the decomposition. 176// If there is no decomposition, TrailCCC equals CCC. 177func (p Properties) TrailCCC() uint8 { 178 return ccc[p.tccc] 179} 180 181// Recomposition 182// We use 32-bit keys instead of 64-bit for the two codepoint keys. 183// This clips off the bits of three entries, but we know this will not 184// result in a collision. In the unlikely event that changes to 185// UnicodeData.txt introduce collisions, the compiler will catch it. 186// Note that the recomposition map for NFC and NFKC are identical. 187 188// combine returns the combined rune or 0 if it doesn't exist. 189func combine(a, b rune) rune { 190 key := uint32(uint16(a))<<16 + uint32(uint16(b)) 191 return recompMap[key] 192} 193 194func lookupInfoNFC(b input, i int) Properties { 195 v, sz := b.charinfoNFC(i) 196 return compInfo(v, sz) 197} 198 199func lookupInfoNFKC(b input, i int) Properties { 200 v, sz := b.charinfoNFKC(i) 201 return compInfo(v, sz) 202} 203 204// Properties returns properties for the first rune in s. 205func (f Form) Properties(s []byte) Properties { 206 if f == NFC || f == NFD { 207 return compInfo(nfcData.lookup(s)) 208 } 209 return compInfo(nfkcData.lookup(s)) 210} 211 212// PropertiesString returns properties for the first rune in s. 213func (f Form) PropertiesString(s string) Properties { 214 if f == NFC || f == NFD { 215 return compInfo(nfcData.lookupString(s)) 216 } 217 return compInfo(nfkcData.lookupString(s)) 218} 219 220// compInfo converts the information contained in v and sz 221// to a Properties. See the comment at the top of the file 222// for more information on the format. 223func compInfo(v uint16, sz int) Properties { 224 if v == 0 { 225 return Properties{size: uint8(sz)} 226 } else if v >= 0x8000 { 227 p := Properties{ 228 size: uint8(sz), 229 ccc: uint8(v), 230 tccc: uint8(v), 231 flags: qcInfo(v >> 8), 232 } 233 if p.ccc > 0 || p.combinesBackward() { 234 p.nLead = uint8(p.flags & 0x3) 235 } 236 return p 237 } 238 // has decomposition 239 h := decomps[v] 240 f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 241 p := Properties{size: uint8(sz), flags: f, index: v} 242 if v >= firstCCC { 243 v += uint16(h&headerLenMask) + 1 244 c := decomps[v] 245 p.tccc = c >> 2 246 p.flags |= qcInfo(c & 0x3) 247 if v >= firstLeadingCCC { 248 p.nLead = c & 0x3 249 if v >= firstStarterWithNLead { 250 // We were tricked. Remove the decomposition. 251 p.flags &= 0x03 252 p.index = 0 253 return p 254 } 255 p.ccc = decomps[v+1] 256 } 257 } 258 return p 259} 260