1// Copyright 2017 The Wuffs Authors. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package token 16 17import ( 18 "errors" 19 "fmt" 20) 21 22const ( 23 maxID = 1048575 24 maxLine = 1048575 25 maxTokenSize = 1023 26) 27 28func Unescape(s string) (unescaped string, ok bool) { 29 if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' { 30 return "", false 31 } 32 return s[1 : len(s)-1], true 33} 34 35type Map struct { 36 byName map[string]ID 37 byID []string 38} 39 40func (m *Map) Insert(name string) (ID, error) { 41 if name == "" { 42 return 0, nil 43 } 44 if id, ok := builtInsByName[name]; ok { 45 return id, nil 46 } 47 if m.byName == nil { 48 m.byName = map[string]ID{} 49 } 50 if id, ok := m.byName[name]; ok { 51 return id, nil 52 } 53 54 id := nBuiltInIDs + ID(len(m.byID)) 55 if id > maxID { 56 return 0, errors.New("token: too many distinct tokens") 57 } 58 m.byName[name] = id 59 m.byID = append(m.byID, name) 60 return id, nil 61} 62 63func (m *Map) ByName(name string) ID { 64 if id, ok := builtInsByName[name]; ok { 65 return id 66 } 67 if m.byName != nil { 68 return m.byName[name] 69 } 70 return 0 71} 72 73func (m *Map) ByID(x ID) string { 74 if x < nBuiltInIDs { 75 return builtInsByID[x] 76 } 77 x -= nBuiltInIDs 78 if uint(x) < uint(len(m.byID)) { 79 return m.byID[x] 80 } 81 return "" 82} 83 84func alpha(c byte) bool { 85 return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || (c == '_') 86} 87 88func alphaNumeric(c byte) bool { 89 return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || (c == '_') || ('0' <= c && c <= '9') 90} 91 92func hexaNumeric(c byte) bool { 93 return ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f') || ('0' <= c && c <= '9') 94} 95 96func numeric(c byte) bool { 97 return ('0' <= c && c <= '9') 98} 99 100func hasPrefix(a []byte, s string) bool { 101 if len(s) == 0 { 102 return true 103 } 104 if len(a) < len(s) { 105 return false 106 } 107 a = a[:len(s)] 108 for i := range a { 109 if a[i] != s[i] { 110 return false 111 } 112 } 113 return true 114} 115 116func Tokenize(m *Map, filename string, src []byte) (tokens []Token, comments []string, retErr error) { 117 line := uint32(1) 118loop: 119 for i := 0; i < len(src); { 120 c := src[i] 121 122 if c <= ' ' { 123 if c == '\n' { 124 if len(tokens) > 0 && tokens[len(tokens)-1].ID.IsImplicitSemicolon(m) { 125 tokens = append(tokens, Token{IDSemicolon, line}) 126 } 127 if line == maxLine { 128 return nil, nil, fmt.Errorf("token: too many lines in %q", filename) 129 } 130 line++ 131 } 132 i++ 133 continue 134 } 135 136 // TODO: recognize escapes such as `\t`, `\"` and `\\`. For now, we 137 // assume that strings don't contain control bytes or backslashes. 138 // Neither should be necessary to parse `use "foo/bar"` lines. 139 if c == '"' { 140 j := i + 1 141 for ; j < len(src); j++ { 142 c = src[j] 143 if c == '"' { 144 j++ 145 break 146 } 147 if c == '\\' { 148 return nil, nil, fmt.Errorf("token: backslash in string at %s:%d", filename, line) 149 } 150 if c == '\n' { 151 return nil, nil, fmt.Errorf("token: expected final '\"' in string at %s:%d", filename, line) 152 } 153 if c < ' ' { 154 return nil, nil, fmt.Errorf("token: control character in string at %s:%d", filename, line) 155 } 156 // The -1 is because we still haven't seen the final '"'. 157 if j-i == maxTokenSize-1 { 158 return nil, nil, fmt.Errorf("token: string too long at %s:%d", filename, line) 159 } 160 } 161 id, err := m.Insert(string(src[i:j])) 162 if err != nil { 163 return nil, nil, err 164 } 165 tokens = append(tokens, Token{id, line}) 166 i = j 167 continue 168 } 169 170 if alpha(c) { 171 j := i + 1 172 for ; j < len(src) && alphaNumeric(src[j]); j++ { 173 if j-i == maxTokenSize { 174 return nil, nil, fmt.Errorf("token: identifier too long at %s:%d", filename, line) 175 } 176 } 177 id, err := m.Insert(string(src[i:j])) 178 if err != nil { 179 return nil, nil, err 180 } 181 tokens = append(tokens, Token{id, line}) 182 i = j 183 continue 184 } 185 186 if numeric(c) { 187 // TODO: 0b11 binary numbers. 188 // 189 // TODO: allow underscores like 0b1000_0000_1111? 190 j, isDigit := i+1, numeric 191 if c == '0' && j < len(src) { 192 if next := src[j]; next == 'x' || next == 'X' { 193 j, isDigit = j+1, hexaNumeric 194 } else if numeric(next) { 195 return nil, nil, fmt.Errorf("token: legacy octal syntax at %s:%d", filename, line) 196 } 197 } 198 for ; j < len(src) && isDigit(src[j]); j++ { 199 if j-i == maxTokenSize { 200 return nil, nil, fmt.Errorf("token: constant too long at %s:%d", filename, line) 201 } 202 } 203 id, err := m.Insert(string(src[i:j])) 204 if err != nil { 205 return nil, nil, err 206 } 207 tokens = append(tokens, Token{id, line}) 208 i = j 209 continue 210 } 211 212 if c == '/' && i+1 < len(src) && src[i+1] == '/' { 213 h := i 214 i += 2 215 for ; i < len(src) && src[i] != '\n'; i++ { 216 } 217 for uint32(len(comments)) < line { 218 comments = append(comments, "") 219 } 220 comments = append(comments, string(src[h:i])) 221 continue 222 } 223 224 if id := squiggles[c]; id != 0 { 225 i++ 226 tokens = append(tokens, Token{id, line}) 227 continue 228 } 229 for _, x := range lexers[c] { 230 if hasPrefix(src[i+1:], x.suffix) { 231 i += len(x.suffix) + 1 232 tokens = append(tokens, Token{x.id, line}) 233 continue loop 234 } 235 } 236 237 msg := "" 238 if c <= 0x7F { 239 msg = fmt.Sprintf("byte '\\x%02X' (%q)", c, c) 240 } else { 241 msg = fmt.Sprintf("non-ASCII byte '\\x%02X'", c) 242 } 243 return nil, nil, fmt.Errorf("token: unrecognized %s at %s:%d", msg, filename, line) 244 } 245 return tokens, comments, nil 246} 247