• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2017 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package token
16
17import (
18	"errors"
19	"fmt"
20)
21
22const (
23	maxID        = 1048575
24	maxLine      = 1048575
25	maxTokenSize = 1023
26)
27
28func Unescape(s string) (unescaped string, ok bool) {
29	if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' {
30		return "", false
31	}
32	return s[1 : len(s)-1], true
33}
34
35type Map struct {
36	byName map[string]ID
37	byID   []string
38}
39
40func (m *Map) Insert(name string) (ID, error) {
41	if name == "" {
42		return 0, nil
43	}
44	if id, ok := builtInsByName[name]; ok {
45		return id, nil
46	}
47	if m.byName == nil {
48		m.byName = map[string]ID{}
49	}
50	if id, ok := m.byName[name]; ok {
51		return id, nil
52	}
53
54	id := nBuiltInIDs + ID(len(m.byID))
55	if id > maxID {
56		return 0, errors.New("token: too many distinct tokens")
57	}
58	m.byName[name] = id
59	m.byID = append(m.byID, name)
60	return id, nil
61}
62
63func (m *Map) ByName(name string) ID {
64	if id, ok := builtInsByName[name]; ok {
65		return id
66	}
67	if m.byName != nil {
68		return m.byName[name]
69	}
70	return 0
71}
72
73func (m *Map) ByID(x ID) string {
74	if x < nBuiltInIDs {
75		return builtInsByID[x]
76	}
77	x -= nBuiltInIDs
78	if uint(x) < uint(len(m.byID)) {
79		return m.byID[x]
80	}
81	return ""
82}
83
84func alpha(c byte) bool {
85	return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || (c == '_')
86}
87
88func alphaNumeric(c byte) bool {
89	return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || (c == '_') || ('0' <= c && c <= '9')
90}
91
92func hexaNumeric(c byte) bool {
93	return ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f') || ('0' <= c && c <= '9')
94}
95
96func numeric(c byte) bool {
97	return ('0' <= c && c <= '9')
98}
99
100func hasPrefix(a []byte, s string) bool {
101	if len(s) == 0 {
102		return true
103	}
104	if len(a) < len(s) {
105		return false
106	}
107	a = a[:len(s)]
108	for i := range a {
109		if a[i] != s[i] {
110			return false
111		}
112	}
113	return true
114}
115
116func Tokenize(m *Map, filename string, src []byte) (tokens []Token, comments []string, retErr error) {
117	line := uint32(1)
118loop:
119	for i := 0; i < len(src); {
120		c := src[i]
121
122		if c <= ' ' {
123			if c == '\n' {
124				if len(tokens) > 0 && tokens[len(tokens)-1].ID.IsImplicitSemicolon(m) {
125					tokens = append(tokens, Token{IDSemicolon, line})
126				}
127				if line == maxLine {
128					return nil, nil, fmt.Errorf("token: too many lines in %q", filename)
129				}
130				line++
131			}
132			i++
133			continue
134		}
135
136		// TODO: recognize escapes such as `\t`, `\"` and `\\`. For now, we
137		// assume that strings don't contain control bytes or backslashes.
138		// Neither should be necessary to parse `use "foo/bar"` lines.
139		if c == '"' {
140			j := i + 1
141			for ; j < len(src); j++ {
142				c = src[j]
143				if c == '"' {
144					j++
145					break
146				}
147				if c == '\\' {
148					return nil, nil, fmt.Errorf("token: backslash in string at %s:%d", filename, line)
149				}
150				if c == '\n' {
151					return nil, nil, fmt.Errorf("token: expected final '\"' in string at %s:%d", filename, line)
152				}
153				if c < ' ' {
154					return nil, nil, fmt.Errorf("token: control character in string at %s:%d", filename, line)
155				}
156				// The -1 is because we still haven't seen the final '"'.
157				if j-i == maxTokenSize-1 {
158					return nil, nil, fmt.Errorf("token: string too long at %s:%d", filename, line)
159				}
160			}
161			id, err := m.Insert(string(src[i:j]))
162			if err != nil {
163				return nil, nil, err
164			}
165			tokens = append(tokens, Token{id, line})
166			i = j
167			continue
168		}
169
170		if alpha(c) {
171			j := i + 1
172			for ; j < len(src) && alphaNumeric(src[j]); j++ {
173				if j-i == maxTokenSize {
174					return nil, nil, fmt.Errorf("token: identifier too long at %s:%d", filename, line)
175				}
176			}
177			id, err := m.Insert(string(src[i:j]))
178			if err != nil {
179				return nil, nil, err
180			}
181			tokens = append(tokens, Token{id, line})
182			i = j
183			continue
184		}
185
186		if numeric(c) {
187			// TODO: 0b11 binary numbers.
188			//
189			// TODO: allow underscores like 0b1000_0000_1111?
190			j, isDigit := i+1, numeric
191			if c == '0' && j < len(src) {
192				if next := src[j]; next == 'x' || next == 'X' {
193					j, isDigit = j+1, hexaNumeric
194				} else if numeric(next) {
195					return nil, nil, fmt.Errorf("token: legacy octal syntax at %s:%d", filename, line)
196				}
197			}
198			for ; j < len(src) && isDigit(src[j]); j++ {
199				if j-i == maxTokenSize {
200					return nil, nil, fmt.Errorf("token: constant too long at %s:%d", filename, line)
201				}
202			}
203			id, err := m.Insert(string(src[i:j]))
204			if err != nil {
205				return nil, nil, err
206			}
207			tokens = append(tokens, Token{id, line})
208			i = j
209			continue
210		}
211
212		if c == '/' && i+1 < len(src) && src[i+1] == '/' {
213			h := i
214			i += 2
215			for ; i < len(src) && src[i] != '\n'; i++ {
216			}
217			for uint32(len(comments)) < line {
218				comments = append(comments, "")
219			}
220			comments = append(comments, string(src[h:i]))
221			continue
222		}
223
224		if id := squiggles[c]; id != 0 {
225			i++
226			tokens = append(tokens, Token{id, line})
227			continue
228		}
229		for _, x := range lexers[c] {
230			if hasPrefix(src[i+1:], x.suffix) {
231				i += len(x.suffix) + 1
232				tokens = append(tokens, Token{x.id, line})
233				continue loop
234			}
235		}
236
237		msg := ""
238		if c <= 0x7F {
239			msg = fmt.Sprintf("byte '\\x%02X' (%q)", c, c)
240		} else {
241			msg = fmt.Sprintf("non-ASCII byte '\\x%02X'", c)
242		}
243		return nil, nil, fmt.Errorf("token: unrecognized %s at %s:%d", msg, filename, line)
244	}
245	return tokens, comments, nil
246}
247