• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package norm
6
7import (
8	"fmt"
9	"unicode/utf8"
10)
11
12// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
13// sequence of starter and non-starter runes for the purpose of normalization.
14const MaxSegmentSize = maxByteBufferSize
15
16// An Iter iterates over a string or byte slice, while normalizing it
17// to a given Form.
18type Iter struct {
19	rb     reorderBuffer
20	buf    [maxByteBufferSize]byte
21	info   Properties // first character saved from previous iteration
22	next   iterFunc   // implementation of next depends on form
23	asciiF iterFunc
24
25	p        int    // current position in input source
26	multiSeg []byte // remainder of multi-segment decomposition
27}
28
29type iterFunc func(*Iter) []byte
30
31// Init initializes i to iterate over src after normalizing it to Form f.
32func (i *Iter) Init(f Form, src []byte) {
33	i.p = 0
34	if len(src) == 0 {
35		i.setDone()
36		i.rb.nsrc = 0
37		return
38	}
39	i.multiSeg = nil
40	i.rb.init(f, src)
41	i.next = i.rb.f.nextMain
42	i.asciiF = nextASCIIBytes
43	i.info = i.rb.f.info(i.rb.src, i.p)
44	i.rb.ss.first(i.info)
45}
46
47// InitString initializes i to iterate over src after normalizing it to Form f.
48func (i *Iter) InitString(f Form, src string) {
49	i.p = 0
50	if len(src) == 0 {
51		i.setDone()
52		i.rb.nsrc = 0
53		return
54	}
55	i.multiSeg = nil
56	i.rb.initString(f, src)
57	i.next = i.rb.f.nextMain
58	i.asciiF = nextASCIIString
59	i.info = i.rb.f.info(i.rb.src, i.p)
60	i.rb.ss.first(i.info)
61}
62
63// Seek sets the segment to be returned by the next call to Next to start
64// at position p.  It is the responsibility of the caller to set p to the
65// start of a segment.
66func (i *Iter) Seek(offset int64, whence int) (int64, error) {
67	var abs int64
68	switch whence {
69	case 0:
70		abs = offset
71	case 1:
72		abs = int64(i.p) + offset
73	case 2:
74		abs = int64(i.rb.nsrc) + offset
75	default:
76		return 0, fmt.Errorf("norm: invalid whence")
77	}
78	if abs < 0 {
79		return 0, fmt.Errorf("norm: negative position")
80	}
81	if int(abs) >= i.rb.nsrc {
82		i.setDone()
83		return int64(i.p), nil
84	}
85	i.p = int(abs)
86	i.multiSeg = nil
87	i.next = i.rb.f.nextMain
88	i.info = i.rb.f.info(i.rb.src, i.p)
89	i.rb.ss.first(i.info)
90	return abs, nil
91}
92
93// returnSlice returns a slice of the underlying input type as a byte slice.
94// If the underlying is of type []byte, it will simply return a slice.
95// If the underlying is of type string, it will copy the slice to the buffer
96// and return that.
97func (i *Iter) returnSlice(a, b int) []byte {
98	if i.rb.src.bytes == nil {
99		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
100	}
101	return i.rb.src.bytes[a:b]
102}
103
104// Pos returns the byte position at which the next call to Next will commence processing.
105func (i *Iter) Pos() int {
106	return i.p
107}
108
109func (i *Iter) setDone() {
110	i.next = nextDone
111	i.p = i.rb.nsrc
112}
113
114// Done returns true if there is no more input to process.
115func (i *Iter) Done() bool {
116	return i.p >= i.rb.nsrc
117}
118
119// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
120// For any input a and b for which f(a) == f(b), subsequent calls
121// to Next will return the same segments.
122// Modifying runes are grouped together with the preceding starter, if such a starter exists.
123// Although not guaranteed, n will typically be the smallest possible n.
124func (i *Iter) Next() []byte {
125	return i.next(i)
126}
127
128func nextASCIIBytes(i *Iter) []byte {
129	p := i.p + 1
130	if p >= i.rb.nsrc {
131		i.setDone()
132		return i.rb.src.bytes[i.p:p]
133	}
134	if i.rb.src.bytes[p] < utf8.RuneSelf {
135		p0 := i.p
136		i.p = p
137		return i.rb.src.bytes[p0:p]
138	}
139	i.info = i.rb.f.info(i.rb.src, i.p)
140	i.next = i.rb.f.nextMain
141	return i.next(i)
142}
143
144func nextASCIIString(i *Iter) []byte {
145	p := i.p + 1
146	if p >= i.rb.nsrc {
147		i.buf[0] = i.rb.src.str[i.p]
148		i.setDone()
149		return i.buf[:1]
150	}
151	if i.rb.src.str[p] < utf8.RuneSelf {
152		i.buf[0] = i.rb.src.str[i.p]
153		i.p = p
154		return i.buf[:1]
155	}
156	i.info = i.rb.f.info(i.rb.src, i.p)
157	i.next = i.rb.f.nextMain
158	return i.next(i)
159}
160
161func nextHangul(i *Iter) []byte {
162	p := i.p
163	next := p + hangulUTF8Size
164	if next >= i.rb.nsrc {
165		i.setDone()
166	} else if i.rb.src.hangul(next) == 0 {
167		i.rb.ss.next(i.info)
168		i.info = i.rb.f.info(i.rb.src, i.p)
169		i.next = i.rb.f.nextMain
170		return i.next(i)
171	}
172	i.p = next
173	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
174}
175
176func nextDone(i *Iter) []byte {
177	return nil
178}
179
180// nextMulti is used for iterating over multi-segment decompositions
181// for decomposing normal forms.
182func nextMulti(i *Iter) []byte {
183	j := 0
184	d := i.multiSeg
185	// skip first rune
186	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
187	}
188	for j < len(d) {
189		info := i.rb.f.info(input{bytes: d}, j)
190		if info.BoundaryBefore() {
191			i.multiSeg = d[j:]
192			return d[:j]
193		}
194		j += int(info.size)
195	}
196	// treat last segment as normal decomposition
197	i.next = i.rb.f.nextMain
198	return i.next(i)
199}
200
201// nextMultiNorm is used for iterating over multi-segment decompositions
202// for composing normal forms.
203func nextMultiNorm(i *Iter) []byte {
204	j := 0
205	d := i.multiSeg
206	for j < len(d) {
207		info := i.rb.f.info(input{bytes: d}, j)
208		if info.BoundaryBefore() {
209			i.rb.compose()
210			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
211			i.rb.insertUnsafe(input{bytes: d}, j, info)
212			i.multiSeg = d[j+int(info.size):]
213			return seg
214		}
215		i.rb.insertUnsafe(input{bytes: d}, j, info)
216		j += int(info.size)
217	}
218	i.multiSeg = nil
219	i.next = nextComposed
220	return doNormComposed(i)
221}
222
223// nextDecomposed is the implementation of Next for forms NFD and NFKD.
224func nextDecomposed(i *Iter) (next []byte) {
225	outp := 0
226	inCopyStart, outCopyStart := i.p, 0
227	for {
228		if sz := int(i.info.size); sz <= 1 {
229			i.rb.ss = 0
230			p := i.p
231			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
232			if i.p >= i.rb.nsrc {
233				i.setDone()
234				return i.returnSlice(p, i.p)
235			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
236				i.next = i.asciiF
237				return i.returnSlice(p, i.p)
238			}
239			outp++
240		} else if d := i.info.Decomposition(); d != nil {
241			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
242			// Case 1: there is a leftover to copy.  In this case the decomposition
243			// must begin with a modifier and should always be appended.
244			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
245			p := outp + len(d)
246			if outp > 0 {
247				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
248				// TODO: this condition should not be possible, but we leave it
249				// in for defensive purposes.
250				if p > len(i.buf) {
251					return i.buf[:outp]
252				}
253			} else if i.info.multiSegment() {
254				// outp must be 0 as multi-segment decompositions always
255				// start a new segment.
256				if i.multiSeg == nil {
257					i.multiSeg = d
258					i.next = nextMulti
259					return nextMulti(i)
260				}
261				// We are in the last segment.  Treat as normal decomposition.
262				d = i.multiSeg
263				i.multiSeg = nil
264				p = len(d)
265			}
266			prevCC := i.info.tccc
267			if i.p += sz; i.p >= i.rb.nsrc {
268				i.setDone()
269				i.info = Properties{} // Force BoundaryBefore to succeed.
270			} else {
271				i.info = i.rb.f.info(i.rb.src, i.p)
272			}
273			switch i.rb.ss.next(i.info) {
274			case ssOverflow:
275				i.next = nextCGJDecompose
276				fallthrough
277			case ssStarter:
278				if outp > 0 {
279					copy(i.buf[outp:], d)
280					return i.buf[:p]
281				}
282				return d
283			}
284			copy(i.buf[outp:], d)
285			outp = p
286			inCopyStart, outCopyStart = i.p, outp
287			if i.info.ccc < prevCC {
288				goto doNorm
289			}
290			continue
291		} else if r := i.rb.src.hangul(i.p); r != 0 {
292			outp = decomposeHangul(i.buf[:], r)
293			i.p += hangulUTF8Size
294			inCopyStart, outCopyStart = i.p, outp
295			if i.p >= i.rb.nsrc {
296				i.setDone()
297				break
298			} else if i.rb.src.hangul(i.p) != 0 {
299				i.next = nextHangul
300				return i.buf[:outp]
301			}
302		} else {
303			p := outp + sz
304			if p > len(i.buf) {
305				break
306			}
307			outp = p
308			i.p += sz
309		}
310		if i.p >= i.rb.nsrc {
311			i.setDone()
312			break
313		}
314		prevCC := i.info.tccc
315		i.info = i.rb.f.info(i.rb.src, i.p)
316		if v := i.rb.ss.next(i.info); v == ssStarter {
317			break
318		} else if v == ssOverflow {
319			i.next = nextCGJDecompose
320			break
321		}
322		if i.info.ccc < prevCC {
323			goto doNorm
324		}
325	}
326	if outCopyStart == 0 {
327		return i.returnSlice(inCopyStart, i.p)
328	} else if inCopyStart < i.p {
329		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
330	}
331	return i.buf[:outp]
332doNorm:
333	// Insert what we have decomposed so far in the reorderBuffer.
334	// As we will only reorder, there will always be enough room.
335	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
336	i.rb.insertDecomposed(i.buf[0:outp])
337	return doNormDecomposed(i)
338}
339
340func doNormDecomposed(i *Iter) []byte {
341	for {
342		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
343		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
344			i.setDone()
345			break
346		}
347		i.info = i.rb.f.info(i.rb.src, i.p)
348		if i.info.ccc == 0 {
349			break
350		}
351		if s := i.rb.ss.next(i.info); s == ssOverflow {
352			i.next = nextCGJDecompose
353			break
354		}
355	}
356	// new segment or too many combining characters: exit normalization
357	return i.buf[:i.rb.flushCopy(i.buf[:])]
358}
359
360func nextCGJDecompose(i *Iter) []byte {
361	i.rb.ss = 0
362	i.rb.insertCGJ()
363	i.next = nextDecomposed
364	i.rb.ss.first(i.info)
365	buf := doNormDecomposed(i)
366	return buf
367}
368
369// nextComposed is the implementation of Next for forms NFC and NFKC.
370func nextComposed(i *Iter) []byte {
371	outp, startp := 0, i.p
372	var prevCC uint8
373	for {
374		if !i.info.isYesC() {
375			goto doNorm
376		}
377		prevCC = i.info.tccc
378		sz := int(i.info.size)
379		if sz == 0 {
380			sz = 1 // illegal rune: copy byte-by-byte
381		}
382		p := outp + sz
383		if p > len(i.buf) {
384			break
385		}
386		outp = p
387		i.p += sz
388		if i.p >= i.rb.nsrc {
389			i.setDone()
390			break
391		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
392			i.rb.ss = 0
393			i.next = i.asciiF
394			break
395		}
396		i.info = i.rb.f.info(i.rb.src, i.p)
397		if v := i.rb.ss.next(i.info); v == ssStarter {
398			break
399		} else if v == ssOverflow {
400			i.next = nextCGJCompose
401			break
402		}
403		if i.info.ccc < prevCC {
404			goto doNorm
405		}
406	}
407	return i.returnSlice(startp, i.p)
408doNorm:
409	// reset to start position
410	i.p = startp
411	i.info = i.rb.f.info(i.rb.src, i.p)
412	i.rb.ss.first(i.info)
413	if i.info.multiSegment() {
414		d := i.info.Decomposition()
415		info := i.rb.f.info(input{bytes: d}, 0)
416		i.rb.insertUnsafe(input{bytes: d}, 0, info)
417		i.multiSeg = d[int(info.size):]
418		i.next = nextMultiNorm
419		return nextMultiNorm(i)
420	}
421	i.rb.ss.first(i.info)
422	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
423	return doNormComposed(i)
424}
425
426func doNormComposed(i *Iter) []byte {
427	// First rune should already be inserted.
428	for {
429		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
430			i.setDone()
431			break
432		}
433		i.info = i.rb.f.info(i.rb.src, i.p)
434		if s := i.rb.ss.next(i.info); s == ssStarter {
435			break
436		} else if s == ssOverflow {
437			i.next = nextCGJCompose
438			break
439		}
440		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
441	}
442	i.rb.compose()
443	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
444	return seg
445}
446
447func nextCGJCompose(i *Iter) []byte {
448	i.rb.ss = 0 // instead of first
449	i.rb.insertCGJ()
450	i.next = nextComposed
451	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
452	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
453	// If we ever change that, insert a check here.
454	i.rb.ss.first(i.info)
455	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
456	return doNormComposed(i)
457}
458