• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package tokenizer converts a text into a stream of tokens.
16package tokenizer
17
18import (
19	"bytes"
20	"fmt"
21	"hash/crc32"
22	"sort"
23	"unicode"
24	"unicode/utf8"
25)
26
27// Token is a non-whitespace sequence (i.e., word or punctuation) in the
28// original string. This is not meant for use outside of this package.
29type token struct {
30	Text   string
31	Offset int
32}
33
34// Tokens is a list of Token objects.
35type Tokens []*token
36
37// newToken creates a new token object with an invalid (negative) offset, which
38// will be set before the token's used.
39func newToken() *token {
40	return &token{Offset: -1}
41}
42
43// Tokenize converts a string into a stream of tokens.
44func Tokenize(s string) (toks Tokens) {
45	tok := newToken()
46	for i := 0; i < len(s); {
47		r, size := utf8.DecodeRuneInString(s[i:])
48		switch {
49		case unicode.IsSpace(r):
50			if tok.Offset >= 0 {
51				toks = append(toks, tok)
52				tok = newToken()
53			}
54		case unicode.IsPunct(r):
55			if tok.Offset >= 0 {
56				toks = append(toks, tok)
57				tok = newToken()
58			}
59			toks = append(toks, &token{
60				Text:   string(r),
61				Offset: i,
62			})
63		default:
64			if tok.Offset == -1 {
65				tok.Offset = i
66			}
67			tok.Text += string(r)
68		}
69		i += size
70	}
71	if tok.Offset != -1 {
72		// Add any remaining token that wasn't yet included in the list.
73		toks = append(toks, tok)
74	}
75	return toks
76}
77
78// GenerateHashes generates hashes for "size" length substrings. The
79// "stringifyTokens" call takes a long time to run, so not all substrings have
80// hashes, i.e. we skip some of the smaller substrings.
81func (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) {
82	if size == 0 {
83		return nil, nil
84	}
85
86	var css []uint32
87	var tr TokenRanges
88	for offset := 0; offset+size <= len(t); offset += size / 2 {
89		var b bytes.Buffer
90		t.stringifyTokens(&b, offset, size)
91		cs := crc32.ChecksumIEEE(b.Bytes())
92		css = append(css, cs)
93		tr = append(tr, &TokenRange{offset, offset + size})
94		h.add(cs, offset, offset+size)
95		if size <= 1 {
96			break
97		}
98	}
99
100	return css, tr
101}
102
103// stringifyTokens serializes a sublist of tokens into a bytes buffer.
104func (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) {
105	for j := offset; j < offset+size; j++ {
106		if j != offset {
107			b.WriteRune(' ')
108		}
109		b.WriteString(t[j].Text)
110	}
111}
112
113// TokenRange indicates the range of tokens that map to a particular checksum.
114type TokenRange struct {
115	Start int
116	End   int
117}
118
119func (t *TokenRange) String() string {
120	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
121}
122
123// TokenRanges is a list of TokenRange objects. The chance that two different
124// strings map to the same checksum is very small, but unfortunately isn't
125// zero, so we use this instead of making the assumption that they will all be
126// unique.
127type TokenRanges []*TokenRange
128
129func (t TokenRanges) Len() int           { return len(t) }
130func (t TokenRanges) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
131func (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start }
132
133// CombineUnique returns the combination of both token ranges with no duplicates.
134func (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges {
135	if len(other) == 0 {
136		return t
137	}
138	if len(t) == 0 {
139		return other
140	}
141
142	cu := append(t, other...)
143	sort.Sort(cu)
144
145	if len(cu) == 0 {
146		return nil
147	}
148
149	res := TokenRanges{cu[0]}
150	for prev, i := cu[0], 1; i < len(cu); i++ {
151		if prev.Start != cu[i].Start || prev.End != cu[i].End {
152			res = append(res, cu[i])
153			prev = cu[i]
154		}
155	}
156	return res
157}
158
159// Hash is a map of the hashes of a section of text to the token range covering that text.
160type Hash map[uint32]TokenRanges
161
162// add associates a token range, [start, end], to a checksum.
163func (h Hash) add(checksum uint32, start, end int) {
164	ntr := &TokenRange{Start: start, End: end}
165	if r, ok := h[checksum]; ok {
166		for _, tr := range r {
167			if tr.Start == ntr.Start && tr.End == ntr.End {
168				// The token range already exists at this
169				// checksum. No need to re-add it.
170				return
171			}
172		}
173	}
174	h[checksum] = append(h[checksum], ntr)
175}
176