1// Copyright 2017 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package tokenizer converts a text into a stream of tokens. 16package tokenizer 17 18import ( 19 "bytes" 20 "fmt" 21 "hash/crc32" 22 "sort" 23 "unicode" 24 "unicode/utf8" 25) 26 27// Token is a non-whitespace sequence (i.e., word or punctuation) in the 28// original string. This is not meant for use outside of this package. 29type token struct { 30 Text string 31 Offset int 32} 33 34// Tokens is a list of Token objects. 35type Tokens []*token 36 37// newToken creates a new token object with an invalid (negative) offset, which 38// will be set before the token's used. 39func newToken() *token { 40 return &token{Offset: -1} 41} 42 43// Tokenize converts a string into a stream of tokens. 44func Tokenize(s string) (toks Tokens) { 45 tok := newToken() 46 for i := 0; i < len(s); { 47 r, size := utf8.DecodeRuneInString(s[i:]) 48 switch { 49 case unicode.IsSpace(r): 50 if tok.Offset >= 0 { 51 toks = append(toks, tok) 52 tok = newToken() 53 } 54 case unicode.IsPunct(r): 55 if tok.Offset >= 0 { 56 toks = append(toks, tok) 57 tok = newToken() 58 } 59 toks = append(toks, &token{ 60 Text: string(r), 61 Offset: i, 62 }) 63 default: 64 if tok.Offset == -1 { 65 tok.Offset = i 66 } 67 tok.Text += string(r) 68 } 69 i += size 70 } 71 if tok.Offset != -1 { 72 // Add any remaining token that wasn't yet included in the list. 73 toks = append(toks, tok) 74 } 75 return toks 76} 77 78// GenerateHashes generates hashes for "size" length substrings. The 79// "stringifyTokens" call takes a long time to run, so not all substrings have 80// hashes, i.e. we skip some of the smaller substrings. 81func (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) { 82 if size == 0 { 83 return nil, nil 84 } 85 86 var css []uint32 87 var tr TokenRanges 88 for offset := 0; offset+size <= len(t); offset += size / 2 { 89 var b bytes.Buffer 90 t.stringifyTokens(&b, offset, size) 91 cs := crc32.ChecksumIEEE(b.Bytes()) 92 css = append(css, cs) 93 tr = append(tr, &TokenRange{offset, offset + size}) 94 h.add(cs, offset, offset+size) 95 if size <= 1 { 96 break 97 } 98 } 99 100 return css, tr 101} 102 103// stringifyTokens serializes a sublist of tokens into a bytes buffer. 104func (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) { 105 for j := offset; j < offset+size; j++ { 106 if j != offset { 107 b.WriteRune(' ') 108 } 109 b.WriteString(t[j].Text) 110 } 111} 112 113// TokenRange indicates the range of tokens that map to a particular checksum. 114type TokenRange struct { 115 Start int 116 End int 117} 118 119func (t *TokenRange) String() string { 120 return fmt.Sprintf("[%v, %v)", t.Start, t.End) 121} 122 123// TokenRanges is a list of TokenRange objects. The chance that two different 124// strings map to the same checksum is very small, but unfortunately isn't 125// zero, so we use this instead of making the assumption that they will all be 126// unique. 127type TokenRanges []*TokenRange 128 129func (t TokenRanges) Len() int { return len(t) } 130func (t TokenRanges) Swap(i, j int) { t[i], t[j] = t[j], t[i] } 131func (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start } 132 133// CombineUnique returns the combination of both token ranges with no duplicates. 134func (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges { 135 if len(other) == 0 { 136 return t 137 } 138 if len(t) == 0 { 139 return other 140 } 141 142 cu := append(t, other...) 143 sort.Sort(cu) 144 145 if len(cu) == 0 { 146 return nil 147 } 148 149 res := TokenRanges{cu[0]} 150 for prev, i := cu[0], 1; i < len(cu); i++ { 151 if prev.Start != cu[i].Start || prev.End != cu[i].End { 152 res = append(res, cu[i]) 153 prev = cu[i] 154 } 155 } 156 return res 157} 158 159// Hash is a map of the hashes of a section of text to the token range covering that text. 160type Hash map[uint32]TokenRanges 161 162// add associates a token range, [start, end], to a checksum. 163func (h Hash) add(checksum uint32, start, end int) { 164 ntr := &TokenRange{Start: start, End: end} 165 if r, ok := h[checksum]; ok { 166 for _, tr := range r { 167 if tr.Start == ntr.Start && tr.End == ntr.End { 168 // The token range already exists at this 169 // checksum. No need to re-add it. 170 return 171 } 172 } 173 } 174 h[checksum] = append(h[checksum], ntr) 175} 176