1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package classifier 16 17import ( 18 "html" 19 "io" 20 "regexp" 21 "strings" 22 "unicode" 23 "unicode/utf8" 24) 25 26var eol = "\n" 27 28func header(in string) bool { 29 if len(in) == 0 { 30 return false 31 } 32 p, e := in[:len(in)-1], in[len(in)-1] 33 switch e { 34 case '.', ':', ')': 35 if listMarker[p] { 36 if e != ')' { 37 return true 38 } 39 } 40 // Check for patterns like 1.2.3 41 for _, r := range p { 42 if unicode.IsDigit(r) || r == '.' { 43 continue 44 } 45 return false 46 } 47 return true 48 } 49 return false 50} 51 52var listMarker = func() map[string]bool { 53 const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv" 54 l := map[string]bool{} 55 for _, marker := range strings.Split(allListMarkers, " ") { 56 l[marker] = true 57 } 58 return l 59}() 60 61// ignorableTexts is a list of lines at the start of the string we can remove 62// to get a cleaner match. 63var ignorableTexts = []*regexp.Regexp{ 64 regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`), 65 regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`), 66 regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), 67} 68 69// tokenizeStream reads bytes from src and produces an indexedDocument of its 70// cotent. tokenizeStream will never return an error of its own, it can only 71// return an error from the provided Reader. If the provided Reader never 72// returns an error, it is safe to assume that tokenizeStream will not return an 73// error. 74func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) { 75 const bufSize = 1024 76 // The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes 77 // in the buffer to ensure we never run out of bytes trying to finish 78 // constructing a rune. These leftover 4 bytes will be copied to the start of 79 // the buffer before additional bytes are read. 80 tgt := bufSize - 4 81 82 rbuf := make([]byte, bufSize) 83 obuf := make([]byte, 0) 84 linebuf := make([]tokenID, 0) 85 idx := 0 86 line := 1 // 1s-based count 87 deferredEOL := false 88 deferredWord := false 89 // the tokenizer uses a local dictionary to conserve memory while 90 // analyzing the input doc to avoid polluting the global dictionary 91 ld := newDictionary() 92 93 var doc indexedDocument 94 95 isEOF := func(in error) bool { 96 return in == io.EOF || in == io.ErrUnexpectedEOF 97 } 98 99 // Read out the stream in chunks 100 for { 101 // Fill up the buffer with bytes to extract runes from 102 // idx is offset to hold any bytes left over from previous reads 103 n, err := io.ReadFull(src, rbuf[idx:]) 104 if isEOF(err) { 105 // There are no more bytes to read, so we must now consume all bytes in the 106 // buffer. 107 tgt = idx + n 108 } else if err != nil { 109 return nil, err 110 } 111 112 for idx = 0; idx < tgt; { 113 r, n := utf8.DecodeRune(rbuf[idx:]) 114 idx += n 115 116 if r == '\n' { 117 // Deal with carriage return 118 119 // If we are in a word (len(obuf) > 0)and the last rune is a - 120 // strike that rune and keep accumulating. 121 // Otherwise we treat it like a space and 122 // flush the word 123 124 if len(obuf) > 0 { 125 if obuf[len(obuf)-1] == '-' { 126 obuf = obuf[0 : len(obuf)-1] 127 deferredEOL = true 128 continue 129 } 130 131 // Append the word fragment to the line buffer 132 linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) 133 } 134 135 // If there is something in the line to process, do so now 136 if len(linebuf) > 0 { 137 appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) 138 linebuf = nil 139 obuf = nil 140 } 141 if !normalize { 142 tokID := dict.getIndex(eol) 143 if tokID == unknownIndex { 144 tokID = dict.add(eol) 145 } 146 doc.Tokens = append(doc.Tokens, indexedToken{ 147 ID: tokID, 148 Line: line}) 149 } 150 line++ 151 continue 152 } 153 154 if len(obuf) == 0 { 155 if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' { 156 // Number or word character starts an interesting word 157 // Now we slurp up all non-space runes and aggregate it as 158 // a single word 159 160 // Buffer the initial token, normalizing to lower case if needed 161 if normalize { 162 r = unicode.ToLower(r) 163 } 164 obuf = utf8.AppendRune(obuf, r) 165 } 166 continue 167 } 168 169 // At this point, len(obuf) > 0 and we are accumulating more runes 170 // to complete a word. 171 if unicode.IsSpace(r) { 172 // If we have a deferred EOL, we need to pick up a non-space character 173 // to resume the hyphenated word, so we just consume spaces until that 174 // happens 175 if deferredEOL { 176 continue 177 } 178 179 // This is a space between word characters, so we assemble the word as a 180 // token and flush it out. 181 idx -= n 182 183 linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) 184 if deferredWord { 185 appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) 186 linebuf = nil 187 deferredWord = false 188 // Increment the line count now so the remainder token is credited 189 // to the previous line number. 190 line++ 191 } 192 obuf = make([]byte, 0) 193 continue 194 } 195 196 if deferredEOL { 197 deferredEOL = false 198 deferredWord = true 199 } 200 // perform token mappings for punctuation to emulate 201 // normalizePunctuation. this returns a string and each rune needs to be 202 // injected. 203 if rep, found := punctuationMappings[r]; found { 204 for _, t := range rep { 205 obuf = utf8.AppendRune(obuf, unicode.ToLower(t)) 206 } 207 continue 208 } 209 210 // if it's not punctuation, lowercase and buffer the token 211 obuf = utf8.AppendRune(obuf, unicode.ToLower(r)) 212 } 213 214 // Break out if we have consumed all read bytes 215 if isEOF(err) { 216 break 217 } 218 219 // Copy the unconsumed bytes at the end of the buffer to the start 220 // of the buffer so the next read appends after them. 221 n = copy(rbuf, rbuf[idx:]) 222 idx = n 223 } 224 225 // Process the remaining bytes in the buffer 226 if len(obuf) > 0 { 227 linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld)) 228 } 229 if len(linebuf) > 0 { 230 appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf) 231 } 232 233 doc.dict = dict 234 doc.generateFrequencies() 235 doc.runes = diffWordsToRunes(&doc, 0, doc.size()) 236 doc.Norm = doc.normalized() 237 return &doc, nil 238} 239 240func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) { 241 tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict) 242 if tokens != nil { 243 doc.Tokens = append(doc.Tokens, tokens...) 244 } else if m != nil { 245 doc.Matches = append(doc.Matches, m) 246 } 247} 248 249func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) { 250 if len(in) == 0 { 251 return nil, nil 252 } 253 var sb strings.Builder 254 for i, r := range in { 255 out := ld.getWord(r) 256 if out == "" { 257 continue 258 } 259 sb.WriteString(out) 260 if i < len(in)-1 { 261 sb.WriteByte(' ') 262 } 263 } 264 265 out := sb.String() 266 267 for _, re := range ignorableTexts { 268 if re.MatchString(out) { 269 return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line} 270 } 271 } 272 273 var tokens []indexedToken 274 for i, r := range in { 275 txt := cleanupToken(i, ld.getWord(r), normalize) 276 if txt != "" { 277 var tokID tokenID 278 if updateDict { 279 tokID = dict.add(txt) 280 } else { 281 tokID = dict.getIndex(txt) 282 } 283 tokens = append(tokens, indexedToken{ 284 Line: line, 285 ID: tokID, 286 }) 287 } 288 } 289 290 return tokens, nil 291} 292 293func normalizeToken(in string) string { 294 // This performs some preprocessing on the token. 295 // This is different than cleanupToken in that fixups here 296 // are not exact match on the token. 297 // Normalizing URLs from https to http is an example of a fix applied 298 // here. 299 return strings.ReplaceAll(in, "https", "http") 300} 301 302func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID { 303 // clean up the contents of the rune buffer 304 token := string(obuf) 305 // escape sequences can occur anywhere in the string, not just the beginning 306 // so always attempt to unescape the word's content. 307 token = html.UnescapeString(token) 308 309 clean := normalizeToken(token) 310 311 return ld.add(clean) 312} 313 314func cleanupToken(pos int, in string, normalizeWord bool) string { 315 r, _ := utf8.DecodeRuneInString(in) 316 var out strings.Builder 317 if pos == 0 && header(in) { 318 return "" 319 } 320 321 if !unicode.IsLetter(r) { 322 if unicode.IsDigit(r) { 323 // Based on analysis of the license corpus, the characters that are 324 // significant are numbers, periods, and dashes. Anything else can be 325 // safely discarded, and helps avoid matching failures due to inconsistent 326 // whitespacing and formatting. 327 for _, c := range in { 328 if unicode.IsDigit(c) || c == '.' || c == '-' { 329 out.WriteRune(c) 330 } 331 } 332 333 // Numbers should not end in a . since that doesn't indicate a version 334 // number, but usually an end of a line. 335 res := out.String() 336 for strings.HasSuffix(res, ".") { 337 res = res[0 : len(res)-1] 338 } 339 return res 340 } 341 } 342 343 // Remove internal hyphenization or URL constructs to better normalize strings 344 // for matching. 345 346 for _, c := range in { 347 if unicode.IsLetter(c) { 348 out.WriteRune(c) 349 } 350 } 351 352 tok := out.String() 353 if !normalizeWord { 354 return tok 355 } 356 357 if iw, ok := interchangeableWords[tok]; ok && normalizeWord { 358 return iw 359 } 360 return tok 361} 362 363var interchangeableWords = map[string]string{ 364 "analyse": "analyze", 365 "artefact": "artifact", 366 "authorisation": "authorization", 367 "authorised": "authorized", 368 "calibre": "caliber", 369 "cancelled": "canceled", 370 "capitalisations": "capitalizations", 371 "catalogue": "catalog", 372 "categorise": "categorize", 373 "centre": "center", 374 "emphasised": "emphasized", 375 "favour": "favor", 376 "favourite": "favorite", 377 "fulfil": "fulfill", 378 "fulfilment": "fulfillment", 379 "https": "http", 380 "initialise": "initialize", 381 "judgment": "judgement", 382 "labelling": "labeling", 383 "labour": "labor", 384 "licence": "license", 385 "maximise": "maximize", 386 "modelled": "modeled", 387 "modelling": "modeling", 388 "offence": "offense", 389 "optimise": "optimize", 390 "organisation": "organization", 391 "organise": "organize", 392 "practise": "practice", 393 "programme": "program", 394 "realise": "realize", 395 "recognise": "recognize", 396 "signalling": "signaling", 397 "utilisation": "utilization", 398 "whilst": "while", 399 "wilful": "wilfull", 400 // TODO: These three need tokenizer magic 401 "non commercial": "noncommercial", 402 "per cent": "percent", 403 "sub license": "sublicense", 404} 405 406var punctuationMappings = map[rune]string{ 407 '-': "-", 408 '‒': "-", 409 '–': "-", 410 '—': "-", 411 '‐': "-", 412 '©': "(c)", 413 '§': "(s)", 414 '¤': "(s)", 415 '·': " ", 416 '*': " ", 417} 418