• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2020 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package classifier
16
17import (
18	"html"
19	"io"
20	"regexp"
21	"strings"
22	"unicode"
23	"unicode/utf8"
24)
25
26var eol = "\n"
27
28func header(in string) bool {
29	if len(in) == 0 {
30		return false
31	}
32	p, e := in[:len(in)-1], in[len(in)-1]
33	switch e {
34	case '.', ':', ')':
35		if listMarker[p] {
36			if e != ')' {
37				return true
38			}
39		}
40		// Check for patterns like 1.2.3
41		for _, r := range p {
42			if unicode.IsDigit(r) || r == '.' {
43				continue
44			}
45			return false
46		}
47		return true
48	}
49	return false
50}
51
52var listMarker = func() map[string]bool {
53	const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv"
54	l := map[string]bool{}
55	for _, marker := range strings.Split(allListMarkers, " ") {
56		l[marker] = true
57	}
58	return l
59}()
60
61// ignorableTexts is a list of lines at the start of the string we can remove
62// to get a cleaner match.
63var ignorableTexts = []*regexp.Regexp{
64	regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`),
65	regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`),
66	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
67}
68
69// tokenizeStream reads bytes from src and produces an indexedDocument of its
70// cotent. tokenizeStream will never return an error of its own, it can only
71// return an error from the provided Reader. If the provided Reader never
72// returns an error, it is safe to assume that tokenizeStream will not return an
73// error.
74func tokenizeStream(src io.Reader, normalize bool, dict *dictionary, updateDict bool) (*indexedDocument, error) {
75	const bufSize = 1024
76	// The longest UTF-8 encoded rune is 4 bytes, so we keep enough leftover bytes
77	// in the buffer to ensure we never run out of bytes trying to finish
78	// constructing a rune. These leftover 4 bytes will be copied to the start of
79	// the buffer before additional bytes are read.
80	tgt := bufSize - 4
81
82	rbuf := make([]byte, bufSize)
83	obuf := make([]byte, 0)
84	linebuf := make([]tokenID, 0)
85	idx := 0
86	line := 1 // 1s-based count
87	deferredEOL := false
88	deferredWord := false
89	// the tokenizer uses a local dictionary to conserve memory while
90	// analyzing the input doc to avoid polluting the global dictionary
91	ld := newDictionary()
92
93	var doc indexedDocument
94
95	isEOF := func(in error) bool {
96		return in == io.EOF || in == io.ErrUnexpectedEOF
97	}
98
99	// Read out the stream in chunks
100	for {
101		// Fill up the buffer with bytes to extract runes from
102		// idx is offset to hold any bytes left over from previous reads
103		n, err := io.ReadFull(src, rbuf[idx:])
104		if isEOF(err) {
105			// There are no more bytes to read, so we must now consume all bytes in the
106			// buffer.
107			tgt = idx + n
108		} else if err != nil {
109			return nil, err
110		}
111
112		for idx = 0; idx < tgt; {
113			r, n := utf8.DecodeRune(rbuf[idx:])
114			idx += n
115
116			if r == '\n' {
117				// Deal with carriage return
118
119				// If we are in a word (len(obuf) > 0)and the last rune is a -
120				// strike that rune and keep accumulating.
121				// Otherwise we treat it like a space and
122				// flush the word
123
124				if len(obuf) > 0 {
125					if obuf[len(obuf)-1] == '-' {
126						obuf = obuf[0 : len(obuf)-1]
127						deferredEOL = true
128						continue
129					}
130
131					// Append the word fragment to the line buffer
132					linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
133				}
134
135				// If there is something in the line to process, do so now
136				if len(linebuf) > 0 {
137					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
138					linebuf = nil
139					obuf = nil
140				}
141				if !normalize {
142					tokID := dict.getIndex(eol)
143					if tokID == unknownIndex {
144						tokID = dict.add(eol)
145					}
146					doc.Tokens = append(doc.Tokens, indexedToken{
147						ID:   tokID,
148						Line: line})
149				}
150				line++
151				continue
152			}
153
154			if len(obuf) == 0 {
155				if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '&' || r == '(' {
156					// Number or word character starts an interesting word
157					// Now we slurp up all non-space runes and aggregate it as
158					// a single word
159
160					// Buffer the initial token, normalizing to lower case if needed
161					if normalize {
162						r = unicode.ToLower(r)
163					}
164					obuf = utf8.AppendRune(obuf, r)
165				}
166				continue
167			}
168
169			// At this point, len(obuf) > 0 and we are accumulating more runes
170			// to complete a word.
171			if unicode.IsSpace(r) {
172				// If we have a deferred EOL, we need to pick up a non-space character
173				// to resume the hyphenated word, so we just consume spaces until that
174				// happens
175				if deferredEOL {
176					continue
177				}
178
179				// This is a space between word characters, so we assemble the word as a
180				// token and flush it out.
181				idx -= n
182
183				linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
184				if deferredWord {
185					appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
186					linebuf = nil
187					deferredWord = false
188					// Increment the line count now so the remainder token is credited
189					// to the previous line number.
190					line++
191				}
192				obuf = make([]byte, 0)
193				continue
194			}
195
196			if deferredEOL {
197				deferredEOL = false
198				deferredWord = true
199			}
200			// perform token mappings for punctuation to emulate
201			// normalizePunctuation. this returns a string and each rune needs to be
202			// injected.
203			if rep, found := punctuationMappings[r]; found {
204				for _, t := range rep {
205					obuf = utf8.AppendRune(obuf, unicode.ToLower(t))
206				}
207				continue
208			}
209
210			// if it's not punctuation, lowercase and buffer the token
211			obuf = utf8.AppendRune(obuf, unicode.ToLower(r))
212		}
213
214		// Break out if we have consumed all read bytes
215		if isEOF(err) {
216			break
217		}
218
219		// Copy the unconsumed bytes at the end of the buffer to the start
220		// of the buffer so the next read appends after them.
221		n = copy(rbuf, rbuf[idx:])
222		idx = n
223	}
224
225	// Process the remaining bytes in the buffer
226	if len(obuf) > 0 {
227		linebuf = append(linebuf, flushBuf(len(linebuf), obuf, normalize, ld))
228	}
229	if len(linebuf) > 0 {
230		appendToDoc(&doc, dict, line, linebuf, ld, normalize, updateDict, linebuf)
231	}
232
233	doc.dict = dict
234	doc.generateFrequencies()
235	doc.runes = diffWordsToRunes(&doc, 0, doc.size())
236	doc.Norm = doc.normalized()
237	return &doc, nil
238}
239
240func appendToDoc(doc *indexedDocument, dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool, linebuf []tokenID) {
241	tokens, m := stringifyLineBuf(dict, line, linebuf, ld, normalize, updateDict)
242	if tokens != nil {
243		doc.Tokens = append(doc.Tokens, tokens...)
244	} else if m != nil {
245		doc.Matches = append(doc.Matches, m)
246	}
247}
248
249func stringifyLineBuf(dict *dictionary, line int, in []tokenID, ld *dictionary, normalize bool, updateDict bool) ([]indexedToken, *Match) {
250	if len(in) == 0 {
251		return nil, nil
252	}
253	var sb strings.Builder
254	for i, r := range in {
255		out := ld.getWord(r)
256		if out == "" {
257			continue
258		}
259		sb.WriteString(out)
260		if i < len(in)-1 {
261			sb.WriteByte(' ')
262		}
263	}
264
265	out := sb.String()
266
267	for _, re := range ignorableTexts {
268		if re.MatchString(out) {
269			return nil, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: line, EndLine: line}
270		}
271	}
272
273	var tokens []indexedToken
274	for i, r := range in {
275		txt := cleanupToken(i, ld.getWord(r), normalize)
276		if txt != "" {
277			var tokID tokenID
278			if updateDict {
279				tokID = dict.add(txt)
280			} else {
281				tokID = dict.getIndex(txt)
282			}
283			tokens = append(tokens, indexedToken{
284				Line: line,
285				ID:   tokID,
286			})
287		}
288	}
289
290	return tokens, nil
291}
292
293func normalizeToken(in string) string {
294	// This performs some preprocessing on the token.
295	// This is different than cleanupToken in that fixups here
296	// are not exact match on the token.
297	// Normalizing URLs from https to http is an example of a fix applied
298	// here.
299	return strings.ReplaceAll(in, "https", "http")
300}
301
302func flushBuf(pos int, obuf []byte, normalizeWord bool, ld *dictionary) tokenID {
303	// clean up the contents of the rune buffer
304	token := string(obuf)
305	// escape sequences can occur anywhere in the string, not just the beginning
306	// so always attempt to unescape the word's content.
307	token = html.UnescapeString(token)
308
309	clean := normalizeToken(token)
310
311	return ld.add(clean)
312}
313
314func cleanupToken(pos int, in string, normalizeWord bool) string {
315	r, _ := utf8.DecodeRuneInString(in)
316	var out strings.Builder
317	if pos == 0 && header(in) {
318		return ""
319	}
320
321	if !unicode.IsLetter(r) {
322		if unicode.IsDigit(r) {
323			// Based on analysis of the license corpus, the characters that are
324			// significant are numbers, periods, and dashes. Anything else can be
325			// safely discarded, and helps avoid matching failures due to inconsistent
326			// whitespacing and formatting.
327			for _, c := range in {
328				if unicode.IsDigit(c) || c == '.' || c == '-' {
329					out.WriteRune(c)
330				}
331			}
332
333			// Numbers should not end in a .  since that doesn't indicate a version
334			// number, but usually an end of a line.
335			res := out.String()
336			for strings.HasSuffix(res, ".") {
337				res = res[0 : len(res)-1]
338			}
339			return res
340		}
341	}
342
343	// Remove internal hyphenization or URL constructs to better normalize strings
344	// for matching.
345
346	for _, c := range in {
347		if unicode.IsLetter(c) {
348			out.WriteRune(c)
349		}
350	}
351
352	tok := out.String()
353	if !normalizeWord {
354		return tok
355	}
356
357	if iw, ok := interchangeableWords[tok]; ok && normalizeWord {
358		return iw
359	}
360	return tok
361}
362
363var interchangeableWords = map[string]string{
364	"analyse":         "analyze",
365	"artefact":        "artifact",
366	"authorisation":   "authorization",
367	"authorised":      "authorized",
368	"calibre":         "caliber",
369	"cancelled":       "canceled",
370	"capitalisations": "capitalizations",
371	"catalogue":       "catalog",
372	"categorise":      "categorize",
373	"centre":          "center",
374	"emphasised":      "emphasized",
375	"favour":          "favor",
376	"favourite":       "favorite",
377	"fulfil":          "fulfill",
378	"fulfilment":      "fulfillment",
379	"https":           "http",
380	"initialise":      "initialize",
381	"judgment":        "judgement",
382	"labelling":       "labeling",
383	"labour":          "labor",
384	"licence":         "license",
385	"maximise":        "maximize",
386	"modelled":        "modeled",
387	"modelling":       "modeling",
388	"offence":         "offense",
389	"optimise":        "optimize",
390	"organisation":    "organization",
391	"organise":        "organize",
392	"practise":        "practice",
393	"programme":       "program",
394	"realise":         "realize",
395	"recognise":       "recognize",
396	"signalling":      "signaling",
397	"utilisation":     "utilization",
398	"whilst":          "while",
399	"wilful":          "wilfull",
400	// TODO: These three need tokenizer magic
401	"non commercial": "noncommercial",
402	"per cent":       "percent",
403	"sub license":    "sublicense",
404}
405
406var punctuationMappings = map[rune]string{
407	'-': "-",
408	'‒': "-",
409	'–': "-",
410	'—': "-",
411	'‐': "-",
412	'©': "(c)",
413	'§': "(s)",
414	'¤': "(s)",
415	'·': " ",
416	'*': " ",
417}
418