• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2020 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package classifier provides the implementation of the v2 license classifier.
16package classifier
17
18import (
19	"bytes"
20	"fmt"
21	"os"
22	"strings"
23)
24
25type tokenID int // type to ensure safety when manipulating token identifiers.
26
27// token provides detailed information about a single textual token in the document.
28type token struct {
29	Text     string // normalized text of the token
30	Line     int    // line position of this token in the source
31	Previous string // for the first token in a line, any previous text.
32}
33
34type indexedToken struct {
35	Line int     // line position of this token in the source
36	ID   tokenID // identifier of the text in the dictionary
37}
38
39type indexedDocument struct {
40	Norm    string          // The normalized token sequence
41	Tokens  []indexedToken  // ordered tokens of the document
42	Matches Matches         // these are matches identified while processing the original, untokenized text via regexp matching
43	f       *frequencyTable // frequencies computed for this document
44	dict    *dictionary     // The corpus dictionary for this document
45	s       *searchSet      // The searchset for this document
46	runes   []rune
47}
48
49func (d *indexedDocument) generateSearchSet(q int) {
50	d.s = newSearchSet(d, q)
51}
52
53func (d *indexedDocument) size() int {
54	return len(d.Tokens)
55}
56
57// normalized returns a string of the normalized tokens concatenated with a
58// single space. This is used by the diff algorithm.
59// TODO: it'd be more efficient to have the diff algorithm work with the raw tokens directly
60// and avoid these ephemeral allocations.
61func (d *indexedDocument) normalized() string {
62	var w strings.Builder
63	for i, t := range d.Tokens {
64		w.WriteString(d.dict.getWord(t.ID))
65		if (i + 1) != d.size() {
66			w.WriteString(" ")
67		}
68	}
69	return w.String()
70}
71
72func computeQ(threshold float64) int {
73	// q is the lower bound for token runs (q-grams) that must exist
74	// in content that can be recognized at the specified threshold.
75	// Imagine a document with 100 tokens, and a threshold of 80%. This means
76	// that in a worst-case scenario, the 20 errors are evenly distributed to
77	// create the sortest possible token runs. In this case, there would be
78	// a repeating sequence of 4 good tokens and 1 errored token, occurring
79	// 20 times. This function returns the minimum token length, or returning
80	// a value of 1 if necessary (since a threshold level below 50% would generate
81	// a run of 0-length, which is meaningless.)
82	if threshold == 1.0 {
83		return 10 // avoid divide by 0
84	}
85
86	return max(1, int((threshold)/(1.0-threshold)))
87}
88
89func max(a, b int) int {
90	if a > b {
91		return a
92	}
93	return b
94}
95
96// AddContent incorporates the provided textual content into the classifier for
97// matching. This will not modify the supplied content.
98func (c *Classifier) AddContent(category, name, variant string, content []byte) {
99	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
100	// will never return an error so it's okay to ignore the return value in this
101	// case.
102	doc, _ := tokenizeStream(bytes.NewReader(content), true, c.dict, true)
103	c.addDocument(category, name, variant, doc)
104}
105
106// addDocument takes a textual document and incorporates it into the classifier for matching.
107func (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) {
108	// For documents that are part of the corpus, we add them to the dictionary and
109	// compute their associated search data eagerly so they are ready for matching against
110	// candidates.
111	indexName := c.generateDocName(category, name, variant)
112	id.generateSearchSet(c.q)
113	id.s.origin = indexName
114	c.docs[indexName] = id
115}
116
117// createTargetIndexedDocument creates an indexed document without adding the
118// words to the classifier dictionary. This should be used for matching targets, not
119// populating the corpus.
120func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
121	doc, _ := tokenizeStream(bytes.NewReader(in), true, c.dict, false)
122	return doc
123}
124
125func (c *Classifier) generateDocName(category, name, variant string) string {
126	return fmt.Sprintf("%s%c%s%c%s", category, os.PathSeparator, name, os.PathSeparator, variant)
127}
128func (c *Classifier) getIndexedDocument(category, name, variant string) *indexedDocument {
129	return c.docs[c.generateDocName(category, name, variant)]
130}
131
132// dictionary is used to intern all the token words encountered in the text corpus.
133// words and indices form an inverse mapping relationship. It is just a convenience type
134// over a pair of correlated maps.
135type dictionary struct {
136	words   map[tokenID]string
137	indices map[string]tokenID
138}
139
140func newDictionary() *dictionary {
141	return &dictionary{
142		words:   make(map[tokenID]string),
143		indices: make(map[string]tokenID),
144	}
145}
146
147// add inserts the provided word into the dictionary if it does not already exist.
148func (d *dictionary) add(word string) tokenID {
149	if idx := d.getIndex(word); idx != unknownIndex {
150		return idx
151	}
152	// token IDs start from 1, 0 is reserved for the invalid ID
153	idx := tokenID(len(d.words) + 1)
154	d.words[idx] = word
155	d.indices[word] = idx
156	return idx
157}
158
159var unknownWord = "UNKNOWN"
160var unknownIndex = tokenID(0)
161
162// getIndex returns the index of the supplied word, or 0 if the word is not in the dictionary.
163func (d *dictionary) getIndex(word string) tokenID {
164	if idx, found := d.indices[word]; found {
165		return idx
166	}
167	return unknownIndex
168}
169
170// getWord returns the word associated with the index.
171func (d *dictionary) getWord(index tokenID) string {
172	if word, found := d.words[index]; found {
173		return word
174	}
175	return unknownWord
176}
177