1// Copyright 2020 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// Package classifier provides the implementation of the v2 license classifier. 16package classifier 17 18import ( 19 "bytes" 20 "fmt" 21 "os" 22 "strings" 23) 24 25type tokenID int // type to ensure safety when manipulating token identifiers. 26 27// token provides detailed information about a single textual token in the document. 28type token struct { 29 Text string // normalized text of the token 30 Line int // line position of this token in the source 31 Previous string // for the first token in a line, any previous text. 32} 33 34type indexedToken struct { 35 Line int // line position of this token in the source 36 ID tokenID // identifier of the text in the dictionary 37} 38 39type indexedDocument struct { 40 Norm string // The normalized token sequence 41 Tokens []indexedToken // ordered tokens of the document 42 Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching 43 f *frequencyTable // frequencies computed for this document 44 dict *dictionary // The corpus dictionary for this document 45 s *searchSet // The searchset for this document 46 runes []rune 47} 48 49func (d *indexedDocument) generateSearchSet(q int) { 50 d.s = newSearchSet(d, q) 51} 52 53func (d *indexedDocument) size() int { 54 return len(d.Tokens) 55} 56 57// normalized returns a string of the normalized tokens concatenated with a 58// single space. This is used by the diff algorithm. 59// TODO: it'd be more efficient to have the diff algorithm work with the raw tokens directly 60// and avoid these ephemeral allocations. 61func (d *indexedDocument) normalized() string { 62 var w strings.Builder 63 for i, t := range d.Tokens { 64 w.WriteString(d.dict.getWord(t.ID)) 65 if (i + 1) != d.size() { 66 w.WriteString(" ") 67 } 68 } 69 return w.String() 70} 71 72func computeQ(threshold float64) int { 73 // q is the lower bound for token runs (q-grams) that must exist 74 // in content that can be recognized at the specified threshold. 75 // Imagine a document with 100 tokens, and a threshold of 80%. This means 76 // that in a worst-case scenario, the 20 errors are evenly distributed to 77 // create the sortest possible token runs. In this case, there would be 78 // a repeating sequence of 4 good tokens and 1 errored token, occurring 79 // 20 times. This function returns the minimum token length, or returning 80 // a value of 1 if necessary (since a threshold level below 50% would generate 81 // a run of 0-length, which is meaningless.) 82 if threshold == 1.0 { 83 return 10 // avoid divide by 0 84 } 85 86 return max(1, int((threshold)/(1.0-threshold))) 87} 88 89func max(a, b int) int { 90 if a > b { 91 return a 92 } 93 return b 94} 95 96// AddContent incorporates the provided textual content into the classifier for 97// matching. This will not modify the supplied content. 98func (c *Classifier) AddContent(category, name, variant string, content []byte) { 99 // Since bytes.NewReader().Read() will never return an error, tokenizeStream 100 // will never return an error so it's okay to ignore the return value in this 101 // case. 102 doc, _ := tokenizeStream(bytes.NewReader(content), true, c.dict, true) 103 c.addDocument(category, name, variant, doc) 104} 105 106// addDocument takes a textual document and incorporates it into the classifier for matching. 107func (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) { 108 // For documents that are part of the corpus, we add them to the dictionary and 109 // compute their associated search data eagerly so they are ready for matching against 110 // candidates. 111 indexName := c.generateDocName(category, name, variant) 112 id.generateSearchSet(c.q) 113 id.s.origin = indexName 114 c.docs[indexName] = id 115} 116 117// createTargetIndexedDocument creates an indexed document without adding the 118// words to the classifier dictionary. This should be used for matching targets, not 119// populating the corpus. 120func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument { 121 doc, _ := tokenizeStream(bytes.NewReader(in), true, c.dict, false) 122 return doc 123} 124 125func (c *Classifier) generateDocName(category, name, variant string) string { 126 return fmt.Sprintf("%s%c%s%c%s", category, os.PathSeparator, name, os.PathSeparator, variant) 127} 128func (c *Classifier) getIndexedDocument(category, name, variant string) *indexedDocument { 129 return c.docs[c.generateDocName(category, name, variant)] 130} 131 132// dictionary is used to intern all the token words encountered in the text corpus. 133// words and indices form an inverse mapping relationship. It is just a convenience type 134// over a pair of correlated maps. 135type dictionary struct { 136 words map[tokenID]string 137 indices map[string]tokenID 138} 139 140func newDictionary() *dictionary { 141 return &dictionary{ 142 words: make(map[tokenID]string), 143 indices: make(map[string]tokenID), 144 } 145} 146 147// add inserts the provided word into the dictionary if it does not already exist. 148func (d *dictionary) add(word string) tokenID { 149 if idx := d.getIndex(word); idx != unknownIndex { 150 return idx 151 } 152 // token IDs start from 1, 0 is reserved for the invalid ID 153 idx := tokenID(len(d.words) + 1) 154 d.words[idx] = word 155 d.indices[word] = idx 156 return idx 157} 158 159var unknownWord = "UNKNOWN" 160var unknownIndex = tokenID(0) 161 162// getIndex returns the index of the supplied word, or 0 if the word is not in the dictionary. 163func (d *dictionary) getIndex(word string) tokenID { 164 if idx, found := d.indices[word]; found { 165 return idx 166 } 167 return unknownIndex 168} 169 170// getWord returns the word associated with the index. 171func (d *dictionary) getWord(index tokenID) string { 172 if word, found := d.words[index]; found { 173 return word 174 } 175 return unknownWord 176} 177