// Copyright 2017 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Select test data comes from // The Project Gutenberg eBook of The humour of Ireland, by D. J., (David James), (1866-1917) O'Donoghue package stringclassifier import ( "reflect" "regexp" "sort" "testing" "github.com/sergi/go-diff/diffmatchpatch" ) var ( gettysburg = `Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.` modifiedGettysburg = `Four score and seven years ago our fathers brought forth on this continent, a nation that was new and improved, conceived in Liberty, and dedicated to the proposition that all men are created equal.` gettysburgExtraWord = `Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.Foobar` declaration = `When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation.` loremipsum = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla varius enim mattis, rhoncus lectus id, aliquet sem. Phasellus eget ex in dolor feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim vulputate, tempus leo commodo, accumsan nulla.` modifiedLorem = `Lorem ipsum dolor amet, consectetur adipiscing elit. Nulla varius enim mattis, lectus id, aliquet rhoncus sem. Phasellus eget ex in dolor feugiat ultricies. Etiam interdum sit amet sit nisl in placerat. Sed vitae enim vulputate, tempus leo commodo, accumsan nulla.` lessModifiedLorem = `Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla varius enim mattis, rhoncus lectus id, aliquet. Phasellus eget ex in dolor feugiat ultricies. Etiam interdum sit amet nisl in placerat. Sed vitae enim vulputate, tempus leo commodo, accumsan nulla.` humourOfIreland = `As a rule, Irish poets have not extracted a pessimistic philosophy from liquor; they are “elevated,” not depressed, and do not deem it essential to the production of a poem that its author should be a cynic or an evil prophet. One of the best attributes of Irish poetry is its constant expression of the natural emotions. Previous to the close of the seventeenth[xvi] century, it is said, drunkenness was not suggested by the poets as common in Ireland—the popularity of Bacchanalian songs since that date seems to prove that the vice soon became a virtue. Maginn is the noisiest of modern revellers, and easily roars the others down. ` fellowInTheGoatSkin = `There was a poor widow living down there near the Iron Forge when the country was all covered with forests, and you might walk on the tops of trees from Carnew to the Lady’s Island, and she had one boy. She was very poor, as I said before, and was not able to buy clothes for her son. So when she was going out she fixed him snug and combustible in the ash-pit, and piled the warm ashes about him. The boy knew no better, and was as happy as the day was long; and he was happier still when a neighbour[10] gave his mother a kid to keep him company when herself was abroad. The kid and the lad played like two may-boys; and when she was old enough to give milk, wasn’t it a godsend to the little family? You won’t prevent the boy from growing up into a young man, but not a screed of clothes had he then no more than when he was a gorsoon. ` oldCrowYoungCrow = `There was an old crow teaching a young crow one day, and he said to him, “Now, my son,” says he, “listen to the advice I’m going to give you. If you see a person coming near you and stooping, mind yourself, and be on your keeping; he’s stooping for a stone to throw at you.” “But tell me,” says the young crow, “what should I do if he had a stone already down in his pocket?” “Musha, go ’long out of that,” says the old crow, “you’ve learned enough; the devil another learning I’m able to give you.” ` nullifiable = `[[ , _ , _ , _ ? _ : _ ? _ : _ ? _ : _ ] } ` nonWords = regexp.MustCompile("[[:punct:]]+") ) // removeNonWords removes non-words from the string, replacing them with empty // string. (This is meant to exercise tokenization problems.) func removeNonWords(s string) string { return nonWords.ReplaceAllString(s, "") } func TestClassify_NearestMatch(t *testing.T) { c := New(DefaultConfidenceThreshold, FlattenWhitespace) c.AddValue("gettysburg", gettysburg) c.AddValue("declaration", declaration) c.AddValue("loremipsum", loremipsum) tests := []struct { description string input string // input string to match name string // name of expected nearest match minConf float64 // the lowest confidence accepted for the match maxConf float64 // the highest confidence we expect for this match }{ { description: "Full Declaration", input: declaration, name: "declaration", minConf: 1.0, maxConf: 1.0, }, { description: "Modified Lorem", input: modifiedLorem, name: "loremipsum", minConf: 0.90, maxConf: 0.91, }, { description: "Modified Gettysburg", input: modifiedGettysburg, name: "gettysburg", minConf: 0.86, maxConf: 0.87, }, } for _, tt := range tests { m := c.NearestMatch(tt.input) if got, want := m.Name, tt.name; got != want { t.Errorf("NearestMatch(%q) = %q, want %q", tt.description, got, want) } if got, want := m.Confidence, tt.minConf; got < want { t.Errorf("NearestMatch(%q) returned confidence %v, want minimum of %v", tt.description, got, want) } if got, want := m.Confidence, tt.maxConf; got > want { t.Errorf("NearestMatch(%q) = %v, want maxiumum of %v", tt.description, got, want) } } } type result struct { key string // key of expected nearest match offset int // offset of match in unknown string // The confidence values are retrieved by simply running the classifier // and noting the output. A value greater than the "max" is fine and // the tests can be adjusted to account for it. A value less than "min" // should be carefully scrutinzed before adjusting the tests. minConf float64 // the lowest confidence accepted for the match maxConf float64 // the highest confidence we expect for this match } func TestClassify_MultipleMatch(t *testing.T) { c := New(DefaultConfidenceThreshold, FlattenWhitespace) c.AddValue("gettysburg", gettysburg) c.AddValue("declaration", declaration) c.AddValue("declaration-close", declaration[:len(declaration)/2-1]+"_"+declaration[len(declaration)/2:]) c.AddValue("loremipsum", loremipsum) cNormalize := New(DefaultConfidenceThreshold, FlattenWhitespace, removeNonWords) cNormalize.AddValue("gettysburg", gettysburg) tests := []struct { description string c *Classifier input string // input string to match want []result }{ { description: "Exact text match", c: c, input: fellowInTheGoatSkin + declaration + humourOfIreland, want: []result{ { key: "declaration", offset: 845, minConf: 1.0, maxConf: 1.0, }, }, }, { description: "Partial text match", c: c, input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland, want: []result{ { key: "loremipsum", offset: 845, minConf: 0.90, maxConf: 0.91, }, }, }, { description: "Two partial matches", c: c, input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + modifiedGettysburg + oldCrowYoungCrow, want: []result{ { key: "loremipsum", offset: 845, minConf: 0.90, maxConf: 0.91, }, { key: "gettysburg", offset: 1750, minConf: 0.86, maxConf: 0.87, }, }, }, { description: "Partial matches of similar text", c: c, input: fellowInTheGoatSkin + modifiedLorem + humourOfIreland + lessModifiedLorem + oldCrowYoungCrow, want: []result{ { key: "loremipsum", offset: 1750, minConf: 0.98, maxConf: 0.99, }, { key: "loremipsum", offset: 845, minConf: 0.90, maxConf: 0.91, }, }, }, { description: "Nullifiable text", c: c, input: nullifiable, want: nil, }, { description: "No match", c: c, input: fellowInTheGoatSkin + humourOfIreland, want: nil, }, { description: "Exact text match, with extra word and non-word normalizer", c: cNormalize, input: fellowInTheGoatSkin + gettysburgExtraWord + humourOfIreland, want: []result{ { key: "gettysburg", offset: 825, minConf: 1.0, maxConf: 1.0, }, }, }, } for _, tt := range tests { matches := tt.c.MultipleMatch(tt.input) if len(matches) != len(tt.want) { t.Errorf("MultipleMatch(%q) not enough matches = %v, want %v", tt.description, len(matches), len(tt.want)) } for i := 0; i < len(matches); i++ { m := matches[i] w := tt.want[i] if got, want := m.Name, w.key; got != want { t.Errorf("MultipleMatch(%q) = %q, want %q", tt.description, got, want) } if got, want := m.Confidence, w.minConf; got < want { t.Errorf("MultipleMatch(%q) %q = %v, want minimum of %v", tt.description, w.key, got, want) } if got, want := m.Confidence, w.maxConf; got > want { t.Errorf("MultipleMatch(%q) %q = %v, want maximum of %v", tt.description, w.key, got, want) } if got, want := m.Offset, w.offset; got != want { t.Errorf("MultipleMatch(%q) %q = %v, want offset of %v", tt.description, w.key, got, want) } } } } func TestClassify_DiffRatio(t *testing.T) { tests := []struct { x, y string want float64 }{ {"", "", 1.0}, {"a", "b", 1.0}, {"", "abc", 0}, {"ab", "c", 0.5}, {"a", "bc", 0.5}, {"a", "bcde", 0.25}, } for _, tt := range tests { if got, want := diffRatio(tt.x, tt.y), tt.want; got != want { t.Errorf("diffRatio(%q, %q) = %f, want %f", tt.x, tt.y, got, want) } } } func TestClassify_Matches(t *testing.T) { tests := []struct { description string matches Matches want Matches }{ { description: "Different names, same confidences, same offset", matches: Matches{ &Match{ Name: "b", Confidence: 0.42, Offset: 0, }, &Match{ Name: "a", Confidence: 0.42, Offset: 0, }, }, want: Matches{ &Match{ Name: "a", Confidence: 0.42, Offset: 0, }, &Match{ Name: "b", Confidence: 0.42, Offset: 0, }, }, }, { description: "Same names, different confidences, same offset", matches: Matches{ &Match{ Name: "b", Confidence: 0.42, Offset: 0, }, &Match{ Name: "b", Confidence: 0.90, Offset: 0, }, }, want: Matches{ &Match{ Name: "b", Confidence: 0.90, Offset: 0, }, &Match{ Name: "b", Confidence: 0.42, Offset: 0, }, }, }, { description: "Same names, same confidences, different offsets", matches: Matches{ &Match{ Name: "b", Confidence: 0.42, Offset: 42, }, &Match{ Name: "b", Confidence: 0.42, Offset: 0, }, }, want: Matches{ &Match{ Name: "b", Confidence: 0.42, Offset: 0, }, &Match{ Name: "b", Confidence: 0.42, Offset: 42, }, }, }, { description: "Different names, different confidences, same offset", matches: Matches{ &Match{ Name: "b", Confidence: 0.42, Offset: 0, }, &Match{ Name: "a", Confidence: 0.90, Offset: 0, }, }, want: Matches{ &Match{ Name: "a", Confidence: 0.90, Offset: 0, }, &Match{ Name: "b", Confidence: 0.42, Offset: 0, }, }, }, { description: "Different names, same confidences, different offset", matches: Matches{ &Match{ Name: "b", Confidence: 0.42, Offset: 37, }, &Match{ Name: "a", Confidence: 0.42, Offset: 0, }, }, want: Matches{ &Match{ Name: "a", Confidence: 0.42, Offset: 0, }, &Match{ Name: "b", Confidence: 0.42, Offset: 37, }, }, }, { description: "Different names, different confidences, different offset", matches: Matches{ &Match{ Name: "a", Confidence: 0.42, Offset: 0, }, &Match{ Name: "b", Confidence: 0.90, Offset: 37, }, }, want: Matches{ &Match{ Name: "b", Confidence: 0.90, Offset: 37, }, &Match{ Name: "a", Confidence: 0.42, Offset: 0, }, }, }, } for _, tt := range tests { sort.Sort(tt.matches) if !reflect.DeepEqual(tt.matches, tt.want) { for _, x := range tt.matches { t.Errorf("got: %v", x) } for _, x := range tt.want { t.Errorf("want: %v", x) } t.Errorf("MatchesSort(%q) = %v, want %v", tt.description, tt.matches, tt.want) } } } func TestClassify_DiffRangeEnd(t *testing.T) { dmp := diffmatchpatch.New() tests := []struct { description string unknown string known string end int }{ { description: "identical", unknown: declaration, known: declaration, end: 1, }, { description: "lorem", unknown: lessModifiedLorem, known: loremipsum, end: 3, }, { description: "gettysburg", unknown: modifiedGettysburg, known: gettysburg, end: 19, }, } for _, tt := range tests { diffs := dmp.DiffMain(tt.unknown, tt.known, true) if e := diffRangeEnd(tt.known, diffs); e != tt.end { t.Errorf("DiffRangeEnd(%q) = end %v, want %v", tt.description, e, tt.end) } } } func BenchmarkClassifier(b *testing.B) { c := New(DefaultConfidenceThreshold, FlattenWhitespace) c.AddValue("gettysburg", gettysburg) c.AddValue("declaration", declaration) c.AddValue("loremipsum", loremipsum) b.ResetTimer() for i := 0; i < b.N; i++ { c.NearestMatch(modifiedLorem) } }