• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //! This library implements string similarity metrics.
2 
3 #![forbid(unsafe_code)]
4 
5 use std::char;
6 use std::cmp::{max, min};
7 use std::collections::HashMap;
8 use std::error::Error;
9 use std::fmt::{self, Display, Formatter};
10 use std::hash::Hash;
11 use std::str::Chars;
12 
13 #[derive(Debug, PartialEq)]
14 pub enum StrSimError {
15     DifferentLengthArgs,
16 }
17 
18 impl Display for StrSimError {
fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error>19     fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> {
20         let text = match self {
21             StrSimError::DifferentLengthArgs => "Differing length arguments provided",
22         };
23 
24         write!(fmt, "{}", text)
25     }
26 }
27 
28 impl Error for StrSimError {}
29 
30 pub type HammingResult = Result<usize, StrSimError>;
31 
32 /// Calculates the number of positions in the two sequences where the elements
33 /// differ. Returns an error if the sequences have different lengths.
generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult where Iter1: IntoIterator<Item=Elem1>, Iter2: IntoIterator<Item=Elem2>, Elem1: PartialEq<Elem2>34 pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult
35     where Iter1: IntoIterator<Item=Elem1>,
36           Iter2: IntoIterator<Item=Elem2>,
37           Elem1: PartialEq<Elem2> {
38     let (mut ita, mut itb) = (a.into_iter(), b.into_iter());
39     let mut count = 0;
40     loop {
41         match (ita.next(), itb.next()){
42             (Some(x), Some(y)) => if x != y { count += 1 },
43             (None, None) => return Ok(count),
44             _ => return Err(StrSimError::DifferentLengthArgs),
45         }
46     }
47 }
48 
49 /// Calculates the number of positions in the two strings where the characters
50 /// differ. Returns an error if the strings have different lengths.
51 ///
52 /// ```
53 /// use strsim::{hamming, StrSimError::DifferentLengthArgs};
54 ///
55 /// assert_eq!(Ok(3), hamming("hamming", "hammers"));
56 ///
57 /// assert_eq!(Err(DifferentLengthArgs), hamming("hamming", "ham"));
58 /// ```
hamming(a: &str, b: &str) -> HammingResult59 pub fn hamming(a: &str, b: &str) -> HammingResult {
60     generic_hamming(a.chars(), b.chars())
61 }
62 
63 /// Calculates the Jaro similarity between two sequences. The returned value
64 /// is between 0.0 and 1.0 (higher value means more similar).
generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 where &'a Iter1: IntoIterator<Item=Elem1>, &'b Iter2: IntoIterator<Item=Elem2>, Elem1: PartialEq<Elem2>65 pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64
66     where &'a Iter1: IntoIterator<Item=Elem1>,
67           &'b Iter2: IntoIterator<Item=Elem2>,
68           Elem1: PartialEq<Elem2> {
69     let a_len = a.into_iter().count();
70     let b_len = b.into_iter().count();
71 
72     // The check for lengths of one here is to prevent integer overflow when
73     // calculating the search range.
74     if a_len == 0 && b_len == 0 {
75         return 1.0;
76     } else if a_len == 0 || b_len == 0 {
77         return 0.0;
78     } else if a_len == 1 && b_len == 1 {
79         return if a.into_iter().eq(b.into_iter()) { 1.0} else { 0.0 };
80     }
81 
82     let search_range = (max(a_len, b_len) / 2) - 1;
83 
84     let mut b_consumed = Vec::with_capacity(b_len);
85     for _ in 0..b_len {
86         b_consumed.push(false);
87     }
88     let mut matches = 0.0;
89 
90     let mut transpositions = 0.0;
91     let mut b_match_index = 0;
92 
93     for (i, a_elem) in a.into_iter().enumerate() {
94         let min_bound =
95             // prevent integer wrapping
96             if i > search_range {
97                 max(0, i - search_range)
98             } else {
99                 0
100             };
101 
102         let max_bound = min(b_len - 1, i + search_range);
103 
104         if min_bound > max_bound {
105             continue;
106         }
107 
108         for (j, b_elem) in b.into_iter().enumerate() {
109             if min_bound <= j && j <= max_bound && a_elem == b_elem &&
110                 !b_consumed[j] {
111                 b_consumed[j] = true;
112                 matches += 1.0;
113 
114                 if j < b_match_index {
115                     transpositions += 1.0;
116                 }
117                 b_match_index = j;
118 
119                 break;
120             }
121         }
122     }
123 
124     if matches == 0.0 {
125         0.0
126     } else {
127         (1.0 / 3.0) * ((matches / a_len as f64) +
128             (matches / b_len as f64) +
129             ((matches - transpositions) / matches))
130     }
131 }
132 
133 struct StringWrapper<'a>(&'a str);
134 
135 impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> {
136     type Item = char;
137     type IntoIter = Chars<'b>;
138 
into_iter(self) -> Self::IntoIter139     fn into_iter(self) -> Self::IntoIter {
140         self.0.chars()
141     }
142 }
143 
144 /// Calculates the Jaro similarity between two strings. The returned value
145 /// is between 0.0 and 1.0 (higher value means more similar).
146 ///
147 /// ```
148 /// use strsim::jaro;
149 ///
150 /// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
151 ///         0.001);
152 /// ```
jaro(a: &str, b: &str) -> f64153 pub fn jaro(a: &str, b: &str) -> f64 {
154     generic_jaro(&StringWrapper(a), &StringWrapper(b))
155 }
156 
157 /// Like Jaro but gives a boost to sequences that have a common prefix.
generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 where &'a Iter1: IntoIterator<Item=Elem1>, &'b Iter2: IntoIterator<Item=Elem2>, Elem1: PartialEq<Elem2>158 pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64
159     where &'a Iter1: IntoIterator<Item=Elem1>,
160           &'b Iter2: IntoIterator<Item=Elem2>,
161           Elem1: PartialEq<Elem2> {
162     let jaro_distance = generic_jaro(a, b);
163 
164     // Don't limit the length of the common prefix
165     let prefix_length = a.into_iter()
166         .zip(b.into_iter())
167         .take_while(|&(ref a_elem, ref b_elem)| a_elem == b_elem)
168         .count();
169 
170     let jaro_winkler_distance =
171         jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance));
172 
173     if jaro_winkler_distance <= 1.0 {
174         jaro_winkler_distance
175     } else {
176         1.0
177     }
178 }
179 
180 /// Like Jaro but gives a boost to strings that have a common prefix.
181 ///
182 /// ```
183 /// use strsim::jaro_winkler;
184 ///
185 /// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
186 ///         0.001);
187 /// ```
jaro_winkler(a: &str, b: &str) -> f64188 pub fn jaro_winkler(a: &str, b: &str) -> f64 {
189     generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b))
190 }
191 
192 /// Calculates the minimum number of insertions, deletions, and substitutions
193 /// required to change one sequence into the other.
194 ///
195 /// ```
196 /// use strsim::generic_levenshtein;
197 ///
198 /// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6]));
199 /// ```
generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize where &'a Iter1: IntoIterator<Item=Elem1>, &'b Iter2: IntoIterator<Item=Elem2>, Elem1: PartialEq<Elem2>200 pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize
201     where &'a Iter1: IntoIterator<Item=Elem1>,
202           &'b Iter2: IntoIterator<Item=Elem2>,
203           Elem1: PartialEq<Elem2> {
204     let b_len = b.into_iter().count();
205 
206     if a.into_iter().next().is_none() { return b_len; }
207 
208     let mut cache: Vec<usize> = (1..b_len+1).collect();
209 
210     let mut result = 0;
211 
212     for (i, a_elem) in a.into_iter().enumerate() {
213         result = i + 1;
214         let mut distance_b = i;
215 
216         for (j, b_elem) in b.into_iter().enumerate() {
217             let cost = if a_elem == b_elem { 0usize } else { 1usize };
218             let distance_a = distance_b + cost;
219             distance_b = cache[j];
220             result = min(result + 1, min(distance_a, distance_b + 1));
221             cache[j] = result;
222         }
223     }
224 
225     result
226 }
227 
228 /// Calculates the minimum number of insertions, deletions, and substitutions
229 /// required to change one string into the other.
230 ///
231 /// ```
232 /// use strsim::levenshtein;
233 ///
234 /// assert_eq!(3, levenshtein("kitten", "sitting"));
235 /// ```
levenshtein(a: &str, b: &str) -> usize236 pub fn levenshtein(a: &str, b: &str) -> usize {
237     generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
238 }
239 
240 /// Calculates a normalized score of the Levenshtein algorithm between 0.0 and
241 /// 1.0 (inclusive), where 1.0 means the strings are the same.
242 ///
243 /// ```
244 /// use strsim::normalized_levenshtein;
245 ///
246 /// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
247 /// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
248 /// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
249 /// assert!(normalized_levenshtein("first", "").abs() < 0.00001);
250 /// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
251 /// ```
normalized_levenshtein(a: &str, b: &str) -> f64252 pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
253     if a.is_empty() && b.is_empty() {
254         return 1.0;
255     }
256     1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
257 }
258 
259 /// Like Levenshtein but allows for adjacent transpositions. Each substring can
260 /// only be edited once.
261 ///
262 /// ```
263 /// use strsim::osa_distance;
264 ///
265 /// assert_eq!(3, osa_distance("ab", "bca"));
266 /// ```
osa_distance(a: &str, b: &str) -> usize267 pub fn osa_distance(a: &str, b: &str) -> usize {
268     let a_len = a.chars().count();
269     let b_len = b.chars().count();
270     if a == b { return 0; }
271     else if a_len == 0 { return b_len; }
272     else if b_len == 0 { return a_len; }
273 
274     let mut prev_two_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
275     let mut prev_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
276     let mut curr_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
277 
278     let mut prev_a_char = char::MAX;
279     let mut prev_b_char = char::MAX;
280 
281     for i in 0..(b_len + 1) {
282         prev_two_distances.push(i);
283         prev_distances.push(i);
284         curr_distances.push(0);
285     }
286 
287     for (i, a_char) in a.chars().enumerate() {
288         curr_distances[0] = i + 1;
289 
290         for (j, b_char) in b.chars().enumerate() {
291             let cost = if a_char == b_char { 0 } else { 1 };
292             curr_distances[j + 1] = min(curr_distances[j] + 1,
293                                         min(prev_distances[j + 1] + 1,
294                                             prev_distances[j] + cost));
295             if i > 0 && j > 0 && a_char != b_char &&
296                 a_char == prev_b_char && b_char == prev_a_char {
297                 curr_distances[j + 1] = min(curr_distances[j + 1],
298                                             prev_two_distances[j - 1] + 1);
299             }
300 
301             prev_b_char = b_char;
302         }
303 
304         prev_two_distances.clone_from(&prev_distances);
305         prev_distances.clone_from(&curr_distances);
306         prev_a_char = a_char;
307     }
308 
309     curr_distances[b_len]
310 
311 }
312 
313 /* Returns the final index for a value in a single vector that represents a fixed
314    2d grid */
flat_index(i: usize, j: usize, width: usize) -> usize315 fn flat_index(i: usize, j: usize, width: usize) -> usize {
316     j * width + i
317 }
318 
319 /// Like optimal string alignment, but substrings can be edited an unlimited
320 /// number of times, and the triangle inequality holds.
321 ///
322 /// ```
323 /// use strsim::generic_damerau_levenshtein;
324 ///
325 /// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1]));
326 /// ```
generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize where Elem: Eq + Hash + Clone327 pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize
328     where Elem: Eq + Hash + Clone {
329     let a_len = a_elems.len();
330     let b_len = b_elems.len();
331 
332     if a_len == 0 { return b_len; }
333     if b_len == 0 { return a_len; }
334 
335     let width = a_len + 2;
336     let mut distances = vec![0; (a_len + 2) * (b_len + 2)];
337     let max_distance = a_len + b_len;
338     distances[0] = max_distance;
339 
340     for i in 0..(a_len + 1) {
341         distances[flat_index(i + 1, 0, width)] = max_distance;
342         distances[flat_index(i + 1, 1, width)] = i;
343     }
344 
345     for j in 0..(b_len + 1) {
346         distances[flat_index(0, j + 1, width)] = max_distance;
347         distances[flat_index(1, j + 1, width)] = j;
348     }
349 
350     let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64);
351 
352     for i in 1..(a_len + 1) {
353         let mut db = 0;
354 
355         for j in 1..(b_len + 1) {
356             let k = match elems.get(&b_elems[j - 1]) {
357                 Some(&value) => value,
358                 None => 0
359             };
360 
361             let insertion_cost = distances[flat_index(i, j + 1, width)] + 1;
362             let deletion_cost = distances[flat_index(i + 1, j, width)] + 1;
363             let transposition_cost = distances[flat_index(k, db, width)] +
364                 (i - k - 1) + 1 + (j - db - 1);
365 
366             let mut substitution_cost = distances[flat_index(i, j, width)] + 1;
367             if a_elems[i - 1] == b_elems[j - 1] {
368                 db = j;
369                 substitution_cost -= 1;
370             }
371 
372             distances[flat_index(i + 1, j + 1, width)] = min(substitution_cost,
373                 min(insertion_cost, min(deletion_cost, transposition_cost)));
374         }
375 
376         elems.insert(a_elems[i - 1].clone(), i);
377     }
378 
379     distances[flat_index(a_len + 1, b_len + 1, width)]
380 }
381 
382 /// Like optimal string alignment, but substrings can be edited an unlimited
383 /// number of times, and the triangle inequality holds.
384 ///
385 /// ```
386 /// use strsim::damerau_levenshtein;
387 ///
388 /// assert_eq!(2, damerau_levenshtein("ab", "bca"));
389 /// ```
damerau_levenshtein(a: &str, b: &str) -> usize390 pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
391     let (x, y): (Vec<_>, Vec<_>) = (a.chars().collect(), b.chars().collect());
392     generic_damerau_levenshtein(x.as_slice(), y.as_slice())
393 }
394 
395 /// Calculates a normalized score of the Damerau–Levenshtein algorithm between
396 /// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same.
397 ///
398 /// ```
399 /// use strsim::normalized_damerau_levenshtein;
400 ///
401 /// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
402 /// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
403 /// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
404 /// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
405 /// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
406 /// ```
normalized_damerau_levenshtein(a: &str, b: &str) -> f64407 pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 {
408     if a.is_empty() && b.is_empty() {
409         return 1.0;
410     }
411     1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
412 }
413 
414 /// Returns an Iterator of char tuples.
bigrams(s: &str) -> impl Iterator<Item=(char, char)> + '_415 fn bigrams(s: &str) -> impl Iterator<Item=(char, char)> + '_ {
416     s.chars().zip(s.chars().skip(1))
417 }
418 
419 
420 /// Calculates a Sørensen-Dice similarity distance using bigrams.
421 /// See http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient.
422 ///
423 /// ```
424 /// use strsim::sorensen_dice;
425 ///
426 /// assert_eq!(1.0, sorensen_dice("", ""));
427 /// assert_eq!(0.0, sorensen_dice("", "a"));
428 /// assert_eq!(0.0, sorensen_dice("french", "quebec"));
429 /// assert_eq!(1.0, sorensen_dice("ferris", "ferris"));
430 /// assert_eq!(1.0, sorensen_dice("ferris", "ferris"));
431 /// assert_eq!(0.8888888888888888, sorensen_dice("feris", "ferris"));
432 /// ```
sorensen_dice(a: &str, b: &str) -> f64433 pub fn sorensen_dice(a: &str, b: &str) -> f64 {
434     // implementation guided by
435     // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6
436 
437     let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect();
438     let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect();
439 
440     if a.len() == 0 && b.len() == 0 {
441         return 1.0;
442     }
443 
444     if a.len() == 0 || b.len() == 0 {
445         return 0.0;
446     }
447 
448     if a == b {
449         return 1.0;
450     }
451 
452     if a.len() == 1 && b.len() == 1 {
453         return 0.0;
454     }
455 
456     if a.len() < 2 || b.len() < 2 {
457         return 0.0;
458     }
459 
460     let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new();
461 
462     for bigram in bigrams(&a) {
463         *a_bigrams.entry(bigram).or_insert(0) += 1;
464     }
465 
466     let mut intersection_size = 0;
467 
468     for bigram in bigrams(&b) {
469         a_bigrams.entry(bigram).and_modify(|bi| {
470             if *bi > 0 {
471                 *bi -= 1;
472                 intersection_size += 1;
473             }
474         });
475     }
476 
477     (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64
478 }
479 
480 
481 #[cfg(test)]
482 mod tests {
483     use super::*;
484 
485     #[test]
bigrams_iterator()486     fn bigrams_iterator() {
487         let mut bi = bigrams("abcde");
488 
489         assert_eq!(Some(('a', 'b')), bi.next());
490         assert_eq!(Some(('b', 'c')), bi.next());
491         assert_eq!(Some(('c', 'd')), bi.next());
492         assert_eq!(Some(('d', 'e')), bi.next());
493         assert_eq!(None, bi.next());
494     }
495 
assert_hamming_dist(dist: usize, str1: &str, str2: &str)496     fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) {
497         assert_eq!(Ok(dist), hamming(str1, str2));
498     }
499 
500     #[test]
hamming_empty()501     fn hamming_empty() {
502         assert_hamming_dist(0, "", "")
503     }
504 
505     #[test]
hamming_same()506     fn hamming_same() {
507         assert_hamming_dist(0, "hamming", "hamming")
508     }
509 
510     #[test]
hamming_numbers()511     fn hamming_numbers() {
512         assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3]));
513     }
514 
515     #[test]
hamming_diff()516     fn hamming_diff() {
517         assert_hamming_dist(3, "hamming", "hammers")
518     }
519 
520     #[test]
hamming_diff_multibyte()521     fn hamming_diff_multibyte() {
522         assert_hamming_dist(2, "hamming", "h香mmüng");
523     }
524 
525     #[test]
hamming_unequal_length()526     fn hamming_unequal_length() {
527         assert_eq!(
528             Err(StrSimError::DifferentLengthArgs),
529             generic_hamming("ham".chars(), "hamming".chars())
530         );
531     }
532 
533     #[test]
hamming_names()534     fn hamming_names() {
535         assert_hamming_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre")
536     }
537 
538     #[test]
jaro_both_empty()539     fn jaro_both_empty() {
540         assert_eq!(1.0, jaro("", ""));
541     }
542 
543     #[test]
jaro_first_empty()544     fn jaro_first_empty() {
545         assert_eq!(0.0, jaro("", "jaro"));
546     }
547 
548     #[test]
jaro_second_empty()549     fn jaro_second_empty() {
550         assert_eq!(0.0, jaro("distance", ""));
551     }
552 
553     #[test]
jaro_same()554     fn jaro_same() {
555         assert_eq!(1.0, jaro("jaro", "jaro"));
556     }
557 
558     #[test]
jaro_multibyte()559     fn jaro_multibyte() {
560         assert!((0.818 - jaro("testabctest", "testöঙ香test")) < 0.001);
561         assert!((0.818 - jaro("testöঙ香test", "testabctest")) < 0.001);
562     }
563 
564     #[test]
jaro_diff_short()565     fn jaro_diff_short() {
566         assert!((0.767 - jaro("dixon", "dicksonx")).abs() < 0.001);
567     }
568 
569     #[test]
jaro_diff_one_character()570     fn jaro_diff_one_character() {
571         assert_eq!(0.0, jaro("a", "b"));
572     }
573 
574     #[test]
jaro_same_one_character()575     fn jaro_same_one_character() {
576         assert_eq!(1.0, jaro("a", "a"));
577     }
578 
579     #[test]
generic_jaro_diff()580     fn generic_jaro_diff() {
581         assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4]));
582     }
583 
584     #[test]
jaro_diff_one_and_two()585     fn jaro_diff_one_and_two() {
586         assert!((0.83 - jaro("a", "ab")).abs() < 0.01);
587     }
588 
589     #[test]
jaro_diff_two_and_one()590     fn jaro_diff_two_and_one() {
591         assert!((0.83 - jaro("ab", "a")).abs() < 0.01);
592     }
593 
594     #[test]
jaro_diff_no_transposition()595     fn jaro_diff_no_transposition() {
596         assert!((0.822 - jaro("dwayne", "duane")).abs() < 0.001);
597     }
598 
599     #[test]
jaro_diff_with_transposition()600     fn jaro_diff_with_transposition() {
601         assert!((0.944 - jaro("martha", "marhta")).abs() < 0.001);
602     }
603 
604     #[test]
jaro_names()605     fn jaro_names() {
606         assert!((0.392 - jaro("Friedrich Nietzsche",
607                               "Jean-Paul Sartre")).abs() < 0.001);
608     }
609 
610     #[test]
jaro_winkler_both_empty()611     fn jaro_winkler_both_empty() {
612         assert_eq!(1.0, jaro_winkler("", ""));
613     }
614 
615     #[test]
jaro_winkler_first_empty()616     fn jaro_winkler_first_empty() {
617         assert_eq!(0.0, jaro_winkler("", "jaro-winkler"));
618     }
619 
620     #[test]
jaro_winkler_second_empty()621     fn jaro_winkler_second_empty() {
622         assert_eq!(0.0, jaro_winkler("distance", ""));
623     }
624 
625     #[test]
jaro_winkler_same()626     fn jaro_winkler_same() {
627         assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler"));
628     }
629 
630     #[test]
jaro_winkler_multibyte()631     fn jaro_winkler_multibyte() {
632         assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() <
633             0.001);
634         assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() <
635             0.001);
636     }
637 
638     #[test]
jaro_winkler_diff_short()639     fn jaro_winkler_diff_short() {
640         assert!((0.813 - jaro_winkler("dixon", "dicksonx")).abs() < 0.001);
641         assert!((0.813 - jaro_winkler("dicksonx", "dixon")).abs() < 0.001);
642     }
643 
644     #[test]
jaro_winkler_diff_one_character()645     fn jaro_winkler_diff_one_character() {
646         assert_eq!(0.0, jaro_winkler("a", "b"));
647     }
648 
649     #[test]
jaro_winkler_same_one_character()650     fn jaro_winkler_same_one_character() {
651         assert_eq!(1.0, jaro_winkler("a", "a"));
652     }
653 
654     #[test]
jaro_winkler_diff_no_transposition()655     fn jaro_winkler_diff_no_transposition() {
656         assert!((0.840 - jaro_winkler("dwayne", "duane")).abs() < 0.001);
657     }
658 
659     #[test]
jaro_winkler_diff_with_transposition()660     fn jaro_winkler_diff_with_transposition() {
661         assert!((0.961 - jaro_winkler("martha", "marhta")).abs() < 0.001);
662     }
663 
664     #[test]
jaro_winkler_names()665     fn jaro_winkler_names() {
666         assert!((0.562 - jaro_winkler("Friedrich Nietzsche",
667                                       "Fran-Paul Sartre")).abs() < 0.001);
668     }
669 
670     #[test]
jaro_winkler_long_prefix()671     fn jaro_winkler_long_prefix() {
672         assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
673             0.001);
674     }
675 
676     #[test]
jaro_winkler_more_names()677     fn jaro_winkler_more_names() {
678         assert!((0.868 - jaro_winkler("Thorkel", "Thorgier")).abs() < 0.001);
679     }
680 
681     #[test]
jaro_winkler_length_of_one()682     fn jaro_winkler_length_of_one() {
683         assert!((0.738 - jaro_winkler("Dinsdale", "D")).abs() < 0.001);
684     }
685 
686     #[test]
jaro_winkler_very_long_prefix()687     fn jaro_winkler_very_long_prefix() {
688         assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx",
689                                     "thequickbrownfoxjumpedovery")).abs() <
690             0.001);
691     }
692 
693     #[test]
levenshtein_empty()694     fn levenshtein_empty() {
695         assert_eq!(0, levenshtein("", ""));
696     }
697 
698     #[test]
levenshtein_same()699     fn levenshtein_same() {
700         assert_eq!(0, levenshtein("levenshtein", "levenshtein"));
701     }
702 
703     #[test]
levenshtein_diff_short()704     fn levenshtein_diff_short() {
705         assert_eq!(3, levenshtein("kitten", "sitting"));
706     }
707 
708     #[test]
levenshtein_diff_with_space()709     fn levenshtein_diff_with_space() {
710         assert_eq!(5, levenshtein("hello, world", "bye, world"));
711     }
712 
713     #[test]
levenshtein_diff_multibyte()714     fn levenshtein_diff_multibyte() {
715         assert_eq!(3, levenshtein("öঙ香", "abc"));
716         assert_eq!(3, levenshtein("abc", "öঙ香"));
717     }
718 
719     #[test]
levenshtein_diff_longer()720     fn levenshtein_diff_longer() {
721         let a = "The quick brown fox jumped over the angry dog.";
722         let b = "Lorem ipsum dolor sit amet, dicta latine an eam.";
723         assert_eq!(37, levenshtein(a, b));
724     }
725 
726     #[test]
levenshtein_first_empty()727     fn levenshtein_first_empty() {
728         assert_eq!(7, levenshtein("", "sitting"));
729     }
730 
731     #[test]
levenshtein_second_empty()732     fn levenshtein_second_empty() {
733         assert_eq!(6, levenshtein("kitten", ""));
734     }
735 
736     #[test]
normalized_levenshtein_diff_short()737     fn normalized_levenshtein_diff_short() {
738         assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
739     }
740 
741     #[test]
normalized_levenshtein_for_empty_strings()742     fn normalized_levenshtein_for_empty_strings() {
743         assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
744     }
745 
746     #[test]
normalized_levenshtein_first_empty()747     fn normalized_levenshtein_first_empty() {
748         assert!(normalized_levenshtein("", "second").abs() < 0.00001);
749     }
750 
751     #[test]
normalized_levenshtein_second_empty()752     fn normalized_levenshtein_second_empty() {
753         assert!(normalized_levenshtein("first", "").abs() < 0.00001);
754     }
755 
756     #[test]
normalized_levenshtein_identical_strings()757     fn normalized_levenshtein_identical_strings() {
758         assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001);
759     }
760 
761     #[test]
osa_distance_empty()762     fn osa_distance_empty() {
763         assert_eq!(0, osa_distance("", ""));
764     }
765 
766     #[test]
osa_distance_same()767     fn osa_distance_same() {
768         assert_eq!(0, osa_distance("damerau", "damerau"));
769     }
770 
771     #[test]
osa_distance_first_empty()772     fn osa_distance_first_empty() {
773         assert_eq!(7, osa_distance("", "damerau"));
774     }
775 
776     #[test]
osa_distance_second_empty()777     fn osa_distance_second_empty() {
778         assert_eq!(7, osa_distance("damerau", ""));
779     }
780 
781     #[test]
osa_distance_diff()782     fn osa_distance_diff() {
783         assert_eq!(3, osa_distance("ca", "abc"));
784     }
785 
786     #[test]
osa_distance_diff_short()787     fn osa_distance_diff_short() {
788         assert_eq!(3, osa_distance("damerau", "aderua"));
789     }
790 
791     #[test]
osa_distance_diff_reversed()792     fn osa_distance_diff_reversed() {
793         assert_eq!(3, osa_distance("aderua", "damerau"));
794     }
795 
796     #[test]
osa_distance_diff_multibyte()797     fn osa_distance_diff_multibyte() {
798         assert_eq!(3, osa_distance("öঙ香", "abc"));
799         assert_eq!(3, osa_distance("abc", "öঙ香"));
800     }
801 
802     #[test]
osa_distance_diff_unequal_length()803     fn osa_distance_diff_unequal_length() {
804         assert_eq!(6, osa_distance("damerau", "aderuaxyz"));
805     }
806 
807     #[test]
osa_distance_diff_unequal_length_reversed()808     fn osa_distance_diff_unequal_length_reversed() {
809         assert_eq!(6, osa_distance("aderuaxyz", "damerau"));
810     }
811 
812     #[test]
osa_distance_diff_comedians()813     fn osa_distance_diff_comedians() {
814         assert_eq!(5, osa_distance("Stewart", "Colbert"));
815     }
816 
817     #[test]
osa_distance_many_transpositions()818     fn osa_distance_many_transpositions() {
819         assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk"));
820     }
821 
822     #[test]
osa_distance_diff_longer()823     fn osa_distance_diff_longer() {
824         let a = "The quick brown fox jumped over the angry dog.";
825         let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
826         assert_eq!(36, osa_distance(a, b));
827     }
828 
829     #[test]
osa_distance_beginning_transposition()830     fn osa_distance_beginning_transposition() {
831         assert_eq!(1, osa_distance("foobar", "ofobar"));
832     }
833 
834     #[test]
osa_distance_end_transposition()835     fn osa_distance_end_transposition() {
836         assert_eq!(1, osa_distance("specter", "spectre"));
837     }
838 
839     #[test]
osa_distance_restricted_edit()840     fn osa_distance_restricted_edit() {
841         assert_eq!(4, osa_distance("a cat", "an abct"));
842     }
843 
844     #[test]
damerau_levenshtein_empty()845     fn damerau_levenshtein_empty() {
846         assert_eq!(0, damerau_levenshtein("", ""));
847     }
848 
849     #[test]
damerau_levenshtein_same()850     fn damerau_levenshtein_same() {
851         assert_eq!(0, damerau_levenshtein("damerau", "damerau"));
852     }
853 
854     #[test]
damerau_levenshtein_first_empty()855     fn damerau_levenshtein_first_empty() {
856         assert_eq!(7, damerau_levenshtein("", "damerau"));
857     }
858 
859     #[test]
damerau_levenshtein_second_empty()860     fn damerau_levenshtein_second_empty() {
861         assert_eq!(7, damerau_levenshtein("damerau", ""));
862     }
863 
864     #[test]
damerau_levenshtein_diff()865     fn damerau_levenshtein_diff() {
866         assert_eq!(2, damerau_levenshtein("ca", "abc"));
867     }
868 
869     #[test]
damerau_levenshtein_diff_short()870     fn damerau_levenshtein_diff_short() {
871         assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
872     }
873 
874     #[test]
damerau_levenshtein_diff_reversed()875     fn damerau_levenshtein_diff_reversed() {
876         assert_eq!(3, damerau_levenshtein("aderua", "damerau"));
877     }
878 
879     #[test]
damerau_levenshtein_diff_multibyte()880     fn damerau_levenshtein_diff_multibyte() {
881         assert_eq!(3, damerau_levenshtein("öঙ香", "abc"));
882         assert_eq!(3, damerau_levenshtein("abc", "öঙ香"));
883     }
884 
885     #[test]
damerau_levenshtein_diff_unequal_length()886     fn damerau_levenshtein_diff_unequal_length() {
887         assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz"));
888     }
889 
890     #[test]
damerau_levenshtein_diff_unequal_length_reversed()891     fn damerau_levenshtein_diff_unequal_length_reversed() {
892         assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau"));
893     }
894 
895     #[test]
damerau_levenshtein_diff_comedians()896     fn damerau_levenshtein_diff_comedians() {
897         assert_eq!(5, damerau_levenshtein("Stewart", "Colbert"));
898     }
899 
900     #[test]
damerau_levenshtein_many_transpositions()901     fn damerau_levenshtein_many_transpositions() {
902         assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk"));
903     }
904 
905     #[test]
damerau_levenshtein_diff_longer()906     fn damerau_levenshtein_diff_longer() {
907         let a = "The quick brown fox jumped over the angry dog.";
908         let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
909         assert_eq!(36, damerau_levenshtein(a, b));
910     }
911 
912     #[test]
damerau_levenshtein_beginning_transposition()913     fn damerau_levenshtein_beginning_transposition() {
914         assert_eq!(1, damerau_levenshtein("foobar", "ofobar"));
915     }
916 
917     #[test]
damerau_levenshtein_end_transposition()918     fn damerau_levenshtein_end_transposition() {
919         assert_eq!(1, damerau_levenshtein("specter", "spectre"));
920     }
921 
922     #[test]
damerau_levenshtein_unrestricted_edit()923     fn damerau_levenshtein_unrestricted_edit() {
924         assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
925     }
926 
927     #[test]
normalized_damerau_levenshtein_diff_short()928     fn normalized_damerau_levenshtein_diff_short() {
929         assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
930     }
931 
932     #[test]
normalized_damerau_levenshtein_for_empty_strings()933     fn normalized_damerau_levenshtein_for_empty_strings() {
934         assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
935     }
936 
937     #[test]
normalized_damerau_levenshtein_first_empty()938     fn normalized_damerau_levenshtein_first_empty() {
939         assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
940     }
941 
942     #[test]
normalized_damerau_levenshtein_second_empty()943     fn normalized_damerau_levenshtein_second_empty() {
944         assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
945     }
946 
947     #[test]
normalized_damerau_levenshtein_identical_strings()948     fn normalized_damerau_levenshtein_identical_strings() {
949         assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
950     }
951 
952     #[test]
sorensen_dice_all()953     fn sorensen_dice_all() {
954         // test cases taken from
955         // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11
956 
957         assert_eq!(1.0, sorensen_dice("a", "a"));
958         assert_eq!(0.0, sorensen_dice("a", "b"));
959         assert_eq!(1.0, sorensen_dice("", ""));
960         assert_eq!(0.0, sorensen_dice("a", ""));
961         assert_eq!(0.0, sorensen_dice("", "a"));
962         assert_eq!(1.0, sorensen_dice("apple event", "apple    event"));
963         assert_eq!(0.9090909090909091, sorensen_dice("iphone", "iphone x"));
964         assert_eq!(0.0, sorensen_dice("french", "quebec"));
965         assert_eq!(1.0, sorensen_dice("france", "france"));
966         assert_eq!(0.2, sorensen_dice("fRaNce", "france"));
967         assert_eq!(0.8, sorensen_dice("healed", "sealed"));
968         assert_eq!(
969             0.7878787878787878,
970             sorensen_dice("web applications", "applications of the web")
971         );
972         assert_eq!(
973             0.92,
974             sorensen_dice(
975                 "this will have a typo somewhere",
976                 "this will huve a typo somewhere"
977             )
978         );
979         assert_eq!(
980             0.6060606060606061,
981             sorensen_dice(
982                 "Olive-green table for sale, in extremely good condition.",
983                 "For sale: table in very good  condition, olive green in colour."
984             )
985         );
986         assert_eq!(
987             0.2558139534883721,
988             sorensen_dice(
989                 "Olive-green table for sale, in extremely good condition.",
990                 "For sale: green Subaru Impreza, 210,000 miles"
991             )
992         );
993         assert_eq!(
994             0.1411764705882353,
995             sorensen_dice(
996                 "Olive-green table for sale, in extremely good condition.",
997                 "Wanted: mountain bike with at least 21 gears."
998             )
999         );
1000         assert_eq!(
1001             0.7741935483870968,
1002             sorensen_dice("this has one extra word", "this has one word")
1003         );
1004     }
1005 }
1006