• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //! Sørensen-Dice coefficient
2 #![cfg(feature = "std")]
3 use crate::counter::Counter;
4 use crate::{Algorithm, Result};
5 
6 /// [Sørensen–Dice similarity] is a ratio of common chars to total chars in the given strings.
7 ///
8 /// [Sørensen–Dice similarity]: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
9 #[derive(Default)]
10 pub struct SorensenDice {}
11 
12 impl Algorithm<f64> for SorensenDice {
for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64> where C: Iterator<Item = E>, E: Eq + core::hash::Hash,13     fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
14     where
15         C: Iterator<Item = E>,
16         E: Eq + core::hash::Hash,
17     {
18         let c1 = Counter::from_iter(s1);
19         let c2 = Counter::from_iter(s2);
20         let cn = c1.count() + c2.count();
21         let res = if cn == 0 {
22             1.
23         } else {
24             let ic = c1.intersect_count(&c2);
25             (2 * ic) as f64 / cn as f64
26         };
27         Result {
28             abs: res,
29             is_distance: false,
30             max: 1.,
31             len1: c1.count(),
32             len2: c2.count(),
33         }
34     }
35 }
36 
37 #[cfg(test)]
38 mod tests {
39     use crate::str::sorensen_dice;
40     use crate::{Algorithm, SorensenDice};
41     use assert2::assert;
42     use rstest::rstest;
43 
is_close(a: f64, b: f64) -> bool44     fn is_close(a: f64, b: f64) -> bool {
45         (a - b).abs() < 1E-5
46     }
47 
48     #[rstest]
49     #[case("", "", 1.)]
50     #[case("nelson", "", 0.)]
51     #[case("", "neilsen", 0.)]
52     // parity with textdistance
53     #[case("test", "text", 2.0 * 3. / 8.)]
function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64)54     fn function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64) {
55         let act = sorensen_dice(s1, s2);
56         let ok = is_close(act, exp);
57         assert!(ok, "sorensen_dice({}, {}) is {}, not {}", s1, s2, act, exp);
58     }
59 
60     #[rstest]
61     // parity with strsim
62     #[case("a", "a", 1.0)]
63     #[case("", "", 1.0)]
64     #[case("apple event", "apple event", 1.0)]
65     #[case("iphone", "iphone x", 0.9090909090909091)]
66     #[case("french", "quebec", 0.0)]
67     #[case("france", "france", 1.0)]
68     #[case("fRaNce", "france", 0.2)]
69     #[case("healed", "sealed", 0.8)]
70     #[case("web applications", "applications of the web", 0.7878787878)]
71     #[case("this has one extra word", "this has one word", 0.7741935483870968)]
72     #[case(
73         "this will have a typo somewhere",
74         "this will huve a typo somewhere",
75         0.92
76     )]
77     #[case(
78         "Olive-green table for sale, in extremely good condition.",
79         "For sale: table in very good  condition, olive green in colour.",
80         0.6060606060606061
81     )]
82     #[case(
83         "Olive-green table for sale, in extremely good condition.",
84         "For sale: green Subaru Impreza, 210,000 miles",
85         0.2558139534883721
86     )]
87     #[case(
88         "Olive-green table for sale, in extremely good condition.",
89         "Wanted: mountain bike with at least 21 gears.",
90         0.1411764705882353
91     )]
for_bigrams(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64)92     fn for_bigrams(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64) {
93         let s1 = &s1.replace(' ', "");
94         let s2 = &s2.replace(' ', "");
95         let act = SorensenDice::default().for_bigrams(s1, s2).nval();
96         let ok = is_close(act, exp);
97         assert!(ok, "sorensen_dice({}, {}) is {}, not {}", s1, s2, act, exp);
98     }
99 }
100