1 //! Sørensen-Dice coefficient 2 #![cfg(feature = "std")] 3 use crate::counter::Counter; 4 use crate::{Algorithm, Result}; 5 6 /// [Sørensen–Dice similarity] is a ratio of common chars to total chars in the given strings. 7 /// 8 /// [Sørensen–Dice similarity]: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient 9 #[derive(Default)] 10 pub struct SorensenDice {} 11 12 impl Algorithm<f64> for SorensenDice { for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64> where C: Iterator<Item = E>, E: Eq + core::hash::Hash,13 fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64> 14 where 15 C: Iterator<Item = E>, 16 E: Eq + core::hash::Hash, 17 { 18 let c1 = Counter::from_iter(s1); 19 let c2 = Counter::from_iter(s2); 20 let cn = c1.count() + c2.count(); 21 let res = if cn == 0 { 22 1. 23 } else { 24 let ic = c1.intersect_count(&c2); 25 (2 * ic) as f64 / cn as f64 26 }; 27 Result { 28 abs: res, 29 is_distance: false, 30 max: 1., 31 len1: c1.count(), 32 len2: c2.count(), 33 } 34 } 35 } 36 37 #[cfg(test)] 38 mod tests { 39 use crate::str::sorensen_dice; 40 use crate::{Algorithm, SorensenDice}; 41 use assert2::assert; 42 use rstest::rstest; 43 is_close(a: f64, b: f64) -> bool44 fn is_close(a: f64, b: f64) -> bool { 45 (a - b).abs() < 1E-5 46 } 47 48 #[rstest] 49 #[case("", "", 1.)] 50 #[case("nelson", "", 0.)] 51 #[case("", "neilsen", 0.)] 52 // parity with textdistance 53 #[case("test", "text", 2.0 * 3. / 8.)] function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64)54 fn function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64) { 55 let act = sorensen_dice(s1, s2); 56 let ok = is_close(act, exp); 57 assert!(ok, "sorensen_dice({}, {}) is {}, not {}", s1, s2, act, exp); 58 } 59 60 #[rstest] 61 // parity with strsim 62 #[case("a", "a", 1.0)] 63 #[case("", "", 1.0)] 64 #[case("apple event", "apple event", 1.0)] 65 #[case("iphone", "iphone x", 0.9090909090909091)] 66 #[case("french", "quebec", 0.0)] 67 #[case("france", "france", 1.0)] 68 #[case("fRaNce", "france", 0.2)] 69 #[case("healed", "sealed", 0.8)] 70 #[case("web applications", "applications of the web", 0.7878787878)] 71 #[case("this has one extra word", "this has one word", 0.7741935483870968)] 72 #[case( 73 "this will have a typo somewhere", 74 "this will huve a typo somewhere", 75 0.92 76 )] 77 #[case( 78 "Olive-green table for sale, in extremely good condition.", 79 "For sale: table in very good condition, olive green in colour.", 80 0.6060606060606061 81 )] 82 #[case( 83 "Olive-green table for sale, in extremely good condition.", 84 "For sale: green Subaru Impreza, 210,000 miles", 85 0.2558139534883721 86 )] 87 #[case( 88 "Olive-green table for sale, in extremely good condition.", 89 "Wanted: mountain bike with at least 21 gears.", 90 0.1411764705882353 91 )] for_bigrams(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64)92 fn for_bigrams(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64) { 93 let s1 = &s1.replace(' ', ""); 94 let s2 = &s2.replace(' ', ""); 95 let act = SorensenDice::default().for_bigrams(s1, s2).nval(); 96 let ok = is_close(act, exp); 97 assert!(ok, "sorensen_dice({}, {}) is {}, not {}", s1, s2, act, exp); 98 } 99 } 100