1 #![cfg_attr(not(any(feature = "std", test)), no_std)] 2 #![doc = include_str!("../README.md")] 3 #![deny(missing_docs)] 4 #![deny(clippy::all, clippy::pedantic)] 5 #![allow( 6 clippy::cast_precision_loss, 7 clippy::must_use_candidate, 8 clippy::similar_names, 9 clippy::unreadable_literal, 10 clippy::doc_markdown, 11 clippy::wildcard_imports 12 )] 13 14 extern crate alloc; 15 16 mod algorithm; 17 mod counter; 18 mod result; 19 20 pub mod nstr; 21 pub mod str; 22 23 mod algorithms { 24 pub mod bag; 25 pub mod cosine; 26 pub mod damerau_levenshtein; 27 pub mod entropy_ncd; 28 pub mod hamming; 29 pub mod jaccard; 30 pub mod jaro; 31 pub mod jaro_winkler; 32 pub mod lcsseq; 33 pub mod lcsstr; 34 pub mod length; 35 pub mod levenshtein; 36 pub mod lig3; 37 pub mod mlipns; 38 pub mod overlap; 39 pub mod prefix; 40 pub mod ratcliff_obershelp; 41 pub mod roberts; 42 pub mod sift4_common; 43 pub mod sift4_simple; 44 pub mod smith_waterman; 45 pub mod sorensen_dice; 46 pub mod suffix; 47 pub mod tversky; 48 pub mod yujian_bo; 49 } 50 51 pub use self::algorithm::Algorithm; 52 #[cfg(feature = "std")] 53 pub use self::algorithms::bag::Bag; 54 #[cfg(feature = "std")] 55 pub use self::algorithms::cosine::Cosine; 56 #[cfg(feature = "std")] 57 pub use self::algorithms::damerau_levenshtein::DamerauLevenshtein; 58 #[cfg(feature = "std")] 59 pub use self::algorithms::entropy_ncd::EntropyNCD; 60 pub use self::algorithms::hamming::Hamming; 61 #[cfg(feature = "std")] 62 pub use self::algorithms::jaccard::Jaccard; 63 pub use self::algorithms::jaro::Jaro; 64 pub use self::algorithms::jaro_winkler::JaroWinkler; 65 pub use self::algorithms::lcsseq::LCSSeq; 66 pub use self::algorithms::lcsstr::LCSStr; 67 pub use self::algorithms::length::Length; 68 pub use self::algorithms::levenshtein::Levenshtein; 69 pub use self::algorithms::lig3::LIG3; 70 pub use self::algorithms::mlipns::MLIPNS; 71 #[cfg(feature = "std")] 72 pub use self::algorithms::overlap::Overlap; 73 pub use self::algorithms::prefix::Prefix; 74 pub use self::algorithms::ratcliff_obershelp::RatcliffObershelp; 75 #[cfg(feature = "std")] 76 pub use self::algorithms::roberts::Roberts; 77 pub use self::algorithms::sift4_common::Sift4Common; 78 pub use self::algorithms::sift4_simple::Sift4Simple; 79 pub use self::algorithms::smith_waterman::SmithWaterman; 80 #[cfg(feature = "std")] 81 pub use self::algorithms::sorensen_dice::SorensenDice; 82 pub use self::algorithms::suffix::Suffix; 83 #[cfg(feature = "std")] 84 pub use self::algorithms::tversky::Tversky; 85 pub use self::algorithms::yujian_bo::YujianBo; 86 pub use self::result::Result; 87 88 #[cfg(test)] 89 mod tests { 90 #![allow(clippy::float_cmp)] 91 92 use super::*; 93 use assert2::assert; 94 use proptest::prelude::*; 95 use rstest::rstest; 96 97 const ALGS: usize = 8; 98 get_result(alg: usize, s1: &str, s2: &str) -> Result<usize>99 fn get_result(alg: usize, s1: &str, s2: &str) -> Result<usize> { 100 match alg { 101 1 => Hamming::default().for_str(s1, s2), 102 2 => LCSSeq::default().for_str(s1, s2), 103 3 => LCSStr::default().for_str(s1, s2), 104 4 => RatcliffObershelp::default().for_str(s1, s2), 105 5 => Levenshtein::default().for_str(s1, s2), 106 #[cfg(feature = "std")] 107 6 => DamerauLevenshtein::default().for_str(s1, s2), 108 7 => Sift4Simple::default().for_str(s1, s2), 109 8 => MLIPNS::default().for_str(s1, s2), 110 9 => Prefix::default().for_str(s1, s2), 111 10 => Suffix::default().for_str(s1, s2), 112 11 => Length::default().for_str(s1, s2), 113 12 => Bag::default().for_str(s1, s2), 114 13 => SmithWaterman::default().for_str(s1, s2), 115 14 => Sift4Common::default().for_str(s1, s2), 116 _ => panic!("there are not so many algorithms!"), 117 } 118 } 119 get_result_f64(alg: usize, s1: &str, s2: &str) -> Result<f64>120 fn get_result_f64(alg: usize, s1: &str, s2: &str) -> Result<f64> { 121 match alg { 122 1 => Jaro::default().for_str(s1, s2), 123 2 => JaroWinkler::default().for_str(s1, s2), 124 3 => YujianBo::default().for_str(s1, s2), 125 4 => Jaccard::default().for_str(s1, s2), 126 5 => SorensenDice::default().for_str(s1, s2), 127 6 => Tversky::default().for_str(s1, s2), 128 7 => Overlap::default().for_str(s1, s2), 129 8 => Cosine::default().for_str(s1, s2), 130 9 => EntropyNCD::default().for_str(s1, s2), 131 10 => LIG3::default().for_str(s1, s2), 132 11 => Roberts::default().for_str(s1, s2), 133 _ => panic!("there are not so many algorithms!"), 134 } 135 } 136 137 #[rstest] 138 #[case::hamming(1)] 139 #[case::lcsseq(2)] 140 #[case::lcsstr(3)] 141 #[case::ratcliff_obershelp(4)] 142 #[case::levenshtein(5)] 143 #[case::damerau_levenshtein(6)] 144 #[case::sift4_simple(7)] 145 #[case::mlipns(8)] 146 #[case::prefix(9)] 147 #[case::suffix(10)] 148 #[case::length(11)] 149 #[case::bag(12)] 150 #[case::smith_waterman(13)] 151 #[case::sift4_common(14)] basic_usize(#[case] alg: usize)152 fn basic_usize(#[case] alg: usize) { 153 let empty_res = get_result(alg, "", ""); 154 assert!(empty_res.dist() == 0); 155 if alg != 8 { 156 assert!(get_result(alg, "ab", "cde").dist() > 0); 157 assert!(get_result(alg, "ab", "cde").ndist() > 0.); 158 } 159 if alg != 11 { 160 assert!(get_result(alg, "spam", "qwer").sim() == 0); 161 assert!(get_result(alg, "spam", "qwer").nsim() == 0.); 162 } 163 assert!(empty_res.ndist() == 0.); 164 assert!(empty_res.nsim() == 1.); 165 } 166 167 #[rstest] 168 #[case::jaro(1)] 169 #[case::jaro_winkler(2)] 170 #[case::yujian_bo(3)] 171 #[case::jaccard(4)] 172 #[case::sorensen_dice(5)] 173 #[case::tversky(6)] 174 #[case::overlap(7)] 175 #[case::cosine(8)] 176 #[case::entropy_ncd(9)] 177 #[case::lig3(10)] 178 #[case::roberts(11)] basic_f64(#[case] alg: usize)179 fn basic_f64(#[case] alg: usize) { 180 let empty_res = get_result_f64(alg, "", ""); 181 assert!(get_result_f64(alg, "ab", "cde").ndist() > 0.); 182 if alg != 3 && alg != 9 { 183 assert!(get_result_f64(alg, "spam", "qwer").nsim() == 0.); 184 } 185 assert!(empty_res.ndist() == 0.); 186 assert!(empty_res.nsim() == 1.); 187 assert!(empty_res.max == 1.); 188 } 189 is_close(a: f64, b: f64) -> bool190 fn is_close(a: f64, b: f64) -> bool { 191 (a - b).abs() < 1E-9 192 } 193 194 proptest! { 195 #[test] 196 fn prop(s1 in ".*", s2 in ".*") { 197 for alg in 1..ALGS { 198 let res = get_result(alg, &s1, &s2); 199 let d = res.dist(); 200 let s = res.sim(); 201 202 let nd = res.ndist(); 203 prop_assert!(nd.is_finite()); 204 prop_assert!(nd >= 0.); 205 prop_assert!(nd <= 1.); 206 207 let ns = res.nsim(); 208 prop_assert!(ns.is_finite()); 209 prop_assert!(ns >= 0.); 210 prop_assert!(ns <= 1.); 211 212 prop_assert!(is_close(ns + nd, 1.), "{} + {} == 1", nd, ns); 213 214 if d < s { 215 prop_assert!(nd < ns, "{} < {}", nd, ns); 216 } else if d > s { 217 prop_assert!(nd > ns, "{} > {}", nd, ns); 218 } else if !s1.is_empty() && !s2.is_empty() { 219 prop_assert!(nd == ns, "{} == {}", nd, ns); 220 } 221 prop_assert!(res.val() == d || res.val() == s); 222 223 prop_assert_eq!(res.len1, s1.chars().count()); 224 prop_assert_eq!(res.len2, s2.chars().count()); 225 prop_assert!(res.max >= res.len1.min(res.len2)); 226 } 227 } 228 229 #[test] 230 fn prop_same(s in ".*") { 231 for alg in 1..ALGS { 232 let res = get_result(alg, &s, &s); 233 let nd = res.ndist(); 234 prop_assert_eq!(nd, 0., "{}: {} == 0.0", alg, nd); 235 let ns = res.nsim(); 236 prop_assert_eq!(ns, 1., "{}: {} == 1.0", alg, ns); 237 } 238 } 239 240 // strings should have lower distance if you add the same prefix to them 241 fn prop_prefix(prefix in ".+", s1 in ".+", s2 in ".+") { 242 for alg in 1..ALGS { 243 let r1 = get_result(alg, &s1, &s2).ndist(); 244 let mut p1 = prefix.clone(); 245 let mut p2 = prefix.clone(); 246 p1.push_str(&s1); 247 p2.push_str(&s2); 248 let r2 = get_result(alg, &p1, &p2).ndist(); 249 prop_assert!(r1 > r2, "{}: {} > {}", alg, r1, r2); 250 } 251 } 252 } 253 } 254