• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #![cfg_attr(not(any(feature = "std", test)), no_std)]
2 #![doc = include_str!("../README.md")]
3 #![deny(missing_docs)]
4 #![deny(clippy::all, clippy::pedantic)]
5 #![allow(
6     clippy::cast_precision_loss,
7     clippy::must_use_candidate,
8     clippy::similar_names,
9     clippy::unreadable_literal,
10     clippy::doc_markdown,
11     clippy::wildcard_imports
12 )]
13 
14 extern crate alloc;
15 
16 mod algorithm;
17 mod counter;
18 mod result;
19 
20 pub mod nstr;
21 pub mod str;
22 
23 mod algorithms {
24     pub mod bag;
25     pub mod cosine;
26     pub mod damerau_levenshtein;
27     pub mod entropy_ncd;
28     pub mod hamming;
29     pub mod jaccard;
30     pub mod jaro;
31     pub mod jaro_winkler;
32     pub mod lcsseq;
33     pub mod lcsstr;
34     pub mod length;
35     pub mod levenshtein;
36     pub mod lig3;
37     pub mod mlipns;
38     pub mod overlap;
39     pub mod prefix;
40     pub mod ratcliff_obershelp;
41     pub mod roberts;
42     pub mod sift4_common;
43     pub mod sift4_simple;
44     pub mod smith_waterman;
45     pub mod sorensen_dice;
46     pub mod suffix;
47     pub mod tversky;
48     pub mod yujian_bo;
49 }
50 
51 pub use self::algorithm::Algorithm;
52 #[cfg(feature = "std")]
53 pub use self::algorithms::bag::Bag;
54 #[cfg(feature = "std")]
55 pub use self::algorithms::cosine::Cosine;
56 #[cfg(feature = "std")]
57 pub use self::algorithms::damerau_levenshtein::DamerauLevenshtein;
58 #[cfg(feature = "std")]
59 pub use self::algorithms::entropy_ncd::EntropyNCD;
60 pub use self::algorithms::hamming::Hamming;
61 #[cfg(feature = "std")]
62 pub use self::algorithms::jaccard::Jaccard;
63 pub use self::algorithms::jaro::Jaro;
64 pub use self::algorithms::jaro_winkler::JaroWinkler;
65 pub use self::algorithms::lcsseq::LCSSeq;
66 pub use self::algorithms::lcsstr::LCSStr;
67 pub use self::algorithms::length::Length;
68 pub use self::algorithms::levenshtein::Levenshtein;
69 pub use self::algorithms::lig3::LIG3;
70 pub use self::algorithms::mlipns::MLIPNS;
71 #[cfg(feature = "std")]
72 pub use self::algorithms::overlap::Overlap;
73 pub use self::algorithms::prefix::Prefix;
74 pub use self::algorithms::ratcliff_obershelp::RatcliffObershelp;
75 #[cfg(feature = "std")]
76 pub use self::algorithms::roberts::Roberts;
77 pub use self::algorithms::sift4_common::Sift4Common;
78 pub use self::algorithms::sift4_simple::Sift4Simple;
79 pub use self::algorithms::smith_waterman::SmithWaterman;
80 #[cfg(feature = "std")]
81 pub use self::algorithms::sorensen_dice::SorensenDice;
82 pub use self::algorithms::suffix::Suffix;
83 #[cfg(feature = "std")]
84 pub use self::algorithms::tversky::Tversky;
85 pub use self::algorithms::yujian_bo::YujianBo;
86 pub use self::result::Result;
87 
88 #[cfg(test)]
89 mod tests {
90     #![allow(clippy::float_cmp)]
91 
92     use super::*;
93     use assert2::assert;
94     use proptest::prelude::*;
95     use rstest::rstest;
96 
97     const ALGS: usize = 8;
98 
get_result(alg: usize, s1: &str, s2: &str) -> Result<usize>99     fn get_result(alg: usize, s1: &str, s2: &str) -> Result<usize> {
100         match alg {
101             1 => Hamming::default().for_str(s1, s2),
102             2 => LCSSeq::default().for_str(s1, s2),
103             3 => LCSStr::default().for_str(s1, s2),
104             4 => RatcliffObershelp::default().for_str(s1, s2),
105             5 => Levenshtein::default().for_str(s1, s2),
106             #[cfg(feature = "std")]
107             6 => DamerauLevenshtein::default().for_str(s1, s2),
108             7 => Sift4Simple::default().for_str(s1, s2),
109             8 => MLIPNS::default().for_str(s1, s2),
110             9 => Prefix::default().for_str(s1, s2),
111             10 => Suffix::default().for_str(s1, s2),
112             11 => Length::default().for_str(s1, s2),
113             12 => Bag::default().for_str(s1, s2),
114             13 => SmithWaterman::default().for_str(s1, s2),
115             14 => Sift4Common::default().for_str(s1, s2),
116             _ => panic!("there are not so many algorithms!"),
117         }
118     }
119 
get_result_f64(alg: usize, s1: &str, s2: &str) -> Result<f64>120     fn get_result_f64(alg: usize, s1: &str, s2: &str) -> Result<f64> {
121         match alg {
122             1 => Jaro::default().for_str(s1, s2),
123             2 => JaroWinkler::default().for_str(s1, s2),
124             3 => YujianBo::default().for_str(s1, s2),
125             4 => Jaccard::default().for_str(s1, s2),
126             5 => SorensenDice::default().for_str(s1, s2),
127             6 => Tversky::default().for_str(s1, s2),
128             7 => Overlap::default().for_str(s1, s2),
129             8 => Cosine::default().for_str(s1, s2),
130             9 => EntropyNCD::default().for_str(s1, s2),
131             10 => LIG3::default().for_str(s1, s2),
132             11 => Roberts::default().for_str(s1, s2),
133             _ => panic!("there are not so many algorithms!"),
134         }
135     }
136 
137     #[rstest]
138     #[case::hamming(1)]
139     #[case::lcsseq(2)]
140     #[case::lcsstr(3)]
141     #[case::ratcliff_obershelp(4)]
142     #[case::levenshtein(5)]
143     #[case::damerau_levenshtein(6)]
144     #[case::sift4_simple(7)]
145     #[case::mlipns(8)]
146     #[case::prefix(9)]
147     #[case::suffix(10)]
148     #[case::length(11)]
149     #[case::bag(12)]
150     #[case::smith_waterman(13)]
151     #[case::sift4_common(14)]
basic_usize(#[case] alg: usize)152     fn basic_usize(#[case] alg: usize) {
153         let empty_res = get_result(alg, "", "");
154         assert!(empty_res.dist() == 0);
155         if alg != 8 {
156             assert!(get_result(alg, "ab", "cde").dist() > 0);
157             assert!(get_result(alg, "ab", "cde").ndist() > 0.);
158         }
159         if alg != 11 {
160             assert!(get_result(alg, "spam", "qwer").sim() == 0);
161             assert!(get_result(alg, "spam", "qwer").nsim() == 0.);
162         }
163         assert!(empty_res.ndist() == 0.);
164         assert!(empty_res.nsim() == 1.);
165     }
166 
167     #[rstest]
168     #[case::jaro(1)]
169     #[case::jaro_winkler(2)]
170     #[case::yujian_bo(3)]
171     #[case::jaccard(4)]
172     #[case::sorensen_dice(5)]
173     #[case::tversky(6)]
174     #[case::overlap(7)]
175     #[case::cosine(8)]
176     #[case::entropy_ncd(9)]
177     #[case::lig3(10)]
178     #[case::roberts(11)]
basic_f64(#[case] alg: usize)179     fn basic_f64(#[case] alg: usize) {
180         let empty_res = get_result_f64(alg, "", "");
181         assert!(get_result_f64(alg, "ab", "cde").ndist() > 0.);
182         if alg != 3 && alg != 9 {
183             assert!(get_result_f64(alg, "spam", "qwer").nsim() == 0.);
184         }
185         assert!(empty_res.ndist() == 0.);
186         assert!(empty_res.nsim() == 1.);
187         assert!(empty_res.max == 1.);
188     }
189 
is_close(a: f64, b: f64) -> bool190     fn is_close(a: f64, b: f64) -> bool {
191         (a - b).abs() < 1E-9
192     }
193 
194     proptest! {
195         #[test]
196         fn prop(s1 in ".*", s2 in ".*") {
197             for alg in 1..ALGS {
198                 let res = get_result(alg, &s1, &s2);
199                 let d = res.dist();
200                 let s = res.sim();
201 
202                 let nd = res.ndist();
203                 prop_assert!(nd.is_finite());
204                 prop_assert!(nd >= 0.);
205                 prop_assert!(nd <= 1.);
206 
207                 let ns = res.nsim();
208                 prop_assert!(ns.is_finite());
209                 prop_assert!(ns >= 0.);
210                 prop_assert!(ns <= 1.);
211 
212                 prop_assert!(is_close(ns + nd, 1.), "{} + {} == 1", nd, ns);
213 
214                 if d < s {
215                     prop_assert!(nd < ns, "{} < {}", nd, ns);
216                 } else if d > s {
217                     prop_assert!(nd > ns, "{} > {}", nd, ns);
218                 } else if !s1.is_empty() && !s2.is_empty() {
219                     prop_assert!(nd == ns, "{} == {}", nd, ns);
220                 }
221                 prop_assert!(res.val() == d || res.val() == s);
222 
223                 prop_assert_eq!(res.len1, s1.chars().count());
224                 prop_assert_eq!(res.len2, s2.chars().count());
225                 prop_assert!(res.max >= res.len1.min(res.len2));
226             }
227         }
228 
229         #[test]
230         fn prop_same(s in ".*") {
231             for alg in 1..ALGS {
232                 let res = get_result(alg, &s, &s);
233                 let nd = res.ndist();
234                 prop_assert_eq!(nd, 0., "{}: {} == 0.0", alg, nd);
235                 let ns = res.nsim();
236                 prop_assert_eq!(ns, 1., "{}: {} == 1.0", alg, ns);
237             }
238         }
239 
240         // strings should have lower distance if you add the same prefix to them
241         fn prop_prefix(prefix in ".+", s1 in ".+", s2 in ".+") {
242             for alg in 1..ALGS {
243                 let r1 = get_result(alg, &s1, &s2).ndist();
244                 let mut p1 = prefix.clone();
245                 let mut p2 = prefix.clone();
246                 p1.push_str(&s1);
247                 p2.push_str(&s2);
248                 let r2 = get_result(alg, &p1, &p2).ndist();
249                 prop_assert!(r1 > r2, "{}: {} > {}", alg, r1, r2);
250             }
251         }
252     }
253 }
254