1 //! Smith-Waterman sequence alignment 2 use crate::{Algorithm, Result}; 3 use alloc::vec; 4 use alloc::vec::Vec; 5 6 /// [Smith-Waterman similarity] is edit-based and designed for nucleic acid (and protein) sequences. 7 /// 8 /// [Smith-Waterman similarity]: https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm 9 pub struct SmithWaterman { 10 /// The cost of an alignment gap. Default: 1. 11 pub gap_cost: isize, 12 13 /// The cost of symbols matching. Default: -1. 14 pub match_cost: isize, 15 16 /// The cost of symbols not matching. Default: 0. 17 pub mismatch_cost: isize, 18 } 19 20 impl Default for SmithWaterman { default() -> Self21 fn default() -> Self { 22 Self { 23 gap_cost: 1, 24 match_cost: -1, 25 mismatch_cost: 0, 26 } 27 } 28 } 29 30 impl Algorithm<usize> for SmithWaterman { for_vec<E: Eq>(&self, s1: &[E], s2: &[E]) -> Result<usize>31 fn for_vec<E: Eq>(&self, s1: &[E], s2: &[E]) -> Result<usize> { 32 let l1 = s1.len(); 33 let l2 = s2.len(); 34 let mut dist_mat: Vec<Vec<isize>> = vec![vec![0; l2 + 1]; l1 + 1]; 35 for (i, sc1) in s1.iter().enumerate() { 36 for (j, sc2) in s2.iter().enumerate() { 37 let cost = if sc1 == sc2 { 38 self.match_cost 39 } else { 40 self.mismatch_cost 41 }; 42 let match_ = dist_mat[i][j] - cost; 43 let delete = dist_mat[i][j + 1] - self.gap_cost; 44 let insert = dist_mat[i + 1][j] - self.gap_cost; 45 dist_mat[i + 1][j + 1] = 0.max(match_).max(delete).max(insert); 46 } 47 } 48 let result = dist_mat[l1][l2]; 49 Result { 50 #[allow(clippy::cast_sign_loss)] 51 abs: result as usize, 52 is_distance: false, 53 max: l1.max(l2), 54 len1: l1, 55 len2: l2, 56 } 57 } 58 } 59 60 #[cfg(test)] 61 mod tests { 62 use crate::str::smith_waterman; 63 use assert2::assert; 64 use rstest::rstest; 65 66 #[rstest] 67 // parity with textdistance 68 #[case("hello", "world", 1)] 69 #[case("abcd", "abce", 3)] 70 #[case("AGACTAGTTAC", "CGAGACGT", 3)] 71 #[case("qwe", "rty", 0)] 72 #[case("qwe", "rty", 0)] 73 #[case("check", "shrek", 2)] 74 // parity with abydos 75 #[case("cat", "hat", 2)] 76 #[case("Niall", "Neil", 1)] 77 #[case("aluminum", "Catalan", 0)] 78 #[case("ATCG", "TAGC", 1)] function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: usize)79 fn function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: usize) { 80 assert!(smith_waterman(s1, s2) == exp); 81 } 82 } 83