1 //! Kernel density estimation 2 3 pub mod kernel; 4 5 use self::kernel::Kernel; 6 use crate::stats::float::Float; 7 use crate::stats::univariate::Sample; 8 #[cfg(feature = "rayon")] 9 use rayon::prelude::*; 10 11 /// Univariate kernel density estimator 12 pub struct Kde<'a, A, K> 13 where 14 A: Float, 15 K: Kernel<A>, 16 { 17 bandwidth: A, 18 kernel: K, 19 sample: &'a Sample<A>, 20 } 21 22 impl<'a, A, K> Kde<'a, A, K> 23 where 24 A: 'a + Float, 25 K: Kernel<A>, 26 { 27 /// Creates a new kernel density estimator from the `sample`, using a kernel and estimating 28 /// the bandwidth using the method `bw` new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K>29 pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> { 30 Kde { 31 bandwidth: bw.estimate(sample), 32 kernel, 33 sample, 34 } 35 } 36 37 /// Returns the bandwidth used by the estimator bandwidth(&self) -> A38 pub fn bandwidth(&self) -> A { 39 self.bandwidth 40 } 41 42 /// Maps the KDE over `xs` 43 /// 44 /// - Multihreaded map(&self, xs: &[A]) -> Box<[A]>45 pub fn map(&self, xs: &[A]) -> Box<[A]> { 46 #[cfg(feature = "rayon")] 47 let iter = xs.par_iter(); 48 49 #[cfg(not(feature = "rayon"))] 50 let iter = xs.iter(); 51 52 iter.map(|&x| self.estimate(x)) 53 .collect::<Vec<_>>() 54 .into_boxed_slice() 55 } 56 57 /// Estimates the probability density of `x` estimate(&self, x: A) -> A58 pub fn estimate(&self, x: A) -> A { 59 let _0 = A::cast(0); 60 let slice = self.sample; 61 let h = self.bandwidth; 62 let n = A::cast(slice.len()); 63 64 let sum = slice 65 .iter() 66 .fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h)); 67 68 sum / (h * n) 69 } 70 } 71 72 /// Method to estimate the bandwidth 73 pub enum Bandwidth { 74 /// Use Silverman's rule of thumb to estimate the bandwidth from the sample 75 Silverman, 76 } 77 78 impl Bandwidth { estimate<A: Float>(self, sample: &Sample<A>) -> A79 fn estimate<A: Float>(self, sample: &Sample<A>) -> A { 80 match self { 81 Bandwidth::Silverman => { 82 let factor = A::cast(4. / 3.); 83 let exponent = A::cast(1. / 5.); 84 let n = A::cast(sample.len()); 85 let sigma = sample.std_dev(None); 86 87 sigma * (factor / n).powf(exponent) 88 } 89 } 90 } 91 } 92 93 #[cfg(test)] 94 macro_rules! test { 95 ($ty:ident) => { 96 mod $ty { 97 use approx::relative_eq; 98 use quickcheck::quickcheck; 99 use quickcheck::TestResult; 100 101 use crate::stats::univariate::kde::kernel::Gaussian; 102 use crate::stats::univariate::kde::{Bandwidth, Kde}; 103 use crate::stats::univariate::Sample; 104 105 // The [-inf inf] integral of the estimated PDF should be one 106 quickcheck! { 107 fn integral(size: u8, start: u8) -> TestResult { 108 let size = size as usize; 109 let start = start as usize; 110 const DX: $ty = 1e-3; 111 112 if let Some(v) = crate::stats::test::vec::<$ty>(size, start) { 113 let slice = &v[start..]; 114 let data = Sample::new(slice); 115 let kde = Kde::new(data, Gaussian, Bandwidth::Silverman); 116 let h = kde.bandwidth(); 117 // NB Obviously a [-inf inf] integral is not feasible, but this range works 118 // quite well 119 let (a, b) = (data.min() - 5. * h, data.max() + 5. * h); 120 121 let mut acc = 0.; 122 let mut x = a; 123 let mut y = kde.estimate(a); 124 125 while x < b { 126 acc += DX * y / 2.; 127 128 x += DX; 129 y = kde.estimate(x); 130 131 acc += DX * y / 2.; 132 } 133 134 TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5)) 135 } else { 136 TestResult::discard() 137 } 138 } 139 } 140 } 141 }; 142 } 143 144 #[cfg(test)] 145 mod test { 146 test!(f32); 147 test!(f64); 148 } 149