1 //! Kernel density estimation 2 3 pub mod kernel; 4 5 use self::kernel::Kernel; 6 use crate::stats::float::Float; 7 use crate::stats::univariate::Sample; 8 use rayon::prelude::*; 9 10 /// Univariate kernel density estimator 11 pub struct Kde<'a, A, K> 12 where 13 A: Float, 14 K: Kernel<A>, 15 { 16 bandwidth: A, 17 kernel: K, 18 sample: &'a Sample<A>, 19 } 20 21 impl<'a, A, K> Kde<'a, A, K> 22 where 23 A: 'a + Float, 24 K: Kernel<A>, 25 { 26 /// Creates a new kernel density estimator from the `sample`, using a kernel and estimating 27 /// the bandwidth using the method `bw` new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K>28 pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> { 29 Kde { 30 bandwidth: bw.estimate(sample), 31 kernel, 32 sample, 33 } 34 } 35 36 /// Returns the bandwidth used by the estimator bandwidth(&self) -> A37 pub fn bandwidth(&self) -> A { 38 self.bandwidth 39 } 40 41 /// Maps the KDE over `xs` 42 /// 43 /// - Multihreaded map(&self, xs: &[A]) -> Box<[A]>44 pub fn map(&self, xs: &[A]) -> Box<[A]> { 45 xs.par_iter() 46 .map(|&x| self.estimate(x)) 47 .collect::<Vec<_>>() 48 .into_boxed_slice() 49 } 50 51 /// Estimates the probability density of `x` estimate(&self, x: A) -> A52 pub fn estimate(&self, x: A) -> A { 53 let _0 = A::cast(0); 54 let slice = self.sample; 55 let h = self.bandwidth; 56 let n = A::cast(slice.len()); 57 58 let sum = slice 59 .iter() 60 .fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h)); 61 62 sum / (h * n) 63 } 64 } 65 66 /// Method to estimate the bandwidth 67 pub enum Bandwidth { 68 /// Use Silverman's rule of thumb to estimate the bandwidth from the sample 69 Silverman, 70 } 71 72 impl Bandwidth { estimate<A: Float>(self, sample: &Sample<A>) -> A73 fn estimate<A: Float>(self, sample: &Sample<A>) -> A { 74 match self { 75 Bandwidth::Silverman => { 76 let factor = A::cast(4. / 3.); 77 let exponent = A::cast(1. / 5.); 78 let n = A::cast(sample.len()); 79 let sigma = sample.std_dev(None); 80 81 sigma * (factor / n).powf(exponent) 82 } 83 } 84 } 85 } 86 87 #[cfg(test)] 88 macro_rules! test { 89 ($ty:ident) => { 90 mod $ty { 91 use approx::relative_eq; 92 use quickcheck::quickcheck; 93 use quickcheck::TestResult; 94 95 use crate::stats::univariate::kde::kernel::Gaussian; 96 use crate::stats::univariate::kde::{Bandwidth, Kde}; 97 use crate::stats::univariate::Sample; 98 99 // The [-inf inf] integral of the estimated PDF should be one 100 quickcheck! { 101 fn integral(size: usize, start: usize) -> TestResult { 102 const DX: $ty = 1e-3; 103 104 if let Some(v) = crate::stats::test::vec::<$ty>(size, start) { 105 let slice = &v[start..]; 106 let data = Sample::new(slice); 107 let kde = Kde::new(data, Gaussian, Bandwidth::Silverman); 108 let h = kde.bandwidth(); 109 // NB Obviously a [-inf inf] integral is not feasible, but this range works 110 // quite well 111 let (a, b) = (data.min() - 5. * h, data.max() + 5. * h); 112 113 let mut acc = 0.; 114 let mut x = a; 115 let mut y = kde.estimate(a); 116 117 while x < b { 118 acc += DX * y / 2.; 119 120 x += DX; 121 y = kde.estimate(x); 122 123 acc += DX * y / 2.; 124 } 125 126 TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5)) 127 } else { 128 TestResult::discard() 129 } 130 } 131 } 132 } 133 }; 134 } 135 136 #[cfg(test)] 137 mod test { 138 test!(f32); 139 test!(f64); 140 } 141