1 use std::{mem, ops}; 2 3 use crate::stats::float::Float; 4 use crate::stats::tuple::{Tuple, TupledDistributionsBuilder}; 5 use crate::stats::univariate::Percentiles; 6 use crate::stats::univariate::Resamples; 7 use rayon::prelude::*; 8 9 /// A collection of data points drawn from a population 10 /// 11 /// Invariants: 12 /// 13 /// - The sample contains at least 2 data points 14 /// - The sample contains no `NaN`s 15 pub struct Sample<A>([A]); 16 17 // TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module 18 impl<A> Sample<A> 19 where 20 A: Float, 21 { 22 /// Creates a new sample from an existing slice 23 /// 24 /// # Panics 25 /// 26 /// Panics if `slice` contains any `NaN` or if `slice` has less than two elements 27 #[cfg_attr(feature = "cargo-clippy", allow(clippy::new_ret_no_self))] new(slice: &[A]) -> &Sample<A>28 pub fn new(slice: &[A]) -> &Sample<A> { 29 assert!(slice.len() > 1 && slice.iter().all(|x| !x.is_nan())); 30 31 unsafe { mem::transmute(slice) } 32 } 33 34 /// Returns the biggest element in the sample 35 /// 36 /// - Time: `O(length)` max(&self) -> A37 pub fn max(&self) -> A { 38 let mut elems = self.iter(); 39 40 match elems.next() { 41 Some(&head) => elems.fold(head, |a, &b| a.max(b)), 42 // NB `unreachable!` because `Sample` is guaranteed to have at least one data point 43 None => unreachable!(), 44 } 45 } 46 47 /// Returns the arithmetic average of the sample 48 /// 49 /// - Time: `O(length)` mean(&self) -> A50 pub fn mean(&self) -> A { 51 let n = self.len(); 52 53 self.sum() / A::cast(n) 54 } 55 56 /// Returns the median absolute deviation 57 /// 58 /// The `median` can be optionally passed along to speed up (2X) the computation 59 /// 60 /// - Time: `O(length)` 61 /// - Memory: `O(length)` median_abs_dev(&self, median: Option<A>) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,62 pub fn median_abs_dev(&self, median: Option<A>) -> A 63 where 64 usize: cast::From<A, Output = Result<usize, cast::Error>>, 65 { 66 let median = median.unwrap_or_else(|| self.percentiles().median()); 67 68 // NB Although this operation can be SIMD accelerated, the gain is negligible because the 69 // bottle neck is the sorting operation which is part of the computation of the median 70 let abs_devs = self.iter().map(|&x| (x - median).abs()).collect::<Vec<_>>(); 71 72 let abs_devs: &Self = Self::new(&abs_devs); 73 74 abs_devs.percentiles().median() * A::cast(1.4826) 75 } 76 77 /// Returns the median absolute deviation as a percentage of the median 78 /// 79 /// - Time: `O(length)` 80 /// - Memory: `O(length)` median_abs_dev_pct(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,81 pub fn median_abs_dev_pct(&self) -> A 82 where 83 usize: cast::From<A, Output = Result<usize, cast::Error>>, 84 { 85 let _100 = A::cast(100); 86 let median = self.percentiles().median(); 87 let mad = self.median_abs_dev(Some(median)); 88 89 (mad / median) * _100 90 } 91 92 /// Returns the smallest element in the sample 93 /// 94 /// - Time: `O(length)` min(&self) -> A95 pub fn min(&self) -> A { 96 let mut elems = self.iter(); 97 98 match elems.next() { 99 Some(&elem) => elems.fold(elem, |a, &b| a.min(b)), 100 // NB `unreachable!` because `Sample` is guaranteed to have at least one data point 101 None => unreachable!(), 102 } 103 } 104 105 /// Returns a "view" into the percentiles of the sample 106 /// 107 /// This "view" makes consecutive computations of percentiles much faster (`O(1)`) 108 /// 109 /// - Time: `O(N log N) where N = length` 110 /// - Memory: `O(length)` percentiles(&self) -> Percentiles<A> where usize: cast::From<A, Output = Result<usize, cast::Error>>,111 pub fn percentiles(&self) -> Percentiles<A> 112 where 113 usize: cast::From<A, Output = Result<usize, cast::Error>>, 114 { 115 use std::cmp::Ordering; 116 117 // NB This function assumes that there are no `NaN`s in the sample 118 fn cmp<T>(a: &T, b: &T) -> Ordering 119 where 120 T: PartialOrd, 121 { 122 match a.partial_cmp(b) { 123 Some(o) => o, 124 // Arbitrary way to handle NaNs that should never happen 125 None => Ordering::Equal, 126 } 127 } 128 129 let mut v = self.to_vec().into_boxed_slice(); 130 v.par_sort_unstable_by(cmp); 131 132 // NB :-1: to intra-crate privacy rules 133 unsafe { mem::transmute(v) } 134 } 135 136 /// Returns the standard deviation of the sample 137 /// 138 /// The `mean` can be optionally passed along to speed up (2X) the computation 139 /// 140 /// - Time: `O(length)` std_dev(&self, mean: Option<A>) -> A141 pub fn std_dev(&self, mean: Option<A>) -> A { 142 self.var(mean).sqrt() 143 } 144 145 /// Returns the standard deviation as a percentage of the mean 146 /// 147 /// - Time: `O(length)` std_dev_pct(&self) -> A148 pub fn std_dev_pct(&self) -> A { 149 let _100 = A::cast(100); 150 let mean = self.mean(); 151 let std_dev = self.std_dev(Some(mean)); 152 153 (std_dev / mean) * _100 154 } 155 156 /// Returns the sum of all the elements of the sample 157 /// 158 /// - Time: `O(length)` sum(&self) -> A159 pub fn sum(&self) -> A { 160 crate::stats::sum(self) 161 } 162 163 /// Returns the t score between these two samples 164 /// 165 /// - Time: `O(length)` t(&self, other: &Sample<A>) -> A166 pub fn t(&self, other: &Sample<A>) -> A { 167 let (x_bar, y_bar) = (self.mean(), other.mean()); 168 let (s2_x, s2_y) = (self.var(Some(x_bar)), other.var(Some(y_bar))); 169 let n_x = A::cast(self.len()); 170 let n_y = A::cast(other.len()); 171 let num = x_bar - y_bar; 172 let den = (s2_x / n_x + s2_y / n_y).sqrt(); 173 174 num / den 175 } 176 177 /// Returns the variance of the sample 178 /// 179 /// The `mean` can be optionally passed along to speed up (2X) the computation 180 /// 181 /// - Time: `O(length)` var(&self, mean: Option<A>) -> A182 pub fn var(&self, mean: Option<A>) -> A { 183 use std::ops::Add; 184 185 let mean = mean.unwrap_or_else(|| self.mean()); 186 let slice = self; 187 188 let sum = slice 189 .iter() 190 .map(|&x| (x - mean).powi(2)) 191 .fold(A::cast(0), Add::add); 192 193 sum / A::cast(slice.len() - 1) 194 } 195 196 // TODO Remove the `T` parameter in favor of `S::Output` 197 /// Returns the bootstrap distributions of the parameters estimated by the 1-sample statistic 198 /// 199 /// - Multi-threaded 200 /// - Time: `O(nresamples)` 201 /// - Memory: `O(nresamples)` bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions where S: Fn(&Sample<A>) -> T + Sync, T: Tuple + Send, T::Distributions: Send, T::Builder: Send,202 pub fn bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions 203 where 204 S: Fn(&Sample<A>) -> T + Sync, 205 T: Tuple + Send, 206 T::Distributions: Send, 207 T::Builder: Send, 208 { 209 (0..nresamples) 210 .into_par_iter() 211 .map_init( 212 || Resamples::new(self), 213 |resamples, _| statistic(resamples.next()), 214 ) 215 .fold( 216 || T::Builder::new(0), 217 |mut sub_distributions, sample| { 218 sub_distributions.push(sample); 219 sub_distributions 220 }, 221 ) 222 .reduce( 223 || T::Builder::new(0), 224 |mut a, mut b| { 225 a.extend(&mut b); 226 a 227 }, 228 ) 229 .complete() 230 } 231 232 #[cfg(test)] iqr(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,233 pub fn iqr(&self) -> A 234 where 235 usize: cast::From<A, Output = Result<usize, cast::Error>>, 236 { 237 self.percentiles().iqr() 238 } 239 240 #[cfg(test)] median(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,241 pub fn median(&self) -> A 242 where 243 usize: cast::From<A, Output = Result<usize, cast::Error>>, 244 { 245 self.percentiles().median() 246 } 247 } 248 249 impl<A> ops::Deref for Sample<A> { 250 type Target = [A]; 251 deref(&self) -> &[A]252 fn deref(&self) -> &[A] { 253 &self.0 254 } 255 } 256