• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::{mem, ops};
2 
3 use crate::stats::float::Float;
4 use crate::stats::tuple::{Tuple, TupledDistributionsBuilder};
5 use crate::stats::univariate::Percentiles;
6 use crate::stats::univariate::Resamples;
7 use rayon::prelude::*;
8 
9 /// A collection of data points drawn from a population
10 ///
11 /// Invariants:
12 ///
13 /// - The sample contains at least 2 data points
14 /// - The sample contains no `NaN`s
15 pub struct Sample<A>([A]);
16 
17 // TODO(rust-lang/rfcs#735) move this `impl` into a private percentiles module
18 impl<A> Sample<A>
19 where
20     A: Float,
21 {
22     /// Creates a new sample from an existing slice
23     ///
24     /// # Panics
25     ///
26     /// Panics if `slice` contains any `NaN` or if `slice` has less than two elements
27     #[cfg_attr(feature = "cargo-clippy", allow(clippy::new_ret_no_self))]
new(slice: &[A]) -> &Sample<A>28     pub fn new(slice: &[A]) -> &Sample<A> {
29         assert!(slice.len() > 1 && slice.iter().all(|x| !x.is_nan()));
30 
31         unsafe { mem::transmute(slice) }
32     }
33 
34     /// Returns the biggest element in the sample
35     ///
36     /// - Time: `O(length)`
max(&self) -> A37     pub fn max(&self) -> A {
38         let mut elems = self.iter();
39 
40         match elems.next() {
41             Some(&head) => elems.fold(head, |a, &b| a.max(b)),
42             // NB `unreachable!` because `Sample` is guaranteed to have at least one data point
43             None => unreachable!(),
44         }
45     }
46 
47     /// Returns the arithmetic average of the sample
48     ///
49     /// - Time: `O(length)`
mean(&self) -> A50     pub fn mean(&self) -> A {
51         let n = self.len();
52 
53         self.sum() / A::cast(n)
54     }
55 
56     /// Returns the median absolute deviation
57     ///
58     /// The `median` can be optionally passed along to speed up (2X) the computation
59     ///
60     /// - Time: `O(length)`
61     /// - Memory: `O(length)`
median_abs_dev(&self, median: Option<A>) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,62     pub fn median_abs_dev(&self, median: Option<A>) -> A
63     where
64         usize: cast::From<A, Output = Result<usize, cast::Error>>,
65     {
66         let median = median.unwrap_or_else(|| self.percentiles().median());
67 
68         // NB Although this operation can be SIMD accelerated, the gain is negligible because the
69         // bottle neck is the sorting operation which is part of the computation of the median
70         let abs_devs = self.iter().map(|&x| (x - median).abs()).collect::<Vec<_>>();
71 
72         let abs_devs: &Self = Self::new(&abs_devs);
73 
74         abs_devs.percentiles().median() * A::cast(1.4826)
75     }
76 
77     /// Returns the median absolute deviation as a percentage of the median
78     ///
79     /// - Time: `O(length)`
80     /// - Memory: `O(length)`
median_abs_dev_pct(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,81     pub fn median_abs_dev_pct(&self) -> A
82     where
83         usize: cast::From<A, Output = Result<usize, cast::Error>>,
84     {
85         let _100 = A::cast(100);
86         let median = self.percentiles().median();
87         let mad = self.median_abs_dev(Some(median));
88 
89         (mad / median) * _100
90     }
91 
92     /// Returns the smallest element in the sample
93     ///
94     /// - Time: `O(length)`
min(&self) -> A95     pub fn min(&self) -> A {
96         let mut elems = self.iter();
97 
98         match elems.next() {
99             Some(&elem) => elems.fold(elem, |a, &b| a.min(b)),
100             // NB `unreachable!` because `Sample` is guaranteed to have at least one data point
101             None => unreachable!(),
102         }
103     }
104 
105     /// Returns a "view" into the percentiles of the sample
106     ///
107     /// This "view" makes consecutive computations of percentiles much faster (`O(1)`)
108     ///
109     /// - Time: `O(N log N) where N = length`
110     /// - Memory: `O(length)`
percentiles(&self) -> Percentiles<A> where usize: cast::From<A, Output = Result<usize, cast::Error>>,111     pub fn percentiles(&self) -> Percentiles<A>
112     where
113         usize: cast::From<A, Output = Result<usize, cast::Error>>,
114     {
115         use std::cmp::Ordering;
116 
117         // NB This function assumes that there are no `NaN`s in the sample
118         fn cmp<T>(a: &T, b: &T) -> Ordering
119         where
120             T: PartialOrd,
121         {
122             match a.partial_cmp(b) {
123                 Some(o) => o,
124                 // Arbitrary way to handle NaNs that should never happen
125                 None => Ordering::Equal,
126             }
127         }
128 
129         let mut v = self.to_vec().into_boxed_slice();
130         v.par_sort_unstable_by(cmp);
131 
132         // NB :-1: to intra-crate privacy rules
133         unsafe { mem::transmute(v) }
134     }
135 
136     /// Returns the standard deviation of the sample
137     ///
138     /// The `mean` can be optionally passed along to speed up (2X) the computation
139     ///
140     /// - Time: `O(length)`
std_dev(&self, mean: Option<A>) -> A141     pub fn std_dev(&self, mean: Option<A>) -> A {
142         self.var(mean).sqrt()
143     }
144 
145     /// Returns the standard deviation as a percentage of the mean
146     ///
147     /// - Time: `O(length)`
std_dev_pct(&self) -> A148     pub fn std_dev_pct(&self) -> A {
149         let _100 = A::cast(100);
150         let mean = self.mean();
151         let std_dev = self.std_dev(Some(mean));
152 
153         (std_dev / mean) * _100
154     }
155 
156     /// Returns the sum of all the elements of the sample
157     ///
158     /// - Time: `O(length)`
sum(&self) -> A159     pub fn sum(&self) -> A {
160         crate::stats::sum(self)
161     }
162 
163     /// Returns the t score between these two samples
164     ///
165     /// - Time: `O(length)`
t(&self, other: &Sample<A>) -> A166     pub fn t(&self, other: &Sample<A>) -> A {
167         let (x_bar, y_bar) = (self.mean(), other.mean());
168         let (s2_x, s2_y) = (self.var(Some(x_bar)), other.var(Some(y_bar)));
169         let n_x = A::cast(self.len());
170         let n_y = A::cast(other.len());
171         let num = x_bar - y_bar;
172         let den = (s2_x / n_x + s2_y / n_y).sqrt();
173 
174         num / den
175     }
176 
177     /// Returns the variance of the sample
178     ///
179     /// The `mean` can be optionally passed along to speed up (2X) the computation
180     ///
181     /// - Time: `O(length)`
var(&self, mean: Option<A>) -> A182     pub fn var(&self, mean: Option<A>) -> A {
183         use std::ops::Add;
184 
185         let mean = mean.unwrap_or_else(|| self.mean());
186         let slice = self;
187 
188         let sum = slice
189             .iter()
190             .map(|&x| (x - mean).powi(2))
191             .fold(A::cast(0), Add::add);
192 
193         sum / A::cast(slice.len() - 1)
194     }
195 
196     // TODO Remove the `T` parameter in favor of `S::Output`
197     /// Returns the bootstrap distributions of the parameters estimated by the 1-sample statistic
198     ///
199     /// - Multi-threaded
200     /// - Time: `O(nresamples)`
201     /// - Memory: `O(nresamples)`
bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions where S: Fn(&Sample<A>) -> T + Sync, T: Tuple + Send, T::Distributions: Send, T::Builder: Send,202     pub fn bootstrap<T, S>(&self, nresamples: usize, statistic: S) -> T::Distributions
203     where
204         S: Fn(&Sample<A>) -> T + Sync,
205         T: Tuple + Send,
206         T::Distributions: Send,
207         T::Builder: Send,
208     {
209         (0..nresamples)
210             .into_par_iter()
211             .map_init(
212                 || Resamples::new(self),
213                 |resamples, _| statistic(resamples.next()),
214             )
215             .fold(
216                 || T::Builder::new(0),
217                 |mut sub_distributions, sample| {
218                     sub_distributions.push(sample);
219                     sub_distributions
220                 },
221             )
222             .reduce(
223                 || T::Builder::new(0),
224                 |mut a, mut b| {
225                     a.extend(&mut b);
226                     a
227                 },
228             )
229             .complete()
230     }
231 
232     #[cfg(test)]
iqr(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,233     pub fn iqr(&self) -> A
234     where
235         usize: cast::From<A, Output = Result<usize, cast::Error>>,
236     {
237         self.percentiles().iqr()
238     }
239 
240     #[cfg(test)]
median(&self) -> A where usize: cast::From<A, Output = Result<usize, cast::Error>>,241     pub fn median(&self) -> A
242     where
243         usize: cast::From<A, Output = Result<usize, cast::Error>>,
244     {
245         self.percentiles().median()
246     }
247 }
248 
249 impl<A> ops::Deref for Sample<A> {
250     type Target = [A];
251 
deref(&self) -> &[A]252     fn deref(&self) -> &[A] {
253         &self.0
254     }
255 }
256