• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1<html>
2<head>
3<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
4<title>Hypergeometric Distribution</title>
5<link rel="stylesheet" href="../../../math.css" type="text/css">
6<meta name="generator" content="DocBook XSL Stylesheets V1.79.1">
7<link rel="home" href="../../../index.html" title="Math Toolkit 2.12.0">
8<link rel="up" href="../dists.html" title="Distributions">
9<link rel="prev" href="hyperexponential_dist.html" title="Hyperexponential Distribution">
10<link rel="next" href="inverse_chi_squared_dist.html" title="Inverse Chi Squared Distribution">
11</head>
12<body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF">
13<table cellpadding="2" width="100%"><tr>
14<td valign="top"><img alt="Boost C++ Libraries" width="277" height="86" src="../../../../../../../boost.png"></td>
15<td align="center"><a href="../../../../../../../index.html">Home</a></td>
16<td align="center"><a href="../../../../../../../libs/libraries.htm">Libraries</a></td>
17<td align="center"><a href="http://www.boost.org/users/people.html">People</a></td>
18<td align="center"><a href="http://www.boost.org/users/faq.html">FAQ</a></td>
19<td align="center"><a href="../../../../../../../more/index.htm">More</a></td>
20</tr></table>
21<hr>
22<div class="spirit-nav">
23<a accesskey="p" href="hyperexponential_dist.html"><img src="../../../../../../../doc/src/images/prev.png" alt="Prev"></a><a accesskey="u" href="../dists.html"><img src="../../../../../../../doc/src/images/up.png" alt="Up"></a><a accesskey="h" href="../../../index.html"><img src="../../../../../../../doc/src/images/home.png" alt="Home"></a><a accesskey="n" href="inverse_chi_squared_dist.html"><img src="../../../../../../../doc/src/images/next.png" alt="Next"></a>
24</div>
25<div class="section">
26<div class="titlepage"><div><div><h4 class="title">
27<a name="math_toolkit.dist_ref.dists.hypergeometric_dist"></a><a class="link" href="hypergeometric_dist.html" title="Hypergeometric Distribution">Hypergeometric
28        Distribution</a>
29</h4></div></div></div>
30<pre class="programlisting"><span class="preprocessor">#include</span> <span class="special">&lt;</span><span class="identifier">boost</span><span class="special">/</span><span class="identifier">math</span><span class="special">/</span><span class="identifier">distributions</span><span class="special">/</span><span class="identifier">hypergeometric</span><span class="special">.</span><span class="identifier">hpp</span><span class="special">&gt;</span></pre>
31<pre class="programlisting"><span class="keyword">namespace</span> <span class="identifier">boost</span><span class="special">{</span> <span class="keyword">namespace</span> <span class="identifier">math</span><span class="special">{</span>
32
33<span class="keyword">template</span> <span class="special">&lt;</span><span class="keyword">class</span> <span class="identifier">RealType</span> <span class="special">=</span> <span class="keyword">double</span><span class="special">,</span>
34          <span class="keyword">class</span> <a class="link" href="../../../policy.html" title="Chapter 21. Policies: Controlling Precision, Error Handling etc">Policy</a>   <span class="special">=</span> <a class="link" href="../../pol_ref/pol_ref_ref.html" title="Policy Class Reference">policies::policy&lt;&gt;</a> <span class="special">&gt;</span>
35<span class="keyword">class</span> <span class="identifier">hypergeometric_distribution</span><span class="special">;</span>
36
37<span class="keyword">template</span> <span class="special">&lt;</span><span class="keyword">class</span> <span class="identifier">RealType</span><span class="special">,</span> <span class="keyword">class</span> <span class="identifier">Policy</span><span class="special">&gt;</span>
38<span class="keyword">class</span> <span class="identifier">hypergeometric_distribution</span>
39<span class="special">{</span>
40<span class="keyword">public</span><span class="special">:</span>
41   <span class="keyword">typedef</span> <span class="identifier">RealType</span> <span class="identifier">value_type</span><span class="special">;</span>
42   <span class="keyword">typedef</span> <span class="identifier">Policy</span>   <span class="identifier">policy_type</span><span class="special">;</span>
43   <span class="comment">// Construct:</span>
44   <span class="identifier">hypergeometric_distribution</span><span class="special">(</span><span class="keyword">unsigned</span> <span class="identifier">r</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">n</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">N</span><span class="special">);</span>
45   <span class="comment">// Accessors:</span>
46   <span class="keyword">unsigned</span> <span class="identifier">total</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
47   <span class="keyword">unsigned</span> <span class="identifier">defective</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
48   <span class="keyword">unsigned</span> <span class="identifier">sample_count</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
49<span class="special">};</span>
50
51<span class="keyword">typedef</span> <span class="identifier">hypergeometric_distribution</span><span class="special">&lt;&gt;</span> <span class="identifier">hypergeometric</span><span class="special">;</span>
52
53<span class="special">}}</span> <span class="comment">// namespaces</span>
54</pre>
55<p>
56          The hypergeometric distribution describes the number of "events"
57          <span class="emphasis"><em>k</em></span> from a sample <span class="emphasis"><em>n</em></span> drawn from
58          a total population <span class="emphasis"><em>N</em></span> <span class="emphasis"><em>without replacement</em></span>.
59        </p>
60<p>
61          Imagine we have a sample of <span class="emphasis"><em>N</em></span> objects of which <span class="emphasis"><em>r</em></span>
62          are "defective" and N-r are "not defective" (the terms
63          "success/failure" or "red/blue" are also used). If
64          we sample <span class="emphasis"><em>n</em></span> items <span class="emphasis"><em>without replacement</em></span>
65          then what is the probability that exactly <span class="emphasis"><em>k</em></span> items
66          in the sample are defective? The answer is given by the pdf of the hypergeometric
67          distribution <code class="computeroutput"><span class="identifier">f</span><span class="special">(</span><span class="identifier">k</span><span class="special">;</span> <span class="identifier">r</span><span class="special">,</span> <span class="identifier">n</span><span class="special">,</span>
68          <span class="identifier">N</span><span class="special">)</span></code>,
69          whilst the probability of <span class="emphasis"><em>k</em></span> defectives or fewer is
70          given by F(k; r, n, N), where F(k) is the CDF of the hypergeometric distribution.
71        </p>
72<div class="note"><table border="0" summary="Note">
73<tr>
74<td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="../../../../../../../doc/src/images/note.png"></td>
75<th align="left">Note</th>
76</tr>
77<tr><td align="left" valign="top"><p>
78            Unlike almost all of the other distributions in this library, the hypergeometric
79            distribution is strictly discrete: it can not be extended to real valued
80            arguments of its parameters or random variable.
81          </p></td></tr>
82</table></div>
83<p>
84          The following graph shows how the distribution changes as the proportion
85          of "defective" items changes, while keeping the population and
86          sample sizes constant:
87        </p>
88<div class="blockquote"><blockquote class="blockquote"><p>
89            <span class="inlinemediaobject"><img src="../../../../graphs/hypergeometric_pdf_1.svg" align="middle"></span>
90
91          </p></blockquote></div>
92<p>
93          Note that since the distribution is symmetrical in parameters <span class="emphasis"><em>n</em></span>
94          and <span class="emphasis"><em>r</em></span>, if we change the sample size and keep the population
95          and proportion "defective" the same then we obtain basically
96          the same graphs:
97        </p>
98<div class="blockquote"><blockquote class="blockquote"><p>
99            <span class="inlinemediaobject"><img src="../../../../graphs/hypergeometric_pdf_2.svg" align="middle"></span>
100
101          </p></blockquote></div>
102<h5>
103<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h0"></a>
104          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.member_functions"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.member_functions">Member
105          Functions</a>
106        </h5>
107<pre class="programlisting"><span class="identifier">hypergeometric_distribution</span><span class="special">(</span><span class="keyword">unsigned</span> <span class="identifier">r</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">n</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">N</span><span class="special">);</span>
108</pre>
109<p>
110          Constructs a hypergeometric distribution with a population of <span class="emphasis"><em>N</em></span>
111          objects, of which <span class="emphasis"><em>r</em></span> are defective, and from which
112          <span class="emphasis"><em>n</em></span> are sampled.
113        </p>
114<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">total</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
115</pre>
116<p>
117          Returns the total number of objects <span class="emphasis"><em>N</em></span>.
118        </p>
119<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">defective</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
120</pre>
121<p>
122          Returns the number of objects <span class="emphasis"><em>r</em></span> in population <span class="emphasis"><em>N</em></span>
123          which are defective.
124        </p>
125<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">sample_count</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
126</pre>
127<p>
128          Returns the number of objects <span class="emphasis"><em>n</em></span> which are sampled
129          from the population <span class="emphasis"><em>N</em></span>.
130        </p>
131<h5>
132<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h1"></a>
133          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.non_member_accessors"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.non_member_accessors">Non-member
134          Accessors</a>
135        </h5>
136<p>
137          All the <a class="link" href="../nmp.html" title="Non-Member Properties">usual non-member accessor
138          functions</a> that are generic to all distributions are supported:
139          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.cdf">Cumulative Distribution Function</a>,
140          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.pdf">Probability Density Function</a>,
141          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.quantile">Quantile</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.hazard">Hazard Function</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.chf">Cumulative Hazard Function</a>,
142          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.mean">mean</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.median">median</a>,
143          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.mode">mode</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.variance">variance</a>,
144          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.sd">standard deviation</a>,
145          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.skewness">skewness</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.kurtosis">kurtosis</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.kurtosis_excess">kurtosis_excess</a>,
146          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.range">range</a> and <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.support">support</a>.
147        </p>
148<p>
149          The domain of the random variable is the unsigned integers in the range
150          [max(0, n + r - N), min(n, r)]. A <a class="link" href="../../error_handling.html#math_toolkit.error_handling.domain_error">domain_error</a>
151          is raised if the random variable is outside this range, or is not an integral
152          value.
153        </p>
154<div class="caution"><table border="0" summary="Caution">
155<tr>
156<td rowspan="2" align="center" valign="top" width="25"><img alt="[Caution]" src="../../../../../../../doc/src/images/caution.png"></td>
157<th align="left">Caution</th>
158</tr>
159<tr><td align="left" valign="top">
160<p>
161            The quantile function will by default return an integer result that has
162            been <span class="emphasis"><em>rounded outwards</em></span>. That is to say lower quantiles
163            (where the probability is less than 0.5) are rounded downward, and upper
164            quantiles (where the probability is greater than 0.5) are rounded upwards.
165            This behaviour ensures that if an X% quantile is requested, then <span class="emphasis"><em>at
166            least</em></span> the requested coverage will be present in the central
167            region, and <span class="emphasis"><em>no more than</em></span> the requested coverage
168            will be present in the tails.
169          </p>
170<p>
171            This behaviour can be changed so that the quantile functions are rounded
172            differently using <a class="link" href="../../pol_overview.html" title="Policy Overview">Policies</a>.
173            It is strongly recommended that you read the tutorial <a class="link" href="../../pol_tutorial/understand_dis_quant.html" title="Understanding Quantiles of Discrete Distributions">Understanding
174            Quantiles of Discrete Distributions</a> before using the quantile
175            function on the Hypergeometric distribution. The <a class="link" href="../../pol_ref/discrete_quant_ref.html" title="Discrete Quantile Policies">reference
176            docs</a> describe how to change the rounding policy for these distributions.
177          </p>
178<p>
179            However, note that the implementation method of the quantile function
180            always returns an integral value, therefore attempting to use a <a class="link" href="../../../policy.html" title="Chapter 21. Policies: Controlling Precision, Error Handling etc">Policy</a> that requires (or produces) a real valued
181            result will result in a compile time error.
182          </p>
183</td></tr>
184</table></div>
185<h5>
186<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h2"></a>
187          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.accuracy"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.accuracy">Accuracy</a>
188        </h5>
189<p>
190          For small N such that <code class="computeroutput"><span class="identifier">N</span> <span class="special">&lt;</span> <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code>
191          then table based lookup of the results gives an accuracy to a few epsilon.
192          <code class="computeroutput"><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code> is 170 at double or long double
193          precision.
194        </p>
195<p>
196          For larger N such that <code class="computeroutput"><span class="identifier">N</span> <span class="special">&lt;</span> <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">prime</span><span class="special">(</span><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_prime</span><span class="special">)</span></code> then only basic arithmetic is required
197          for the calculation and the accuracy is typically &lt; 20 epsilon. This
198          takes care of N up to 104729.
199        </p>
200<p>
201          For <code class="computeroutput"><span class="identifier">N</span> <span class="special">&gt;</span>
202          <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">prime</span><span class="special">(</span><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_prime</span><span class="special">)</span></code>
203          then accuracy quickly degrades, with 5 or 6 decimal digits being lost for
204          N = 110000.
205        </p>
206<p>
207          In general for very large N, the user should expect to lose log<sub>10</sub>N decimal
208          digits of precision during the calculation, with the results becoming meaningless
209          for N &gt;= 10<sup>15</sup>.
210        </p>
211<h5>
212<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h3"></a>
213          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.testing"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.testing">Testing</a>
214        </h5>
215<p>
216          There are three sets of tests: our implementation is tested against a table
217          of values produced by Mathematica's implementation of this distribution.
218          We also sanity check our implementation against some spot values computed
219          using the online calculator here <a href="http://stattrek.com/Tables/Hypergeometric.aspx" target="_top">http://stattrek.com/Tables/Hypergeometric.aspx</a>.
220          Finally we test accuracy against some high precision test data using this
221          implementation and NTL::RR.
222        </p>
223<h5>
224<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h4"></a>
225          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.implementation"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.implementation">Implementation</a>
226        </h5>
227<p>
228          The PDF can be calculated directly using the formula:
229        </p>
230<div class="blockquote"><blockquote class="blockquote"><p>
231            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric1.svg"></span>
232
233          </p></blockquote></div>
234<p>
235          However, this can only be used directly when the largest of the factorials
236          is guaranteed not to overflow the floating point representation used. This
237          formula is used directly when <code class="computeroutput"><span class="identifier">N</span>
238          <span class="special">&lt;</span> <span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code>
239          in which case table lookup of the factorials gives a rapid and accurate
240          implementation method.
241        </p>
242<p>
243          For larger <span class="emphasis"><em>N</em></span> the method described in "An Accurate
244          Computation of the Hypergeometric Distribution Function", Trong Wu,
245          ACM Transactions on Mathematical Software, Vol. 19, No. 1, March 1993,
246          Pages 33-43 is used. The method relies on the fact that there is an easy
247          method for factorising a factorial into the product of prime numbers:
248        </p>
249<div class="blockquote"><blockquote class="blockquote"><p>
250            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric2.svg"></span>
251
252          </p></blockquote></div>
253<p>
254          Where p<sub>i</sub> is the i'th prime number, and e<sub>i</sub> is a small positive integer or
255          zero, which can be calculated via:
256        </p>
257<div class="blockquote"><blockquote class="blockquote"><p>
258            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric3.svg"></span>
259
260          </p></blockquote></div>
261<p>
262          Further we can combine the factorials in the expression for the PDF to
263          yield the PDF directly as the product of prime numbers:
264        </p>
265<div class="blockquote"><blockquote class="blockquote"><p>
266            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric4.svg"></span>
267
268          </p></blockquote></div>
269<p>
270          With this time the exponents e<sub>i</sub> being either positive, negative or zero.
271          Indeed such a degree of cancellation occurs in the calculation of the e<sub>i</sub> that
272          many are zero, and typically most have a magnitude or no more than 1 or
273          2.
274        </p>
275<p>
276          Calculation of the product of the primes requires some care to prevent
277          numerical overflow, we use a novel recursive method which splits the calculation
278          into a series of sub-products, with a new sub-product started each time
279          the next multiplication would cause either overflow or underflow. The sub-products
280          are stored in a linked list on the program stack, and combined in an order
281          that will guarantee no overflow or unnecessary-underflow once the last
282          sub-product has been calculated.
283        </p>
284<p>
285          This method can be used as long as N is smaller than the largest prime
286          number we have stored in our table of primes (currently 104729). The method
287          is relatively slow (calculating the exponents requires the most time),
288          but requires only a small number of arithmetic operations to calculate
289          the result (indeed there is no shorter method involving only basic arithmetic
290          once the exponents have been found), the method is therefore much more
291          accurate than the alternatives.
292        </p>
293<p>
294          For much larger N, we can calculate the PDF from the factorials using either
295          lgamma, or by directly combining lanczos approximations to avoid calculating
296          via logarithms. We use the latter method, as it is usually 1 or 2 decimal
297          digits more accurate than computing via logarithms with lgamma. However,
298          in this area where N &gt; 104729, the user should expect to lose around
299          log<sub>10</sub>N decimal digits during the calculation in the worst case.
300        </p>
301<p>
302          The CDF and its complement is calculated by directly summing the PDF's.
303          We start by deciding whether the CDF, or its complement, is likely to be
304          the smaller of the two and then calculate the PDF at <span class="emphasis"><em>k</em></span>
305          (or <span class="emphasis"><em>k+1</em></span> if we're calculating the complement) and calculate
306          successive PDF values via the recurrence relations:
307        </p>
308<div class="blockquote"><blockquote class="blockquote"><p>
309            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric5.svg"></span>
310
311          </p></blockquote></div>
312<p>
313          Until we either reach the end of the distributions domain, or the next
314          PDF value to be summed would be too small to affect the result.
315        </p>
316<p>
317          The quantile is calculated in a similar manner to the CDF: we first guess
318          which end of the distribution we're nearer to, and then sum PDFs starting
319          from the end of the distribution this time, until we have some value <span class="emphasis"><em>k</em></span>
320          that gives the required CDF.
321        </p>
322<p>
323          The median is simply the quantile at 0.5, and the remaining properties
324          are calculated via:
325        </p>
326<div class="blockquote"><blockquote class="blockquote"><p>
327            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric6.svg"></span>
328
329          </p></blockquote></div>
330</div>
331<table xmlns:rev="http://www.cs.rpi.edu/~gregod/boost/tools/doc/revision" width="100%"><tr>
332<td align="left"></td>
333<td align="right"><div class="copyright-footer">Copyright © 2006-2019 Nikhar
334      Agrawal, Anton Bikineev, Paul A. Bristow, Marco Guazzone, Christopher Kormanyos,
335      Hubert Holin, Bruno Lalande, John Maddock, Jeremy Murphy, Matthew Pulver, Johan
336      Råde, Gautam Sewani, Benjamin Sobotta, Nicholas Thompson, Thijs van den Berg,
337      Daryle Walker and Xiaogang Zhang<p>
338        Distributed under the Boost Software License, Version 1.0. (See accompanying
339        file LICENSE_1_0.txt or copy at <a href="http://www.boost.org/LICENSE_1_0.txt" target="_top">http://www.boost.org/LICENSE_1_0.txt</a>)
340      </p>
341</div></td>
342</tr></table>
343<hr>
344<div class="spirit-nav">
345<a accesskey="p" href="hyperexponential_dist.html"><img src="../../../../../../../doc/src/images/prev.png" alt="Prev"></a><a accesskey="u" href="../dists.html"><img src="../../../../../../../doc/src/images/up.png" alt="Up"></a><a accesskey="h" href="../../../index.html"><img src="../../../../../../../doc/src/images/home.png" alt="Home"></a><a accesskey="n" href="inverse_chi_squared_dist.html"><img src="../../../../../../../doc/src/images/next.png" alt="Next"></a>
346</div>
347</body>
348</html>
349