OpenHarmony-v3.2.3-Release/s

<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Hypergeometric Distribution</title>
<link rel="stylesheet" href="../../../math.css" type="text/css">
<meta name="generator" content="DocBook XSL Stylesheets V1.79.1">
<link rel="home" href="../../../index.html" title="Math Toolkit 2.12.0">
<link rel="up" href="../dists.html" title="Distributions">
<link rel="prev" href="hyperexponential_dist.html" title="Hyperexponential Distribution">
<link rel="next" href="inverse_chi_squared_dist.html" title="Inverse Chi Squared Distribution">
</head>
<body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF">
<table cellpadding="2" width="100%"><tr>
<td valign="top"><img alt="Boost C++ Libraries" width="277" height="86" src="../../../../../../../boost.png"></td>
<td align="center"><a href="../../../../../../../index.html">Home</a></td>
<td align="center"><a href="../../../../../../../libs/libraries.htm">Libraries</a></td>
<td align="center"><a href="http://www.boost.org/users/people.html">People</a></td>
<td align="center"><a href="http://www.boost.org/users/faq.html">FAQ</a></td>
<td align="center"><a href="../../../../../../../more/index.htm">More</a></td>
</tr></table>
<hr>
<div class="spirit-nav">
<a accesskey="p" href="hyperexponential_dist.html"><img src="../../../../../../../doc/src/images/prev.png" alt="Prev"></a><a accesskey="u" href="../dists.html"><img src="../../../../../../../doc/src/images/up.png" alt="Up"></a><a accesskey="h" href="../../../index.html"><img src="../../../../../../../doc/src/images/home.png" alt="Home"></a><a accesskey="n" href="inverse_chi_squared_dist.html"><img src="../../../../../../../doc/src/images/next.png" alt="Next"></a>
</div>
<div class="section">
<div class="titlepage"><div><div><h4 class="title">
<a name="math_toolkit.dist_ref.dists.hypergeometric_dist"></a><a class="link" href="hypergeometric_dist.html" title="Hypergeometric Distribution">Hypergeometric
        Distribution</a>
</h4></div></div></div>
<pre class="programlisting"><span class="preprocessor">#include</span> <span class="special">&lt;</span><span class="identifier">boost</span><span class="special">/</span><span class="identifier">math</span><span class="special">/</span><span class="identifier">distributions</span><span class="special">/</span><span class="identifier">hypergeometric</span><span class="special">.</span><span class="identifier">hpp</span><span class="special">&gt;</span></pre>
<pre class="programlisting"><span class="keyword">namespace</span> <span class="identifier">boost</span><span class="special">{</span> <span class="keyword">namespace</span> <span class="identifier">math</span><span class="special">{</span>

<span class="keyword">template</span> <span class="special">&lt;</span><span class="keyword">class</span> <span class="identifier">RealType</span> <span class="special">=</span> <span class="keyword">double</span><span class="special">,</span>
          <span class="keyword">class</span> <a class="link" href="../../../policy.html" title="Chapter 21. Policies: Controlling Precision, Error Handling etc">Policy</a>   <span class="special">=</span> <a class="link" href="../../pol_ref/pol_ref_ref.html" title="Policy Class Reference">policies::policy&lt;&gt;</a> <span class="special">&gt;</span>
<span class="keyword">class</span> <span class="identifier">hypergeometric_distribution</span><span class="special">;</span>

<span class="keyword">template</span> <span class="special">&lt;</span><span class="keyword">class</span> <span class="identifier">RealType</span><span class="special">,</span> <span class="keyword">class</span> <span class="identifier">Policy</span><span class="special">&gt;</span>
<span class="keyword">class</span> <span class="identifier">hypergeometric_distribution</span>
<span class="special">{</span>
<span class="keyword">public</span><span class="special">:</span>
   <span class="keyword">typedef</span> <span class="identifier">RealType</span> <span class="identifier">value_type</span><span class="special">;</span>
   <span class="keyword">typedef</span> <span class="identifier">Policy</span>   <span class="identifier">policy_type</span><span class="special">;</span>
   <span class="comment">// Construct:</span>
   <span class="identifier">hypergeometric_distribution</span><span class="special">(</span><span class="keyword">unsigned</span> <span class="identifier">r</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">n</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">N</span><span class="special">);</span>
   <span class="comment">// Accessors:</span>
   <span class="keyword">unsigned</span> <span class="identifier">total</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
   <span class="keyword">unsigned</span> <span class="identifier">defective</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
   <span class="keyword">unsigned</span> <span class="identifier">sample_count</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
<span class="special">};</span>

<span class="keyword">typedef</span> <span class="identifier">hypergeometric_distribution</span><span class="special">&lt;&gt;</span> <span class="identifier">hypergeometric</span><span class="special">;</span>

<span class="special">}}</span> <span class="comment">// namespaces</span>
</pre>
<p>
          The hypergeometric distribution describes the number of "events"
          <span class="emphasis"><em>k</em></span> from a sample <span class="emphasis"><em>n</em></span> drawn from
          a total population <span class="emphasis"><em>N</em></span> <span class="emphasis"><em>without replacement</em></span>.
        </p>
<p>
          Imagine we have a sample of <span class="emphasis"><em>N</em></span> objects of which <span class="emphasis"><em>r</em></span>
          are "defective" and N-r are "not defective" (the terms
          "success/failure" or "red/blue" are also used). If
          we sample <span class="emphasis"><em>n</em></span> items <span class="emphasis"><em>without replacement</em></span>
          then what is the probability that exactly <span class="emphasis"><em>k</em></span> items
          in the sample are defective? The answer is given by the pdf of the hypergeometric
          distribution <code class="computeroutput"><span class="identifier">f</span><span class="special">(</span><span class="identifier">k</span><span class="special">;</span> <span class="identifier">r</span><span class="special">,</span> <span class="identifier">n</span><span class="special">,</span>
          <span class="identifier">N</span><span class="special">)</span></code>,
          whilst the probability of <span class="emphasis"><em>k</em></span> defectives or fewer is
          given by F(k; r, n, N), where F(k) is the CDF of the hypergeometric distribution.
        </p>
<div class="note"><table border="0" summary="Note">
<tr>
<td rowspan="2" align="center" valign="top" width="25"><img alt="[Note]" src="../../../../../../../doc/src/images/note.png"></td>
<th align="left">Note</th>
</tr>
<tr><td align="left" valign="top"><p>
            Unlike almost all of the other distributions in this library, the hypergeometric
            distribution is strictly discrete: it can not be extended to real valued
            arguments of its parameters or random variable.
          </p></td></tr>
</table></div>
<p>
          The following graph shows how the distribution changes as the proportion
          of "defective" items changes, while keeping the population and
          sample sizes constant:
        </p>
<div class="blockquote"><blockquote class="blockquote"><p>
            <span class="inlinemediaobject"><img src="../../../../graphs/hypergeometric_pdf_1.svg" align="middle"></span>

          </p></blockquote></div>
<p>
          Note that since the distribution is symmetrical in parameters <span class="emphasis"><em>n</em></span>
          and <span class="emphasis"><em>r</em></span>, if we change the sample size and keep the population
          and proportion "defective" the same then we obtain basically
          the same graphs:
        </p>
<div class="blockquote"><blockquote class="blockquote"><p>
            <span class="inlinemediaobject"><img src="../../../../graphs/hypergeometric_pdf_2.svg" align="middle"></span>

          </p></blockquote></div>
<h5>
<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h0"></a>
          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.member_functions"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.member_functions">Member
          Functions</a>
        </h5>
<pre class="programlisting"><span class="identifier">hypergeometric_distribution</span><span class="special">(</span><span class="keyword">unsigned</span> <span class="identifier">r</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">n</span><span class="special">,</span> <span class="keyword">unsigned</span> <span class="identifier">N</span><span class="special">);</span>
</pre>
<p>
          Constructs a hypergeometric distribution with a population of <span class="emphasis"><em>N</em></span>
          objects, of which <span class="emphasis"><em>r</em></span> are defective, and from which
          <span class="emphasis"><em>n</em></span> are sampled.
        </p>
<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">total</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
</pre>
<p>
          Returns the total number of objects <span class="emphasis"><em>N</em></span>.
        </p>
<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">defective</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
</pre>
<p>
          Returns the number of objects <span class="emphasis"><em>r</em></span> in population <span class="emphasis"><em>N</em></span>
          which are defective.
        </p>
<pre class="programlisting"><span class="keyword">unsigned</span> <span class="identifier">sample_count</span><span class="special">()</span><span class="keyword">const</span><span class="special">;</span>
</pre>
<p>
          Returns the number of objects <span class="emphasis"><em>n</em></span> which are sampled
          from the population <span class="emphasis"><em>N</em></span>.
        </p>
<h5>
<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h1"></a>
          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.non_member_accessors"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.non_member_accessors">Non-member
          Accessors</a>
        </h5>
<p>
          All the <a class="link" href="../nmp.html" title="Non-Member Properties">usual non-member accessor
          functions</a> that are generic to all distributions are supported:
          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.cdf">Cumulative Distribution Function</a>,
          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.pdf">Probability Density Function</a>,
          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.quantile">Quantile</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.hazard">Hazard Function</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.chf">Cumulative Hazard Function</a>,
          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.mean">mean</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.median">median</a>,
          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.mode">mode</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.variance">variance</a>,
          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.sd">standard deviation</a>,
          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.skewness">skewness</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.kurtosis">kurtosis</a>, <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.kurtosis_excess">kurtosis_excess</a>,
          <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.range">range</a> and <a class="link" href="../nmp.html#math_toolkit.dist_ref.nmp.support">support</a>.
        </p>
<p>
          The domain of the random variable is the unsigned integers in the range
          [max(0, n + r - N), min(n, r)]. A <a class="link" href="../../error_handling.html#math_toolkit.error_handling.domain_error">domain_error</a>
          is raised if the random variable is outside this range, or is not an integral
          value.
        </p>
<div class="caution"><table border="0" summary="Caution">
<tr>
<td rowspan="2" align="center" valign="top" width="25"><img alt="[Caution]" src="../../../../../../../doc/src/images/caution.png"></td>
<th align="left">Caution</th>
</tr>
<tr><td align="left" valign="top">
<p>
            The quantile function will by default return an integer result that has
            been <span class="emphasis"><em>rounded outwards</em></span>. That is to say lower quantiles
            (where the probability is less than 0.5) are rounded downward, and upper
            quantiles (where the probability is greater than 0.5) are rounded upwards.
            This behaviour ensures that if an X% quantile is requested, then <span class="emphasis"><em>at
            least</em></span> the requested coverage will be present in the central
            region, and <span class="emphasis"><em>no more than</em></span> the requested coverage
            will be present in the tails.
          </p>
<p>
            This behaviour can be changed so that the quantile functions are rounded
            differently using <a class="link" href="../../pol_overview.html" title="Policy Overview">Policies</a>.
            It is strongly recommended that you read the tutorial <a class="link" href="../../pol_tutorial/understand_dis_quant.html" title="Understanding Quantiles of Discrete Distributions">Understanding
            Quantiles of Discrete Distributions</a> before using the quantile
            function on the Hypergeometric distribution. The <a class="link" href="../../pol_ref/discrete_quant_ref.html" title="Discrete Quantile Policies">reference
            docs</a> describe how to change the rounding policy for these distributions.
          </p>
<p>
            However, note that the implementation method of the quantile function
            always returns an integral value, therefore attempting to use a <a class="link" href="../../../policy.html" title="Chapter 21. Policies: Controlling Precision, Error Handling etc">Policy</a> that requires (or produces) a real valued
            result will result in a compile time error.
          </p>
</td></tr>
</table></div>
<h5>
<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h2"></a>
          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.accuracy"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.accuracy">Accuracy</a>
        </h5>
<p>
          For small N such that <code class="computeroutput"><span class="identifier">N</span> <span class="special">&lt;</span> <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code>
          then table based lookup of the results gives an accuracy to a few epsilon.
          <code class="computeroutput"><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code> is 170 at double or long double
          precision.
        </p>
<p>
          For larger N such that <code class="computeroutput"><span class="identifier">N</span> <span class="special">&lt;</span> <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">prime</span><span class="special">(</span><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_prime</span><span class="special">)</span></code> then only basic arithmetic is required
          for the calculation and the accuracy is typically &lt; 20 epsilon. This
          takes care of N up to 104729.
        </p>
<p>
          For <code class="computeroutput"><span class="identifier">N</span> <span class="special">&gt;</span>
          <span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">prime</span><span class="special">(</span><span class="identifier">boost</span><span class="special">::</span><span class="identifier">math</span><span class="special">::</span><span class="identifier">max_prime</span><span class="special">)</span></code>
          then accuracy quickly degrades, with 5 or 6 decimal digits being lost for
          N = 110000.
        </p>
<p>
          In general for very large N, the user should expect to lose log<sub>10</sub>N decimal
          digits of precision during the calculation, with the results becoming meaningless
          for N &gt;= 10<sup>15</sup>.
        </p>
<h5>
<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h3"></a>
          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.testing"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.testing">Testing</a>
        </h5>
<p>
          There are three sets of tests: our implementation is tested against a table
          of values produced by Mathematica's implementation of this distribution.
          We also sanity check our implementation against some spot values computed
          using the online calculator here <a href="http://stattrek.com/Tables/Hypergeometric.aspx" target="_top">http://stattrek.com/Tables/Hypergeometric.aspx</a>.
          Finally we test accuracy against some high precision test data using this
          implementation and NTL::RR.
        </p>
<h5>
<a name="math_toolkit.dist_ref.dists.hypergeometric_dist.h4"></a>
          <span class="phrase"><a name="math_toolkit.dist_ref.dists.hypergeometric_dist.implementation"></a></span><a class="link" href="hypergeometric_dist.html#math_toolkit.dist_ref.dists.hypergeometric_dist.implementation">Implementation</a>
        </h5>
<p>
          The PDF can be calculated directly using the formula:
        </p>
<div class="blockquote"><blockquote class="blockquote"><p>
            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric1.svg"></span>

          </p></blockquote></div>
<p>
          However, this can only be used directly when the largest of the factorials
          is guaranteed not to overflow the floating point representation used. This
          formula is used directly when <code class="computeroutput"><span class="identifier">N</span>
          <span class="special">&lt;</span> <span class="identifier">max_factorial</span><span class="special">&lt;</span><span class="identifier">RealType</span><span class="special">&gt;::</span><span class="identifier">value</span></code>
          in which case table lookup of the factorials gives a rapid and accurate
          implementation method.
        </p>
<p>
          For larger <span class="emphasis"><em>N</em></span> the method described in "An Accurate
          Computation of the Hypergeometric Distribution Function", Trong Wu,
          ACM Transactions on Mathematical Software, Vol. 19, No. 1, March 1993,
          Pages 33-43 is used. The method relies on the fact that there is an easy
          method for factorising a factorial into the product of prime numbers:
        </p>
<div class="blockquote"><blockquote class="blockquote"><p>
            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric2.svg"></span>

          </p></blockquote></div>
<p>
          Where p<sub>i</sub> is the i'th prime number, and e<sub>i</sub> is a small positive integer or
          zero, which can be calculated via:
        </p>
<div class="blockquote"><blockquote class="blockquote"><p>
            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric3.svg"></span>

          </p></blockquote></div>
<p>
          Further we can combine the factorials in the expression for the PDF to
          yield the PDF directly as the product of prime numbers:
        </p>
<div class="blockquote"><blockquote class="blockquote"><p>
            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric4.svg"></span>

          </p></blockquote></div>
<p>
          With this time the exponents e<sub>i</sub> being either positive, negative or zero.
          Indeed such a degree of cancellation occurs in the calculation of the e<sub>i</sub> that
          many are zero, and typically most have a magnitude or no more than 1 or
          2.
        </p>
<p>
          Calculation of the product of the primes requires some care to prevent
          numerical overflow, we use a novel recursive method which splits the calculation
          into a series of sub-products, with a new sub-product started each time
          the next multiplication would cause either overflow or underflow. The sub-products
          are stored in a linked list on the program stack, and combined in an order
          that will guarantee no overflow or unnecessary-underflow once the last
          sub-product has been calculated.
        </p>
<p>
          This method can be used as long as N is smaller than the largest prime
          number we have stored in our table of primes (currently 104729). The method
          is relatively slow (calculating the exponents requires the most time),
          but requires only a small number of arithmetic operations to calculate
          the result (indeed there is no shorter method involving only basic arithmetic
          once the exponents have been found), the method is therefore much more
          accurate than the alternatives.
        </p>
<p>
          For much larger N, we can calculate the PDF from the factorials using either
          lgamma, or by directly combining lanczos approximations to avoid calculating
          via logarithms. We use the latter method, as it is usually 1 or 2 decimal
          digits more accurate than computing via logarithms with lgamma. However,
          in this area where N &gt; 104729, the user should expect to lose around
          log<sub>10</sub>N decimal digits during the calculation in the worst case.
        </p>
<p>
          The CDF and its complement is calculated by directly summing the PDF's.
          We start by deciding whether the CDF, or its complement, is likely to be
          the smaller of the two and then calculate the PDF at <span class="emphasis"><em>k</em></span>
          (or <span class="emphasis"><em>k+1</em></span> if we're calculating the complement) and calculate
          successive PDF values via the recurrence relations:
        </p>
<div class="blockquote"><blockquote class="blockquote"><p>
            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric5.svg"></span>

          </p></blockquote></div>
<p>
          Until we either reach the end of the distributions domain, or the next
          PDF value to be summed would be too small to affect the result.
        </p>
<p>
          The quantile is calculated in a similar manner to the CDF: we first guess
          which end of the distribution we're nearer to, and then sum PDFs starting
          from the end of the distribution this time, until we have some value <span class="emphasis"><em>k</em></span>
          that gives the required CDF.
        </p>
<p>
          The median is simply the quantile at 0.5, and the remaining properties
          are calculated via:
        </p>
<div class="blockquote"><blockquote class="blockquote"><p>
            <span class="inlinemediaobject"><img src="../../../../equations/hypergeometric6.svg"></span>

          </p></blockquote></div>
</div>
<table xmlns:rev="http://www.cs.rpi.edu/~gregod/boost/tools/doc/revision" width="100%"><tr>
<td align="left"></td>
<td align="right"><div class="copyright-footer">Copyright © 2006-2019 Nikhar
      Agrawal, Anton Bikineev, Paul A. Bristow, Marco Guazzone, Christopher Kormanyos,
      Hubert Holin, Bruno Lalande, John Maddock, Jeremy Murphy, Matthew Pulver, Johan
      Råde, Gautam Sewani, Benjamin Sobotta, Nicholas Thompson, Thijs van den Berg,
      Daryle Walker and Xiaogang Zhang<p>
        Distributed under the Boost Software License, Version 1.0. (See accompanying
        file LICENSE_1_0.txt or copy at <a href="http://www.boost.org/LICENSE_1_0.txt" target="_top">http://www.boost.org/LICENSE_1_0.txt</a>)
      </p>
</div></td>
</tr></table>
<hr>
<div class="spirit-nav">
<a accesskey="p" href="hyperexponential_dist.html"><img src="../../../../../../../doc/src/images/prev.png" alt="Prev"></a><a accesskey="u" href="../dists.html"><img src="../../../../../../../doc/src/images/up.png" alt="Up"></a><a accesskey="h" href="../../../index.html"><img src="../../../../../../../doc/src/images/home.png" alt="Home"></a><a accesskey="n" href="inverse_chi_squared_dist.html"><img src="../../../../../../../doc/src/images/next.png" alt="Next"></a>
</div>
</body>
</html>