#!/usr/local/bin/perl # ******************************************************************** # * COPYRIGHT: # * © 2016 and later: Unicode, Inc. and others. # * License & terms of use: http://www.unicode.org/copyright.html#License # * Copyright (c) 2006, International Business Machines Corporation and # * others. All Rights Reserved. # ******************************************************************** package Dataset; use Statistics::Descriptive; use Statistics::Distributions; use strict; # Create a new Dataset with the given data. sub new { my ($class) = shift; my $self = bless { _data => \@_, _scale => 1.0, _mean => 0.0, _error => 0.0, }, $class; my $n = @_; if ($n >= 1) { my $stats = Statistics::Descriptive::Full->new(); $stats->add_data(@{$self->{_data}}); $self->{_mean} = $stats->mean(); if ($n >= 2) { # Use a t distribution rather than Gaussian because (a) we # assume an underlying normal dist, (b) we do not know the # standard deviation -- we estimate it from the data, and (c) # we MAY have a small sample size (also works for large n). my $t = Statistics::Distributions::tdistr($n-1, 0.005); $self->{_error} = $t * $stats->standard_deviation(); } } $self; } # Set a scaling factor for all data; 1.0 means no scaling. # Scale must be > 0. sub setScale { my ($self, $scale) = @_; $self->{_scale} = $scale; } # Multiply the scaling factor by a value. sub scaleBy { my ($self, $a) = @_; $self->{_scale} *= $a; } # Return the mean. sub getMean { my $self = shift; return $self->{_mean} * $self->{_scale}; } # Return a 99% error based on the t distribution. The dataset # is desribed as getMean() +/- getError(). sub getError { my $self = shift; return $self->{_error} * $self->{_scale}; } # Divide two Datasets and return a new one, maintaining the # mean+/-error. The new Dataset has no data points. sub divide { my $self = shift; my $rhs = shift; my $minratio = ($self->{_mean} - $self->{_error}) / ($rhs->{_mean} + $rhs->{_error}); my $maxratio = ($self->{_mean} + $self->{_error}) / ($rhs->{_mean} - $rhs->{_error}); my $result = Dataset->new(); $result->{_mean} = ($minratio + $maxratio) / 2; $result->{_error} = $result->{_mean} - $minratio; $result->{_scale} = $self->{_scale} / $rhs->{_scale}; $result; } # subtracts two Datasets and return a new one, maintaining the # mean+/-error. The new Dataset has no data points. sub subtract { my $self = shift; my $rhs = shift; my $result = Dataset->new(); $result->{_mean} = $self->{_mean} - $rhs->{_mean}; $result->{_error} = $self->{_error} + $rhs->{_error}; $result->{_scale} = $self->{_scale}; $result; } # adds two Datasets and return a new one, maintaining the # mean+/-error. The new Dataset has no data points. sub add { my $self = shift; my $rhs = shift; my $result = Dataset->new(); $result->{_mean} = $self->{_mean} + $rhs->{_mean}; $result->{_error} = $self->{_error} + $rhs->{_error}; $result->{_scale} = $self->{_scale}; $result; } # Divides a dataset by a scalar. # The new Dataset has no data points. sub divideByScalar { my $self = shift; my $s = shift; my $result = Dataset->new(); $result->{_mean} = $self->{_mean}/$s; $result->{_error} = $self->{_error}/$s; $result->{_scale} = $self->{_scale}; $result; } # Divides a dataset by a scalar. # The new Dataset has no data points. sub multiplyByScalar { my $self = shift; my $s = shift; my $result = Dataset->new(); $result->{_mean} = $self->{_mean}*$s; $result->{_error} = $self->{_error}*$s; $result->{_scale} = $self->{_scale}; $result; } 1;