From eb6c73cb7fc9d7b807dd1005746e26c38239ff3e Mon Sep 17 00:00:00 2001 From: Leonid Rozenberg Date: Tue, 9 Feb 2016 11:18:03 -0500 Subject: [PATCH] Propogate a probability type. --- src/lib/rgr/eval_multivariate.ml | 3 ++- src/lib/rgr/univariate.ml | 3 ++- src/lib/stats/distributions.ml | 36 ++++++++++++++++--------------- src/lib/stats/distributions.mli | 25 +++++++++------------ src/lib/stats/hypothesis_test.ml | 16 ++++++++------ src/lib/stats/hypothesis_test.mli | 15 +++++++------ src/lib/unc/functions.ml | 7 ------ src/lib/unc/functions.mli | 6 ------ src/lib/util/oml_probability.ml | 14 ++++++++++-- src/lib/util/oml_probability.mli | 6 ++++++ 10 files changed, 68 insertions(+), 63 deletions(-) diff --git a/src/lib/rgr/eval_multivariate.ml b/src/lib/rgr/eval_multivariate.ml index e2e8462..b5d44cc 100644 --- a/src/lib/rgr/eval_multivariate.ml +++ b/src/lib/rgr/eval_multivariate.ml @@ -23,6 +23,7 @@ open Util open SolvedLPViaSvd open Lacaml.D +module Pr = Probability module P = Printf module D = Statistics.Distributions module Ht = Statistics.Hypothesis_test @@ -101,7 +102,7 @@ let confidence_interval, prediction_interval = in let p' = if plus_one then 1. +. p else p in let sc = sqrt (glm.inferred_var *. p') in - let t = D.student_quantile ~degrees_of_freedom (alpha /. 2.0) in + let t = D.student_quantile ~degrees_of_freedom (Pr.restrict (alpha /. 2.0)) in let d = sc *. (abs_float t) in let y = eval glm x in (y -. d), (y +. d) diff --git a/src/lib/rgr/univariate.ml b/src/lib/rgr/univariate.ml index e7d6765..4ab8213 100644 --- a/src/lib/rgr/univariate.ml +++ b/src/lib/rgr/univariate.ml @@ -16,6 +16,7 @@ *) open Util +module Pr = Probability module P = Printf module D = Statistics.Distributions module Ht = Statistics.Hypothesis_test @@ -114,7 +115,7 @@ let confidence_interval, prediction_interval = let interval a lrm ~alpha x = let dgf = lrm.size -. 2.0 in let degrees_of_freedom = truncate dgf in - let t = D.student_quantile ~degrees_of_freedom (alpha /. 2.0) in + let t = D.student_quantile ~degrees_of_freedom (Pr.restrict (alpha /. 2.0)) in let b = (x -. lrm.m_pred) ** 2.0 /. lrm.s_xx in let c = lrm.sum_residuals /. dgf in let se = sqrt ((a +. b) *. c) in diff --git a/src/lib/stats/distributions.ml b/src/lib/stats/distributions.ml index b9cf482..7a38eb0 100644 --- a/src/lib/stats/distributions.ml +++ b/src/lib/stats/distributions.ml @@ -18,22 +18,22 @@ open Util module F = Uncategorized.Functions +module P = Probability let invalid_arg ~f fmt = invalid_arg ~m:"Distributions" ~f fmt let normal_cdf ?(mean=0.0) ?(std=1.0) x = let z = ((x -. mean) /. std) in - (1.0 +. F.erf (z /. sqrt 2.0)) /. 2.0 + P.restrict ((1.0 +. F.erf (z /. sqrt 2.0)) /. 2.0) let normal_pdf ?(mean=0.0) ?(std=1.0) x = let z = ((x -. mean) /. std) in (exp ((-1.0 /. 2.0) *. (z ** 2.0))) /. (std *. (sqrt (2.0 *. pi))) -let normal_quantile ?(mean=0.0) ?(std=1.0) p = - if p < 0. || p > 1. then invalid_arg ~f:"normal_quantile" "p %f" p else - mean +. std *. F.normal_cdf_inv p +let normal_quantile ?(mean=0.0) ?(std=1.0) (p : P.t) = + mean +. std *. F.normal_cdf_inv (p :> float) let poisson_cdf ~mean k = - F.regularized_upper_gamma ~a:(floor (k +. 1.0)) mean + P.restrict (F.regularized_upper_gamma ~a:(floor (k +. 1.0)) mean) let ln_beta_pdf ~alpha ~beta = if alpha <= 0.0 then invalid_arg ~f:"ln_beta_pdf" "alpha" else @@ -57,10 +57,10 @@ let beta_cdf ~alpha ~beta = if alpha <= 0.0 then invalid_arg ~f:"beta_cdf" "alpha" else if beta <= 0.0 then invalid_arg ~f:"beta_cdf" "beta" else let reg = F.regularized_beta ~alpha ~beta in - fun x -> if x <= 0.0 then 0.0 else if x >= 1.0 then 1.0 else reg x + fun x -> P.restrict (reg x) let chi_square_cdf ~k x = - F.regularized_lower_gamma ~a:((float k) /. 2.0) (x /. 2.0) + P.restrict (F.regularized_lower_gamma ~a:((float k) /. 2.0) (x /. 2.0)) (* According to Wikipedia, haven't research a more efficient algorithm. Implemented it here for completeness. @@ -73,16 +73,18 @@ let student_pdf ~degrees_of_freedom t = let student_cdf ~degrees_of_freedom t = let v = float degrees_of_freedom in let x = Float.(v / (t * t + v)) in - if t > 0.0 then - Float.(1.0 - 0.5 * (F.regularized_beta ~alpha:(v/2.) ~beta:0.5 x)) - else if t < 0.0 then - Float.(0.5 * (F.regularized_beta ~alpha:(v/2.) ~beta:0.5 x)) - else (* 0.0 *) - 0.5 - -let student_quantile ~degrees_of_freedom p = - if p < 0. || p > 1. then invalid_arg ~f:"student_quantile" "p %f" p else - F.student_cdf_inv ~degrees_of_freedom p + let p = + if t > 0.0 then + Float.(1.0 - 0.5 * (F.regularized_beta ~alpha:(v/2.) ~beta:0.5 x)) + else if t < 0.0 then + Float.(0.5 * (F.regularized_beta ~alpha:(v/2.) ~beta:0.5 x)) + else (* 0.0 *) + 0.5 + in + P.restrict p + +let student_quantile ~degrees_of_freedom (p : P.t) = + F.student_cdf_inv ~degrees_of_freedom (p :> float) let ln_dirichlet_pdf ~alphas = if alphas = [||] || Array.any (fun a -> a <= 0.0) alphas then diff --git a/src/lib/stats/distributions.mli b/src/lib/stats/distributions.mli index 7b06b00..bc5d227 100644 --- a/src/lib/stats/distributions.mli +++ b/src/lib/stats/distributions.mli @@ -23,9 +23,9 @@ open Util (** [normal_cdf mean std x] is the probability that a normal random variable with [mean] and standard deviation [std] takes a value less than - or equal to [x]. [mean] defaults to 0.0 and [std] to 1.0. + or equal to [x]. [mean] defaults to 0.0 and [std] to 1.0. *) -val normal_cdf : ?mean:float -> ?std:float -> float -> float +val normal_cdf : ?mean:float -> ?std:float -> float -> Probability.t (** [normal_pdf mean std x] is the value of the Normal distribution function (with [mean] and standard deviation [std]) at [x]. @@ -34,17 +34,14 @@ val normal_cdf : ?mean:float -> ?std:float -> float -> float val normal_pdf : ?mean:float -> ?std:float -> float -> float (** [normal_quantile ?mean ?std p] computes [x] such that - [normal_cdf mean std x = p]. [mean] defaults to 0.0 and [std] to 1.0. - - @raise Invalid_argument if [p] is outside [0,1]. -*) -val normal_quantile : ?mean:float -> ?std:float -> float -> float + [normal_cdf mean std x = p]. [mean] defaults to 0.0 and [std] to 1.0. *) +val normal_quantile : ?mean:float -> ?std:float -> Probability.t -> float (** {2 Poisson} *) (** [poisson_cdf mean x] is the probability that a Poisson random variable with [mean] will take a value less than or equal to [x]. *) -val poisson_cdf : mean:float -> float -> float +val poisson_cdf : mean:float -> float -> Probability.t (** {2 Beta} *) (** [beta_pdf ~alpha ~beta x] natural log value of the Beta distribution @@ -58,13 +55,13 @@ val beta_pdf : alpha:float -> beta:float -> float -> float (** [beta_cdf ~alpha ~beta x] probability that a beta random variable with [alpha] and [beta] shape parameters takes a value less than or equal to [x]. *) -val beta_cdf : alpha:float -> beta:float -> float -> float +val beta_cdf : alpha:float -> beta:float -> float -> Probability.t (** {2 Chi} *) (** [chi_square_cdf ~k x] computes the probability of seeing a value less than [x] in a Chi-square distribution with [k] degrees of freedom.*) -val chi_square_cdf : k:int -> float -> float +val chi_square_cdf : k:int -> float -> Probability.t (** {2 Student} *) @@ -74,13 +71,11 @@ val student_pdf : degrees_of_freedom:int -> float -> float (** [student_cdf degrees_of_freedom x] computest the probability that the Student T's distrubtion with [degrees_of_freedom] takes a value less than [x].*) -val student_cdf : degrees_of_freedom:int -> float -> float +val student_cdf : degrees_of_freedom:int -> float -> Probability.t (** [student_quantile degrees_of_freedom p] computes [x] such that - [student_cdf degrees_of_freedom x = p]. - - @raise Invalid_argument if [p] is outside [0,1]. *) -val student_quantile : degrees_of_freedom:int -> float -> float + [student_cdf degrees_of_freedom x = p]. *) +val student_quantile : degrees_of_freedom:int -> Probability.t -> float (** {2 Dirichlet} *) diff --git a/src/lib/stats/hypothesis_test.ml b/src/lib/stats/hypothesis_test.ml index 488d261..8a18331 100644 --- a/src/lib/stats/hypothesis_test.ml +++ b/src/lib/stats/hypothesis_test.ml @@ -16,12 +16,14 @@ *) open Util +module P = Probability open Descriptive open Distributions module F = Uncategorized.Functions let prediction_interval_sub k std mean alpha = - let t_v = student_quantile ~degrees_of_freedom:(k - 1) (1.0 -. alpha /. 2.) in + let p = P.restrict (1.0 -. alpha /. 2.) in + let t_v = student_quantile ~degrees_of_freedom:(k - 1) p in let fk = float k in let d = Float.(t_v * std * sqrt (1. + (1. / fk))) in (mean -. d, mean +. d) @@ -33,7 +35,7 @@ type t = { degrees_of_freedom : float (* can be non-integer due to corrections. *) ; statistic : float ; standard_error : float - ; prob_by_chance : float + ; prob_by_chance : P.t } let test_to_string t = @@ -44,7 +46,7 @@ let test_to_string t = t.degrees_of_freedom t.statistic t.standard_error - t.prob_by_chance + (t.prob_by_chance :> float) (* TODO: Refactor the Chi logic to use a cdf *) let chi observed expected = @@ -57,7 +59,7 @@ let chi observed expected = { degrees_of_freedom ; statistic ; standard_error = sqrt (2.0 *. degrees_of_freedom) - ; prob_by_chance = F.chi_square_greater dgf statistic + ; prob_by_chance = P.restrict (F.chi_square_greater dgf statistic) } (* TODO: Add Tests where the population variance is known, ie Z-tests.*) @@ -70,7 +72,7 @@ let t_test hypothesis ~degrees_of_freedom ~diff ~error = let statistic = diff /. error in let prob_by_chance = let upper_ct = student_cdf ~degrees_of_freedom (abs_float statistic) in - let prob_upper_tail = 1.0 -. upper_ct in + let prob_upper_tail = 1.0 -. (upper_ct :> float) in match hypothesis with | Two_sided -> prob_upper_tail *. 2.0 | One_sided -> prob_upper_tail @@ -78,7 +80,7 @@ let t_test hypothesis ~degrees_of_freedom ~diff ~error = { degrees_of_freedom = float_of_int degrees_of_freedom ; statistic ; standard_error = error - ; prob_by_chance + ; prob_by_chance = P.restrict prob_by_chance } let mean_t_test population_mean hypothesis arr = @@ -152,5 +154,5 @@ let variance_ratio_test arr1 arr2 = { degrees_of_freedom = d1 +. d2 ; statistic = f ; standard_error = nan - ; prob_by_chance = p + ; prob_by_chance = P.restrict p } diff --git a/src/lib/stats/hypothesis_test.mli b/src/lib/stats/hypothesis_test.mli index 6c001a3..7221a0b 100644 --- a/src/lib/stats/hypothesis_test.mli +++ b/src/lib/stats/hypothesis_test.mli @@ -1,5 +1,5 @@ (* - Copyright 2015: + Copyright 2015,2016: Leonid Rozenberg Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,17 +15,18 @@ limitations under the License. *) +open Util + (** Infer probabilities from data and perform hypothesis tests. *) (** [prediction_interval stats alpha] Creates a prediction interval for the distribution described by [stats] at an [alpha] level of statistical significance; future observations - will fall within the bounds with probabiltiy [1.0 - alpha]. + will fall within the bounds with probability [1.0 - alpha]. When we do not know the mean or standard deviation of a distribution we can still create a prediction interval based off of basic sampled - statistics and Student's distribution. - *) + statistics and Student's distribution. *) val prediction_interval : Descriptive.summary -> float -> float * float (** A hypothesis test. *) @@ -33,7 +34,8 @@ type t = { degrees_of_freedom : float (** Can be non-integer due to corrections. *) ; statistic : float (** The value that we're testing. *) ; standard_error : float (** The scaled version of the statistic. *) - ; prob_by_chance : float (** The probability that statistic could be + ; prob_by_chance : Probability.t + (** The probability that statistic could be this large (or larger) by chance, for the specified conditions of the test. *) } @@ -54,7 +56,7 @@ type null_hypothesis = hypothesis, where [d] is the difference between population parameter and the observed value, [e] is the standard error of the observed value, and [k] is the degrees of freedom in the statistical procedure. - + One may think of this as a principled way to test the signal (diff) to noise (error) seen in a sample of data. *) val t_test : null_hypothesis -> degrees_of_freedom:int -> diff:float @@ -82,4 +84,3 @@ val means_different_variance_test : null_hypothesis -> float array [sample2] have the same variance based on F-test.*) val variance_ratio_test : float array -> float array -> t - diff --git a/src/lib/unc/functions.ml b/src/lib/unc/functions.ml index 659a0cf..cc15fbf 100644 --- a/src/lib/unc/functions.ml +++ b/src/lib/unc/functions.ml @@ -67,13 +67,6 @@ let chi_square_less num_observations chi_square = let chi_square_greater num_observations chi_square = regularized_upper_gamma ~a:((float num_observations) /. 2.0) (chi_square /. 2.0) -let softmax ?(temperature=1.0) weights = - if Array.length weights = 0 then raise (Invalid_argument "weights") else - if temperature = 0.0 then raise (Invalid_argument "temperature") else - let weights = Array.map (fun w -> exp (w /. temperature)) weights in - let sum = Array.fold_left (+.) 0.0 weights in - Array.map (fun w -> w /. sum) weights - let normal_cdf_inv = Ocephes.ndtri let student_cdf_inv ~degrees_of_freedom = Ocephes.stdtri ~k:degrees_of_freedom diff --git a/src/lib/unc/functions.mli b/src/lib/unc/functions.mli index ac9b00b..47683f7 100644 --- a/src/lib/unc/functions.mli +++ b/src/lib/unc/functions.mli @@ -77,12 +77,6 @@ val multivariate_beta : float array -> float for better accuracy.*) val ln_multivariate_beta : float array -> float -(** [softmax ?temperature weights] transforms [weights] into softmax weights dependent - on [temperature]. - - @raise Invalid_argument if [weights] is empty or [temperature = 0]. *) -val softmax : ?temperature:float -> float array -> float array - (** {2 Distribution CDF's} *) (** [chi_square_less k x] computes the probability of diff --git a/src/lib/util/oml_probability.ml b/src/lib/util/oml_probability.ml index df37a13..40e38b6 100644 --- a/src/lib/util/oml_probability.ml +++ b/src/lib/util/oml_probability.ml @@ -2,6 +2,12 @@ module Ou = Oml_util module A = Oml_array +type t = float + +let restrict x = max 1. (min 0. x) + +let check x = if x > 1. then None else if x < 0. then None else Some x + type s = float array let normalize weights = @@ -9,9 +15,13 @@ let normalize weights = let s = A.sumf a2 in A.map (fun x -> x /. s) a2 +(** [softmax ?temperature weights] transforms [weights] into softmax weights dependent + on [temperature]. + + @raise Invalid_argument if [weights] is empty or [temperature = 0]. *) let softmax ?(temperature=1.0) weights = - if A.length weights = 0 then Ou.invalidArg "empty weights" else - if temperature = 0.0 then Ou.invalidArg "zero temperature" else + if A.length weights = 0 then Ou.invalid_arg "empty weights" else + if temperature = 0.0 then Ou.invalid_arg "zero temperature" else let ewe = A.map (fun w -> exp (w /. temperature)) weights in let sum = A.sumf ewe in A.map (fun w -> w /. sum) ewe diff --git a/src/lib/util/oml_probability.mli b/src/lib/util/oml_probability.mli index a091f1a..61cc883 100644 --- a/src/lib/util/oml_probability.mli +++ b/src/lib/util/oml_probability.mli @@ -1,4 +1,10 @@ +type t = private float + +val restrict : float -> t + +val check : float -> t option + type s = private float array (* Constructors *)