From eb6c73cb7fc9d7b807dd1005746e26c38239ff3e Mon Sep 17 00:00:00 2001
From: Leonid Rozenberg <leonidr@gmail.com>
Date: Tue, 9 Feb 2016 11:18:03 -0500
Subject: [PATCH] Propogate a probability type.

---
 src/lib/rgr/eval_multivariate.ml  |  3 ++-
 src/lib/rgr/univariate.ml         |  3 ++-
 src/lib/stats/distributions.ml    | 36 ++++++++++++++++---------------
 src/lib/stats/distributions.mli   | 25 +++++++++------------
 src/lib/stats/hypothesis_test.ml  | 16 ++++++++------
 src/lib/stats/hypothesis_test.mli | 15 +++++++------
 src/lib/unc/functions.ml          |  7 ------
 src/lib/unc/functions.mli         |  6 ------
 src/lib/util/oml_probability.ml   | 14 ++++++++++--
 src/lib/util/oml_probability.mli  |  6 ++++++
 10 files changed, 68 insertions(+), 63 deletions(-)

diff --git a/src/lib/rgr/eval_multivariate.ml b/src/lib/rgr/eval_multivariate.ml
index e2e8462..b5d44cc 100644
--- a/src/lib/rgr/eval_multivariate.ml
+++ b/src/lib/rgr/eval_multivariate.ml
@@ -23,6 +23,7 @@
 open Util
 open SolvedLPViaSvd
 open Lacaml.D
+module Pr = Probability
 module P = Printf
 module D = Statistics.Distributions
 module Ht = Statistics.Hypothesis_test
@@ -101,7 +102,7 @@ let confidence_interval, prediction_interval =
     in
     let p' = if plus_one then 1. +. p else p in
     let sc = sqrt (glm.inferred_var  *. p') in
-    let t  = D.student_quantile ~degrees_of_freedom (alpha /. 2.0) in
+    let t  = D.student_quantile ~degrees_of_freedom (Pr.restrict (alpha /. 2.0)) in
     let d  = sc *. (abs_float t) in
     let y  = eval glm x in
     (y -. d), (y +. d)
diff --git a/src/lib/rgr/univariate.ml b/src/lib/rgr/univariate.ml
index e7d6765..4ab8213 100644
--- a/src/lib/rgr/univariate.ml
+++ b/src/lib/rgr/univariate.ml
@@ -16,6 +16,7 @@
 *)
 
 open Util
+module Pr = Probability
 module P = Printf
 module D = Statistics.Distributions
 module Ht = Statistics.Hypothesis_test
@@ -114,7 +115,7 @@ let confidence_interval, prediction_interval =
   let interval a lrm ~alpha x =
     let dgf = lrm.size -. 2.0 in
     let degrees_of_freedom = truncate dgf in
-    let t  = D.student_quantile ~degrees_of_freedom (alpha /. 2.0) in
+    let t  = D.student_quantile ~degrees_of_freedom (Pr.restrict (alpha /. 2.0)) in
     let b  = (x -. lrm.m_pred) ** 2.0 /. lrm.s_xx in
     let c  = lrm.sum_residuals /. dgf in
     let se = sqrt ((a +. b) *. c) in
diff --git a/src/lib/stats/distributions.ml b/src/lib/stats/distributions.ml
index b9cf482..7a38eb0 100644
--- a/src/lib/stats/distributions.ml
+++ b/src/lib/stats/distributions.ml
@@ -18,22 +18,22 @@
 
 open Util
 module F = Uncategorized.Functions
+module P = Probability
 let invalid_arg ~f fmt = invalid_arg ~m:"Distributions" ~f fmt 
 
 let normal_cdf ?(mean=0.0) ?(std=1.0) x =
   let z = ((x -. mean) /. std) in
-  (1.0 +. F.erf (z /. sqrt 2.0)) /. 2.0
+  P.restrict ((1.0 +. F.erf (z /. sqrt 2.0)) /. 2.0)
 
 let normal_pdf ?(mean=0.0) ?(std=1.0) x =
   let z = ((x -. mean) /. std) in
   (exp ((-1.0 /. 2.0) *. (z ** 2.0))) /.  (std *. (sqrt (2.0 *. pi)))
 
-let normal_quantile ?(mean=0.0) ?(std=1.0) p =
-  if p < 0. || p > 1. then invalid_arg ~f:"normal_quantile" "p %f" p else
-    mean +. std *. F.normal_cdf_inv p
+let normal_quantile ?(mean=0.0) ?(std=1.0) (p : P.t) =
+  mean +. std *. F.normal_cdf_inv (p :> float)
 
 let poisson_cdf ~mean k =
-  F.regularized_upper_gamma ~a:(floor (k +. 1.0)) mean
+  P.restrict (F.regularized_upper_gamma ~a:(floor (k +. 1.0)) mean)
 
 let ln_beta_pdf ~alpha ~beta =
   if alpha <= 0.0 then invalid_arg ~f:"ln_beta_pdf" "alpha" else
@@ -57,10 +57,10 @@ let beta_cdf ~alpha ~beta =
   if alpha <= 0.0 then invalid_arg ~f:"beta_cdf" "alpha" else
     if beta <= 0.0 then invalid_arg ~f:"beta_cdf" "beta" else
       let reg = F.regularized_beta ~alpha ~beta in
-      fun x -> if x <= 0.0 then 0.0 else if x >= 1.0 then 1.0 else reg x
+      fun x -> P.restrict (reg x)
 
 let chi_square_cdf ~k x =
-  F.regularized_lower_gamma ~a:((float k) /. 2.0) (x /. 2.0)
+  P.restrict (F.regularized_lower_gamma ~a:((float k) /. 2.0) (x /. 2.0))
 
 (* According to Wikipedia, haven't research a more efficient algorithm.
    Implemented it here for completeness.
@@ -73,16 +73,18 @@ let student_pdf ~degrees_of_freedom t =
 let student_cdf ~degrees_of_freedom t =
   let v = float degrees_of_freedom in
   let x = Float.(v / (t * t + v)) in
-  if t > 0.0 then
-    Float.(1.0 - 0.5 * (F.regularized_beta ~alpha:(v/2.) ~beta:0.5 x))
-  else if t < 0.0 then
-    Float.(0.5 * (F.regularized_beta ~alpha:(v/2.) ~beta:0.5 x))
-  else  (* 0.0 *)
-    0.5
-
-let student_quantile ~degrees_of_freedom p =
-  if p < 0. || p > 1. then invalid_arg ~f:"student_quantile" "p %f" p else
-    F.student_cdf_inv ~degrees_of_freedom p
+  let p =
+    if t > 0.0 then
+      Float.(1.0 - 0.5 * (F.regularized_beta ~alpha:(v/2.) ~beta:0.5 x))
+    else if t < 0.0 then
+      Float.(0.5 * (F.regularized_beta ~alpha:(v/2.) ~beta:0.5 x))
+    else  (* 0.0 *)
+      0.5
+  in
+  P.restrict p
+
+let student_quantile ~degrees_of_freedom (p : P.t) =
+  F.student_cdf_inv ~degrees_of_freedom (p :> float)
 
 let ln_dirichlet_pdf ~alphas =
   if alphas = [||] || Array.any (fun a -> a <= 0.0) alphas then
diff --git a/src/lib/stats/distributions.mli b/src/lib/stats/distributions.mli
index 7b06b00..bc5d227 100644
--- a/src/lib/stats/distributions.mli
+++ b/src/lib/stats/distributions.mli
@@ -23,9 +23,9 @@ open Util
 
 (** [normal_cdf mean std x] is the probability that a normal random
     variable with [mean] and standard deviation [std] takes a value less than
-    or equal to [x].  [mean] defaults to 0.0 and [std] to 1.0.
+    or equal to [x]. [mean] defaults to 0.0 and [std] to 1.0.
 *)
-val normal_cdf : ?mean:float -> ?std:float -> float -> float
+val normal_cdf : ?mean:float -> ?std:float -> float -> Probability.t
 
 (** [normal_pdf mean std x] is the value of the Normal distribution
     function (with [mean] and standard deviation [std]) at [x].
@@ -34,17 +34,14 @@ val normal_cdf : ?mean:float -> ?std:float -> float -> float
 val normal_pdf : ?mean:float -> ?std:float -> float -> float
 
 (** [normal_quantile ?mean ?std p] computes [x] such that
-    [normal_cdf mean std x = p]. [mean] defaults to 0.0 and [std] to 1.0.
-
-    @raise Invalid_argument if [p] is outside [0,1].
-*)
-val normal_quantile : ?mean:float -> ?std:float -> float -> float
+    [normal_cdf mean std x = p]. [mean] defaults to 0.0 and [std] to 1.0. *)
+val normal_quantile : ?mean:float -> ?std:float -> Probability.t -> float
 
 (** {2 Poisson} *)
 
 (** [poisson_cdf mean x] is the probability that a Poisson random variable with
     [mean] will take a value less than or equal to [x]. *)
-val poisson_cdf : mean:float -> float -> float
+val poisson_cdf : mean:float -> float -> Probability.t
 
 (** {2 Beta} *)
 (** [beta_pdf ~alpha ~beta x] natural log value of the Beta distribution
@@ -58,13 +55,13 @@ val beta_pdf : alpha:float -> beta:float -> float -> float
 (** [beta_cdf ~alpha ~beta x] probability that a beta random variable with
     [alpha] and [beta] shape parameters takes a value less than or equal to
     [x]. *)
-val beta_cdf : alpha:float -> beta:float -> float -> float
+val beta_cdf : alpha:float -> beta:float -> float -> Probability.t
 
 (** {2 Chi} *)
 
 (** [chi_square_cdf ~k x] computes the probability of seeing a value less than
     [x] in a Chi-square distribution with [k] degrees of freedom.*)
-val chi_square_cdf : k:int -> float -> float
+val chi_square_cdf : k:int -> float -> Probability.t
 
 (** {2 Student} *)
 
@@ -74,13 +71,11 @@ val student_pdf : degrees_of_freedom:int -> float -> float
 
 (** [student_cdf degrees_of_freedom x] computest the probability that the Student T's
     distrubtion with [degrees_of_freedom] takes a value less than [x].*)
-val student_cdf : degrees_of_freedom:int -> float -> float
+val student_cdf : degrees_of_freedom:int -> float -> Probability.t
 
 (** [student_quantile degrees_of_freedom p] computes [x] such that
-    [student_cdf degrees_of_freedom x = p].
-
-    @raise Invalid_argument if [p] is outside [0,1]. *)
-val student_quantile : degrees_of_freedom:int -> float -> float
+    [student_cdf degrees_of_freedom x = p]. *)
+val student_quantile : degrees_of_freedom:int -> Probability.t -> float
 
 (** {2 Dirichlet} *)
 
diff --git a/src/lib/stats/hypothesis_test.ml b/src/lib/stats/hypothesis_test.ml
index 488d261..8a18331 100644
--- a/src/lib/stats/hypothesis_test.ml
+++ b/src/lib/stats/hypothesis_test.ml
@@ -16,12 +16,14 @@
 *)
 
 open Util
+module P = Probability
 open Descriptive
 open Distributions
 module F = Uncategorized.Functions
 
 let prediction_interval_sub k std mean alpha =
-  let t_v = student_quantile ~degrees_of_freedom:(k - 1) (1.0 -. alpha /. 2.) in
+  let p   = P.restrict (1.0 -. alpha /. 2.) in
+  let t_v = student_quantile ~degrees_of_freedom:(k - 1) p in
   let fk  = float k in
   let d   = Float.(t_v * std * sqrt (1. + (1. / fk))) in
   (mean -. d, mean +. d)
@@ -33,7 +35,7 @@ type t =
   { degrees_of_freedom  : float   (* can be non-integer due to corrections. *)
   ; statistic           : float
   ; standard_error      : float
-  ; prob_by_chance      : float
+  ; prob_by_chance      : P.t
   }
 
 let test_to_string t =
@@ -44,7 +46,7 @@ let test_to_string t =
         t.degrees_of_freedom
         t.statistic
         t.standard_error
-        t.prob_by_chance
+        (t.prob_by_chance :> float)
 
 (* TODO: Refactor the Chi logic to use a cdf *)
 let chi observed expected =
@@ -57,7 +59,7 @@ let chi observed expected =
   { degrees_of_freedom
   ; statistic
   ; standard_error      = sqrt (2.0 *. degrees_of_freedom)
-  ; prob_by_chance      = F.chi_square_greater dgf statistic
+  ; prob_by_chance      = P.restrict (F.chi_square_greater dgf statistic)
   }
 
 (* TODO: Add Tests where the population variance is known, ie Z-tests.*)
@@ -70,7 +72,7 @@ let t_test hypothesis ~degrees_of_freedom ~diff ~error =
   let statistic = diff /. error in
   let prob_by_chance =
     let upper_ct = student_cdf ~degrees_of_freedom (abs_float statistic) in
-    let prob_upper_tail = 1.0 -. upper_ct in
+    let prob_upper_tail = 1.0 -. (upper_ct :> float) in
     match hypothesis with
     | Two_sided -> prob_upper_tail *. 2.0
     | One_sided -> prob_upper_tail
@@ -78,7 +80,7 @@ let t_test hypothesis ~degrees_of_freedom ~diff ~error =
   { degrees_of_freedom = float_of_int degrees_of_freedom
   ; statistic
   ; standard_error = error
-  ; prob_by_chance
+  ; prob_by_chance = P.restrict prob_by_chance
   }
 
 let mean_t_test population_mean hypothesis arr =
@@ -152,5 +154,5 @@ let variance_ratio_test arr1 arr2 =
   { degrees_of_freedom = d1 +. d2
   ; statistic = f
   ; standard_error = nan
-  ; prob_by_chance = p
+  ; prob_by_chance = P.restrict p
   }
diff --git a/src/lib/stats/hypothesis_test.mli b/src/lib/stats/hypothesis_test.mli
index 6c001a3..7221a0b 100644
--- a/src/lib/stats/hypothesis_test.mli
+++ b/src/lib/stats/hypothesis_test.mli
@@ -1,5 +1,5 @@
 (*
-   Copyright 2015:
+   Copyright 2015,2016:
      Leonid Rozenberg <leonidr@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,17 +15,18 @@
    limitations under the License.
 *)
 
+open Util
+
 (** Infer probabilities from data and perform hypothesis tests. *)
 
 (** [prediction_interval stats alpha]
   Creates a prediction interval for the distribution described by [stats]
   at an [alpha] level of statistical significance; future observations
-  will fall within the bounds with probabiltiy [1.0 - alpha].
+  will fall within the bounds with probability [1.0 - alpha].
 
   When we do not know the mean or standard deviation of a distribution
   we can still create a prediction interval based off of basic sampled
-  statistics and Student's distribution.
-  *)
+  statistics and Student's distribution. *)
 val prediction_interval : Descriptive.summary -> float -> float * float
 
 (** A hypothesis test.  *)
@@ -33,7 +34,8 @@ type t =
   { degrees_of_freedom : float  (** Can be non-integer due to corrections. *)
   ; statistic          : float  (** The value that we're testing. *)
   ; standard_error     : float  (** The scaled version of the statistic. *)
-  ; prob_by_chance     : float  (** The probability that statistic could be
+  ; prob_by_chance     : Probability.t
+                                (** The probability that statistic could be
                                     this large (or larger) by chance, for the
                                     specified conditions of the test. *)
   }
@@ -54,7 +56,7 @@ type null_hypothesis =
     hypothesis, where [d] is the difference between population parameter and
     the observed value, [e] is the standard error of the observed value, and
     [k] is the degrees of freedom in the statistical procedure.
-    
+
     One may think of this as a principled way to test the signal (diff)
     to noise (error) seen in a sample of data. *)
 val t_test : null_hypothesis -> degrees_of_freedom:int -> diff:float
@@ -82,4 +84,3 @@ val means_different_variance_test : null_hypothesis -> float array
     [sample2] have the same variance based on F-test.*)
 val variance_ratio_test : float array -> float array -> t
 
-
diff --git a/src/lib/unc/functions.ml b/src/lib/unc/functions.ml
index 659a0cf..cc15fbf 100644
--- a/src/lib/unc/functions.ml
+++ b/src/lib/unc/functions.ml
@@ -67,13 +67,6 @@ let chi_square_less num_observations chi_square =
 let chi_square_greater num_observations chi_square =
   regularized_upper_gamma ~a:((float num_observations) /. 2.0) (chi_square /. 2.0)
 
-let softmax ?(temperature=1.0) weights =
-  if Array.length weights = 0 then raise (Invalid_argument "weights") else
-  if temperature = 0.0 then raise (Invalid_argument "temperature") else
-    let weights = Array.map (fun w -> exp (w /. temperature)) weights in
-    let sum = Array.fold_left (+.) 0.0 weights in
-    Array.map (fun w -> w /. sum) weights
-
 let normal_cdf_inv = Ocephes.ndtri
 
 let student_cdf_inv ~degrees_of_freedom = Ocephes.stdtri ~k:degrees_of_freedom
diff --git a/src/lib/unc/functions.mli b/src/lib/unc/functions.mli
index ac9b00b..47683f7 100644
--- a/src/lib/unc/functions.mli
+++ b/src/lib/unc/functions.mli
@@ -77,12 +77,6 @@ val multivariate_beta : float array -> float
     for better accuracy.*)
 val ln_multivariate_beta : float array -> float
 
-(** [softmax ?temperature weights] transforms [weights] into softmax weights dependent
-    on [temperature].
-
-    @raise Invalid_argument if [weights] is empty or [temperature = 0]. *)
-val softmax : ?temperature:float -> float array -> float array
-
 (** {2 Distribution CDF's} *)
 
 (** [chi_square_less k x] computes the probability of
diff --git a/src/lib/util/oml_probability.ml b/src/lib/util/oml_probability.ml
index df37a13..40e38b6 100644
--- a/src/lib/util/oml_probability.ml
+++ b/src/lib/util/oml_probability.ml
@@ -2,6 +2,12 @@
 module Ou = Oml_util
 module A = Oml_array
 
+type t = float
+
+let restrict x = max 1. (min 0. x)
+
+let check x = if x > 1. then None else if x < 0. then None else Some x
+
 type s = float array
 
 let normalize weights =
@@ -9,9 +15,13 @@ let normalize weights =
   let s = A.sumf a2 in
   A.map (fun x -> x /. s) a2
 
+(** [softmax ?temperature weights] transforms [weights] into softmax weights dependent
+    on [temperature].
+
+    @raise Invalid_argument if [weights] is empty or [temperature = 0]. *)
 let softmax ?(temperature=1.0) weights =
-  if A.length weights = 0 then Ou.invalidArg "empty weights" else
-  if temperature = 0.0 then Ou.invalidArg "zero temperature" else
+  if A.length weights = 0 then Ou.invalid_arg "empty weights" else
+  if temperature = 0.0 then Ou.invalid_arg "zero temperature" else
     let ewe = A.map (fun w -> exp (w /. temperature)) weights in
     let sum = A.sumf ewe in
     A.map (fun w -> w /. sum) ewe
diff --git a/src/lib/util/oml_probability.mli b/src/lib/util/oml_probability.mli
index a091f1a..61cc883 100644
--- a/src/lib/util/oml_probability.mli
+++ b/src/lib/util/oml_probability.mli
@@ -1,4 +1,10 @@
 
+type t = private float
+
+val restrict : float -> t
+
+val check : float -> t option
+
 type s = private float array
 
 (* Constructors *)