From b1ba4c5fdd5030b98a8cf9fe9c46319e5f5eb20e Mon Sep 17 00:00:00 2001
From: Phillip Alday <palday@users.noreply.github.com>
Date: Thu, 14 Sep 2023 04:46:34 -0500
Subject: [PATCH] [G]VIF (#548)

* [G]VIF

* add reference value source

* more tests

* glm tests
---
 Project.toml     |  2 +-
 src/GLM.jl       |  3 ++-
 src/linpred.jl   |  2 +-
 test/runtests.jl | 43 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index cf5d12bf..7b4f561e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -27,7 +27,7 @@ SpecialFunctions = "0.6, 0.7, 0.8, 0.9, 0.10, 1, 2.0"
 StatsAPI = "1.4"
 StatsBase = "0.33.5, 0.34"
 StatsFuns = "0.6, 0.7, 0.8, 0.9, 1.0"
-StatsModels = "0.6.23, 0.7"
+StatsModels = "0.7.3"
 Tables = "1"
 julia = "1.6"
 
diff --git a/src/GLM.jl b/src/GLM.jl
index 24f48843..59c327db 100644
--- a/src/GLM.jl
+++ b/src/GLM.jl
@@ -10,6 +10,7 @@ module GLM
     import Base: (\), convert, show, size
     import LinearAlgebra: cholesky, cholesky!
     import Statistics: cor
+    using StatsAPI
     import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual,
                       loglikelihood, nullloglikelihood, nobs, stderror, vcov,
                       residuals, predict, predict!,
@@ -21,7 +22,7 @@ module GLM
     export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
            loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
            fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
-           cooksdistance, hasintercept, dispersion
+           cooksdistance, hasintercept, dispersion, vif, gvif, termnames
 
     export
         # types
diff --git a/src/linpred.jl b/src/linpred.jl
index 4b6471ad..4e64ae50 100644
--- a/src/linpred.jl
+++ b/src/linpred.jl
@@ -362,7 +362,7 @@ fitted(m::LinPredModel) = m.rr.mu
 predict(mm::LinPredModel) = fitted(mm)
 residuals(obj::LinPredModel) = residuals(obj.rr)
 
-function formula(obj::LinPredModel)
+function StatsModels.formula(obj::LinPredModel)
     obj.formula === nothing && throw(ArgumentError("model was fitted without a formula"))
     return obj.formula
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index cbb21f1b..61d1dec3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2011,3 +2011,46 @@ end
     @test_throws ArgumentError lm(@formula(OptDen ~ Carb), form; method=:pr)
     @test_throws ArgumentError glm(@formula(OptDen ~ Carb), form, Normal(); method=:pr)
 end
+
+@testset "[G]VIF" begin
+    # Reference values from car::vif in R:
+    # > library(car)
+    # > data(Duncan)
+    # > lm1 = lm(prestige ~ 1 + income + education, Duncan)
+    # > vif(lm1)
+    #    income education 
+    #    2.1049    2.1049 
+    # > lm2 = lm(prestige ~ 1 + income + education + type, Duncan)
+    # > vif(lm2)
+    #               GVIF Df GVIF^(1/(2*Df))
+    # income    2.209178  1        1.486330
+    # education 5.297584  1        2.301648
+    # type      5.098592  2        1.502666
+    duncan = RDatasets.dataset("car", "Duncan")
+    lm1 = lm(@formula(Prestige ~ 1 + Income + Education), duncan)
+    @test termnames(lm1)[2] == coefnames(lm1)
+    @test vif(lm1) ≈ gvif(lm1)
+    
+    lm1_noform = lm(modelmatrix(lm1), response(lm1))
+    @test vif(lm1) ≈ vif(lm1_noform)
+    @test_throws ArgumentError("model was fitted without a formula") gvif(lm1_noform)
+    
+    lm1log = lm(@formula(Prestige ~ 1 + exp(log(Income)) + exp(log(Education))), duncan)
+    @test termnames(lm1log)[2] == coefnames(lm1log) == ["(Intercept)", "exp(log(Income))", "exp(log(Education))"]
+    @test vif(lm1) ≈ vif(lm1log)
+    
+    gm1 = glm(modelmatrix(lm1), response(lm1), Normal())
+    @test vif(lm1) ≈ vif(gm1)
+    
+    lm2 = lm(@formula(Prestige ~ 1 + Income + Education + Type), duncan)
+    @test termnames(lm2)[2] != coefnames(lm2)
+    @test gvif(lm2; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-4
+    
+    gm2 = glm(@formula(Prestige ~ 1 + Income + Education + Type), duncan, Normal())
+    @test termnames(gm2)[2] != coefnames(gm2)
+    @test gvif(gm2; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-4   
+    
+    # the VIF definition depends on modelmatrix, vcov and stderror returning valid
+    # values. It doesn't care about links, offsets, etc. as long as the model matrix,
+    # vcov matrix and stderrors are well defined.
+end