From b1ba4c5fdd5030b98a8cf9fe9c46319e5f5eb20e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 14 Sep 2023 04:46:34 -0500 Subject: [PATCH] [G]VIF (#548) * [G]VIF * add reference value source * more tests * glm tests --- Project.toml | 2 +- src/GLM.jl | 3 ++- src/linpred.jl | 2 +- test/runtests.jl | 43 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index cf5d12bf..7b4f561e 100644 --- a/Project.toml +++ b/Project.toml @@ -27,7 +27,7 @@ SpecialFunctions = "0.6, 0.7, 0.8, 0.9, 0.10, 1, 2.0" StatsAPI = "1.4" StatsBase = "0.33.5, 0.34" StatsFuns = "0.6, 0.7, 0.8, 0.9, 1.0" -StatsModels = "0.6.23, 0.7" +StatsModels = "0.7.3" Tables = "1" julia = "1.6" diff --git a/src/GLM.jl b/src/GLM.jl index 24f48843..59c327db 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -10,6 +10,7 @@ module GLM import Base: (\), convert, show, size import LinearAlgebra: cholesky, cholesky! import Statistics: cor + using StatsAPI import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual, loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, predict!, @@ -21,7 +22,7 @@ module GLM export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr², - cooksdistance, hasintercept, dispersion + cooksdistance, hasintercept, dispersion, vif, gvif, termnames export # types diff --git a/src/linpred.jl b/src/linpred.jl index 4b6471ad..4e64ae50 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -362,7 +362,7 @@ fitted(m::LinPredModel) = m.rr.mu predict(mm::LinPredModel) = fitted(mm) residuals(obj::LinPredModel) = residuals(obj.rr) -function formula(obj::LinPredModel) +function StatsModels.formula(obj::LinPredModel) obj.formula === nothing && throw(ArgumentError("model was fitted without a formula")) return obj.formula end diff --git a/test/runtests.jl b/test/runtests.jl index cbb21f1b..61d1dec3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2011,3 +2011,46 @@ end @test_throws ArgumentError lm(@formula(OptDen ~ Carb), form; method=:pr) @test_throws ArgumentError glm(@formula(OptDen ~ Carb), form, Normal(); method=:pr) end + +@testset "[G]VIF" begin + # Reference values from car::vif in R: + # > library(car) + # > data(Duncan) + # > lm1 = lm(prestige ~ 1 + income + education, Duncan) + # > vif(lm1) + # income education + # 2.1049 2.1049 + # > lm2 = lm(prestige ~ 1 + income + education + type, Duncan) + # > vif(lm2) + # GVIF Df GVIF^(1/(2*Df)) + # income 2.209178 1 1.486330 + # education 5.297584 1 2.301648 + # type 5.098592 2 1.502666 + duncan = RDatasets.dataset("car", "Duncan") + lm1 = lm(@formula(Prestige ~ 1 + Income + Education), duncan) + @test termnames(lm1)[2] == coefnames(lm1) + @test vif(lm1) ≈ gvif(lm1) + + lm1_noform = lm(modelmatrix(lm1), response(lm1)) + @test vif(lm1) ≈ vif(lm1_noform) + @test_throws ArgumentError("model was fitted without a formula") gvif(lm1_noform) + + lm1log = lm(@formula(Prestige ~ 1 + exp(log(Income)) + exp(log(Education))), duncan) + @test termnames(lm1log)[2] == coefnames(lm1log) == ["(Intercept)", "exp(log(Income))", "exp(log(Education))"] + @test vif(lm1) ≈ vif(lm1log) + + gm1 = glm(modelmatrix(lm1), response(lm1), Normal()) + @test vif(lm1) ≈ vif(gm1) + + lm2 = lm(@formula(Prestige ~ 1 + Income + Education + Type), duncan) + @test termnames(lm2)[2] != coefnames(lm2) + @test gvif(lm2; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-4 + + gm2 = glm(@formula(Prestige ~ 1 + Income + Education + Type), duncan, Normal()) + @test termnames(gm2)[2] != coefnames(gm2) + @test gvif(gm2; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-4 + + # the VIF definition depends on modelmatrix, vcov and stderror returning valid + # values. It doesn't care about links, offsets, etc. as long as the model matrix, + # vcov matrix and stderrors are well defined. +end