Machine Learning
|
Nesse tutorial serão apresentadas técnicas para seleção de variáveis em modelos de regressão múltipla. Serão considerados os métodos de todos os subconjuntos possíveis, seleção passo a passo de variáveis (stepwise) e regularização.
rm(list = objects())
library(lattice)
library(latticeExtra)
teca_crapar.csv
contém os parâmetros estimados da curva de retenção de água do solo.teca_qui.csv
contém valores das variáveis qúmicas do solo.teca_arv.csv
contém os valores de produção de madeira dos nas localidades onde as variáveis preditoras foram determinadas.O objetivo destes dados é fazer um modelo para predição da produção de madeira em função do conjunto de preditoras. A tabela de dados não é grande pois contém apenas 50 registros.
# Endereço das tabelas.
pre <- "https://raw.githubusercontent.com/walmes/EACS/master/data-raw/"
files <- c(hidric = "teca_crapar.csv",
quimic = "teca_qui.csv",
prod = "teca_arv.csv")
urls <- paste0(pre, files)
names(urls) <- names(files)
# Lista com as tabelas.
da <- sapply(urls,
FUN = read.table,
header = TRUE,
sep = ";",
simplify = FALSE)
str(da)
## List of 3
## $ hidric:'data.frame': 141 obs. of 10 variables:
## ..$ loc: int [1:141] 1 1 1 2 2 2 3 3 3 4 ...
## ..$ cam: Factor w/ 3 levels "[0, 5)","[40, 80)",..: 3 2 1 1 3 2 1 3 2 3 ...
## ..$ Ur : num [1:141] 0.178 0.188 0.223 0.131 0.165 ...
## ..$ Us : num [1:141] 0.58 0.647 0.648 0.697 0.573 ...
## ..$ alp: num [1:141] -0.833 -0.722 -0.724 -0.548 -0.547 ...
## ..$ n : num [1:141] 3.11 3.25 3.59 4.01 2.92 ...
## ..$ I : num [1:141] 0.957 0.835 0.815 0.62 0.691 ...
## ..$ Ui : num [1:141] 0.396 0.435 0.45 0.431 0.387 ...
## ..$ S : num [1:141] -0.273 -0.329 -0.341 -0.515 -0.257 ...
## ..$ cad: num [1:141] 0.217 0.247 0.227 0.3 0.222 ...
## $ quimic:'data.frame': 150 obs. of 15 variables:
## ..$ loc: int [1:150] 1 1 1 2 2 2 3 3 3 4 ...
## ..$ cam: Factor w/ 3 levels "[0, 5)","[40, 80)",..: 1 3 2 1 3 2 1 3 2 1 ...
## ..$ ph : num [1:150] 6.8 6.7 6.7 4.7 4.7 4.9 7.6 6.8 6.9 6.6 ...
## ..$ p : num [1:150] 22.51 0.83 0.01 3.89 0.69 ...
## ..$ k : num [1:150] 72.24 13.42 7.23 48.13 12.34 ...
## ..$ ca : num [1:150] 8.27 2.91 2.33 0.97 0.76 0.21 7.63 3.22 2.22 5.54 ...
## ..$ mg : num [1:150] 1.7 1.77 0.51 0.16 0.14 0 1.53 1.31 1.09 1.77 ...
## ..$ al : num [1:150] 0 0 0 0.3 0.6 0.6 0 0 0 0 ...
## ..$ ctc: num [1:150] 12.47 6.57 4.52 5.3 4.17 ...
## ..$ sat: num [1:150] 81.4 71.7 63.2 23.7 22.4 ...
## ..$ mo : num [1:150] 72.2 25.6 9.7 34.4 8.7 9.7 50.4 15.6 9.5 50.2 ...
## ..$ arg: num [1:150] 184 215 286 232 213 ...
## ..$ are: num [1:150] 770 750 674 741 775 ...
## ..$ cas: num [1:150] 1.8 2.2 3.7 0.4 1.1 1.7 8.4 19 14.3 5.5 ...
## ..$ acc: num [1:150] 770 750 676 741 775 ...
## $ prod :'data.frame': 50 obs. of 5 variables:
## ..$ loc : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ alt : num [1:50] 12.9 11.9 22.5 20.4 15.6 ...
## ..$ dap : num [1:50] 0.194 0.182 0.348 0.304 0.227 ...
## ..$ vol : num [1:50] 0.2707 0.0963 0.4974 0.4074 0.2165 ...
## ..$ prod: num [1:50] 68.2 24.3 125.3 102.7 54.6 ...
# Manipular as tabelas para fazer o merge.
da$quimic <- subset(da$quimic, cam == "[0, 5)", select = -cam)
da$hidric <- subset(da$hidric, cam == "[0, 5)", select = -c(cam, cad))
da$prod <- subset(da$prod, select = c(loc, prod))
str(da)
## List of 3
## $ hidric:'data.frame': 48 obs. of 8 variables:
## ..$ loc: int [1:48] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ Ur : num [1:48] 0.223 0.131 0.225 0.233 0.172 ...
## ..$ Us : num [1:48] 0.648 0.697 0.614 0.647 0.844 ...
## ..$ alp: num [1:48] -0.724 -0.548 -0.707 -0.555 -0.25 ...
## ..$ n : num [1:48] 3.59 4.01 2.53 2.91 2.1 ...
## ..$ I : num [1:48] 0.815 0.62 0.905 0.7 0.557 ...
## ..$ Ui : num [1:48] 0.45 0.431 0.44 0.459 0.556 ...
## ..$ S : num [1:48] -0.341 -0.515 -0.206 -0.26 -0.278 ...
## $ quimic:'data.frame': 50 obs. of 14 variables:
## ..$ loc: int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ ph : num [1:50] 6.8 4.7 7.6 6.6 6 5.8 5.9 6.5 5.5 5.8 ...
## ..$ p : num [1:50] 22.51 3.89 11.52 26.43 3.79 ...
## ..$ k : num [1:50] 72.2 48.1 114.5 54.3 56.8 ...
## ..$ ca : num [1:50] 8.27 0.97 7.63 5.54 2.99 1.88 1.6 5.82 1.91 1.73 ...
## ..$ mg : num [1:50] 1.7 0.16 1.53 1.77 2.12 1.53 1.42 1.67 0.21 0.69 ...
## ..$ al : num [1:50] 0 0.3 0 0 0 0 0 0 0 0.1 ...
## ..$ ctc: num [1:50] 12.47 5.3 12.41 9.11 7.57 ...
## ..$ sat: num [1:50] 81.4 23.7 76.1 81.8 69.4 ...
## ..$ mo : num [1:50] 72.2 34.4 50.4 50.2 39.1 35 28.6 42.3 19.9 24.3 ...
## ..$ arg: num [1:50] 184 232 193 142 236 ...
## ..$ are: num [1:50] 770 741 716 790 733 ...
## ..$ cas: num [1:50] 1.8 0.4 8.4 5.5 5.5 20.9 12.4 2 1.4 0.8 ...
## ..$ acc: num [1:50] 770 741 718 791 734 ...
## $ prod :'data.frame': 50 obs. of 2 variables:
## ..$ loc : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ prod: num [1:50] 68.2 24.3 125.3 102.7 54.6 ...
# Aplica o merge recursivamente.
db <- Reduce(f = merge, x = da)
# Elimina a variável identificadora (agora desnecessária).
db$loc <- NULL
# Estrutura da tabela.
str(db)
## 'data.frame': 48 obs. of 21 variables:
## $ Ur : num 0.223 0.131 0.225 0.233 0.172 ...
## $ Us : num 0.648 0.697 0.614 0.647 0.844 ...
## $ alp : num -0.724 -0.548 -0.707 -0.555 -0.25 ...
## $ n : num 3.59 4.01 2.53 2.91 2.1 ...
## $ I : num 0.815 0.62 0.905 0.7 0.557 ...
## $ Ui : num 0.45 0.431 0.44 0.459 0.556 ...
## $ S : num -0.341 -0.515 -0.206 -0.26 -0.278 ...
## $ ph : num 6.8 4.7 7.6 6.6 6 5.8 5.9 6.5 5.5 5.8 ...
## $ p : num 22.51 3.89 11.52 26.43 3.79 ...
## $ k : num 72.2 48.1 114.5 54.3 56.8 ...
## $ ca : num 8.27 0.97 7.63 5.54 2.99 1.88 1.6 5.82 1.91 1.73 ...
## $ mg : num 1.7 0.16 1.53 1.77 2.12 1.53 1.42 1.67 0.21 0.69 ...
## $ al : num 0 0.3 0 0 0 0 0 0 0 0.1 ...
## $ ctc : num 12.47 5.3 12.41 9.11 7.57 ...
## $ sat : num 81.4 23.7 76.1 81.8 69.4 ...
## $ mo : num 72.2 34.4 50.4 50.2 39.1 35 28.6 42.3 19.9 24.3 ...
## $ arg : num 184 232 193 142 236 ...
## $ are : num 770 741 716 790 733 ...
## $ cas : num 1.8 0.4 8.4 5.5 5.5 20.9 12.4 2 1.4 0.8 ...
## $ acc : num 770 741 718 791 734 ...
## $ prod: num 68.2 24.3 125.3 102.7 54.6 ...
# Padronizando as variáveis.
db[, -ncol(db)] <- sapply(db[, -ncol(db)],
FUN = scale,
center = TRUE,
scale = TRUE)
summary(db)
## Ur Us alp
## Min. :-2.26093 Min. :-2.426035 Min. :-1.3255
## 1st Qu.:-0.77012 1st Qu.:-0.528697 1st Qu.:-0.5094
## Median :-0.08204 Median :-0.002851 Median :-0.1950
## Mean : 0.00000 Mean : 0.000000 Mean : 0.0000
## 3rd Qu.: 0.72066 3rd Qu.: 0.391848 3rd Qu.: 0.2485
## Max. : 2.31946 Max. : 3.337896 Max. : 5.0748
## n I Ui S
## Min. :-1.6016 Min. :-4.9012 Min. :-3.33749 Min. :-2.7513
## 1st Qu.:-0.7363 1st Qu.:-0.3495 1st Qu.:-0.64438 1st Qu.:-0.4586
## Median :-0.2119 Median : 0.1196 Median :-0.07202 Median : 0.1355
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.5255 3rd Qu.: 0.4806 3rd Qu.: 0.70223 3rd Qu.: 0.7451
## Max. : 2.6804 Max. : 1.7862 Max. : 2.19968 Max. : 1.3082
## ph p k ca
## Min. :-2.76562 Min. :-0.51945 Min. :-1.2338 Min. :-1.4550
## 1st Qu.:-0.73771 1st Qu.:-0.41300 1st Qu.:-0.7332 1st Qu.:-0.8709
## Median : 0.04225 Median :-0.31581 Median :-0.3878 Median :-0.0794
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.66622 3rd Qu.:-0.06664 3rd Qu.: 0.6222 3rd Qu.: 0.5032
## Max. : 2.38214 Max. : 5.76973 Max. : 2.7657 Max. : 2.4702
## mg al ctc sat
## Min. :-1.9339 Min. :-0.2984 Min. :-1.5625 Min. :-3.1413
## 1st Qu.:-0.5999 1st Qu.:-0.2984 1st Qu.:-0.8460 1st Qu.:-0.6637
## Median :-0.1453 Median :-0.2984 Median :-0.1239 Median : 0.2956
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5707 3rd Qu.:-0.2984 3rd Qu.: 0.6344 3rd Qu.: 0.8398
## Max. : 2.2374 Max. : 5.0720 Max. : 2.3522 Max. : 1.3802
## mo arg are cas
## Min. :-1.7892 Min. :-1.8237 Min. :-2.3056 Min. :-0.7396
## 1st Qu.:-0.7080 1st Qu.:-0.6009 1st Qu.:-0.4985 1st Qu.:-0.6269
## Median :-0.2265 Median :-0.1888 Median : 0.1126 Median :-0.3890
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5854 3rd Qu.: 0.3499 3rd Qu.: 0.7998 3rd Qu.: 0.3114
## Max. : 2.5356 Max. : 2.6955 Max. : 1.4942 Max. : 5.1934
## acc prod
## Min. :-2.3356 Min. : 20.90
## 1st Qu.:-0.5648 1st Qu.: 52.91
## Median : 0.1237 Median : 88.01
## Mean : 0.0000 Mean : 82.81
## 3rd Qu.: 0.8066 3rd Qu.:105.59
## Max. : 1.6078 Max. :170.76
# Gráfico de pares de diagrama de dispersão.
splom(db, as.matrix = TRUE, type = c("p", "r"))
Ao todo são 48 registros completos e 21 variáveis.
A técnica do melhor subconjunto de variáveis visa determinar, para uma quantidade fixa de variáveis, o melhor subconjunto para a predição da resposta.
# Ajuste do modelo com todas as variáveis.
m0 <- lm(prod ~ ., data = db)
summary(m0)
##
## Call:
## lm(formula = prod ~ ., data = db)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.487 -11.679 -2.066 7.435 45.875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.8139 3.4921 23.715 <2e-16 ***
## Ur -277.1771 265.6470 -1.043 0.306
## Us -407.1099 375.2216 -1.085 0.288
## alp -229.6934 254.4104 -0.903 0.375
## n -71.0049 75.3781 -0.942 0.355
## I -187.8809 214.1330 -0.877 0.388
## Ui 529.2090 486.7247 1.087 0.287
## S -112.2970 96.0927 -1.169 0.253
## ph 24.4055 10.1260 2.410 0.023 *
## p -3.8043 8.2028 -0.464 0.647
## k -3.5993 6.1756 -0.583 0.565
## ca -45.3885 52.9315 -0.857 0.399
## mg -15.5749 16.1081 -0.967 0.342
## al 3.7825 4.6485 0.814 0.423
## ctc 53.3969 47.5504 1.123 0.271
## sat 18.0690 19.3197 0.935 0.358
## mo -4.4989 7.3634 -0.611 0.546
## arg 0.6019 9.8178 0.061 0.952
## are -152.8940 151.6733 -1.008 0.322
## cas -31.3163 33.3893 -0.938 0.357
## acc 151.9501 149.7914 1.014 0.319
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.19 on 27 degrees of freedom
## Multiple R-squared: 0.7501, Adjusted R-squared: 0.5649
## F-statistic: 4.051 on 20 and 27 DF, p-value: 0.0004454
# Quantos subconjuntos podem ser formados de tamanho 1 até k?
k <- ncol(db) - 1
k
## [1] 20
# É a quantidade total de modelos! Mais de 1 milhão com 20 variáveis.
sum(sapply(1:k, choose, n = k))
## [1] 1048575
leaps
#-----------------------------------------------------------------------
# Recursos para avaliar todas as combinações possíveis.
library(leaps)
ls("package:leaps")
## [1] "leaps" "regsubsets"
# help(regsubsets, h = "html")
# Melhor modelo de cada tamanho.
# `nvmax` é o limite superior para o tamanho do modelo.
b0 <- regsubsets(prod ~ .,
data = db,
method = "exhaustive",
nvmax = 12)
sel <- summary(b0)
sel
## Subset selection object
## Call: regsubsets.formula(prod ~ ., data = db, method = "exhaustive",
## nvmax = 12)
## 20 Variables (and intercept)
## Forced in Forced out
## Ur FALSE FALSE
## Us FALSE FALSE
## alp FALSE FALSE
## n FALSE FALSE
## I FALSE FALSE
## Ui FALSE FALSE
## S FALSE FALSE
## ph FALSE FALSE
## p FALSE FALSE
## k FALSE FALSE
## ca FALSE FALSE
## mg FALSE FALSE
## al FALSE FALSE
## ctc FALSE FALSE
## sat FALSE FALSE
## mo FALSE FALSE
## arg FALSE FALSE
## are FALSE FALSE
## cas FALSE FALSE
## acc FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
## Ur Us alp n I Ui S ph p k ca mg al ctc sat mo
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " " "*" " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " " "*" " " " "
## 4 ( 1 ) " " " " " " "*" " " " " " " "*" "*" " " " " " " " " "*" " " " "
## 5 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " " " " " " "*" " " " "
## 6 ( 1 ) " " " " " " " " " " " " " " "*" "*" " " " " " " " " "*" " " " "
## 7 ( 1 ) " " " " " " " " "*" " " " " "*" "*" " " " " " " " " "*" " " " "
## 8 ( 1 ) " " " " "*" " " " " "*" " " "*" "*" " " " " " " " " "*" " " " "
## 9 ( 1 ) " " " " " " "*" "*" "*" " " "*" "*" " " " " " " " " "*" " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " " "*" "*" " " " " " " " " "*" " " " "
## 11 ( 1 ) "*" "*" "*" " " " " "*" "*" "*" "*" " " "*" " " " " " " " " " "
## 12 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" " " "*" " " " " " " " " " "
## arg are cas acc
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) "*" " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " "*" "*" "*"
## 6 ( 1 ) " " "*" "*" "*"
## 7 ( 1 ) " " "*" "*" "*"
## 8 ( 1 ) " " "*" "*" "*"
## 9 ( 1 ) " " "*" "*" "*"
## 10 ( 1 ) " " "*" "*" "*"
## 11 ( 1 ) " " "*" "*" "*"
## 12 ( 1 ) " " "*" "*" "*"
# Extrai as medidas de ajuste.
mea <- sapply(c("rsq", "rss", "adjr2", "cp", "bic"),
FUN = function(i) sel[[i]])
mea <- cbind(mod = 1:nrow(mea), as.data.frame(mea))
mea
## mod rsq rss adjr2 cp bic
## 1 1 0.5863481 26157.17 0.5773556 0.6865037 -34.62866
## 2 2 0.6553706 21792.54 0.6400537 -4.7699474 -39.52011
## 3 3 0.6626247 21333.83 0.6396219 -3.5536049 -36.67004
## 4 4 0.6731607 20667.59 0.6427571 -2.6918010 -34.32176
## 5 5 0.6773063 20405.45 0.6388903 -1.1396425 -31.06327
## 6 6 0.6886258 19689.66 0.6430589 -0.3624878 -28.90607
## 7 7 0.6992983 19014.79 0.6466755 0.4845787 -26.70894
## 8 8 0.7057321 18607.95 0.6453695 1.7895341 -23.87590
## 9 9 0.7105030 18306.26 0.6419380 3.2741373 -20.78929
## 10 10 0.7181393 17823.39 0.6419607 4.4492018 -18.20121
## 11 11 0.7232111 17502.67 0.6386368 5.9012907 -15.20160
## 12 12 0.7257293 17343.43 0.6316936 7.6292562 -11.76909
# Exibe as medidas de ajuste.
f <- paste(paste(colnames(mea)[-1], collapse = " + "), " ~ mod")
xyplot(as.formula(f),
data = mea,
outer = TRUE,
scales = "free",
type = "h",
lwd = 3,
as.table = TRUE,
xlab = "Modelo",
ylab = "Medida de ajuste")
str(sel)
## List of 8
## $ which : logi [1:12, 1:21] TRUE TRUE TRUE TRUE TRUE TRUE ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:12] "1" "2" "3" "4" ...
## .. ..$ : chr [1:21] "(Intercept)" "Ur" "Us" "alp" ...
## $ rsq : num [1:12] 0.586 0.655 0.663 0.673 0.677 ...
## $ rss : num [1:12] 26157 21793 21334 20668 20405 ...
## $ adjr2 : num [1:12] 0.577 0.64 0.64 0.643 0.639 ...
## $ cp : num [1:12] 0.687 -4.77 -3.554 -2.692 -1.14 ...
## $ bic : num [1:12] -34.6 -39.5 -36.7 -34.3 -31.1 ...
## $ outmat: chr [1:12, 1:20] " " " " " " " " ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:12] "1 ( 1 )" "2 ( 1 )" "3 ( 1 )" "4 ( 1 )" ...
## .. ..$ : chr [1:20] "Ur" "Us" "alp" "n" ...
## $ obj :List of 28
## ..$ np : int 21
## ..$ nrbar : int 210
## ..$ d : num [1:21] 48 47 47 43.4 43.9 ...
## ..$ rbar : num [1:210] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ thetab : num [1:21] 82.814 -2.417 -6.133 4.685 0.211 ...
## ..$ first : int 2
## ..$ last : int 21
## ..$ vorder : int [1:21] 1 8 3 10 20 7 15 18 11 19 ...
## ..$ tol : num [1:21] 3.46e-09 6.66e-09 3.60e-09 9.31e-09 9.17e-09 ...
## ..$ rss : num [1:21] 63235 62960 61193 60241 60239 ...
## ..$ bound : num [1:21] 63235 26157 21793 21334 20668 ...
## ..$ nvmax : int 13
## ..$ ress : num [1:13, 1] 63235 26157 21793 21334 20668 ...
## ..$ ir : int 13
## ..$ nbest : int 1
## ..$ lopt : int [1:91, 1] 1 1 12 1 9 15 1 9 18 15 ...
## ..$ il : int 91
## ..$ ier : int 0
## ..$ xnames : chr [1:21] "(Intercept)" "Ur" "Us" "alp" ...
## ..$ method : chr "exhaustive"
## ..$ force.in : Named logi [1:21] TRUE FALSE FALSE FALSE FALSE FALSE ...
## .. ..- attr(*, "names")= chr [1:21] "" "Ur" "Us" "alp" ...
## ..$ force.out: Named logi [1:21] FALSE FALSE FALSE FALSE FALSE FALSE ...
## .. ..- attr(*, "names")= chr [1:21] "" "Ur" "Us" "alp" ...
## ..$ sserr : num 15804
## ..$ intercept: logi TRUE
## ..$ lindep : logi [1:21] FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..$ nullrss : num 63235
## ..$ nn : int 48
## ..$ call : language regsubsets.formula(prod ~ ., data = db, method = "exhaustive", nvmax = 12)
## ..- attr(*, "class")= chr "regsubsets"
## - attr(*, "class")= chr "summary.regsubsets"
# Quais as variáveis do melhor modelo?
v <- names(which(sel$which[2, ])[-1])
v
## [1] "ph" "ctc"
summary(lm(prod ~ ., data = db[, c("prod", v)]))
##
## Call:
## lm(formula = prod ~ ., data = db[, c("prod", v)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.972 -13.535 -2.684 15.982 54.436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.814 3.176 26.072 < 2e-16 ***
## ph 14.833 4.587 3.234 0.002291 **
## ctc 17.225 4.587 3.755 0.000495 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.01 on 45 degrees of freedom
## Multiple R-squared: 0.6554, Adjusted R-squared: 0.6401
## F-statistic: 42.79 on 2 and 45 DF, p-value: 3.894e-11
A validação cruzada será usada para selecionar o número ideal de variáveis para a predição. Dessa forma, a função custo será o erro quadrático médio (MSE) da predição no conjunto de teste.
IMPORTANTE: Em cada fold gerado pelas partições o conjunto das variáveis selecionadas não necessariamente é o mesmo. Ou seja, no fold 1, as melhores 3 variáveis podem ser x1
, x4
e x8
e no fold 2 podem ser x4
, x7
e x10
. Isso deve acontecer principalmente entre as variáveis preditoras mais correlacionadas. No entanto, isso não é um problema porque a validação cruzada será feita para selecionar o número ideal de variáveis. Depois será feita a determinação de quais são essas variáveis.
Aqui será usando validação cruzada 5-fold com 10 repetições independentes.
# Para fazer a predição no conjunto de teste com o ajuste do conjunto de
# treino que considera as variáveis selecionadas.
mse_fit_reg <- function(variables, dt_train, dt_test) {
# print(variables)
# print(str(train[, c("prod", variables)]))
# Ajusta o modelo com os dados de treino.
tra <- dt_train[, c("prod", variables)]
m0 <- lm(prod ~ ., data = tra)
# Predição com os dados de teste.
fitted_vals <- predict(m0, newdata = dt_test[, c("prod", variables)])
# Mean esquare error.
crossprod((dt_test$prod - fitted_vals))/nrow(dt_test)
}
# Ajusta o modelo em cada fold e retorna a medida de ajuste.
fit_fold <- function(fold, indexes) {
# Vetor lógico para particionar treino/teste.
is_test <- indexes == fold
dt_test <- db[is_test, ] # Teste.
dt_train <- db[!is_test, ] # Treino.
# Os melhores ajuste com tamanho máximo de 12 variáveis.
fit <- regsubsets(prod ~ .,
data = dt_train,
method = "exhaustive",
nvmax = 12)
sel <- summary(fit)
# Variáveis mantidas em cada ajuste.
variables <- lapply(apply(sel$which[, -1],
MARGIN = 1,
FUN = which),
FUN = names)
# Medida da função custo nos dados de teste.
mse_test <- sapply(variables,
FUN = mse_fit_reg,
dt_train = dt_train,
dt_test = dt_test)
return(mse_test)
}
n <- nrow(db) # Número de observações.
k <- 5 # Número de folds
i <- ceiling((1:n)/(n/k)) # Indicador de fold
res <- replicate(n = 10, simplify = FALSE, {
ii <- sample(i) # Embaralha os índices.
res <- lapply(1:k, fit_fold, indexes = ii)
res <- as.data.frame(do.call(cbind, res))
res$vars <- 1:nrow(res)
res
})
res <- do.call(rbind, res)
xyplot(V1 + V2 + V3 + V4 + V5 ~ vars,
data = res,
xlab = "Número de variáveis no modelo",
ylab = "Função custo no conjunto de teste",
type = c("p", "a")) +
layer({
m <- aggregate(y ~ x, FUN = mean)
panel.lines(m$x, m$y, col = 1, lwd = 2)
})
# Quais são as melhores variáveis então?
fit <- regsubsets(prod ~ .,
data = db,
method = "exhaustive",
nvmax = 2)
sel <- summary(fit)
v <- names(which(sel$which[2, ])[-1])
summary(lm(prod ~ .,
data = db[, c("prod", v)]))
##
## Call:
## lm(formula = prod ~ ., data = db[, c("prod", v)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.972 -13.535 -2.684 15.982 54.436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.814 3.176 26.072 < 2e-16 ***
## ph 14.833 4.587 3.234 0.002291 **
## ctc 17.225 4.587 3.755 0.000495 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.01 on 45 degrees of freedom
## Multiple R-squared: 0.6554, Adjusted R-squared: 0.6401
## F-statistic: 42.79 on 2 and 45 DF, p-value: 3.894e-11
summary(m0)
##
## Call:
## lm(formula = prod ~ ., data = db)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.487 -11.679 -2.066 7.435 45.875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.8139 3.4921 23.715 <2e-16 ***
## Ur -277.1771 265.6470 -1.043 0.306
## Us -407.1099 375.2216 -1.085 0.288
## alp -229.6934 254.4104 -0.903 0.375
## n -71.0049 75.3781 -0.942 0.355
## I -187.8809 214.1330 -0.877 0.388
## Ui 529.2090 486.7247 1.087 0.287
## S -112.2970 96.0927 -1.169 0.253
## ph 24.4055 10.1260 2.410 0.023 *
## p -3.8043 8.2028 -0.464 0.647
## k -3.5993 6.1756 -0.583 0.565
## ca -45.3885 52.9315 -0.857 0.399
## mg -15.5749 16.1081 -0.967 0.342
## al 3.7825 4.6485 0.814 0.423
## ctc 53.3969 47.5504 1.123 0.271
## sat 18.0690 19.3197 0.935 0.358
## mo -4.4989 7.3634 -0.611 0.546
## arg 0.6019 9.8178 0.061 0.952
## are -152.8940 151.6733 -1.008 0.322
## cas -31.3163 33.3893 -0.938 0.357
## acc 151.9501 149.7914 1.014 0.319
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.19 on 27 degrees of freedom
## Multiple R-squared: 0.7501, Adjusted R-squared: 0.5649
## F-statistic: 4.051 on 20 and 27 DF, p-value: 0.0004454
# Critério de seleção AIC: k = 2.
m1 <- step(m0, direction = "both")
## Start: AIC=320.25
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + p + k + ca + mg +
## al + ctc + sat + mo + arg + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - arg 1 2.2 15807 318.26
## - p 1 125.9 15930 318.63
## - k 1 198.8 16003 318.85
## - mo 1 218.5 16023 318.91
## - al 1 387.6 16192 319.41
## - ca 1 430.4 16235 319.54
## - I 1 450.6 16255 319.60
## - alp 1 477.1 16282 319.68
## - sat 1 512.0 16316 319.78
## - cas 1 514.9 16319 319.79
## - n 1 519.4 16324 319.80
## - mg 1 547.2 16352 319.88
## - are 1 594.8 16399 320.02
## - acc 1 602.3 16407 320.04
## - Ur 1 637.3 16442 320.15
## <none> 15804 320.25
## - Us 1 689.1 16494 320.30
## - Ui 1 692.0 16496 320.31
## - ctc 1 738.1 16542 320.44
## - S 1 799.4 16604 320.62
## - ph 1 3400.3 19205 327.60
##
## Step: AIC=318.26
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + p + k + ca + mg +
## al + ctc + sat + mo + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - p 1 138.1 15945 316.67
## - mo 1 218.7 16025 316.91
## - k 1 227.4 16034 316.94
## - al 1 390.4 16197 317.43
## - ca 1 428.5 16235 317.54
## - I 1 459.3 16266 317.63
## - alp 1 485.8 16292 317.71
## - sat 1 509.8 16316 317.78
## - cas 1 542.4 16349 317.87
## - n 1 543.7 16350 317.88
## - mg 1 547.5 16354 317.89
## - acc 1 632.8 16439 318.14
## - are 1 643.2 16450 318.17
## - Ur 1 654.8 16461 318.20
## <none> 15807 318.26
## - Ui 1 713.4 16520 318.37
## - Us 1 713.5 16520 318.37
## - ctc 1 736.0 16543 318.44
## - S 1 839.6 16646 318.74
## + arg 1 2.2 15804 320.25
## - ph 1 3401.5 19208 325.61
##
## Step: AIC=316.67
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + k + ca + mg + al +
## ctc + sat + mo + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - k 1 277.9 16223 315.50
## - mo 1 386.5 16331 315.82
## - cas 1 424.1 16369 315.93
## - al 1 428.8 16374 315.95
## - acc 1 507.9 16452 316.18
## - are 1 520.1 16465 316.21
## - ca 1 530.0 16475 316.24
## - sat 1 553.6 16498 316.31
## - mg 1 584.3 16529 316.40
## <none> 15945 316.67
## - I 1 899.1 16844 317.31
## - ctc 1 920.3 16865 317.37
## - alp 1 929.6 16874 317.39
## - Ur 1 1074.9 17020 317.80
## - Ui 1 1199.5 17144 318.15
## - Us 1 1225.5 17170 318.23
## + p 1 138.1 15807 318.26
## + arg 1 14.3 15930 318.63
## - n 1 1684.2 17629 319.49
## - S 1 1969.1 17914 320.26
## - ph 1 3322.9 19268 323.76
##
## Step: AIC=315.5
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ca + mg + al + ctc +
## sat + mo + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - mo 1 329.14 16552 314.47
## - al 1 379.31 16602 314.61
## - ca 1 442.05 16665 314.79
## - mg 1 486.78 16709 314.92
## - sat 1 500.75 16723 314.96
## - cas 1 624.60 16847 315.32
## <none> 16223 315.50
## - acc 1 695.30 16918 315.52
## - are 1 715.82 16938 315.57
## - ctc 1 754.20 16977 315.68
## - I 1 893.21 17116 316.07
## - alp 1 923.70 17146 316.16
## - Ur 1 1042.24 17265 316.49
## + k 1 277.90 15945 316.67
## - Ui 1 1158.05 17381 316.81
## - Us 1 1185.83 17408 316.89
## + p 1 188.53 16034 316.94
## + arg 1 72.15 16150 317.29
## - n 1 1639.95 17862 318.12
## - S 1 1871.31 18094 318.74
## - ph 1 3053.35 19276 321.78
##
## Step: AIC=314.47
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ca + mg + al + ctc +
## sat + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - al 1 505.1 17057 313.91
## - cas 1 511.2 17063 313.93
## - acc 1 590.8 17142 314.15
## - are 1 617.3 17169 314.22
## - I 1 680.8 17232 314.40
## - sat 1 689.8 17242 314.43
## <none> 16552 314.47
## - alp 1 707.4 17259 314.47
## - ca 1 757.0 17309 314.61
## - Ur 1 842.2 17394 314.85
## - mg 1 854.2 17406 314.88
## - Ui 1 942.1 17494 315.12
## - Us 1 967.3 17519 315.19
## - ctc 1 982.6 17534 315.23
## + p 1 350.5 16201 315.44
## + mo 1 329.1 16223 315.50
## + k 1 220.6 16331 315.82
## - n 1 1335.4 17887 316.19
## + arg 1 28.9 16523 316.38
## - S 1 1572.1 18124 316.82
## - ph 1 3193.6 19745 320.93
##
## Step: AIC=313.91
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ca + mg + ctc +
## sat + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - sat 1 431.90 17489 313.11
## - cas 1 502.36 17559 313.30
## - ca 1 543.74 17601 313.42
## - acc 1 550.92 17608 313.43
## - are 1 573.54 17630 313.50
## - I 1 587.80 17645 313.54
## - alp 1 613.44 17670 313.60
## - mg 1 613.95 17671 313.61
## <none> 17057 313.91
## - Ur 1 758.89 17816 314.00
## - ctc 1 762.26 17819 314.01
## - Ui 1 847.42 17904 314.24
## - Us 1 868.57 17925 314.29
## + al 1 505.12 16552 314.47
## + mo 1 454.96 16602 314.61
## + p 1 444.84 16612 314.64
## - n 1 1128.12 18185 314.98
## + k 1 161.53 16895 315.45
## - S 1 1402.45 18459 315.70
## + arg 1 29.54 17027 315.83
## - ph 1 2938.39 19995 319.54
##
## Step: AIC=313.11
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ca + mg + ctc +
## are + cas + acc
##
## Df Sum of Sq RSS AIC
## - ca 1 128.5 17617 311.46
## - mg 1 185.8 17675 311.62
## - ctc 1 330.9 17820 312.01
## - cas 1 512.1 18001 312.49
## - acc 1 567.0 18056 312.64
## - are 1 591.0 18080 312.70
## <none> 17489 313.11
## + mo 1 587.2 16902 313.47
## + p 1 528.0 16961 313.64
## + sat 1 431.9 17057 313.91
## - I 1 1098.5 18587 314.03
## - alp 1 1151.1 18640 314.17
## + al 1 247.3 17242 314.43
## - Ur 1 1336.6 18825 314.64
## + k 1 126.7 17362 314.76
## - n 1 1396.6 18885 314.80
## - Ui 1 1407.3 18896 314.82
## - Us 1 1417.2 18906 314.85
## + arg 1 10.1 17479 315.08
## - S 1 1642.0 19131 315.42
## - ph 1 4927.7 22416 323.02
##
## Step: AIC=311.46
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + mg + ctc + are +
## cas + acc
##
## Df Sum of Sq RSS AIC
## - mg 1 57.4 17675 309.62
## - cas 1 591.8 18209 311.05
## - acc 1 651.1 18268 311.20
## - are 1 669.4 18287 311.25
## - ctc 1 721.6 18339 311.39
## <none> 17617 311.46
## + mo 1 688.8 16928 311.55
## + p 1 626.6 16991 311.72
## - I 1 1096.3 18714 312.36
## - alp 1 1151.8 18769 312.50
## + al 1 255.3 17362 312.76
## - Ur 1 1339.5 18957 312.98
## + ca 1 128.5 17489 313.11
## - Ui 1 1419.9 19037 313.18
## + k 1 88.5 17529 313.22
## - Us 1 1436.3 19054 313.22
## + sat 1 16.7 17601 313.42
## - n 1 1514.9 19132 313.42
## + arg 1 8.1 17609 313.44
## - S 1 1736.5 19354 313.97
## - ph 1 7964.8 25582 327.37
##
## Step: AIC=309.62
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ctc + are + cas +
## acc
##
## Df Sum of Sq RSS AIC
## - ctc 1 676.8 18351 309.42
## - cas 1 723.3 18398 309.54
## + mo 1 732.3 16942 309.59
## <none> 17675 309.62
## - acc 1 795.6 18470 309.73
## - are 1 816.8 18491 309.79
## + p 1 525.7 17149 310.17
## - I 1 1041.7 18716 310.37
## - alp 1 1097.9 18772 310.51
## + al 1 246.4 17428 310.94
## - Ur 1 1293.4 18968 311.01
## - Ui 1 1377.0 19052 311.22
## - Us 1 1393.1 19068 311.26
## + k 1 76.2 17598 311.41
## - n 1 1464.6 19139 311.44
## + mg 1 57.4 17617 311.46
## + sat 1 3.5 17671 311.61
## + arg 1 0.1 17675 311.62
## + ca 1 0.0 17675 311.62
## - S 1 1696.7 19371 312.02
## - ph 1 7922.4 25597 325.39
##
## Step: AIC=309.42
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## <none> 18351 309.42
## - I 1 857.7 19209 309.61
## + ctc 1 676.8 17675 309.62
## - alp 1 918.7 19270 309.77
## + p 1 606.9 17744 309.81
## + ca 1 518.4 17833 310.05
## - Ur 1 1147.4 19499 310.33
## - Ui 1 1254.5 19606 310.59
## + al 1 298.5 18053 310.63
## - Us 1 1278.8 19630 310.65
## + sat 1 140.9 18211 311.05
## - n 1 1496.3 19848 311.18
## + mo 1 46.0 18305 311.30
## + mg 1 12.6 18339 311.39
## + arg 1 12.2 18339 311.39
## - cas 1 1590.7 19942 311.41
## + k 1 0.5 18351 311.42
## - acc 1 1668.2 20020 311.60
## - are 1 1774.3 20126 311.85
## - S 1 1778.6 20130 311.86
## - ph 1 21933.6 40285 345.16
summary(m1)
##
## Call:
## lm(formula = prod ~ Ur + Us + alp + n + I + Ui + S + ph + are +
## cas + acc, data = db)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.879 -14.736 -2.808 11.697 51.090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.814 3.259 25.412 < 2e-16 ***
## Ur -291.046 193.993 -1.500 0.1423
## Us -428.370 270.464 -1.584 0.1220
## alp -240.393 179.071 -1.342 0.1879
## n -77.122 45.014 -1.713 0.0953 .
## I -194.908 150.260 -1.297 0.2028
## Ui 553.542 352.863 1.569 0.1255
## S -116.858 62.562 -1.868 0.0699 .
## ph 25.891 3.947 6.560 1.25e-07 ***
## are -210.011 112.568 -1.866 0.0703 .
## cas -44.003 24.909 -1.767 0.0858 .
## acc 203.894 112.712 1.809 0.0788 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.58 on 36 degrees of freedom
## Multiple R-squared: 0.7098, Adjusted R-squared: 0.6211
## F-statistic: 8.004 on 11 and 36 DF, p-value: 8.054e-07
# Critério de seleção BIC: k = log(n).
m2 <- step(m0, direction = "both", k = log(nrow(db)))
## Start: AIC=359.54
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + p + k + ca + mg +
## al + ctc + sat + mo + arg + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - arg 1 2.2 15807 355.68
## - p 1 125.9 15930 356.05
## - k 1 198.8 16003 356.27
## - mo 1 218.5 16023 356.33
## - al 1 387.6 16192 356.84
## - ca 1 430.4 16235 356.96
## - I 1 450.6 16255 357.02
## - alp 1 477.1 16282 357.10
## - sat 1 512.0 16316 357.20
## - cas 1 514.9 16319 357.21
## - n 1 519.4 16324 357.22
## - mg 1 547.2 16352 357.31
## - are 1 594.8 16399 357.45
## - acc 1 602.3 16407 357.47
## - Ur 1 637.3 16442 357.57
## - Us 1 689.1 16494 357.72
## - Ui 1 692.0 16496 357.73
## - ctc 1 738.1 16542 357.86
## - S 1 799.4 16604 358.04
## <none> 15804 359.54
## - ph 1 3400.3 19205 365.03
##
## Step: AIC=355.68
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + p + k + ca + mg +
## al + ctc + sat + mo + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - p 1 138.1 15945 352.23
## - mo 1 218.7 16025 352.47
## - k 1 227.4 16034 352.49
## - al 1 390.4 16197 352.98
## - ca 1 428.5 16235 353.09
## - I 1 459.3 16266 353.18
## - alp 1 485.8 16292 353.26
## - sat 1 509.8 16316 353.33
## - cas 1 542.4 16349 353.43
## - n 1 543.7 16350 353.43
## - mg 1 547.5 16354 353.44
## - acc 1 632.8 16439 353.69
## - are 1 643.2 16450 353.72
## - Ur 1 654.8 16461 353.76
## - Ui 1 713.4 16520 353.93
## - Us 1 713.5 16520 353.93
## - ctc 1 736.0 16543 353.99
## - S 1 839.6 16646 354.29
## <none> 15807 355.68
## + arg 1 2.2 15804 359.54
## - ph 1 3401.5 19208 361.16
##
## Step: AIC=352.23
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + k + ca + mg + al +
## ctc + sat + mo + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - k 1 277.9 16223 349.18
## - mo 1 386.5 16331 349.50
## - cas 1 424.1 16369 349.61
## - al 1 428.8 16374 349.63
## - acc 1 507.9 16452 349.86
## - are 1 520.1 16465 349.89
## - ca 1 530.0 16475 349.92
## - sat 1 553.6 16498 349.99
## - mg 1 584.3 16529 350.08
## - I 1 899.1 16844 350.99
## - ctc 1 920.3 16865 351.05
## - alp 1 929.6 16874 351.07
## - Ur 1 1074.9 17020 351.49
## - Ui 1 1199.5 17144 351.84
## - Us 1 1225.5 17170 351.91
## <none> 15945 352.23
## - n 1 1684.2 17629 353.17
## - S 1 1969.1 17914 353.94
## + p 1 138.1 15807 355.68
## + arg 1 14.3 15930 356.05
## - ph 1 3322.9 19268 357.44
##
## Step: AIC=349.18
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ca + mg + al + ctc +
## sat + mo + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - mo 1 329.14 16552 346.28
## - al 1 379.31 16602 346.42
## - ca 1 442.05 16665 346.60
## - mg 1 486.78 16709 346.73
## - sat 1 500.75 16723 346.77
## - cas 1 624.60 16847 347.13
## - acc 1 695.30 16918 347.33
## - are 1 715.82 16938 347.38
## - ctc 1 754.20 16977 347.49
## - I 1 893.21 17116 347.89
## - alp 1 923.70 17146 347.97
## - Ur 1 1042.24 17265 348.30
## - Ui 1 1158.05 17381 348.62
## - Us 1 1185.83 17408 348.70
## <none> 16223 349.18
## - n 1 1639.95 17862 349.93
## - S 1 1871.31 18094 350.55
## + k 1 277.90 15945 352.23
## + p 1 188.53 16034 352.49
## + arg 1 72.15 16150 352.84
## - ph 1 3053.35 19276 353.59
##
## Step: AIC=346.28
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ca + mg + al + ctc +
## sat + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - al 1 505.1 17057 343.85
## - cas 1 511.2 17063 343.87
## - acc 1 590.8 17142 344.09
## - are 1 617.3 17169 344.16
## - I 1 680.8 17232 344.34
## - sat 1 689.8 17242 344.37
## - alp 1 707.4 17259 344.41
## - ca 1 757.0 17309 344.55
## - Ur 1 842.2 17394 344.79
## - mg 1 854.2 17406 344.82
## - Ui 1 942.1 17494 345.06
## - Us 1 967.3 17519 345.13
## - ctc 1 982.6 17534 345.17
## - n 1 1335.4 17887 346.13
## <none> 16552 346.28
## - S 1 1572.1 18124 346.76
## + p 1 350.5 16201 349.12
## + mo 1 329.1 16223 349.18
## + k 1 220.6 16331 349.50
## + arg 1 28.9 16523 350.06
## - ph 1 3193.6 19745 350.87
##
## Step: AIC=343.85
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ca + mg + ctc +
## sat + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - sat 1 431.90 17489 341.18
## - cas 1 502.36 17559 341.37
## - ca 1 543.74 17601 341.48
## - acc 1 550.92 17608 341.50
## - are 1 573.54 17630 341.56
## - I 1 587.80 17645 341.60
## - alp 1 613.44 17670 341.67
## - mg 1 613.95 17671 341.67
## - Ur 1 758.89 17816 342.07
## - ctc 1 762.26 17819 342.08
## - Ui 1 847.42 17904 342.30
## - Us 1 868.57 17925 342.36
## - n 1 1128.12 18185 343.05
## - S 1 1402.45 18459 343.77
## <none> 17057 343.85
## + al 1 505.12 16552 346.28
## + mo 1 454.96 16602 346.42
## + p 1 444.84 16612 346.45
## + k 1 161.53 16895 347.26
## - ph 1 2938.39 19995 347.61
## + arg 1 29.54 17027 347.64
##
## Step: AIC=341.18
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ca + mg + ctc +
## are + cas + acc
##
## Df Sum of Sq RSS AIC
## - ca 1 128.5 17617 337.66
## - mg 1 185.8 17675 337.81
## - ctc 1 330.9 17820 338.21
## - cas 1 512.1 18001 338.69
## - acc 1 567.0 18056 338.84
## - are 1 591.0 18080 338.90
## - I 1 1098.5 18587 340.23
## - alp 1 1151.1 18640 340.37
## - Ur 1 1336.6 18825 340.84
## - n 1 1396.6 18885 340.99
## - Ui 1 1407.3 18896 341.02
## - Us 1 1417.2 18906 341.05
## <none> 17489 341.18
## - S 1 1642.0 19131 341.61
## + mo 1 587.2 16902 343.41
## + p 1 528.0 16961 343.58
## + sat 1 431.9 17057 343.85
## + al 1 247.3 17242 344.37
## + k 1 126.7 17362 344.70
## + arg 1 10.1 17479 345.02
## - ph 1 4927.7 22416 349.22
##
## Step: AIC=337.66
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + mg + ctc + are +
## cas + acc
##
## Df Sum of Sq RSS AIC
## - mg 1 57.4 17675 333.94
## - cas 1 591.8 18209 335.37
## - acc 1 651.1 18268 335.53
## - are 1 669.4 18287 335.58
## - ctc 1 721.6 18339 335.71
## - I 1 1096.3 18714 336.68
## - alp 1 1151.8 18769 336.83
## - Ur 1 1339.5 18957 337.30
## - Ui 1 1419.9 19037 337.51
## - Us 1 1436.3 19054 337.55
## <none> 17617 337.66
## - n 1 1514.9 19132 337.75
## - S 1 1736.5 19354 338.30
## + mo 1 688.8 16928 339.61
## + p 1 626.6 16991 339.79
## + al 1 255.3 17362 340.83
## + ca 1 128.5 17489 341.18
## + k 1 88.5 17529 341.29
## + sat 1 16.7 17601 341.48
## + arg 1 8.1 17609 341.51
## - ph 1 7964.8 25582 351.69
##
## Step: AIC=333.94
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + ctc + are + cas +
## acc
##
## Df Sum of Sq RSS AIC
## - ctc 1 676.8 18351 331.87
## - cas 1 723.3 18398 332.00
## - acc 1 795.6 18470 332.18
## - are 1 816.8 18491 332.24
## - I 1 1041.7 18716 332.82
## - alp 1 1097.9 18772 332.96
## - Ur 1 1293.4 18968 333.46
## - Ui 1 1377.0 19052 333.67
## - Us 1 1393.1 19068 333.71
## - n 1 1464.6 19139 333.89
## <none> 17675 333.94
## - S 1 1696.7 19371 334.47
## + mo 1 732.3 16942 335.78
## + p 1 525.7 17149 336.36
## + al 1 246.4 17428 337.14
## + k 1 76.2 17598 337.61
## + mg 1 57.4 17617 337.66
## + sat 1 3.5 17671 337.80
## + arg 1 0.1 17675 337.81
## + ca 1 0.0 17675 337.81
## - ph 1 7922.4 25597 347.85
##
## Step: AIC=331.87
## prod ~ Ur + Us + alp + n + I + Ui + S + ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - I 1 857.7 19209 330.20
## - alp 1 918.7 19270 330.35
## - Ur 1 1147.4 19499 330.91
## - Ui 1 1254.5 19606 331.18
## - Us 1 1278.8 19630 331.24
## - n 1 1496.3 19848 331.77
## <none> 18351 331.87
## - cas 1 1590.7 19942 331.99
## - acc 1 1668.2 20020 332.18
## - are 1 1774.3 20126 332.43
## - S 1 1778.6 20130 332.44
## + ctc 1 676.8 17675 333.94
## + p 1 606.9 17744 334.13
## + ca 1 518.4 17833 334.37
## + al 1 298.5 18053 334.96
## + sat 1 140.9 18211 335.38
## + mo 1 46.0 18305 335.63
## + mg 1 12.6 18339 335.71
## + arg 1 12.2 18339 335.71
## + k 1 0.5 18351 335.74
## - ph 1 21933.6 40285 365.74
##
## Step: AIC=330.2
## prod ~ Ur + Us + alp + n + Ui + S + ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - Ur 1 664.5 19874 327.96
## - n 1 835.3 20044 328.37
## - Ui 1 1059.6 20269 328.90
## - Us 1 1088.3 20297 328.97
## - alp 1 1102.4 20312 329.00
## - S 1 1299.9 20509 329.47
## <none> 19209 330.20
## + p 1 1237.8 17971 330.87
## - acc 1 2222.9 21432 331.58
## - cas 1 2261.8 21471 331.67
## + I 1 857.7 18351 331.87
## - are 1 2364.5 21574 331.90
## + ctc 1 492.9 18716 332.82
## + sat 1 331.4 18878 333.23
## + ca 1 251.7 18957 333.43
## + al 1 129.3 19080 333.74
## + mg 1 93.1 19116 333.83
## + mo 1 4.8 19204 334.06
## + arg 1 2.3 19207 334.06
## + k 1 1.9 19207 334.06
## - ph 1 21404.9 40614 362.26
##
## Step: AIC=327.96
## prod ~ Us + alp + n + Ui + S + ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - alp 1 528.2 20402 325.35
## - Us 1 610.6 20484 325.54
## - S 1 759.6 20633 325.89
## - n 1 791.9 20666 325.96
## - Ui 1 926.4 20800 326.27
## <none> 19874 327.96
## + p 1 767.8 19106 329.94
## + ctc 1 697.8 19176 330.11
## + Ur 1 664.5 19209 330.20
## - acc 1 2713.3 22587 330.23
## - cas 1 2750.8 22624 330.31
## - are 1 2930.0 22804 330.69
## + sat 1 417.9 19456 330.81
## + I 1 374.8 19499 330.91
## + ca 1 336.3 19537 331.01
## + mg 1 199.1 19675 331.35
## + al 1 188.0 19686 331.37
## + k 1 16.5 19857 331.79
## + mo 1 13.5 19860 331.80
## + arg 1 1.7 19872 331.82
## - ph 1 21246.5 41120 358.99
##
## Step: AIC=325.35
## prod ~ Us + n + Ui + S + ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - n 1 1355.2 21757 324.56
## - Us 1 1550.5 21952 324.99
## - S 1 1602.8 22005 325.10
## <none> 20402 325.35
## - Ui 1 1790.6 22193 325.51
## - acc 1 2266.9 22669 326.53
## - cas 1 2317.5 22719 326.64
## - are 1 2468.9 22871 326.96
## + ctc 1 871.7 19530 327.12
## + p 1 714.5 19687 327.51
## + I 1 649.8 19752 327.66
## + alp 1 528.2 19874 327.96
## + sat 1 500.8 19901 328.02
## + ca 1 459.6 19942 328.12
## + mg 1 164.8 20237 328.83
## + al 1 130.2 20272 328.91
## + Ur 1 90.4 20312 329.00
## + arg 1 30.6 20371 329.14
## + k 1 15.2 20387 329.18
## + mo 1 0.4 20402 329.22
## - ph 1 21300.4 41702 355.79
##
## Step: AIC=324.56
## prod ~ Us + Ui + S + ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - Us 1 195.3 21952 321.12
## - S 1 342.3 22099 321.44
## - Ui 1 437.9 22195 321.65
## + p 1 1934.3 19823 323.96
## - acc 1 1700.8 23458 324.30
## - cas 1 1741.3 23498 324.39
## <none> 21757 324.56
## - are 1 1891.6 23649 324.69
## + ctc 1 1608.9 20148 324.74
## + n 1 1355.2 20402 325.35
## + I 1 1353.2 20404 325.35
## + alp 1 1091.5 20666 325.96
## + ca 1 849.8 20907 326.52
## + mg 1 287.3 21470 327.79
## + mo 1 249.0 21508 327.88
## + sat 1 130.8 21626 328.14
## + al 1 88.3 21669 328.24
## + k 1 85.8 21671 328.24
## + arg 1 40.9 21716 328.34
## + Ur 1 5.0 21752 328.42
## - ph 1 20030.9 41788 352.02
##
## Step: AIC=321.12
## prod ~ Ui + S + ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - S 1 164.0 22116 317.61
## - Ui 1 306.3 22259 317.91
## - acc 1 1565.9 23518 320.56
## - cas 1 1620.2 23573 320.67
## - are 1 1751.1 23704 320.93
## + ctc 1 1715.9 20237 321.08
## <none> 21952 321.12
## + I 1 1548.5 20404 321.48
## + alp 1 1271.2 20681 322.13
## + p 1 959.3 20993 322.85
## + ca 1 942.8 21010 322.88
## + mg 1 268.5 21684 324.40
## + mo 1 234.2 21718 324.48
## + Us 1 195.3 21757 324.56
## + Ur 1 182.5 21770 324.59
## + sat 1 131.6 21821 324.70
## + al 1 117.8 21835 324.73
## + k 1 88.9 21864 324.80
## + arg 1 48.7 21904 324.88
## + n 1 0.0 21952 324.99
## - ph 1 20940.9 42893 349.40
##
## Step: AIC=317.61
## prod ~ Ui + ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## - Ui 1 165.6 22282 314.09
## + ctc 1 1745.7 20371 317.53
## <none> 22116 317.61
## + I 1 1587.7 20529 317.90
## - cas 1 2090.8 24207 318.07
## - acc 1 2116.5 24233 318.12
## + alp 1 1412.7 20704 318.31
## - are 1 2348.4 24465 318.58
## + ca 1 1003.4 21113 319.25
## + p 1 928.2 21188 319.42
## + mo 1 343.9 21772 320.72
## + mg 1 275.2 21841 320.88
## + al 1 200.8 21916 321.04
## + S 1 164.0 21952 321.12
## + arg 1 139.7 21977 321.17
## + k 1 138.9 21978 321.17
## + sat 1 98.8 22018 321.26
## + n 1 97.2 22019 321.27
## + Ur 1 81.0 22035 321.30
## + Us 1 17.0 22099 321.44
## - ph 1 21090.6 43207 345.88
##
## Step: AIC=314.09
## prod ~ ph + are + cas + acc
##
## Df Sum of Sq RSS AIC
## + ctc 1 1876.6 20405 313.74
## <none> 22282 314.09
## - cas 1 1953.7 24236 314.26
## - acc 1 1977.6 24260 314.30
## - are 1 2201.9 24484 314.74
## + ca 1 1020.0 21262 315.71
## + I 1 1009.8 21272 315.74
## + p 1 958.7 21323 315.85
## + alp 1 724.8 21557 316.38
## + mo 1 430.1 21852 317.03
## + mg 1 401.5 21881 317.09
## + k 1 220.3 22062 317.49
## + Ur 1 217.9 22064 317.49
## + arg 1 212.8 22069 317.50
## + al 1 168.9 22113 317.60
## + Ui 1 165.6 22116 317.61
## + sat 1 121.1 22161 317.70
## + Us 1 57.4 22225 317.84
## + S 1 23.3 22259 317.91
## + n 1 3.4 22279 317.96
## - ph 1 20959.6 43242 342.05
##
## Step: AIC=313.74
## prod ~ ph + are + cas + acc + ctc
##
## Df Sum of Sq RSS AIC
## - cas 1 923.2 21329 311.99
## - acc 1 965.7 21371 312.09
## - are 1 1020.7 21426 312.21
## <none> 20405 313.74
## - ctc 1 1876.6 22282 314.09
## + p 1 715.8 19690 315.90
## + I 1 551.2 19854 316.30
## + alp 1 433.8 19972 316.58
## + ca 1 153.6 20252 317.25
## + al 1 107.5 20298 317.36
## + Ur 1 55.3 20350 317.48
## + S 1 51.9 20354 317.49
## + arg 1 46.6 20359 317.50
## + mo 1 36.0 20369 317.53
## + Ui 1 34.7 20371 317.53
## + n 1 24.7 20381 317.55
## + Us 1 16.1 20389 317.57
## + k 1 10.8 20395 317.59
## + mg 1 6.8 20399 317.60
## + sat 1 0.2 20405 317.61
## - ph 1 5977.3 26383 322.20
##
## Step: AIC=311.99
## prod ~ ph + are + acc + ctc
##
## Df Sum of Sq RSS AIC
## - acc 1 47.8 21376 308.23
## - are 1 160.4 21489 308.48
## <none> 21329 311.99
## + cas 1 923.2 20405 313.74
## - ctc 1 2907.1 24236 314.26
## + p 1 312.6 21016 315.16
## + S 1 279.5 21049 315.23
## + alp 1 197.3 21131 315.42
## + I 1 185.6 21143 315.44
## + al 1 164.8 21164 315.49
## + n 1 161.0 21168 315.50
## + k 1 86.3 21242 315.67
## + arg 1 85.2 21243 315.67
## + ca 1 46.9 21282 315.76
## + Us 1 20.9 21308 315.82
## + Ur 1 10.4 21318 315.84
## + mo 1 9.0 21320 315.84
## + mg 1 8.3 21320 315.85
## + Ui 1 0.2 21328 315.86
## + sat 1 0.0 21329 315.86
## - ph 1 5444.6 26773 319.03
##
## Step: AIC=308.23
## prod ~ ph + are + ctc
##
## Df Sum of Sq RSS AIC
## - are 1 416.1 21792 305.28
## <none> 21376 308.23
## - ctc 1 2886.2 24263 310.44
## + S 1 320.5 21056 311.38
## + p 1 294.6 21082 311.43
## + n 1 179.8 21197 311.69
## + alp 1 169.9 21207 311.72
## + I 1 151.9 21224 311.76
## + al 1 145.4 21231 311.77
## + arg 1 80.4 21296 311.92
## + k 1 51.4 21325 311.99
## + acc 1 47.8 21329 311.99
## + ca 1 34.8 21342 312.02
## + Us 1 32.2 21344 312.03
## + Ur 1 22.0 21354 312.05
## + mg 1 13.0 21363 312.07
## + mo 1 12.1 21364 312.07
## + cas 1 5.3 21371 312.09
## + sat 1 0.5 21376 312.10
## + Ui 1 0.4 21376 312.10
## - ph 1 5431.5 26808 315.23
##
## Step: AIC=305.28
## prod ~ ph + ctc
##
## Df Sum of Sq RSS AIC
## <none> 21792 305.28
## + arg 1 458.7 21334 308.13
## + p 1 426.4 21366 308.21
## + are 1 416.1 21376 308.23
## + S 1 411.1 21381 308.24
## + acc 1 303.5 21489 308.48
## + n 1 240.0 21553 308.62
## + alp 1 140.3 21652 308.85
## + I 1 120.1 21672 308.89
## + al 1 108.1 21684 308.92
## + k 1 101.5 21691 308.93
## + Ur 1 81.3 21711 308.98
## + mo 1 63.7 21729 309.01
## + Ui 1 29.6 21763 309.09
## + cas 1 21.6 21771 309.11
## + mg 1 19.3 21773 309.11
## + sat 1 15.3 21777 309.12
## + Us 1 7.1 21786 309.14
## + ca 1 4.3 21788 309.14
## - ph 1 5064.2 26857 311.44
## - ctc 1 6829.3 28622 314.50
summary(m2)
##
## Call:
## lm(formula = prod ~ ph + ctc, data = db)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.972 -13.535 -2.684 15.982 54.436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.814 3.176 26.072 < 2e-16 ***
## ph 14.833 4.587 3.234 0.002291 **
## ctc 17.225 4.587 3.755 0.000495 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.01 on 45 degrees of freedom
## Multiple R-squared: 0.6554, Adjusted R-squared: 0.6401
## F-statistic: 42.79 on 2 and 45 DF, p-value: 3.894e-11
O pacote MASS
tem a função lm.ridge()
para ajuste de modelos de regressão com regularização de norma \(L_2\) ou regressão ridge.
#-----------------------------------------------------------------------
# Usando o pacote MASS.
library(MASS)
m3 <- lm.ridge(prod ~ .,
data = db,
lambda = seq(from = 0,
to = 100,
by = 0.1))
# head(m3)
# Traço das estimativas como função de lambda.
plot(m3)
mtext(side = 1, line = 2, "Valor do parâmetro de regularização")
mtext(side = 2, line = 2, "Valores estimados para os parâmetros")
# Seleção do melhor valor.
select(m3)
## modified HKB estimator is 0.01577956
## modified L-W estimator is 10.66282
## smallest value of GCV at 66.6
ATENÇÃO! As variáveis preditoras não foram transformadas para a mesma escala. Os resultados podem ser diferentes.
#-----------------------------------------------------------------------
db_norm <- db
i <- names(db) != "prod"
db_norm[, i] <- as.data.frame(scale(db[, i]))
m3 <- lm.ridge(prod ~ .,
data = db_norm,
lambda = seq(from = 0,
to = 100,
by = 0.1))
# Traço das estimativas como função de lambda.
plot(m3)
mtext(side = 1, line = 2, "Valor do parâmetro de regularização")
mtext(side = 2, line = 2, "Valores estimados para os parâmetros")
# Seleção do melhor valor.
select(m3)
## modified HKB estimator is 0.01577956
## modified L-W estimator is 10.66282
## smallest value of GCV at 66.6
Pelos testes, o parâmetro de regularização escolhido por GCV foi o mesmo com a normalização ou não das preditoras. Pode-se acreditar então a função lm.ridge()
faça isso internamente. No entanto, a respectiva documentação não diz nada a respeito.
#-----------------------------------------------------------------------
# Implementando a regressão ridge (vanilla flavor).
rid <- function(lambda, y, X) {
XlX <- crossprod(X)
Xly <- crossprod(X, y)
II <- diag(ncol(X))
II[1, 1] <- 0
beta <- solve(XlX + lambda * II, Xly)
return(beta)
}
# Vetor resposta e matriz do modelo.
y <- cbind(db_norm$prod)
X <- model.matrix(~ ph + ctc + mg + I + are, data = db_norm)
# X <- model.matrix(~ . - prod, data = db_norm)
# Valores bem próximos. Diferenças são de sofisticação de implementação.
rid(lambda = 0.5, y = y, X = X)
## [,1]
## (Intercept) 82.8138526
## ph 15.7393005
## ctc 13.6045415
## mg -0.1493336
## I 1.7814979
## are -4.2334755
lm.ridge(prod ~ .,
data = db_norm[, c("prod", colnames(X)[-1])],
lambda = 0.5)
## ph ctc mg I are
## 82.8138526 15.7419859 13.6084765 -0.1524604 1.7813377 -4.2319936
# Sequencia de valores de lambda e estimativas dos parâmetros.
l <- 10^seq(-3, 3, length.out = 200)
r <- matrix(0, nrow = ncol(X), ncol = length(l))
for (i in seq_along(l)) {
r[, i] <- rid(lambda = l[i], y = y, X = X)
}
# Traço.
matplot(log10(l),
t(r[-1, ]),
type = "o",
lty = 1,
pch = 1,
cex = 0.5,
xlab = expression(log(lambda)),
ylab = expression(hat(beta)))
abline(h = 0, lty = 2)
legend("topright",
legend = colnames(X)[-1],
col = palette()[1:(ncol(X) - 1)],
lty = 1,
bty = "n")
Para regressão LASSO será utilizado um otimizador geral da função custo que contém a penalidade de norma \(L_1\). O pacote penalized
tem funções para o ajuste de regressão LASSO.
#-----------------------------------------------------------------------
# Implementação baunilha.
las <- function(beta, lambda, y, X) {
# Valores preditos.
ypred <- X %*% beta
# Sum of squares.
rss <- sum((y - ypred)^2)
# Penalidade no tamanho dos coeficientes.
penalty <- lambda * sum(abs(beta[-1]))
return(rss + penalty)
}
# Valores iniciais.
start <- coef(lm(y ~ 0 + X))
start
## X(Intercept) Xph Xctc Xmg XI
## 82.8138526 15.8675758 13.8014031 -0.3041363 1.7727697
## Xare
## -4.1583341
# Otimiza nos parâmetros para minimizar a função objetivo.
op <- optim(par = start,
fn = las,
lambda = 0,
method = "BFGS",
y = y,
X = X)
op$par
## X(Intercept) Xph Xctc Xmg XI
## 82.8138526 15.8675758 13.8014031 -0.3041363 1.7727697
## Xare
## -4.1583341
# Avalia uma sequência de valores de lambda.
l <- 10^seq(-3, 3, length.out = 200)
r <- sapply(l,
FUN = function(lam) {
op <- optim(start,
fn = las,
lambda = lam,
method = "BFGS",
y = y,
X = X)
start <<- op$par
return(op$par)
})
# Traço.
matplot(log10(l),
t(r[-1, ]),
type = "o",
lty = 1,
pch = 1,
cex = 0.5,
xlab = expression(log(lambda)),
ylab = expression(hat(beta)))
abline(h = 0, lty = 2)
legend("right",
legend = colnames(X)[-1],
col = palette()[1:(ncol(X) - 1)],
lty = 1,
bty = "n")
glmnet
O pacote glmnet
é o recomendado para a aplicação de regularização em modelos de regressão. O pacote permite fazer regularização com elastic-net da qual LASSO e Ridge são casos extremos quando o parâmetro \(\alpha\) é 1 ou 0, respectivamete.
library(glmnet)
y <- cbind(db$prod)
X <- model.matrix(~ . - prod, data = db_norm)
X <- X[, -1] # Remove a coluna do intercepto.
# X <- model.matrix(~ ph + ctc + ca + mg + S, data = db)
# X <- model.matrix(~ ph + ctc, data = db)
# Ajuste com penalização lasso, ridge e elastic net de mistura 1:1.
mridge <- glmnet(x = X, y = y, family = "gaussian", alpha = 0)
melnet <- glmnet(x = X, y = y, family = "gaussian", alpha = 0.5)
mlasso <- glmnet(x = X, y = y, family = "gaussian", alpha = 1)
# Traços.
par(mfrow = c(1, 3))
plot(mridge, xvar = "lambda", main = "RIDGE")
abline(h = 0, lty = 2)
plot(melnet, xvar = "lambda", main = "ELNET")
abline(h = 0, lty = 2)
plot(mlasso, xvar = "lambda", main = "LASSO")
abline(h = 0, lty = 2)
layout(1)
#-----------------------------------------------------------------------
# Obtendo o valor do parâmetro de penalização.
cvfit_ridge <- cv.glmnet(x = X, y = y, alpha = 0, nfolds = 10, type.measure = "mse")
cvfit_lasso <- cv.glmnet(x = X, y = y, alpha = 1, nfolds = 10, type.measure = "mse")
# Perfil do MSE de validação em função de lambda.
par(mfrow = c(2, 1))
plot(cvfit_ridge, main = "RIDGE")
abline(v = log(cvfit_ridge$lambda.min), col = 2)
plot(cvfit_lasso, main = "LASSO")
abline(v = log(cvfit_lasso$lambda.min), col = 2)
layout(1)
# Parâmetros de penalização ótimos.
c(Ridge = cvfit_ridge$lambda.min,
LASSO = cvfit_lasso$lambda.min)
## Ridge LASSO
## 59.878135 5.207895
# Coeficientes estimados (variáveis selecionadas).
cbind(round(coef(cvfit_ridge, s = "lambda.min"), digits = 4),
round(coef(cvfit_lasso, s = "lambda.min"), digits = 4))
## 21 x 2 sparse Matrix of class "dgCMatrix"
## 1 1
## (Intercept) 82.8139 82.8139
## Ur -0.1033 .
## Us -0.4570 .
## alp -0.3924 .
## n 0.5797 .
## I 0.5600 .
## Ui -0.3534 .
## S -0.4521 .
## ph 4.6358 11.1912
## p 0.1650 .
## k 2.5016 .
## ca 3.8835 0.1462
## mg 1.8002 .
## al -0.8545 .
## ctc 3.8234 13.7662
## sat 3.8627 0.7710
## mo 1.7607 .
## arg 1.3466 .
## are -1.8590 -0.0868
## cas 0.0610 .
## acc -1.7163 .
par(mfrow = c(1, 2))
plot(mridge, xvar = "lambda", main = "RIDGE")
abline(v = log(cvfit_ridge$lambda.min), lty = 2)
plot(mlasso, xvar = "lambda", main = "LASSO")
abline(v = log(cvfit_lasso$lambda.min), lty = 2)
layout(1)
Machine Learning | Prof. Eduardo V. Ferreira & Prof. Walmes M. Zeviani |