Url encurtada do markup quando estiver na fase de submissão.
Esclarecer o porquê do uso da inversa da diagonal da informação observada ((diag(H)) − 1) ao invés da diagonal da inversa que (diag(H − 1)) representa as variâncias das estimativas.
Desenvolver o conceito de time médio. Estabelecer comparações de alguns times contra o time médio. Tentar interpretar a diferença com relação ao time médio quanto da probabilidade de fazer gols ou ganhar a partida.
DONE Gráficos de barras com ordem homeWin, draw awayWin.
DONE Tabela de dupla entrada ou gráfico que permita saber a probabilidade do time Home ganhar (gráfico de contornos ou supercície, prob no eixo z) a partir da diferença γH − δA (eixo x) e γA − δH (eixo y).
DONE Revisar a implementação das medidas de DeFinetti.
##----------------------------------------------------------------------
## Load the data of the season 2011.
## UNIX command to get the charset/encoding of a file.
system("file -bi brasileiro2004-2011.txt")
## iso-8859-1 aka latin1.
## Importing the dataset with seasons 2004 to 2011.
da <- read.table(
file="brasileiro2004-2011.txt",
header=TRUE,
sep="\t",
encoding="latin1",
colClasses=c("factor","character","integer","NULL")[
c(2,4,3,4,4,4,1,1,2)])
## Short names are more manageable.
## man: home team, away: away team.
names(da) <- substr(names(da), 1, 3)
names(da)[3:4] <- c("home", "away")
str(da)
## Same order in the factor levels.
l <- union(levels(da$home), levels(da$away))
da$home <- factor(da$home, levels=l)
da$away <- factor(da$away, levels=l)
## Converting string to data (calendar format).
da$dat <- as.POSIXct(x=da$dat, format="%d/%m/%Y")
da$yea <- as.integer(format(da$dat, format="%Y"))
## Get the goals of each team in a match.
da$h <- as.integer(gsub("^(\\d+)x\\d+$", "\\1", tolower(da$pla)))
da$a <- as.integer(gsub("^\\d+x(\\d+)$", "\\1", tolower(da$pla)))
da <- transform(da, d=abs(h-a), s=h+a)
str(da)
##----------------------------------------------------------------------
## Just the 2011 season.
dc <- droplevels(subset(da, yea==2011))
rownames(dc) <- NULL
str(dc)
##----------------------------------------------------------------------
## Home and away points for each match.
colnames(aux) <- c("hpt", "apt")
dc <- cbind(dc, aux)
str(dc)
pfm$pch <- c(6,1,2)[match(pfm$pt, table=c(0,1,3))]
## By team and round.
## xyplot(adv~rod|team, data=pfm, col=1, pch=pfm$pch,
## layout=c(4,5), as.table=TRUE,
## ylab="y = 1: played at home, y = 0: played away",
## xlab="Champonship round",
## key=list(
## text=list(c("Win", "Draw", "Lose")),
## points=list(pch=c(2,1,6))
## ))
Exploratory analysis to the 2011 season
##----------------------------------------------------------------------
## Distribution of goals.
histogram(~h+a+d+s, data=dc,
breaks=with(dc, seq(min(d), max(s)+1, by=1)-0.5),
xlab="Goals",
ylab="Relative frequency (%)",
strip=strip.custom(
factor.levels=c("Goals of Home","Goals of Away",
"Absolute difference","Total of goals")))
##----------------------------------------------------------------------
## Set operation.
## Teams in 2010 that aren't in 2011 (the worst in seria A).
wA <- setdiff(levels(db$home), levels(dc$home))
wA
## The bests of serie B, came to 2011 season.
bB <- setdiff(levels(dc$home), levels(db$home))
bB
[1] "América/MG" "Bahia" "Coritiba" "Figueirense"
##----------------------------------------------------------------------
## Prior for the new teams, those in serie B that went to seria A.
## Save the priors of the teams in 2010.
priors_2010 <- list(priors1, priors2)
str(priors_2010)
List of 2
$ :'data.frame': 20 obs. of 5 variables:
..$ teams : Factor w/ 20 levels "Atlético/GO",..: 1 2 3 4 5 6 7 8 9 10 ...
..$ mu : num [1:20] 1.007 1.016 0.957 1 1.044 ...
..$ sigma2: num [1:20] 0.0217 0.0213 0.0237 0.0221 0.02 ...
..$ theta : num [1:20] 0.97 0.96 1.033 0.941 1.002 ...
..$ psi2 : num [1:20] 0.0197 0.0193 0.0222 0.0186 0.0211 ...
..- attr(*, "hf")=List of 1
.. ..$ omega:'data.frame': 1 obs. of 3 variables:
.. .. ..$ teams : chr "homefield"
.. .. ..$ mu0 : num 0.571
.. .. ..$ sigma20: num 0.00179
$ :'data.frame': 20 obs. of 5 variables:
..$ teams : Factor w/ 20 levels "Atlético/GO",..: 1 2 3 4 5 6 7 8 9 10 ...
..$ mu : num [1:20] 1.05 1.07 0.98 1.05 1.02 ...
..$ sigma2: num [1:20] 0.0207 0.0199 0.0234 0.0206 0.0219 ...
..$ theta : num [1:20] 0.958 0.934 1.052 0.937 0.999 ...
..$ psi2 : num [1:20] 0.0189 0.0181 0.0227 0.0182 0.0204 ...
..- attr(*, "hf")=List of 2
.. ..$ omega:'data.frame': 1 obs. of 3 variables:
.. .. ..$ teams : chr "homefield"
.. .. ..$ mu0 : num 0.596
.. .. ..$ sigma20: num 0.00166
.. ..$ kappa:'data.frame': 1 obs. of 3 variables:
.. .. ..$ teams : chr "awayfield"
.. .. ..$ theta0: num 0.342
.. .. ..$ psi20 : num 0.00266
priors1 <- droplevels(subset(priors1, !teams%in%wA))
## The team at 16th position.
t16 <- subset(priors1, teams==as.character(th16$team))
t16 <- t16[rep(1, 4),]
t16$teams <- bB
## Priors1 for all teams in the 2011 season.
priors1 <- rbind(priors1, t16)
rownames(priors1) <- NULL
priors1
## Prior mean and variance for the home-field advantage parameter.
attr(priors1, "hf") <- list(omega=L$omega)
attr(priors1, "hf")
$omega
teams mu0 sigma20
41 homefield 0.5706891 0.001790835
## Same order in the factor levels.
priors1$teams <- factor(as.character(priors1$teams),
levels=levels(dc$home))
##----------------------------------------------------------------------
priors2 <- droplevels(subset(priors2, !teams%in%wA))
## Priors2 for all teams in the 2011 season.
priors2 <- rbind(priors2, t16)
rownames(priors2) <- NULL
priors2
'data.frame': 2000 obs. of 5 variables:
$ teams : Factor w/ 20 levels "Corinthians",..: 1 1 1 1 1 1 1 1 1 1 ...
$ x : num 0.2 0.233 0.265 0.298 0.331 ...
$ ygamma: num 7.96e-07 2.62e-06 8.23e-06 2.46e-05 6.98e-05 ...
$ ydelta: num 8.25e-07 2.88e-06 9.50e-06 2.97e-05 8.81e-05 ...
$ pr : num 1 1 1 1 1 1 1 1 1 1 ...
## Legend for gamma and delta.
key.gd <- list(columns=1,
title="Parameter", cex.title=1.1,
lines=list(
col=ps$superpose.line$col[1:2]),
text=list(c(expression(gamma), expression(delta))))
## Legend for 1 and 2 field parameter models.
key.12 <- list(columns=1,
title="Model", cex.title=1.1,
lines=list(
lty=1:2),
text=list(c("1 field parameter",
"2 field parameter")))
## Create a grob by merging the two keys (latticeExtra).
keys <- mergedTrellisLegendGrob(
a=list(fun=draw.key, args=list(key=key.gd)),
b=list(fun=draw.key, args=list(key=key.12)),
vertical=FALSE)
xyplot(ygamma+ydelta~x|teams, data=subset(L, pr==1),
type="l", as.table=TRUE, layout=c(4,5),
xlab="Parameter space",
ylab="Prior density",
legend=list(top=list(fun=keys))
)+
as.layer(xyplot(ygamma+ydelta~x|teams, data=subset(L, pr==2),
type="l", lty=2))
## About multiple legends:
## https://procomun.wordpress.com/2012/02/18/maps_with_r_1/
MAP-1 model fitting and results
Parameter estimates
##----------------------------------------------------------------------
## Number of teams and number of rounds.
nteams <- nlevels(db$home)
nrounds <- length(unique(db$rod))
c(nteams, nrounds)
[1] 20 38
## gamma: teams attacking strength.
## delta: teams deffensive strength.
## omega: home-field advantage parameter.
gamma <- matrix(numeric(), nrow=nteams, ncol=nrounds+1)
delta <- gamma
omega <- numeric(nrounds)
## To store values in each match inside a round.
lambdaH <- matrix(numeric(), nrow=nteams/2, ncol=nrounds)
lambdaA <- lambdaH
i <- 1
gamma[,i] <- priors1$mu
delta[,i] <- priors1$theta
omega[i] <- attr(priors1, "hf")$omega$mu0
## Fitting for each round.
for(i in 1:nrounds){
##-------------------------------------------
## Subset each round (10 matches).
##
dbi <- subset(dc, rod==i)
##
##-------------------------------------------
## Model matrices, which teams confronted each other.
##
Xh <- model.matrix(~-1+home, data=dbi)
Xa <- model.matrix(~-1+away, data=dbi)
##
##-------------------------------------------
## Columns of Home and Away teams.
##
posh <- apply(apply(Xh, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
posa <- apply(apply(Xa, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
##
##-------------------------------------------
## Lambda values.
##
lambdaH[,i] <- as.vector(
exp(Xh%*%gamma[,i]-Xa%*%delta[,i]+omega[i])
)
lambdaA[,i] <- as.vector(
exp(Xa%*%gamma[,i]-Xh%*%delta[,i])
)
##
##-------------------------------------------
## Goals expected and goals scored.
##
## cbind(goalRateHome=lambdaA[,i], goalRateAway=lambdaB[,i],
## goalScorHome=dbi$x, goalScorAway=dbi$y)
##
##-------------------------------------------
## Updating priors for the next round.
##
gamma[posh,i+1] <-
Xh%*%gamma[,i]+Xh%*%priors1$sigma2*(dbi$h-lambdaH[,i])
gamma[posa,i+1] <-
Xa%*%gamma[,i]+Xa%*%priors1$sigma2*(dbi$a-lambdaA[,i])
delta[posh,i+1] <-
Xh%*%delta[,i]-Xh%*%priors1$psi2*(dbi$h-lambdaA[,i])
delta[posa,i+1] <-
Xa%*%delta[,i]-Xa%*%priors1$psi2*(dbi$h-lambdaH[,i])
omega[i+1] <-
omega[i]+attr(priors1, "hf")$omega$sigma20*
sum(dbi$h-lambdaH[,i])
}
gamma <- gamma[,-1]
delta <- delta[,-1]
omega <- omega[-1]
##----------------------------------------------------------------------
## Series for all teams parameters along the rounds.
dd <- melt(data=as.data.frame(delta), value.name="delta")
dd <- cbind(
melt(data=cbind(team=levels(dc$home), as.data.frame(gamma)),
id.vars="team", variable.name="rod", value.name ="gamma"),
delta=dd$delta)
dd$rod <- as.integer(dd$rod)
dd$team <- factor(dd$team, levels=as.character(pl11$team))
str(dd)
'data.frame': 760 obs. of 4 variables:
$ team : Factor w/ 20 levels "Corinthians",..: 19 12 15 17 20 14 9 18 1 8 ...
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ gamma: num 1.015 1.017 0.986 0.979 1.024 ...
$ delta: num 0.951 0.998 0.988 0.918 0.953 ...
## Home field parameter.
xyplot(omega~seq_along(omega), type="b",
scales=sc,
xlab="Championship round",
ylab="Home field advantage parameter")
##----------------------------------------------------------------------
## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- as.character(pl11$team[c(1,2,19,20)])
de <- droplevels(subset(dd, team%in%sel))
## Selected teams.
xyplot(gamma+delta~rod|team,
data=de, type="o",
as.table=TRUE,
scales=sc,
key=key.gd,
xlab="Championship round",
ylab="Parameter value")
##----------------------------------------------------------------------
## Estimated probability of HomeWin Draw AwayWin matrix.
## NOTE: this uses the home field advatage parameter.
ephda.hf <- t(apply(
dg[, c("hgamma","hdelta","agamma","adelta","omega")],
MARGIN=1,
FUN=function(i){
g <- 0:10
pla <- outer(g, g,
function(x, y){
Prob(gh=x, ga=y, theta=i, hf=TRUE)
})
res <- c(homeWin=sum(pla[lower.tri(pla)]),
draw=sum(diag(pla)),
awayWin=sum(pla[upper.tri(pla)]))
return(res)
}))
## NOTE: this DOESN'T use the home field advatage parameter.
ephda <- t(apply(
dg[, c("hgamma","hdelta","agamma","adelta","omega")],
MARGIN=1,
FUN=function(i){
g <- 0:10
pla <- outer(g, g,
function(x, y){
Prob(gh=x, ga=y, theta=i, hf=FALSE)
})
res <- c(homeWin=sum(pla[lower.tri(pla)]),
draw=sum(diag(pla)),
awayWin=sum(pla[upper.tri(pla)]))
return(res)
}))
## head(ephda)
## Per match DeFinetti Index.
dfi.hf <- rowSums((ephda.hf-hda)^2)
dfi <- rowSums((ephda-hda)^2)
xyplot(dfi.hf+dfi~rod, data=dg, outer=TRUE,
xlab="Round", ylab="DeFinetti Index")
## Average DeFinetti Index.
## Lazy model: Win Draw Lose = 1/3 in each, so DeFinetti Index is
c("with hf"=mean(dfi.hf), "no hf"=mean(dfi), "Lazy"=(2/3)^2+2*(1/3)^2)
(0,0.667] (0.667,2] Sum
(0,0.667] 0.3710526 0.1157895 0.4868421
(0.667,2] 0.1921053 0.3210526 0.5131579
Sum 0.5631579 0.4368421 1.0000000
## NOTE: the DeFinetti mean measure is not a good choice to evaluate the
## model prediction capability.
xyplot(dfi.hf+dfi~factor(hpt, levels=c(3,1,0), labels=c("W","D","L")),
data=dg, layout=c(NA,1),
outer=TRUE, jitter.x=TRUE,
ylab="DeFinetti Index",
xlab="Outcome for the home team of the match",
strip=strip.custom(
factor.levels=c(
"with field parameters", "without field parameters")))+
layer(panel.abline(h=2/3, lty=2))
##----------------------------------------------------------------------
## Saving the results of MAP-1 approach.
map1 <- list(dd=dd, fr=fr, omega=omega, dg=dg, dfi=dfi)
MAP-2 model fitting and results
Parameter estimates
##----------------------------------------------------------------------
## Number of teams and number of rounds.
nteams <- nlevels(db$home)
nrounds <- length(unique(db$rod))
c(nteams, nrounds)
[1] 20 38
## gamma: teams attacking strength.
## delta: teams deffensive strength.
## omega: home-field advantage parameter.
gamma <- matrix(numeric(), nrow=nteams, ncol=nrounds+1)
delta <- gamma
omega <- numeric(nrounds)
kappa <- omega
## To store values in each match inside a round.
lambdaH <- matrix(numeric(), nrow=nteams/2, ncol=nrounds)
lambdaA <- lambdaH
i <- 1
gamma[,i] <- priors2$mu
delta[,i] <- priors2$theta
omega[i] <- attr(priors2, "hf")$omega$mu0
kappa[i] <- attr(priors2, "hf")$kappa$theta0
## Fitting for each round.
for(i in 1:nrounds){
##-------------------------------------------
## Subset each round (10 matches).
##
dbi <- subset(dc, rod==i)
##
##-------------------------------------------
## Model matrices, which teams confronted each other.
##
Xh <- model.matrix(~-1+home, data=dbi)
Xa <- model.matrix(~-1+away, data=dbi)
##
##-------------------------------------------
## Columns of Home and Away teams.
##
posh <- apply(apply(Xh, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
posa <- apply(apply(Xa, MARGIN=1, FUN="==", 1), MARGIN=2, FUN=which)
##
##-------------------------------------------
## Lambda values.
##
lambdaH[,i] <- as.vector(
exp(Xh%*%gamma[,i]-Xa%*%delta[,i]+omega[i])
)
lambdaA[,i] <- as.vector(
exp(Xa%*%gamma[,i]-Xh%*%delta[,i])
)
##
##-------------------------------------------
## Goals expected and goals scored.
##
## cbind(goalRateHome=lambdaA[,i], goalRateAway=lambdaB[,i],
## goalScorHome=dbi$x, goalScorAway=dbi$y)
##
##-------------------------------------------
## Updating priors for the next round.
##
gamma[posh,i+1] <-
Xh%*%gamma[,i]+Xh%*%priors2$sigma2*(dbi$h-lambdaH[,i])
gamma[posa,i+1] <-
Xa%*%gamma[,i]+Xa%*%priors2$sigma2*(dbi$a-lambdaA[,i])
delta[posh,i+1] <-
Xh%*%delta[,i]-Xh%*%priors2$psi2*(dbi$h-lambdaA[,i])
delta[posa,i+1] <-
Xa%*%delta[,i]-Xa%*%priors2$psi2*(dbi$h-lambdaH[,i])
omega[i+1] <-
omega[i]+attr(priors2, "hf")$omega$sigma20*
sum(dbi$h-lambdaH[,i])
kappa[i+1] <-
kappa[i]-attr(priors2, "hf")$kappa$psi20*
sum(dbi$a-lambdaA[,i])
}
gamma <- gamma[,-1]
delta <- delta[,-1]
omega <- omega[-1]
kappa <- kappa[-1]
##----------------------------------------------------------------------
## Series for all teams parameters along the rounds.
dd <- melt(data=as.data.frame(delta), value.name="delta")
dd <- cbind(
melt(data=cbind(team=levels(dc$home), as.data.frame(gamma)),
id.vars="team", variable.name="rod", value.name ="gamma"),
delta=dd$delta)
dd$rod <- as.integer(dd$rod)
dd$team <- factor(dd$team, levels=as.character(pl11$team))
str(dd)
'data.frame': 760 obs. of 4 variables:
$ team : Factor w/ 20 levels "Corinthians",..: 19 12 15 17 20 14 9 18 1 8 ...
$ rod : int 1 1 1 1 1 1 1 1 1 1 ...
$ gamma: num 1.052 1.066 1.006 1.028 0.995 ...
$ delta: num 0.938 0.974 1.006 0.917 0.954 ...
## Legend for omega and kappa.
key.ok <- list(columns=2,
title="Parameter", cex.title=1.1,
type="o", divide=1,
lines=list(
col=ps$superpose.line$col[1:2],
pch=ps$superpose.symbol$pch[1:2]),
text=list(c(expression(omega), expression(kappa))))
## Field parameters.
xyplot(omega+kappa~seq_along(omega), type="b",
scales=sc, key=key.ok,
xlab="Championship round",
ylab="Field parameters")
##----------------------------------------------------------------------
## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- as.character(pl11$team)[c(1,2,19,20)]
de <- droplevels(subset(dd, team%in%sel))
## Selected teams.
xyplot(gamma+delta~rod|team,
data=de, type="o",
as.table=TRUE,
scales=sc,
key=key.gd,
xlab="Championship round",
ylab="Parameter value")
##----------------------------------------------------------------------
## Estimated probability of HomeWin Draw AwayWin matrix.
## NOTE: this uses the home field advatage parameter.
ephda.hf <- t(apply(
dg[, c("hgamma","hdelta","agamma","adelta","omega","kappa")],
MARGIN=1,
FUN=function(i){
g <- 0:10
pla <- outer(g, g,
function(x, y){
Prob(gh=x, ga=y, theta=i, hf=TRUE)
})
res <- c(homeWin=sum(pla[lower.tri(pla)]),
draw=sum(diag(pla)),
awayWin=sum(pla[upper.tri(pla)]))
return(res)
}))
## NOTE: this DOESN'T use the home field advatage parameter.
ephda <- t(apply(
dg[, c("hgamma","hdelta","agamma","adelta","omega","kappa")],
MARGIN=1,
FUN=function(i){
g <- 0:10
pla <- outer(g, g,
function(x, y){
Prob(gh=x, ga=y, theta=i, hf=FALSE)
})
res <- c(homeWin=sum(pla[lower.tri(pla)]),
draw=sum(diag(pla)),
awayWin=sum(pla[upper.tri(pla)]))
return(res)
}))
## head(ephda)
## Per match DeFinetti Index.
dfi.hf <- rowSums((ephda.hf-hda)^2)
dfi <- rowSums((ephda-hda)^2)
xyplot(dfi.hf+dfi~rod, data=dg, outer=TRUE,
xlab="Round", ylab="DeFinetti Index")
## Average DeFinetti Index.
## Lazy model: Win Draw Lose = 1/3 in each, so DeFinetti Index is
c("with hf"=mean(dfi.hf), "no hf"=mean(dfi), "Lazy"=(2/3)^2+2*(1/3)^2)
(0,0.667] (0.667,2] Sum
(0,0.667] 0.3578947 0.1263158 0.4842105
(0.667,2] 0.1763158 0.3394737 0.5157895
Sum 0.5342105 0.4657895 1.0000000
## NOTE: the DeFinetti mean measure is not a good choice to evaluate the
## model prediction capability.
xyplot(dfi.hf+dfi~factor(hpt, levels=c(3,1,0), labels=c("W","D","L")),
data=dg, layout=c(NA,1),
outer=TRUE, jitter.x=TRUE,
ylab="DeFinetti Index",
xlab="Outcome for the home team of the match",
strip=strip.custom(
factor.levels=c(
"with field parameters", "without field parameters")))+
layer(panel.abline(h=2/3, lty=2))
##----------------------------------------------------------------------
## Saving the results of MAP-2 approach.
map2 <- list(dd=dd, fr=fr, omega=omega, kappa=kappa, dg=dg, dfi=dfi)
Comparing MAP-1 and MAP-2
##----------------------------------------------------------------------
## fp: number of field parameters.
map12 <- rbind(cbind(map1$dd, fp=1),
cbind(map2$dd, fp=2))
## Select som teams to a depper view, 2 bests and 2 worsts.
sel <- as.character(pl11$team)[c(1,2,19,20)]
de <- droplevels(subset(map12, team%in%sel))
key.map <- list(columns=2,
title="Model", cex.title=1.1,
type="o", divide=1,
lines=list(
col=ps$superpose.line$col[1:2],
pch=ps$superpose.symbol$pch[1:2]),
text=list(c("MAP-1","MAP-2")))
sc <- list(x=list(at=c(1,10,20,30,38)),
y=list(relation="free"))
useOuterStrips(combineLimits(
xyplot(gamma+delta~rod|team, outer=TRUE, groups=fp,
data=de, type="o",
as.table=TRUE,
scales=sc,
key=key.map,
xlab="Championship round",
ylab="Parameter value")),
strip.left=strip.custom(factor.levels=key.gd$text[[1]])
)