--- title: "Statistical Rethinking Chapter 6 Problems" author: "John Fox" date: "2016-11-16" output: html_document --- ```{r echo=FALSE} # include this code chunk as-is to set options knitr::opts_chunk$set(comment=NA, prompt=TRUE, fig.height=5, fig.width=5, R.options=list(digits=6)) ``` Preliminaries: ```{r message=FALSE} library(rethinking) data(Howell1) d <- Howell1 d$age <- with(d, (age - mean(age))/sd(age)) set.seed(1000) # for reproducibility i <- sample(nrow(d), size=nrow(d)/2) d1 <- d[i, ] d2 <- d[-i, ] nrow(d1) #check nrow(d2) ``` Aside: I know what a year of age represents; a standard deviation of age, not so much. 6H1 ---- Fitting the various models (using flat priors for a and sigma). I continue to have concerns about where the priors come from (and was unable to fit model `m6` without specifying a more restrictive prior for the 6th-degree term). ```{r} m1 <- map( alist( height ~ dnorm(mu, sigma), mu <- a + b1*age, a ~ dnorm(150, 100), b1 ~ dnorm(0, 10), sigma ~ dunif(0, 30) ), data=d1) m1 m2 <- map( alist( height ~ dnorm(mu, sigma), mu <- a + b1*age + b2*I(age^2), a ~ dnorm(150, 100), b1 ~ dnorm(0, 10), b2 ~ dnorm(0, 10), sigma ~ dunif(0, 30) ), data=d1) m2 m3 <- map( alist( height ~ dnorm(mu, sigma), mu <- a + b1*age + b2*I(age^2) + b3*I(age^3), a ~ dnorm(150, 100), b1 ~ dnorm(0, 10), b2 ~ dnorm(0, 10), b3 ~ dnorm(0, 10), sigma ~ dunif(0, 30) ), data=d1) m3 m4 <- map( alist( height ~ dnorm(mu, sigma), mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4), a ~ dnorm(150, 100), b1 ~ dnorm(0, 10), b2 ~ dnorm(0, 10), b3 ~ dnorm(0, 10), b4 ~ dnorm(0, 10), sigma ~ dunif(0, 30) ), data=d1) m4 m5 <- map( alist( height ~ dnorm(mu, sigma), mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5), a ~ dnorm(150, 100), b1 ~ dnorm(0, 10), b2 ~ dnorm(0, 10), b3 ~ dnorm(0, 10), b4 ~ dnorm(0, 10), b5 ~ dnorm(0, 10), sigma ~ dunif(0, 30) ), data=d1) m5 m6 <- map( alist( height ~ dnorm(mu, sigma), mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5) + b6*I(age^6), a ~ dnorm(150, 100), b1 ~ dnorm(0, 10), b2 ~ dnorm(0, 10), b3 ~ dnorm(0, 10), b4 ~ dnorm(0, 10), b5 ~ dnorm(0, 10), b6 ~ dnorm(0, 5), sigma ~ dunif(0, 30) ), data=d1) m6 ``` Comparing the models using WAIC: ```{r} (comparison <- compare(m1, m2, m3, m4, m5, m6)) ``` The 4th, 5th, and 6th degree polynomials have similar WAIS values; the linear, quadratic, and cubic models fit much worse (to varying extent). 6H2 --- ```{r} age.seq <- seq(min(d1$age), max(d1$age), length=50) d.predict <- list( age = age.seq ) pred <- link(m1, data=d.predict) mu <- apply(pred, 2, mean) mu.PI <- apply(pred, 2, PI, prob=0.97) plot(height ~ age, data=d1, main="model m1") lines(age.seq, mu, lty=2, lwd=2) lines(age.seq, mu.PI[1, ], lty=2) lines(age.seq, mu.PI[2, ], lty=2) pred <- link(m2, data=d.predict) mu <- apply(pred, 2, mean) mu.PI <- apply(pred, 2, PI, prob=0.97) plot(height ~ age, data=d1, main="model m2") lines(age.seq, mu, lty=2, lwd=2) lines(age.seq, mu.PI[1, ], lty=2) lines(age.seq, mu.PI[2, ], lty=2) pred <- link(m3, data=d.predict) mu <- apply(pred, 2, mean) mu.PI <- apply(pred, 2, PI, prob=0.97) plot(height ~ age, data=d1, main="model m3") lines(age.seq, mu, lty=2, lwd=2) lines(age.seq, mu.PI[1, ], lty=2) lines(age.seq, mu.PI[2, ], lty=2) pred <- link(m4, data=d.predict) mu <- apply(pred, 2, mean) mu.PI <- apply(pred, 2, PI, prob=0.97) plot(height ~ age, data=d1, main="model m4") lines(age.seq, mu, lty=2, lwd=2) lines(age.seq, mu.PI[1, ], lty=2) lines(age.seq, mu.PI[2, ], lty=2) pred <- link(m5, data=d.predict) mu <- apply(pred, 2, mean) mu.PI <- apply(pred, 2, PI, prob=0.97) plot(height ~ age, data=d1, main="model m5") lines(age.seq, mu, lty=2, lwd=2) lines(age.seq, mu.PI[1, ], lty=2) lines(age.seq, mu.PI[2, ], lty=2) pred <- link(m6, data=d.predict) mu <- apply(pred, 2, mean) mu.PI <- apply(pred, 2, PI, prob=0.97) plot(height ~ age, data=d1, main="model m6") lines(age.seq, mu, lty=2, lwd=2) lines(age.seq, mu.PI[1, ], lty=2) lines(age.seq, mu.PI[2, ], lty=2) ``` Models 4, 5, and 6 produce roughly similar predictions and at least seem to follow the trend in the scatterplot. Recall that these are the models that get non-zero WAIC-based weights. Frankly, this doesn't make a whole lot of sense to me as a strategy, other than as an exercise, since I'd expect average height to approach an asymptote. On the other hand, the higher-order polynomials produce what appear to be reasonable fits in this example within the range of ages in the data. 6H3 --- Computing and plotting the model-averaged predictions: ```{r message=FALSE} height.ensemble <- ensemble(m1, m2, m3, m4, m5, m6, data=d.predict) mu <- apply(height.ensemble$link, 2, mean) mu.PI <- apply(height.ensemble$link, 2, PI, prob=0.97) plot(height ~ age, data=d1, main="model-averaged") lines(age.seq, mu, lty=2, lwd=2) lines(age.seq, mu.PI[1, ], lty=2) lines(age.seq, mu.PI[2, ], lty=2) ``` The model-averaged predictions and intervals look similar to those for models 4, 5, and 6. 6H4 --- Computing the test-sample deviance for each model: ```{r} d2.predict <- list( age <- d2$age ) mods <- list(m1, m2, m3, m4, m5, m6) deviances <- sapply(mods, function(mod) { pred <- link(mod, data=d2.predict) mu <- apply(pred, 2, mean) -2*sum(dnorm(d2$height, mu, coef(mod)["sigma"], log=TRUE)) }) names(deviances) <- c("m1", "m2", "m3", "m4", "m5", "m6") deviances <- sort(deviances) deviances ``` 6H5 --- Comparison of test-sample deviances to WAIC values, setting the minimum deviance to 0: ```{r} comparison deviances - min(deviances) ``` Both criteria have models 4, 5, and 6 close, but not in exactly the same order, and the deviances for the other, more poorly fitting, models are considerably larger than the corresponding dWAIC values. 6H6 --- Fitting the model 6 with "more strongly regularizing" priors: ```{r} m6r <- map( alist( height ~ dnorm(mu, sigma), mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5) + b6*I(age^6), a ~ dnorm(150, 100), b1 ~ dnorm(0, 5), b2 ~ dnorm(0, 5), b3 ~ dnorm(0, 5), b4 ~ dnorm(0, 5), b5 ~ dnorm(0, 5), b6 ~ dnorm(0, 5), sigma ~ dunif(0, 30) ), data=d1) m6r ``` Plot of predictions: ```{r} pred <- link(m6r, data=d.predict) mu <- apply(pred, 2, mean) mu.PI <- apply(pred, 2, PI, prob=0.97) plot(height ~ age, data=d1, main="model m6a") lines(age.seq, mu, lty=2, lwd=2) lines(age.seq, mu.PI[1, ], lty=2) lines(age.seq, mu.PI[2, ], lty=2) ``` Out-of-sample deviances and model WAIC comparison to previously "best" models: ```{r} d2.predict <- list( age = d2$age ) mods <- list(m4, m6, m6r) deviances <- sapply(mods, function(mod) { pred <- link(mod, data=d2.predict) mu <- apply(pred, 2, mean) -2*sum(dnorm(d2$height, mu, coef(mod)["sigma"], log=TRUE)) }) names(deviances) <- c("m4", "m6", "m6r") deviances <- sort(deviances) deviances compare(m4, m6, m6r) ``` Model `m6r` (with the more strongly regularized prior) looks best with respect to out-of-sample prediction but not according to the WAIC. Addendum -------- I was curious about what a least-squares analysis of the data would show. For this purpose, I'll use unstandardized age. ```{r} d <- Howell1 # restoring the unstandardized data d1 <- d[i, ] d2 <- d[-i, ] mod.6 <- lm(height ~ poly(age, 6), data=d1) summary(mod.6) ``` That suggests trying a 4th-degree polynomial (via the no-no of using significance tests for model selection): ```{r} mod.4 <- update(mod.6, . ~ poly(age, 4)) summary(mod.4) anova(mod.4, mod.6) ``` What does this fit look like? I'll also compare to a 4-df regression spline. ```{r} library(splines) mod.spline <- update(mod.4, . ~ ns(age, 4)) age.seq <- seq(min(d1$age), max(d1$age), length=50) d.predict.us <- list( age = age.seq ) yhat.4 <- predict(mod.4, newdata=d.predict.us) yhat.spline <- predict(mod.spline, newdata=d.predict.us) plot(height ~ age, data=d1) lines(age.seq, yhat.4, lwd=2, col="magenta") lines(age.seq, yhat.spline, lwd=2, lty=2, col="blue") ``` The two fits are remarkably similar. How does the LS fit do with respect to out-of-sample prediction relative to the best Bayesian models? ```{r} d2.predict <- list( age = d2$age ) mu <- predict(mod.4, newdata=d2.predict) deviances[4] <- -2*sum(dnorm(d2$height, mu, summary(mod.4)$sigma, log=TRUE)) names(deviances)[4] <- "mod.4" deviances ``` Answer: Pretty similar prediction error, but quite as good as the best Bayesian polynomial model fits. Finally, I'll try a three-parameter logistic growth model, which on general grounds seems a more reasonable choice than a high-degree polynomial. In this model, $\theta_1$ is the asymptotic mean height, $\theta_2$ adjusts the initial level of mean height at age 0, and $theta_3$ is a growth parameter. The start value for $\theta_1$ is a guessed value, and the other start values are determined by linear least squares for a linearized version of the model. ```{r} library(nlme) th1.start <- 1.01*max(d1$height) mod.start <- lm(log((height/th1.start)/(1 - height/th1.start)) ~ age, data=d1) start <- coef(mod.start) mod.logistic <- nls(height ~ theta1/(1 + exp(-(theta2 + theta3*age))), data=d1, start=list(theta1=th1.start, theta2=start[1], theta3=start[2])) summary(mod.logistic) plot(height ~ age, data=d1) ages <- seq(min(d1$age), max(d1$age), length=100) yhat <- predict(mod.logistic, newdata=list(age=ages)) lines(ages, yhat) ``` Let's see how this model does in the test sample: ```{r} mu <- predict(mod.logistic, newdata=d2.predict) deviances[5] <- -2*sum(dnorm(d2$height, mu, summary(mod.logistic)$sigma, log=TRUE)) names(deviances)[5] <- "mod.logistic" deviances ``` So, the predictive performance of the logistic model is considerably worse than the previously fit "better" models.