Statistical Rethinking Chapter 6 Problems

Preliminaries:

> library(rethinking)
> data(Howell1)
> d <- Howell1
> d$age <- with(d, (age - mean(age))/sd(age))
> set.seed(1000) # for reproducibility
> i <- sample(nrow(d), size=nrow(d)/2)
> d1 <- d[i, ]
> d2 <- d[-i, ]
> nrow(d1) #check

[1] 272

> nrow(d2)

[1] 272

Aside: I know what a year of age represents; a standard deviation of age, not so much.

6H1

Fitting the various models (using flat priors for a and sigma). I continue to have concerns about where the priors come from (and was unable to fit model m6 without specifying a more restrictive prior for the 6th-degree term).

> m1 <- map(
+     alist(
+         height ~ dnorm(mu, sigma),
+         mu <- a + b1*age,
+         a ~ dnorm(150, 100), 
+         b1 ~ dnorm(0, 10),
+         sigma ~ dunif(0, 30)
+     ),
+     data=d1)
> m1


Maximum a posteriori (MAP) model fit

Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)

MAP values:
       a       b1    sigma 
138.4458  18.3889  19.5417 

Log-likelihood: -1194.49

> m2 <- map(
+     alist(
+         height ~ dnorm(mu, sigma),
+         mu <- a + b1*age + b2*I(age^2),
+         a ~ dnorm(150, 100), 
+         b1 ~ dnorm(0, 10),
+         b2 ~ dnorm(0, 10),
+         sigma ~ dunif(0, 30)
+     ),
+     data=d1)
> m2


Maximum a posteriori (MAP) model fit

Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)

MAP values:
       a       b1       b2    sigma 
152.2580  25.5806 -13.9791  12.3795 

Log-likelihood: -1070.31

> m3 <- map(
+     alist(
+         height ~ dnorm(mu, sigma),
+         mu <- a + b1*age + b2*I(age^2) + b3*I(age^3),
+         a ~ dnorm(150, 100), 
+         b1 ~ dnorm(0, 10),
+         b2 ~ dnorm(0, 10),
+         b3 ~ dnorm(0, 10),
+         sigma ~ dunif(0, 30)
+     ),
+     data=d1)
> m3


Maximum a posteriori (MAP) model fit

Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
b3 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)

MAP values:
        a        b1        b2        b3     sigma 
158.01841  11.36137 -23.99399   8.03899   8.58993 

Log-likelihood: -970.91

> m4 <- map(
+     alist(
+         height ~ dnorm(mu, sigma),
+         mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4),
+         a ~ dnorm(150, 100), 
+         b1 ~ dnorm(0, 10),
+         b2 ~ dnorm(0, 10),
+         b3 ~ dnorm(0, 10),
+         b4 ~ dnorm(0, 10),
+         sigma ~ dunif(0, 30)
+     ),
+     data=d1)
> m4


Maximum a posteriori (MAP) model fit

Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3) + b4 * I(age^4)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
b3 ~ dnorm(0, 10)
b4 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)

MAP values:
        a        b1        b2        b3        b4     sigma 
156.58189   5.89587 -19.06828  12.38996  -2.35959   8.16611 

Log-likelihood: -957.15

> m5 <- map(
+     alist(
+         height ~ dnorm(mu, sigma),
+         mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5),
+         a ~ dnorm(150, 100), 
+         b1 ~ dnorm(0, 10),
+         b2 ~ dnorm(0, 10),
+         b3 ~ dnorm(0, 10),
+         b4 ~ dnorm(0, 10),
+         b5 ~ dnorm(0, 10),
+         sigma ~ dunif(0, 30)
+     ),
+     data=d1)
> m5


Maximum a posteriori (MAP) model fit

Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3) + b4 * I(age^4) + 
    b5 * I(age^5)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
b3 ~ dnorm(0, 10)
b4 ~ dnorm(0, 10)
b5 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)

MAP values:
          a          b1          b2          b3          b4          b5 
156.6584932   5.8187186 -19.4034784  12.5300962  -2.1925476  -0.0681921 
      sigma 
  8.1634807 

Log-likelihood: -957.06

> m6 <- map(
+     alist(
+         height ~ dnorm(mu, sigma),
+         mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5) + b6*I(age^6),
+         a ~ dnorm(150, 100), 
+         b1 ~ dnorm(0, 10),
+         b2 ~ dnorm(0, 10),
+         b3 ~ dnorm(0, 10),
+         b4 ~ dnorm(0, 10),
+         b5 ~ dnorm(0, 10),
+         b6 ~ dnorm(0, 5),
+         sigma ~ dunif(0, 30)
+     ),
+     data=d1)
> m6


Maximum a posteriori (MAP) model fit

Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3) + b4 * I(age^4) + 
    b5 * I(age^5) + b6 * I(age^6)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
b3 ~ dnorm(0, 10)
b4 ~ dnorm(0, 10)
b5 ~ dnorm(0, 10)
b6 ~ dnorm(0, 5)
sigma ~ dunif(0, 30)

MAP values:
         a         b1         b2         b3         b4         b5 
156.649555   4.412549 -19.170857  15.274650  -2.810915  -1.107829 
        b6      sigma 
  0.348599   8.142637 

Log-likelihood: -956.37

Comparing the models using WAIC:

> (comparison <- compare(m1, m2, m3, m4, m5, m6))

     WAIC pWAIC dWAIC weight    SE   dSE
m4 1926.0   5.5   0.0   0.57 25.43    NA
m5 1927.5   6.3   1.5   0.27 25.37  0.45
m6 1928.5   7.4   2.5   0.16 25.19  1.68
m3 1952.3   5.4  26.3   0.00 24.20 11.07
m2 2150.1   5.2 224.1   0.00 22.77 26.71
m1 2395.4   3.4 469.4   0.00 23.14 31.05

The 4th, 5th, and 6th degree polynomials have similar WAIS values; the linear, quadratic, and cubic models fit much worse (to varying extent).

6H2

> age.seq <- seq(min(d1$age), max(d1$age), length=50)
> d.predict <- list(
+     age = age.seq
+ )
> 
> pred <- link(m1, data=d.predict)

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m1")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)

> pred <- link(m2, data=d.predict)

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m2")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)

> pred <- link(m3, data=d.predict)

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m3")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)

> pred <- link(m4, data=d.predict)

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m4")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)

> pred <- link(m5, data=d.predict)

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m5")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)

> pred <- link(m6, data=d.predict)

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m6")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)

Models 4, 5, and 6 produce roughly similar predictions and at least seem to follow the trend in the scatterplot. Recall that these are the models that get non-zero WAIC-based weights. Frankly, this doesn’t make a whole lot of sense to me as a strategy, other than as an exercise, since I’d expect average height to approach an asymptote. On the other hand, the higher-order polynomials produce what appear to be reasonable fits in this example within the range of ages in the data.

6H3

Computing and plotting the model-averaged predictions:

> height.ensemble <- ensemble(m1, m2, m3, m4, m5, m6, data=d.predict)

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> mu <- apply(height.ensemble$link, 2, mean)
> mu.PI <- apply(height.ensemble$link, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model-averaged")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)

The model-averaged predictions and intervals look similar to those for models 4, 5, and 6.

6H4

Computing the test-sample deviance for each model:

> d2.predict <- list(
+     age <- d2$age
+ )
> 
> mods <- list(m1, m2, m3, m4, m5, m6)
> deviances <- sapply(mods, function(mod) {
+     pred <- link(mod, data=d2.predict) 
+     mu <- apply(pred, 2, mean)
+     -2*sum(dnorm(d2$height, mu, coef(mod)["sigma"], log=TRUE))
+ })

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> names(deviances) <- c("m1", "m2", "m3", "m4", "m5", "m6")
> deviances <- sort(deviances)
> deviances

     m6      m4      m5      m3      m2      m1 
1875.99 1876.39 1876.52 1932.45 2137.98 2422.24

6H5

Comparison of test-sample deviances to WAIC values, setting the minimum deviance to 0:

> comparison

     WAIC pWAIC dWAIC weight    SE   dSE
m4 1926.0   5.5   0.0   0.57 25.43    NA
m5 1927.5   6.3   1.5   0.27 25.37  0.45
m6 1928.5   7.4   2.5   0.16 25.19  1.68
m3 1952.3   5.4  26.3   0.00 24.20 11.07
m2 2150.1   5.2 224.1   0.00 22.77 26.71
m1 2395.4   3.4 469.4   0.00 23.14 31.05

> deviances - min(deviances)

        m6         m4         m5         m3         m2         m1 
  0.000000   0.400422   0.536945  56.461066 261.993945 546.252649

Both criteria have models 4, 5, and 6 close, but not in exactly the same order, and the deviances for the other, more poorly fitting, models are considerably larger than the corresponding dWAIC values.

6H6

Fitting the model 6 with “more strongly regularizing” priors:

> m6r <- map(
+     alist(
+         height ~ dnorm(mu, sigma),
+         mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5) + b6*I(age^6),
+         a ~ dnorm(150, 100), 
+         b1 ~ dnorm(0, 5),
+         b2 ~ dnorm(0, 5),
+         b3 ~ dnorm(0, 5),
+         b4 ~ dnorm(0, 5),
+         b5 ~ dnorm(0, 5),
+         b6 ~ dnorm(0, 5),
+         sigma ~ dunif(0, 30)
+     ),
+     data=d1)
> m6r


Maximum a posteriori (MAP) model fit

Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3) + b4 * I(age^4) + 
    b5 * I(age^5) + b6 * I(age^6)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 5)
b2 ~ dnorm(0, 5)
b3 ~ dnorm(0, 5)
b4 ~ dnorm(0, 5)
b5 ~ dnorm(0, 5)
b6 ~ dnorm(0, 5)
sigma ~ dunif(0, 30)

MAP values:
          a          b1          b2          b3          b4          b5 
155.8559187   5.9710498 -16.6031204  12.0942108  -3.5113773   0.2316938 
         b6       sigma 
  0.0509131   8.1979731 

Log-likelihood: -958.21

Plot of predictions:

> pred <- link(m6r, data=d.predict)

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m6a")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)

Out-of-sample deviances and model WAIC comparison to previously “best” models:

> d2.predict <- list(
+     age = d2$age
+ )
> 
> mods <- list(m4, m6, m6r)
> deviances <- sapply(mods, function(mod) {
+     pred <- link(mod, data=d2.predict) 
+     mu <- apply(pred, 2, mean)
+     -2*sum(dnorm(d2$height, mu, coef(mod)["sigma"], log=TRUE))
+ })

[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]

> names(deviances) <- c("m4", "m6", "m6r")
> deviances <- sort(deviances)
> deviances

    m6r      m6      m4 
1875.22 1875.43 1876.35

> compare(m4, m6, m6r)

      WAIC pWAIC dWAIC weight    SE  dSE
m4  1926.6   5.9   0.0   0.68 25.58   NA
m6  1928.6   7.5   2.0   0.25 25.31 1.74
m6r 1931.1   6.9   4.5   0.07 25.71 2.04

Model m6r (with the more strongly regularized prior) looks best with respect to out-of-sample prediction but not according to the WAIC.

Addendum

I was curious about what a least-squares analysis of the data would show. For this purpose, I’ll use unstandardized age.

> d <- Howell1  # restoring the unstandardized data
> d1 <- d[i, ]
> d2 <- d[-i, ]
> 
> mod.6 <- lm(height ~ poly(age, 6), data=d1)
> summary(mod.6)


Call:
lm(formula = height ~ poly(age, 6), data = d1)

Residuals:
   Min     1Q Median     3Q    Max 
-24.97  -5.83  -0.08   5.98  28.87 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)     138.22       0.50  276.55  < 2e-16 ***
poly(age, 6)1   304.78       8.24   36.98  < 2e-16 ***
poly(age, 6)2  -249.36       8.24  -30.25  < 2e-16 ***
poly(age, 6)3   146.99       8.24   17.83  < 2e-16 ***
poly(age, 6)4   -43.94       8.24   -5.33  2.1e-07 ***
poly(age, 6)5    -5.09       8.24   -0.62     0.54    
poly(age, 6)6    10.26       8.24    1.25     0.21    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.24 on 265 degrees of freedom
Multiple R-squared:  0.908, Adjusted R-squared:  0.906 
F-statistic:  438 on 6 and 265 DF,  p-value: <2e-16

That suggests trying a 4th-degree polynomial (via the no-no of using significance tests for model selection):

> mod.4 <- update(mod.6, . ~ poly(age, 4))
> summary(mod.4)


Call:
lm(formula = height ~ poly(age, 4), data = d1)

Residuals:
   Min     1Q Median     3Q    Max 
-25.90  -6.00  -0.24   5.63  29.12 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)     138.22       0.50  276.59  < 2e-16 ***
poly(age, 4)1   304.78       8.24   36.98  < 2e-16 ***
poly(age, 4)2  -249.36       8.24  -30.26  < 2e-16 ***
poly(age, 4)3   146.99       8.24   17.83  < 2e-16 ***
poly(age, 4)4   -43.94       8.24   -5.33  2.1e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.24 on 267 degrees of freedom
Multiple R-squared:  0.908, Adjusted R-squared:  0.906 
F-statistic:  657 on 4 and 267 DF,  p-value: <2e-16

> anova(mod.4, mod.6)

Analysis of Variance Table

Model 1: height ~ poly(age, 4)
Model 2: height ~ poly(age, 6)
  Res.Df   RSS Df Sum of Sq     F Pr(>F)
1    267 18136                          
2    265 18004  2     131.3 0.966  0.382

What does this fit look like? I’ll also compare to a 4-df regression spline.

> library(splines)
> mod.spline <- update(mod.4, . ~ ns(age, 4))
> 
> age.seq <- seq(min(d1$age), max(d1$age), length=50)
> d.predict.us <- list(
+     age = age.seq
+ )
> 
> yhat.4 <- predict(mod.4, newdata=d.predict.us)
> yhat.spline <- predict(mod.spline, newdata=d.predict.us)
> 
> plot(height ~ age, data=d1)
> lines(age.seq, yhat.4, lwd=2, col="magenta")
> lines(age.seq, yhat.spline, lwd=2, lty=2, col="blue")

The two fits are remarkably similar.

How does the LS fit do with respect to out-of-sample prediction relative to the best Bayesian models?

> d2.predict <- list(
+     age = d2$age
+ )
> 
> mu <- predict(mod.4, newdata=d2.predict) 
> deviances[4]  <-  -2*sum(dnorm(d2$height, mu, summary(mod.4)$sigma, log=TRUE))
> names(deviances)[4] <- "mod.4"
> deviances

    m6r      m6      m4   mod.4 
1875.22 1875.43 1876.35 1877.30

Answer: Pretty similar prediction error, but quite as good as the best Bayesian polynomial model fits.

Finally, I’ll try a three-parameter logistic growth model, which on general grounds seems a more reasonable choice than a high-degree polynomial. In this model, \(\theta_1\) is the asymptotic mean height, \(\theta_2\) adjusts the initial level of mean height at age 0, and \(theta_3\) is a growth parameter. The start value for \(\theta_1\) is a guessed value, and the other start values are determined by linear least squares for a linearized version of the model.

> library(nlme)
> th1.start <- 1.01*max(d1$height)
> mod.start <- lm(log((height/th1.start)/(1 - height/th1.start)) ~ age, data=d1)
> start <- coef(mod.start)
> 
> mod.logistic <- nls(height ~ theta1/(1 + exp(-(theta2 + theta3*age))), data=d1, 
+                     start=list(theta1=th1.start, theta2=start[1], theta3=start[2]))
> summary(mod.logistic)


Formula: height ~ theta1/(1 + exp(-(theta2 + theta3 * age)))

Parameters:
        Estimate Std. Error t value Pr(>|t|)    
theta1 155.77722    0.73229  212.73  < 2e-16 ***
theta2  -0.28392    0.04502   -6.31  1.2e-09 ***
theta3   0.16200    0.00725   22.35  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.42 on 269 degrees of freedom

Number of iterations to convergence: 10 
Achieved convergence tolerance: 1.81e-06

> plot(height ~ age, data=d1)
> ages <- seq(min(d1$age), max(d1$age), length=100)
> yhat <- predict(mod.logistic, newdata=list(age=ages))
> lines(ages, yhat)

Let’s see how this model does in the test sample:

> mu <- predict(mod.logistic, newdata=d2.predict) 
> deviances[5]  <-  -2*sum(dnorm(d2$height, mu, summary(mod.logistic)$sigma, log=TRUE))
> names(deviances)[5] <- "mod.logistic"
> deviances

         m6r           m6           m4        mod.4 mod.logistic 
     1875.22      1875.43      1876.35      1877.30      1890.83

So, the predictive performance of the logistic model is considerably worse than the previously fit “better” models.

Statistical Rethinking Chapter 6 Problems

John Fox

2016-11-16

6H1

6H2

6H3

6H4

6H5

6H6

Addendum