Preliminaries:
> library(rethinking)
> data(Howell1)
> d <- Howell1
> d$age <- with(d, (age - mean(age))/sd(age))
> set.seed(1000) # for reproducibility
> i <- sample(nrow(d), size=nrow(d)/2)
> d1 <- d[i, ]
> d2 <- d[-i, ]
> nrow(d1) #check
[1] 272
> nrow(d2)
[1] 272
Aside: I know what a year of age represents; a standard deviation of age, not so much.
Fitting the various models (using flat priors for a and sigma). I continue to have concerns about where the priors come from (and was unable to fit model m6
without specifying a more restrictive prior for the 6th-degree term).
> m1 <- map(
+ alist(
+ height ~ dnorm(mu, sigma),
+ mu <- a + b1*age,
+ a ~ dnorm(150, 100),
+ b1 ~ dnorm(0, 10),
+ sigma ~ dunif(0, 30)
+ ),
+ data=d1)
> m1
Maximum a posteriori (MAP) model fit
Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)
MAP values:
a b1 sigma
138.4458 18.3889 19.5417
Log-likelihood: -1194.49
> m2 <- map(
+ alist(
+ height ~ dnorm(mu, sigma),
+ mu <- a + b1*age + b2*I(age^2),
+ a ~ dnorm(150, 100),
+ b1 ~ dnorm(0, 10),
+ b2 ~ dnorm(0, 10),
+ sigma ~ dunif(0, 30)
+ ),
+ data=d1)
> m2
Maximum a posteriori (MAP) model fit
Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)
MAP values:
a b1 b2 sigma
152.2580 25.5806 -13.9791 12.3795
Log-likelihood: -1070.31
> m3 <- map(
+ alist(
+ height ~ dnorm(mu, sigma),
+ mu <- a + b1*age + b2*I(age^2) + b3*I(age^3),
+ a ~ dnorm(150, 100),
+ b1 ~ dnorm(0, 10),
+ b2 ~ dnorm(0, 10),
+ b3 ~ dnorm(0, 10),
+ sigma ~ dunif(0, 30)
+ ),
+ data=d1)
> m3
Maximum a posteriori (MAP) model fit
Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
b3 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)
MAP values:
a b1 b2 b3 sigma
158.01841 11.36137 -23.99399 8.03899 8.58993
Log-likelihood: -970.91
> m4 <- map(
+ alist(
+ height ~ dnorm(mu, sigma),
+ mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4),
+ a ~ dnorm(150, 100),
+ b1 ~ dnorm(0, 10),
+ b2 ~ dnorm(0, 10),
+ b3 ~ dnorm(0, 10),
+ b4 ~ dnorm(0, 10),
+ sigma ~ dunif(0, 30)
+ ),
+ data=d1)
> m4
Maximum a posteriori (MAP) model fit
Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3) + b4 * I(age^4)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
b3 ~ dnorm(0, 10)
b4 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)
MAP values:
a b1 b2 b3 b4 sigma
156.58189 5.89587 -19.06828 12.38996 -2.35959 8.16611
Log-likelihood: -957.15
> m5 <- map(
+ alist(
+ height ~ dnorm(mu, sigma),
+ mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5),
+ a ~ dnorm(150, 100),
+ b1 ~ dnorm(0, 10),
+ b2 ~ dnorm(0, 10),
+ b3 ~ dnorm(0, 10),
+ b4 ~ dnorm(0, 10),
+ b5 ~ dnorm(0, 10),
+ sigma ~ dunif(0, 30)
+ ),
+ data=d1)
> m5
Maximum a posteriori (MAP) model fit
Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3) + b4 * I(age^4) +
b5 * I(age^5)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
b3 ~ dnorm(0, 10)
b4 ~ dnorm(0, 10)
b5 ~ dnorm(0, 10)
sigma ~ dunif(0, 30)
MAP values:
a b1 b2 b3 b4 b5
156.6584932 5.8187186 -19.4034784 12.5300962 -2.1925476 -0.0681921
sigma
8.1634807
Log-likelihood: -957.06
> m6 <- map(
+ alist(
+ height ~ dnorm(mu, sigma),
+ mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5) + b6*I(age^6),
+ a ~ dnorm(150, 100),
+ b1 ~ dnorm(0, 10),
+ b2 ~ dnorm(0, 10),
+ b3 ~ dnorm(0, 10),
+ b4 ~ dnorm(0, 10),
+ b5 ~ dnorm(0, 10),
+ b6 ~ dnorm(0, 5),
+ sigma ~ dunif(0, 30)
+ ),
+ data=d1)
> m6
Maximum a posteriori (MAP) model fit
Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3) + b4 * I(age^4) +
b5 * I(age^5) + b6 * I(age^6)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 10)
b2 ~ dnorm(0, 10)
b3 ~ dnorm(0, 10)
b4 ~ dnorm(0, 10)
b5 ~ dnorm(0, 10)
b6 ~ dnorm(0, 5)
sigma ~ dunif(0, 30)
MAP values:
a b1 b2 b3 b4 b5
156.649555 4.412549 -19.170857 15.274650 -2.810915 -1.107829
b6 sigma
0.348599 8.142637
Log-likelihood: -956.37
Comparing the models using WAIC:
> (comparison <- compare(m1, m2, m3, m4, m5, m6))
WAIC pWAIC dWAIC weight SE dSE
m4 1926.0 5.5 0.0 0.57 25.43 NA
m5 1927.5 6.3 1.5 0.27 25.37 0.45
m6 1928.5 7.4 2.5 0.16 25.19 1.68
m3 1952.3 5.4 26.3 0.00 24.20 11.07
m2 2150.1 5.2 224.1 0.00 22.77 26.71
m1 2395.4 3.4 469.4 0.00 23.14 31.05
The 4th, 5th, and 6th degree polynomials have similar WAIS values; the linear, quadratic, and cubic models fit much worse (to varying extent).
> age.seq <- seq(min(d1$age), max(d1$age), length=50)
> d.predict <- list(
+ age = age.seq
+ )
>
> pred <- link(m1, data=d.predict)
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m1")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)
> pred <- link(m2, data=d.predict)
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m2")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)
> pred <- link(m3, data=d.predict)
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m3")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)
> pred <- link(m4, data=d.predict)
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m4")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)
> pred <- link(m5, data=d.predict)
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m5")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)
> pred <- link(m6, data=d.predict)
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m6")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)
Models 4, 5, and 6 produce roughly similar predictions and at least seem to follow the trend in the scatterplot. Recall that these are the models that get non-zero WAIC-based weights. Frankly, this doesn’t make a whole lot of sense to me as a strategy, other than as an exercise, since I’d expect average height to approach an asymptote. On the other hand, the higher-order polynomials produce what appear to be reasonable fits in this example within the range of ages in the data.
Computing and plotting the model-averaged predictions:
> height.ensemble <- ensemble(m1, m2, m3, m4, m5, m6, data=d.predict)
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> mu <- apply(height.ensemble$link, 2, mean)
> mu.PI <- apply(height.ensemble$link, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model-averaged")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)
The model-averaged predictions and intervals look similar to those for models 4, 5, and 6.
Computing the test-sample deviance for each model:
> d2.predict <- list(
+ age <- d2$age
+ )
>
> mods <- list(m1, m2, m3, m4, m5, m6)
> deviances <- sapply(mods, function(mod) {
+ pred <- link(mod, data=d2.predict)
+ mu <- apply(pred, 2, mean)
+ -2*sum(dnorm(d2$height, mu, coef(mod)["sigma"], log=TRUE))
+ })
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> names(deviances) <- c("m1", "m2", "m3", "m4", "m5", "m6")
> deviances <- sort(deviances)
> deviances
m6 m4 m5 m3 m2 m1
1875.99 1876.39 1876.52 1932.45 2137.98 2422.24
Comparison of test-sample deviances to WAIC values, setting the minimum deviance to 0:
> comparison
WAIC pWAIC dWAIC weight SE dSE
m4 1926.0 5.5 0.0 0.57 25.43 NA
m5 1927.5 6.3 1.5 0.27 25.37 0.45
m6 1928.5 7.4 2.5 0.16 25.19 1.68
m3 1952.3 5.4 26.3 0.00 24.20 11.07
m2 2150.1 5.2 224.1 0.00 22.77 26.71
m1 2395.4 3.4 469.4 0.00 23.14 31.05
> deviances - min(deviances)
m6 m4 m5 m3 m2 m1
0.000000 0.400422 0.536945 56.461066 261.993945 546.252649
Both criteria have models 4, 5, and 6 close, but not in exactly the same order, and the deviances for the other, more poorly fitting, models are considerably larger than the corresponding dWAIC values.
Fitting the model 6 with “more strongly regularizing” priors:
> m6r <- map(
+ alist(
+ height ~ dnorm(mu, sigma),
+ mu <- a + b1*age + b2*I(age^2) + b3*I(age^3) + b4*I(age^4) + b5*I(age^5) + b6*I(age^6),
+ a ~ dnorm(150, 100),
+ b1 ~ dnorm(0, 5),
+ b2 ~ dnorm(0, 5),
+ b3 ~ dnorm(0, 5),
+ b4 ~ dnorm(0, 5),
+ b5 ~ dnorm(0, 5),
+ b6 ~ dnorm(0, 5),
+ sigma ~ dunif(0, 30)
+ ),
+ data=d1)
> m6r
Maximum a posteriori (MAP) model fit
Formula:
height ~ dnorm(mu, sigma)
mu <- a + b1 * age + b2 * I(age^2) + b3 * I(age^3) + b4 * I(age^4) +
b5 * I(age^5) + b6 * I(age^6)
a ~ dnorm(150, 100)
b1 ~ dnorm(0, 5)
b2 ~ dnorm(0, 5)
b3 ~ dnorm(0, 5)
b4 ~ dnorm(0, 5)
b5 ~ dnorm(0, 5)
b6 ~ dnorm(0, 5)
sigma ~ dunif(0, 30)
MAP values:
a b1 b2 b3 b4 b5
155.8559187 5.9710498 -16.6031204 12.0942108 -3.5113773 0.2316938
b6 sigma
0.0509131 8.1979731
Log-likelihood: -958.21
Plot of predictions:
> pred <- link(m6r, data=d.predict)
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> mu <- apply(pred, 2, mean)
> mu.PI <- apply(pred, 2, PI, prob=0.97)
> plot(height ~ age, data=d1, main="model m6a")
> lines(age.seq, mu, lty=2, lwd=2)
> lines(age.seq, mu.PI[1, ], lty=2)
> lines(age.seq, mu.PI[2, ], lty=2)
Out-of-sample deviances and model WAIC comparison to previously “best” models:
> d2.predict <- list(
+ age = d2$age
+ )
>
> mods <- list(m4, m6, m6r)
> deviances <- sapply(mods, function(mod) {
+ pred <- link(mod, data=d2.predict)
+ mu <- apply(pred, 2, mean)
+ -2*sum(dnorm(d2$height, mu, coef(mod)["sigma"], log=TRUE))
+ })
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
[ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
> names(deviances) <- c("m4", "m6", "m6r")
> deviances <- sort(deviances)
> deviances
m6r m6 m4
1875.22 1875.43 1876.35
> compare(m4, m6, m6r)
WAIC pWAIC dWAIC weight SE dSE
m4 1926.6 5.9 0.0 0.68 25.58 NA
m6 1928.6 7.5 2.0 0.25 25.31 1.74
m6r 1931.1 6.9 4.5 0.07 25.71 2.04
Model m6r
(with the more strongly regularized prior) looks best with respect to out-of-sample prediction but not according to the WAIC.
I was curious about what a least-squares analysis of the data would show. For this purpose, I’ll use unstandardized age.
> d <- Howell1 # restoring the unstandardized data
> d1 <- d[i, ]
> d2 <- d[-i, ]
>
> mod.6 <- lm(height ~ poly(age, 6), data=d1)
> summary(mod.6)
Call:
lm(formula = height ~ poly(age, 6), data = d1)
Residuals:
Min 1Q Median 3Q Max
-24.97 -5.83 -0.08 5.98 28.87
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 138.22 0.50 276.55 < 2e-16 ***
poly(age, 6)1 304.78 8.24 36.98 < 2e-16 ***
poly(age, 6)2 -249.36 8.24 -30.25 < 2e-16 ***
poly(age, 6)3 146.99 8.24 17.83 < 2e-16 ***
poly(age, 6)4 -43.94 8.24 -5.33 2.1e-07 ***
poly(age, 6)5 -5.09 8.24 -0.62 0.54
poly(age, 6)6 10.26 8.24 1.25 0.21
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.24 on 265 degrees of freedom
Multiple R-squared: 0.908, Adjusted R-squared: 0.906
F-statistic: 438 on 6 and 265 DF, p-value: <2e-16
That suggests trying a 4th-degree polynomial (via the no-no of using significance tests for model selection):
> mod.4 <- update(mod.6, . ~ poly(age, 4))
> summary(mod.4)
Call:
lm(formula = height ~ poly(age, 4), data = d1)
Residuals:
Min 1Q Median 3Q Max
-25.90 -6.00 -0.24 5.63 29.12
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 138.22 0.50 276.59 < 2e-16 ***
poly(age, 4)1 304.78 8.24 36.98 < 2e-16 ***
poly(age, 4)2 -249.36 8.24 -30.26 < 2e-16 ***
poly(age, 4)3 146.99 8.24 17.83 < 2e-16 ***
poly(age, 4)4 -43.94 8.24 -5.33 2.1e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.24 on 267 degrees of freedom
Multiple R-squared: 0.908, Adjusted R-squared: 0.906
F-statistic: 657 on 4 and 267 DF, p-value: <2e-16
> anova(mod.4, mod.6)
Analysis of Variance Table
Model 1: height ~ poly(age, 4)
Model 2: height ~ poly(age, 6)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 267 18136
2 265 18004 2 131.3 0.966 0.382
What does this fit look like? I’ll also compare to a 4-df regression spline.
> library(splines)
> mod.spline <- update(mod.4, . ~ ns(age, 4))
>
> age.seq <- seq(min(d1$age), max(d1$age), length=50)
> d.predict.us <- list(
+ age = age.seq
+ )
>
> yhat.4 <- predict(mod.4, newdata=d.predict.us)
> yhat.spline <- predict(mod.spline, newdata=d.predict.us)
>
> plot(height ~ age, data=d1)
> lines(age.seq, yhat.4, lwd=2, col="magenta")
> lines(age.seq, yhat.spline, lwd=2, lty=2, col="blue")
The two fits are remarkably similar.
How does the LS fit do with respect to out-of-sample prediction relative to the best Bayesian models?
> d2.predict <- list(
+ age = d2$age
+ )
>
> mu <- predict(mod.4, newdata=d2.predict)
> deviances[4] <- -2*sum(dnorm(d2$height, mu, summary(mod.4)$sigma, log=TRUE))
> names(deviances)[4] <- "mod.4"
> deviances
m6r m6 m4 mod.4
1875.22 1875.43 1876.35 1877.30
Answer: Pretty similar prediction error, but quite as good as the best Bayesian polynomial model fits.
Finally, I’ll try a three-parameter logistic growth model, which on general grounds seems a more reasonable choice than a high-degree polynomial. In this model, \(\theta_1\) is the asymptotic mean height, \(\theta_2\) adjusts the initial level of mean height at age 0, and \(theta_3\) is a growth parameter. The start value for \(\theta_1\) is a guessed value, and the other start values are determined by linear least squares for a linearized version of the model.
> library(nlme)
> th1.start <- 1.01*max(d1$height)
> mod.start <- lm(log((height/th1.start)/(1 - height/th1.start)) ~ age, data=d1)
> start <- coef(mod.start)
>
> mod.logistic <- nls(height ~ theta1/(1 + exp(-(theta2 + theta3*age))), data=d1,
+ start=list(theta1=th1.start, theta2=start[1], theta3=start[2]))
> summary(mod.logistic)
Formula: height ~ theta1/(1 + exp(-(theta2 + theta3 * age)))
Parameters:
Estimate Std. Error t value Pr(>|t|)
theta1 155.77722 0.73229 212.73 < 2e-16 ***
theta2 -0.28392 0.04502 -6.31 1.2e-09 ***
theta3 0.16200 0.00725 22.35 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.42 on 269 degrees of freedom
Number of iterations to convergence: 10
Achieved convergence tolerance: 1.81e-06
> plot(height ~ age, data=d1)
> ages <- seq(min(d1$age), max(d1$age), length=100)
> yhat <- predict(mod.logistic, newdata=list(age=ages))
> lines(ages, yhat)
Let’s see how this model does in the test sample:
> mu <- predict(mod.logistic, newdata=d2.predict)
> deviances[5] <- -2*sum(dnorm(d2$height, mu, summary(mod.logistic)$sigma, log=TRUE))
> names(deviances)[5] <- "mod.logistic"
> deviances
m6r m6 m4 mod.4 mod.logistic
1875.22 1875.43 1876.35 1877.30 1890.83
So, the predictive performance of the logistic model is considerably worse than the previously fit “better” models.