##------------------------------------------------------------------------## ## Script for Review of Linear Models ## ## John Fox ## ## York SPIDA 2010 ## ##------------------------------------------------------------------------## # multiple regression library(car) # load car package ?Duncan # help page for data set some(Duncan) # first few rows View(Duncan) # browse data set scatterplot.matrix(~ prestige + income + education, data=Duncan) scatterplot(income ~ education, data=Duncan, labels=rownames(Duncan)) duncan.mod <- lm(prestige ~ income + education, data=Duncan) summary(duncan.mod) confint(duncan.mod) # incremental F-test duncan.mod.inc <- update(duncan.mod, . ~ . - education) # remove education anova(duncan.mod.inc, duncan.mod) # test if education coefficient = 0 anova(duncan.mod) # sequential ("Type-I") tests Anova(duncan.mod) # partial ("Type-II") tests (equivalent to t-tests) # linear hypothesis (L <- matrix(c(0, 1, -1), 1, 3)) # hypothesis matrix linear.hypothesis(duncan.mod, L, 0) linear.hypothesis(duncan.mod, "education = income") # equivalent # dummy regression # dichotomous factor some(Davis) scatterplot(repwt ~ weight | sex, data=Davis, smooth=FALSE, labels=rownames(Davis)) fix(Davis) # edit data attach(Davis) # attach data set to search path (generally not a good idea) sex class(sex) contrasts(sex) # dummy regressors for sex options("contrasts") mod.davis.1 <- lm(weight ~ repwt + sex) # additive model summary(mod.davis.1) mod.davis.2 <- lm(weight ~ repwt*sex) # model with interaction summary(mod.davis.2) anova(mod.davis.1, mod.davis.2) # test of interaction Anova(mod.davis.2) # type II tests, obeying marginality linear.hypothesis(mod.davis.2, # slopes AND intercepts c("sexM = 0", "repwt:sexM = 0")) # the same for M and F confidence.ellipse(mod.davis.2, which=3:4) # 95% confidence ellipse abline(h=0, v=0, lty=2) detach(Davis) # polytomous factor View(Prestige) Prestige <- na.omit(Prestige) attach(Prestige) # treatment of factors in R type class(type) contrasts(type) # dummy regressors type <- factor(type, levels=c("bc", "wc", "prof")) # change order of levels contrasts(type) # additive dummy-regression model mod.prestige.1 <- lm(prestige ~ income + education + type) summary(mod.prestige.1) mod.prestige.0 <- lm(prestige ~ income + education) anova(mod.prestige.0, mod.prestige.1) # incremental F-test for type linear.hypothesis(mod.prestige.1, c("typewc = 0", "typeprof = 0")) # dummy regression with interactions mod.prestige.2 <- lm(prestige ~ income + education + type + income:type + education:type) mod.prestige.2 <- lm(prestige ~ income*type + education*type) # equivalent mod.prestige.2 <- lm(prestige ~ type*(income + education)) # equivalent summary(mod.prestige.2) Anova(mod.prestige.2) # type-II tests # "effect" plots library(effects) # for "effect" displays plot(allEffects(mod.prestige.2), ask=FALSE) plot(allEffects(mod.prestige.2), multiline=TRUE, ask=FALSE) # regression diagnostics # unusual data # preliminary example remove(Davis) # get rid of fixed data set scatterplot(repwt ~ weight|sex, smooth=FALSE, labels=rownames(Davis), data=Davis) scatterplot(weight ~ repwt|sex, smooth=FALSE, labels=rownames(Davis), data=Davis) davis.1 <- lm(repwt ~ weight*sex, data=Davis) summary(davis.1) davis.2 <- lm(weight ~ repwt*sex, data=Davis) summary(davis.2) plot(hatvalues(davis.1)) # index plot of hat-values abline(h=c(2,3)*4/183) # twice and three-times average hat-value identify(hatvalues(davis.1)) plot(rstudent(davis.1)) # index plot of studentized residuals abline(h=c(-2, 0, 2)) identify(rstudent(davis.1)) outlier.test(davis.1) # Bonferroni t-test qq.plot(davis.1, simulate=TRUE, line="none") # QQ plot for stud. res. # Duncan's regression duncan <- lm(prestige ~ income + education, data=Duncan) summary(duncan) av.plots(duncan, ask=FALSE) # added-variable plots which.names(c("minister", "conductor"), Duncan) influencePlot(duncan, labels=rownames(Duncan)) qq.plot(duncan, simulate=TRUE) outlier.test(duncan) duncan.2 <- update(duncan, subset= -c(6,16)) # removing ministers & conductors summary(duncan.2) linear.hypothesis(duncan.2, "income = education") # non-normality SLID <- read.table( "http://socserv.socsci.mcmaster.ca/jfox/Books/Applied-Regression-2E/datasets/SLID-Ontario.txt", header=TRUE) slid0 <- lm(compositeHourlyWages ~ sex + age + yearsEducation, data=SLID) summary(slid0) qq.plot(slid0, simulate=TRUE, line="none", labels=FALSE) plot(density(rstudent(slid0))) slid1 <- lm(log2(compositeHourlyWages) ~ sex + age + yearsEducation, data=SLID) summary(slid1) qq.plot(slid1, simulate=TRUE, line="none", labels=FALSE) plot(density(rstudent(slid1))) boxplot(rstudent(slid1)) # non-constant error variance plot(fitted(slid0), rstudent(slid0), col="gray") abline(h=0, lty=2) lines(lowess(fitted(slid0), rstudent(slid0))) spread.level.plot(slid0) plot(fitted(slid1), rstudent(slid1), col="gray") abline(h=0, lty=2) lines(lowess(fitted(slid1), rstudent(slid1))) spread.level.plot(slid1) # nonlinearity cr.plots(slid1, ask=FALSE) slid2 <- lm(log2(compositeHourlyWages) ~ sex + poly(age, degree=2) + I(yearsEducation^2), data=SLID) # orthogonal polynomial in age summary(slid2) lm(log2(compositeHourlyWages) ~ sex + poly(age, degree=2, raw=TRUE) + I(yearsEducation^2), data=SLID) # "raw" quadratic lm(log2(compositeHourlyWages) ~ sex + age + I(age^2) + I(yearsEducation^2), data=SLID) # equivalent cr.plots(slid2, ask=FALSE) plot(allEffects(slid2), ask=FALSE) # effect plots plot(allEffects(slid2, transformation=list(link=log2, inverse=function(x) 2^x)), ylab="composite hourly wages", ask=FALSE) # plot on original dollar scale