##------------------------------------------------------------------------## ## Script for Chapter 3, An R and S-PLUS Companion to Applied Regression ## ## John Fox ## ## Sage Publications, 2002 ## ##------------------------------------------------------------------------## # preliminaries options(width=65) options(digits=5) library(car) data(Prestige) attach(Prestige) Prestige[1:5,] # first 5 obs. # histograms, density plots, stem-and-leaf displays hist(income) n.bins(income) hist(income, nclass=n.bins(income), col=1) box() stem(income) hist(income, nclass=n.bins(income), probability=T, ylab='Density') lines(density(income), lwd=2) points(income, rep(0, length(income)), pch="|") box() lines(density(income, adjust=.5), lwd=1) ## S alternative: lines(density(income, from=min(income), to=max(income))) # quantile-comparison plots qq.plot(income, labels=rownames(Prestige)) qq.plot(rchisq(100,3), distribution="chisq", df=3) # boxplots boxplot(income, ylab="income") identify(rep(1,length(income)), income, rownames(Prestige)) # S: identify(rep(50,length(income)), income, rownames(Prestige)) # scatterplots plot(income, prestige) scatterplot(income, prestige, span=.6, lwd=3, labels=rownames(Prestige), las=0) # coded scatterplot(prestige ~ income | type, span=.8) # jittered detach(Prestige) data(Vocab) attach(Vocab) plot(education, vocabulary) plot(jitter(education), jitter(vocabulary)) plot(jitter(education, factor=2), jitter(vocabulary, factor=2)) abline(lm(vocabulary ~ education), lwd=3, lty=2) lines(lowess(education, vocabulary, f=.2), lwd=3) # bivariate density estimates detach(Vocab) data(SLID) attach(SLID) plot(education, wages) valid<-!(is.na(wages) | is.na(education)) ## R equivalent: valid <- complete.cases(wages, education) sum(valid) sort(unique(education[valid])) length(.Last.value) length(unique(wages[valid])) library(sm) sm.density(cbind(education[valid], wages[valid]), display="image", xlab="Education", ylab="Wages", col=gray(seq(1,0,length=100))) points(jitter(education, amount=.25), wages, cex=.15) box() lines(lowess(education[valid], wages[valid], f=1/3), lwd=3) abline(lm(wages~education), lty=2, lwd=3) remove(valid) # parallel boxplots detach(SLID) data(Ornstein) attach(Ornstein) Ornstein[sort(sample(248,5)),] boxplot(interlocks~nation, ylab="Number of Interlocks") identify(as.numeric(nation),interlocks) # scatterplot matrix detach(Vocab) attach(Prestige) pairs(cbind(prestige, income, education, women)) scatterplot.matrix(cbind(prestige,income,education, women), diag="density", span=.75) # conditioning plots detach(Prestige) attach(SLID) coplot(log(wages)~education|age+sex, panel=panel.car, col=gray(.5), lwd=3, cex=.4) # Transformations box.cox(1:5, 2) box.cox(0:5, 2) logit(seq(.1, .9, .1)) logit(seq(.0, 1, .1)) detach(SLID) attach(Prestige) plot(density(women, from=0, to=100)) plot(density(logit(women), adjust=.75)) # for symmetry Ask(p, function(p) qq.plot(box.cox(income, p), ylab="transformed income")) for (p in c(1/2, 1/3, 0, -1/3, -1/2, -1)) qq.plot(box.cox(income, p), ylab=paste("transformed income, p =",p)) summary(box.cox.powers(income)) summary(box.cox.powers(cbind(income,education))) scatterplot(income^.25 ,education^.5, span=.75, lwd=3) # to equalize spreads (spread-level plots) detach(Prestige) attach(Ornstein) spread.level.plot(interlocks+1~nation) old.margins<-par('mar') old.margins par(mar=c(5.1, 4.1, 4.1, 4.1)) boxplot(log(interlocks+1,10)~nation, ylab='log10(Interlocks + 1)') power.axis(power=0, base=10, at=c(1,10,100), axis.title='Interlocks + 1') par(mar=old.margins) remove(old.margins) # for linearity detach(Ornstein) data(UN) attach(UN) scatterplot(gdp, infant.mortality, labels=rownames(UN)) Ask(p, function(p) scatterplot(box.cox(gdp,p[1]), box.cox(infant.mortality, p[2]), xlab="transformed GDP/capita", ylab="transformed infant mortality", labels=row.names(UN)))