##----------------------------------## ## Script for Part 6: Odds & Ends ## ## John Fox ## ## Statistical Computing in R ## ## McMaster University ## ## Fall 2005 ## ##----------------------------------## # Classes & object-orinted programming (S3 objects) library(car) # for data data(Prestige) mod.prestige <- lm(prestige ~ income + education + women, data=Prestige) attributes(mod.prestige) class(mod.prestige) print methods("print") print.lm mod.prestige print(mod.prestige) print.lm(mod.prestige) methods(class="lm") data(Mroz) mod.mroz <- glm(lfp ~ ., family=binomial, data=Mroz) class(mod.mroz) mod.mroz summary methods("summary") # writing method functions lreg.3 <- function(X, y, predictors=colnames(X), max.iter=10, tol=1E-6, constant=TRUE){ if (!is.numeric(X) || !is.matrix(X)) stop('X must be a numeric matrix') if (!is.numeric(y) || !all(y == 0 | y == 1)) stop('y must contain only 0s and 1s') if (nrow(X) != length(y)) stop('X and y contain different numbers of observations') if (constant) { X <- cbind(1, X) colnames(X)[1] <- 'Constant' } b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ p <- as.vector(1/(1 + exp(-X %*% b))) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } dev <- -2*sum(y*log(p) + (1 - y)*log(1 - p)) if (it > max.iter) warning('maximum iterations exceeded') result <- list(coefficients=as.vector(b), var=var.b, deviance=dev, converged= it <= max.iter, predictors=predictors) class(result) <- 'lreg' result } attach(Mroz) some(Mroz) lfp <- ifelse(lfp == "yes", 1, 0) wc <- ifelse(wc == "yes", 1, 0) hc <- ifelse(hc == "yes", 1, 0) mod.mroz.3 <- lreg.3(cbind(k5, k618, age, wc, hc, lwg, inc), lfp) class(mod.mroz.3) mod.mroz.3 print.lreg <- function(x) { coef <- x$coefficients names(coef) <- x$predictors print(coef) if (!x$converged) cat('\n *** lreg did not converge ***\n') invisible(x) } summary.lreg <- function(object) { b <- object$coefficients se <- sqrt(diag(object$var)) z <- b/se table <- cbind(b, se, z, 2*(1-pnorm(abs(z)))) colnames(table) <- c('Coefficient', 'Std.Error', 'z', 'p') rownames(table) <- object$predictors print(table) cat('\nDeviance =', object$deviance,'\n') if (!object$converged) cat('\n Note: *** lreg did not converge ***\n') } # Note: It's conventional for a summary method to produce an object # to be printed (here, it would be of class print.summary.lreg), # but I opted for a simpler approach. mod.mroz.3 summary(mod.mroz.3) # writing a generic function Rsq <- function(model, ...) UseMethod("Rsq") Rsq.lm <- function(model, ...){ summary(model)$r.squared } Rsq.glm <- function(model, ...){ sumry <- summary(model) 1 - sumry$deviance/sumry$null.deviance } Rsq(mod.prestige) Rsq(mod.mroz) Rsq(Mroz) Rsq.default <- function(model, ...){ stop(paste("Rsq not available for objects of class", class(model))) } Rsq(Mroz) # Debugging lreg.4 <- function(X, y, predictors=colnames(X), max.iter=10, # bugged! tol=1E-6, constant=TRUE){ if (!is.numeric(X) || !is.matrix(X)) stop('X must be a numeric matrix') if (!is.numeric(y) || !all(y == 0 | y == 1)) stop('y must contain only 0s and 1s') if (nrow(X) != length(y)) stop('X and y contain different numbers of observations') if (constant) { X <- cbind(1, X) colnames(X)[1] <- 'Constant' } b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } dev <- -2*sum(y*log(p) + (1 - y)*log(1 - p)) if (it > max.iter) warning('maximum iterations exceeded') result <- list(coefficients=as.vector(b), var=var.b, deviance=dev, converged= it <= max.iter, predictors=predictors) class(result) <- 'lreg' result } mod.mroz.4 <- lreg.4(cbind(k5, k618, age, wc, hc, lwg, inc), lfp) # locating the error with traceback() traceback() # stopping execution with browser() # for details: ?browser # or c: continue execution # n: execute next line and enter debugger # Q: quit lreg.4 <- function(X, y, predictors=colnames(X), max.iter=10, # bugged! tol=1E-6, constant=TRUE){ if (!is.numeric(X) || !is.matrix(X)) stop('X must be a numeric matrix') if (!is.numeric(y) || !all(y == 0 | y == 1)) stop('y must contain only 0s and 1s') if (nrow(X) != length(y)) stop('X and y contain different numbers of observations') if (constant) { X <- cbind(1, X) colnames(X)[1] <- 'Constant' } b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) browser() var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } dev <- -2*sum(y*log(p) + (1 - y)*log(1 - p)) if (it > max.iter) warning('maximum iterations exceeded') result <- list(coefficients=as.vector(b), var=var.b, deviance=dev, converged= it <= max.iter, predictors=predictors) class(result) <- 'lreg' result } mod.mroz.4 <- lreg.4(cbind(k5, k618, age, wc, hc, lwg, inc), lfp) # stepping through the function with debug() # for details: ?debug # or n: execute next line # c: continue execution # Q: quit debug(lreg.4) # after reentering original bugged function! mod.mroz <- lreg.4(cbind(k5, k618, age, wc, hc, lwg, inc), lfp) undebug(lreg.4) # Putting together complex graphs # Diagram of the standard normal density function oldpar <- par(mar = c(5, 6, 4, 2) + 0.1) # leave room on the left oldpar # old parameter saved z <- seq(-4, 4, length=1000) p <- dnorm(z) plot(z, p, type="l", lwd=2, main=expression("The Standard Normal Density Function" ~~ phi(z)), ylab=expression(phi(z) == frac(1, sqrt(2*pi)) * ~~ e^- ~~ frac(z^2, 2))) abline(h=0, col="gray") abline(v=0, col="gray") z0 <- z[z >= 1.96] # define region to fill z0 <- c(z0[1], z0) p0 <- p[z >= 1.96] p0 <- c(0, p0) polygon(z0, p0, col="gray") coords <- locator(2) # locate head and tail of arrow arrows(coords$x[1], coords$y[1], coords$x[2], coords$y[2], code=1, length=0.125) text(coords$x[2], coords$y[2], pos=3, # text above tail of arrow expression(integral(phi(z)*dz, 1.96, infinity) == .025)) # a four-panel display explaining kernel regression (time permitting) UN <- read.table( "http://socserv.socsci.mcmaster.ca/jfox/Courses/MacRCourse/UnitedNations.txt") UN.2 <- na.omit(UN[, c("life.female", "gdp.capita")]) # valid data only head(UN.2) par(mfrow=c(2,2)) # 2 x 2 array of graphs gdp <- UN.2$gdp.capita life <- UN.2$life.female ord <- order(gdp) # sort data by gdp gdp <- gdp[ord] life <- life[ord] x0 <- gdp[120] # focal x = x_(120) dist <- abs(gdp - x0) # distance from focal x h <- sort(dist)[95] # bandwidth for span of .5 (where n = 190) pick <- dist <= h # observations within window plot(gdp, life, xlab="GDP per Capita", ylab="Female Expectation of Life", type="n", main="(a) Observations Within the Window\nspan = 0.5") points(gdp[pick], life[pick], col="gray20") points(gdp[!pick], life[!pick], col="gray60") abline(v=x0) # focal x abline(v=c(x0 - h, x0 + h), lty=2) # window text(locator(1), expression(x[(120)]), xpd=TRUE) plot(range(gdp), c(0,1), xlab="GDP per Capita", ylab="Tricube Kernel Weight", type="n", main="(b) Tricube Weights") abline(v=x0) abline(v=c(x0 - h, x0 + h), lty=2) # function to calculate tricube weights: tricube <- function(z) ifelse(abs(z) < 1, (1 - (abs(z))^3)^3, 0) x <- seq(min(gdp), max(gdp), length=1000) z <- (x - x0)/h lines(spline(x, tricube(z)), lwd=2) points(gdp[pick], tricube(((gdp-x0)/h)[pick]), col="gray20") abline(h=c(0,1), col="gray") plot(gdp, life, xlab="GDP per Capita", ylab="Female Expectation of Life", type="n", main="(c) Weighted Average (Kernal Estimate)") points(gdp[pick], life[pick], col="gray20") points(gdp[!pick], life[!pick], col="gray60") abline(v=x0) abline(v=c(x0 - h, x0 + h), lty=2) yhat <- weighted.mean(life, tricube((gdp - x0)/h)) # kernel estimate lines(c(x0 - h, x0 + h), c(yhat, yhat), lwd=3) plot(gdp, life, xlab="GDP per Capita", ylab="Female Expectation of Life", main="(d) Complete Kernel Estimate") yhat <- rep(0, length(gdp)) for (i in 1:length(gdp)){ # kernel estimate at each x x0 <- gdp[i] dist <- abs(gdp - x0) h <- sort(dist)[95] yhat[i] <- weighted.mean(life, tricube((gdp - x0)/h)) } lines(gdp, yhat, lwd=2)