##------------------------------------------------------------------------## ## Script for web appendix on Cox regression ## ## An R and S-PLUS Companion to Applied Regression ## ## John Fox ## ## Sage Publications, 2002 ## ##------------------------------------------------------------------------## # preliminaries options(width=65) options(digits=5) # time-constant covariates library(survival) # R only args(coxph) Rossi <- read.table('Rossi.txt', header=T) Rossi[1:5, 1:10] mod.allison <- coxph(Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio, data=Rossi) mod.allison summary(mod.allison) plot(survfit(mod.allison), ylim=c(.7, 1), xlab='Weeks', ylab='Proportion Not Rearrested') attach(Rossi) Rossi.fin <- data.frame(fin=c(0,1), age=rep(mean(age),2), race=rep(mean(race),2), wexp=rep(mean(wexp),2), mar=rep(mean(mar),2), paro=rep(mean(paro),2), prio=rep(mean(prio),2)) detach() plot(survfit(mod.allison, newdata=Rossi.fin), conf.int=T, lty=c(1,2), ylim=c(.6, 1), xlab='Weeks', ylab='Proportion Not Rearrested') legend(locator(1), legend=c('fin = 0', 'fin = 1'), lty=c(1,2)) # time-dependent covariate Rossi[1,] names(Rossi)[11:62] sum(!is.na(Rossi[,11:62])) # record count Rossi.2 <- matrix(0, 19809, 14) # to hold new data set # in S-PLUS the next command needs the colnames function in the car library colnames(Rossi.2) <- c('start', 'stop', 'arrest.time', names(Rossi)[1:10], 'employed') row <- 0 # set record counter to 0 for (i in 1:nrow(Rossi)) { # loop over individuals for (j in 11:62) { # loop over 52 weeks if (is.na(Rossi[i, j])) next # skip missing data else { row <- row + 1 # increment row counter start <- j - 11 # start time (previous week) stop <- start + 1 # stop time (current week) arrest.time <- if (stop == Rossi[i, 1] && Rossi[i, 2] ==1) 1 else 0 # construct record: Rossi.2[row,] <- c(start, stop, arrest.time, unlist(Rossi[i, c(1:10, j)])) } } } Rossi.2 <- as.data.frame(Rossi.2) remove(i, j, row, start, stop, arrest.time) # clean up # S-PLUS: remove(c('row', 'start', 'stop', 'arrest.time')) Rossi.2 <- fold(Rossi, time='week', event='arrest', cov=11:62, cov.names='employed') Rossi.2[1:50,] mod.allison.2 <- coxph(Surv(start, stop, arrest.time) ~ fin + age + race + wexp + mar + paro + prio + employed, data=Rossi.2) summary(mod.allison.2) # lagged, time-dependent covariate Rossi.3 <- fold(Rossi, 'week', 'arrest', 11:62, 'employed', lag=1) mod.allison.3 <- coxph(Surv(start, stop, arrest.time) ~ fin + age + race + wexp + mar + paro + prio + employed, data=Rossi.3) summary(mod.allison.3) # diagnostics mod.allison.4 <- coxph(Surv(week, arrest) ~ fin + age + prio, data=Rossi) mod.allison.4 # proportional hazards cox.zph(mod.allison.4) par(mfrow=c(2,2)) plot(cox.zph(mod.allison.4)) # modeling an interaction with time mod.allison.5 <- coxph(Surv(start, stop, arrest.time) ~ fin + age + age:stop + prio, data=Rossi.2) mod.allison.5 # fitting by strata library(car) Rossi$age.cat <- recode(Rossi$age, " lo:19=1; 20:25=2; 26:30=3; 31:hi=4 ") table(Rossi$age.cat) ## Rossi$age.cat <- cut(Rossi$age, c(0,19,25,30,Inf)) # alternative mod.allison.6 <- coxph(Surv(week, arrest) ~ fin + prio + strata(age.cat), data=Rossi) mod.allison.6 cox.zph(mod.allison.6) # influence dfbeta <- residuals(mod.allison.4, type='dfbeta') par(mfrow=c(2,2)) for (j in 1:3) { plot(dfbeta[,j], ylab=names(coef(mod.allison.4))[j]) abline(h=0, lty=2) } # nonlinearity par(mfrow=c(2,2)) res <- residuals(mod.allison.4, type='martingale') X <- as.matrix(Rossi[,c("age", "prio")]) # matrix of covariates par(mfrow=c(2,2)) for (j in 1:2) { # residual plots plot(X[,j], res, xlab=c("age", "prio")[j], ylab="residuals") abline(h=0, lty=2) lines(lowess(X[,j], res, iter=0)) } b <- coef(mod.allison.4)[c(2,3)] # regression coefficients for (j in 1:2) { # partial-residual plots plot(X[,j], b[j]*X[,j] + res, xlab=c("age", "prio")[j], ylab="component+residual") abline(lm(b[j]*X[,j] + res ~ X[,j]), lty=2) lines(lowess(X[,j], b[j]*X[,j] + res, iter=0)) } ##---------------------------------------------------------------------------------- ## The fold function to create datasets with time-varying covariates ##---------------------------------------------------------------------------------- # R version (last modified 11 June 2006) fold <- function(data, time, event, cov, cov.names=paste('covariate', '.', 1:ncovs, sep=""), suffix='.time', cov.times=0:ncov, common.times=TRUE, lag=0){ ## Arguments: ## data: A data frame or numeric matrix (with column names) to be `unfolded.' ## For reasons of efficiency, if there are factors in data these will be ## converted to numeric variables in the output data frame. ## time: The quoted name of the event/censoring-time variable in data. ## event: The quoted name of the event/censoring-indicator variable in data. ## cov: A vector giving the column numbers of the time-dependent covariate ## in data, or a list of vectors if there is more than one time-varying ## covariate. ## cov.names: A character string or character vector giving the name or names ## to be assigned to the time-dependent covariate(s) in the output data set. ## suffix: The suffix to be attached to the name of the time-to-event variable ## in the output data set; defaults to '.time'. ## cov.times: The observation times for the covariate values, including the start ## time. This argument can take several forms: ## * The default is integers from 0 to the number of covariate values (i.e., ## one more than the length of each vector in cov). ## * An arbitrary numerical vector with one more entry than the length of each ## * vector in cov. ## * The columns in the input data set that give the observations times for each ## individual. There should be one more column than the length of each ## vector in cov. ## common.times: A logical value indicating whether the times of observation ## are the same for all individuals; defaults to TRUE. ## lag: Number of observation periods to lag each value of the time-varying ## covariate(s); defaults to 0. vlag <- function(x, lag) c(rep(NA, lag), x[1:(length(x)-lag)]) xlag <- function(x, lag) apply(as.matrix(x), 2, vlag, lag=lag) all.cov <- unlist(cov) if (!is.list(cov)) cov <- list(cov) ncovs <- length(cov) nrow <- nrow(data) ncol <- ncol(data) ncov <- length(cov[[1]]) nobs <- nrow*ncov if (length(unique(c(sapply(cov, length), length(cov.times)-1))) > 1) stop(paste( "all elements of cov must be of the same length and \n", "cov.times must have one more entry than each element of cov.")) var.names <- names(data) subjects <- rownames(data) omit.cols <- if (!common.times) c(all.cov, cov.times) else all.cov keep.cols <- (1:ncol)[-omit.cols] nkeep <- length(keep.cols) if (is.numeric(event)) event <- var.names[event] times <- if (common.times) matrix(cov.times, nrow, ncov+1, byrow=TRUE) else data[, cov.times] new.data <- matrix(Inf, nobs, 3 + ncovs + nkeep) rownames <- rep("", nobs) colnames(new.data) <- c('start', 'stop', paste(event, suffix, sep=""), var.names[-omit.cols], cov.names) end.row <- 0 for (i in 1:nrow){ start.row <- end.row + 1 end.row <- end.row + ncov start <- as.numeric(times[i, 1:ncov]) stop <- as.numeric(times[i, 2:(ncov+1)]) event.time <- as.numeric(ifelse (stop == data[i, time] & data[i, event] == 1, 1, 0)) keep <- matrix(unlist(data[i, -omit.cols]), ncov, nkeep, byrow=TRUE) select <- apply(matrix(!is.na(data[i, all.cov]), ncol=ncovs), 1, all) rows <- start.row:end.row cov.mat <- xlag(matrix(unlist(data[i, all.cov]), nrow=length(rows)), lag) new.data[rows[select], ] <- cbind(start, stop, event.time, keep, cov.mat)[select,] rownames[rows] <- paste(subjects[i], '.', seq(along=rows), sep="") } row.names(new.data) <- rownames as.data.frame(new.data[new.data[, 1] != Inf & apply(as.matrix(!is.na(new.data[, cov.names])), 1, all), ]) } # S-PLUS version (requires the car library; last modified 11 June 2006) vlag <- function(x, lag) c(rep(NA, lag), x[1:(length(x)-lag)]) xlag <- function(x, lag) apply(as.matrix(x), 2, vlag, lag=lag) fold <- function(data, time, event, cov, cov.names=paste('covariate', '.', 1:ncovs, sep=""), suffix='.time', cov.times=0:ncov, common.times=TRUE, lag=0){ ## Arguments: same as the R version all.cov <- unlist(cov) if (!is.list(cov)) cov <- list(cov) ncovs <- length(cov) nrow <- nrow(data) ncol <- ncol(data) ncov <- length(cov[[1]]) nobs <- nrow*ncov if (length(unique(c(sapply(cov, length), length(cov.times)-1))) > 1) stop(paste( "all elements of cov must be of the same length and \n", "cov.times must have one more entry than each element of cov.")) var.names <- names(data) subjects <- rownames(data) omit.cols <- if (!common.times) c(all.cov, cov.times) else all.cov keep.cols <- (1:ncol)[-omit.cols] nkeep <- length(keep.cols) if (is.numeric(event)) event <- var.names[event] times <- if (common.times) matrix(cov.times, nrow, ncov+1, byrow=T) else data[, cov.times] new.data <- matrix(Inf, nobs, 3 + ncovs + nkeep) rownames <- rep("", nobs) colnames(new.data) <- c('start', 'stop', paste(event, suffix, sep=""), var.names[-omit.cols], cov.names) end.row <- 0 for (i in 1:nrow){ start.row <- end.row + 1 end.row <- end.row + ncov start <- as.numeric(times[i, 1:ncov]) stop <- as.numeric(times[i, 2:(ncov+1)]) event.time <- as.numeric(ifelse (stop == data[i, time] & data[i, event] == 1, 1, 0)) keep <- matrix(unlist(data[i, -omit.cols]), ncov, nkeep, byrow=T) select <- apply(matrix(!is.na(data[i, all.cov]), ncol=ncovs), 1, all) rows <- start.row:end.row cov.mat <- xlag(matrix(unlist(data[i, all.cov]), nrow=length(rows)), lag) new.data[rows[select], ] <- cbind(start, stop, event.time, keep, cov.mat)[select,] rownames[rows] <- paste(subjects[i], '.', seq(along=rows), sep="") } row.names(new.data) <- rownames as.data.frame(new.data[new.data[, 1] != Inf & apply(as.matrix(!is.na(new.data[, cov.names])), 1, all), ]) }