##------------------------------------------------------------------------##
## Script for web appendix on Cox regression                              ##
## An R and S-PLUS Companion to Applied Regression                        ##
##    John Fox                                                            ##
##    Sage Publications, 2002                                             ##
##------------------------------------------------------------------------##

# preliminaries

options(width=65)
options(digits=5)

# time-constant covariates

library(survival)  # R only
args(coxph)

Rossi <- read.table('Rossi.txt', header=T)
Rossi[1:5, 1:10]

mod.allison <- coxph(Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio,
    data=Rossi)
mod.allison

summary(mod.allison)

plot(survfit(mod.allison), ylim=c(.7, 1), xlab='Weeks', 
    ylab='Proportion Not Rearrested')

attach(Rossi)
Rossi.fin <- data.frame(fin=c(0,1), age=rep(mean(age),2), race=rep(mean(race),2),
    wexp=rep(mean(wexp),2), mar=rep(mean(mar),2), paro=rep(mean(paro),2), 
    prio=rep(mean(prio),2))
detach()
plot(survfit(mod.allison, newdata=Rossi.fin), conf.int=T, 
    lty=c(1,2), ylim=c(.6, 1), xlab='Weeks', 
    ylab='Proportion Not Rearrested')
legend(locator(1), legend=c('fin = 0', 'fin = 1'), lty=c(1,2))    


# time-dependent covariate

Rossi[1,]

names(Rossi)[11:62]
sum(!is.na(Rossi[,11:62]))  # record count


Rossi.2 <- matrix(0, 19809, 14)  # to hold new data set
# in S-PLUS the next command needs the colnames function in the car library
colnames(Rossi.2) <- c('start', 'stop', 'arrest.time', names(Rossi)[1:10], 'employed')

row <- 0  # set record counter to 0
for (i in 1:nrow(Rossi)) {  # loop over individuals
    for (j in 11:62) {      # loop over 52 weeks
        if (is.na(Rossi[i, j])) next # skip missing data
        else {
            row <- row + 1     # increment row counter
            start <- j - 11    # start time (previous week)
            stop <- start + 1  # stop time (current week)
            arrest.time <- if (stop == Rossi[i, 1] && Rossi[i, 2] ==1) 1 else 0
             # construct record:
            Rossi.2[row,] <- c(start, stop, arrest.time, unlist(Rossi[i, c(1:10, j)]))
            }
        }
    }
Rossi.2 <- as.data.frame(Rossi.2)
remove(i, j, row, start, stop, arrest.time) # clean up

# S-PLUS: remove(c('row', 'start', 'stop', 'arrest.time'))

Rossi.2 <- fold(Rossi, time='week', 
    event='arrest', cov=11:62, cov.names='employed')

Rossi.2[1:50,]

mod.allison.2 <- coxph(Surv(start, stop, arrest.time) ~ 
    fin + age + race + wexp + mar + paro + prio + employed,
    data=Rossi.2)
summary(mod.allison.2)

# lagged, time-dependent covariate
    
Rossi.3 <- fold(Rossi, 'week', 'arrest', 11:62, 'employed', lag=1)
mod.allison.3 <- coxph(Surv(start, stop, arrest.time) ~ 
    fin + age + race + wexp + mar + paro + prio + employed,
    data=Rossi.3)
summary(mod.allison.3)


# diagnostics
mod.allison.4 <- coxph(Surv(week, arrest) ~ fin + age + prio,
    data=Rossi)

mod.allison.4

    # proportional hazards
    
cox.zph(mod.allison.4)
par(mfrow=c(2,2))
plot(cox.zph(mod.allison.4))

    # modeling an interaction with time
    
mod.allison.5 <- coxph(Surv(start, stop, arrest.time) ~ 
    fin + age + age:stop + prio,
    data=Rossi.2)
mod.allison.5

    # fitting by strata
    
library(car)
Rossi$age.cat <- recode(Rossi$age, " lo:19=1; 20:25=2; 26:30=3; 31:hi=4 ")
table(Rossi$age.cat)

## Rossi$age.cat <- cut(Rossi$age, c(0,19,25,30,Inf)) # alternative

mod.allison.6 <- coxph(Surv(week, arrest) ~ 
    fin + prio + strata(age.cat),
    data=Rossi)
mod.allison.6
cox.zph(mod.allison.6)

    # influence
    
dfbeta <- residuals(mod.allison.4, type='dfbeta')
par(mfrow=c(2,2))
for (j in 1:3) {
    plot(dfbeta[,j], ylab=names(coef(mod.allison.4))[j])
    abline(h=0, lty=2)
    }

    # nonlinearity

par(mfrow=c(2,2))        
res <- residuals(mod.allison.4, type='martingale')
X <- as.matrix(Rossi[,c("age", "prio")]) # matrix of covariates
par(mfrow=c(2,2))
for (j in 1:2) {  # residual plots
    plot(X[,j], res, xlab=c("age", "prio")[j], ylab="residuals")
    abline(h=0, lty=2)
    lines(lowess(X[,j], res, iter=0))
    }

b <- coef(mod.allison.4)[c(2,3)]  # regression coefficients
for (j in 1:2) {  # partial-residual plots
    plot(X[,j], b[j]*X[,j] + res, xlab=c("age", "prio")[j], 
        ylab="component+residual")
    abline(lm(b[j]*X[,j] + res ~ X[,j]), lty=2)
    lines(lowess(X[,j], b[j]*X[,j] + res, iter=0))
    }

##----------------------------------------------------------------------------------
##      The fold function to create datasets with time-varying covariates
##----------------------------------------------------------------------------------

# R version (last modified 11 June 2006)
fold <- function(data, time, event, cov, 
    cov.names=paste('covariate', '.', 1:ncovs, sep=""), 
    suffix='.time', cov.times=0:ncov, common.times=TRUE, lag=0){
##  Arguments:
##  data: A data frame or numeric matrix (with column names) to be `unfolded.' 
##      For reasons of efficiency, if there are factors in data these will be 
##      converted to numeric variables in the output data frame.
##  time: The quoted name of the event/censoring-time variable in data.
##  event: The quoted name of the event/censoring-indicator variable in data.
##  cov: A vector giving the column numbers of the time-dependent covariate 
##      in data, or a list of vectors if there is more than one time-varying 
##      covariate.
##  cov.names: A character string or character vector giving the name or names 
##      to be assigned to the time-dependent covariate(s) in the output data set.
##  suffix: The suffix to be attached to the name of the time-to-event variable 
##      in the output data set; defaults to '.time'.
##  cov.times: The observation times for the covariate values, including the start 
##      time. This argument can take several forms: 
##      *   The default is integers from 0 to the number of covariate values (i.e., 
##              one more than the length of each vector in cov).
##      *   An arbitrary numerical vector with one more entry than the length of each 
##      *       vector in cov.
##      *   The columns in the input data set that give the observations times for each
##              individual. There should be one more column than the length of each 
##              vector in cov.
##  common.times: A logical value indicating whether the times of observation 
##      are the same for all individuals; defaults to TRUE.
##  lag: Number of observation periods to lag each value of the time-varying 
##      covariate(s); defaults to 0.
    vlag <- function(x, lag) c(rep(NA, lag), x[1:(length(x)-lag)])
    xlag <- function(x, lag) apply(as.matrix(x), 2, vlag, lag=lag)
    all.cov <- unlist(cov)
    if (!is.list(cov)) cov <- list(cov)
    ncovs <- length(cov)
    nrow <- nrow(data)
    ncol <- ncol(data)
    ncov <- length(cov[[1]])
    nobs <- nrow*ncov
    if (length(unique(c(sapply(cov, length), length(cov.times)-1))) > 1)
        stop(paste(
            "all elements of cov must be of the same length and \n",
            "cov.times must have one more entry than each element of cov."))
    var.names <- names(data)
    subjects <- rownames(data)
    omit.cols <- if (!common.times) c(all.cov, cov.times) else all.cov
    keep.cols <- (1:ncol)[-omit.cols]
    nkeep <- length(keep.cols)
    if (is.numeric(event)) event <- var.names[event]
    times <- if (common.times) matrix(cov.times, nrow, ncov+1, byrow=TRUE)
        else data[, cov.times]
    new.data <- matrix(Inf, nobs, 3 + ncovs + nkeep)
    rownames <- rep("", nobs)
    colnames(new.data) <- c('start', 'stop', paste(event, suffix, sep=""), 
        var.names[-omit.cols], cov.names)
    end.row <- 0
    for (i in 1:nrow){
        start.row <- end.row + 1
        end.row <- end.row + ncov
        start <- as.numeric(times[i, 1:ncov])
        stop <- as.numeric(times[i, 2:(ncov+1)])
        event.time <- as.numeric(ifelse (stop == data[i, time] & data[i, event] == 1, 1, 0))
        keep <- matrix(unlist(data[i, -omit.cols]), ncov, nkeep, byrow=TRUE)
        select <- apply(matrix(!is.na(data[i, all.cov]), ncol=ncovs), 1, all)
        rows <- start.row:end.row
        cov.mat <- xlag(matrix(unlist(data[i, all.cov]), nrow=length(rows)), lag)
        new.data[rows[select], ] <- 
            cbind(start, stop, event.time, keep, cov.mat)[select,]
        rownames[rows] <- paste(subjects[i], '.', seq(along=rows), sep="")
        }
    row.names(new.data) <- rownames
    as.data.frame(new.data[new.data[, 1] != Inf & 
        apply(as.matrix(!is.na(new.data[, cov.names])), 1, all), ])
    }


# S-PLUS version (requires the car library; last modified 11 June 2006)
vlag <- function(x, lag) c(rep(NA, lag), x[1:(length(x)-lag)])
xlag <- function(x, lag) apply(as.matrix(x), 2, vlag, lag=lag)
fold <- function(data, time, event, cov, 
    cov.names=paste('covariate', '.', 1:ncovs, sep=""), 
    suffix='.time', cov.times=0:ncov, common.times=TRUE, lag=0){
##  Arguments: same as the R version
    all.cov <- unlist(cov)
    if (!is.list(cov)) cov <- list(cov)
    ncovs <- length(cov)
    nrow <- nrow(data)
    ncol <- ncol(data)
    ncov <- length(cov[[1]])
    nobs <- nrow*ncov
    if (length(unique(c(sapply(cov, length), length(cov.times)-1))) > 1)
        stop(paste(
            "all elements of cov must be of the same length and \n",
            "cov.times must have one more entry than each element of cov."))
    var.names <- names(data)
    subjects <- rownames(data)
    omit.cols <- if (!common.times) c(all.cov, cov.times) else all.cov
    keep.cols <- (1:ncol)[-omit.cols]
    nkeep <- length(keep.cols)
    if (is.numeric(event)) event <- var.names[event]
    times <- if (common.times) matrix(cov.times, nrow, ncov+1, byrow=T)
        else data[, cov.times]
    new.data <- matrix(Inf, nobs, 3 + ncovs + nkeep)
    rownames <- rep("", nobs)
    colnames(new.data) <- c('start', 'stop', paste(event, suffix, sep=""), 
        var.names[-omit.cols], cov.names)
    end.row <- 0
    for (i in 1:nrow){
        start.row <- end.row + 1
        end.row <- end.row + ncov
        start <- as.numeric(times[i, 1:ncov])
        stop <- as.numeric(times[i, 2:(ncov+1)])
        event.time <- as.numeric(ifelse (stop == data[i, time] & data[i, event] == 1, 1, 0))
        keep <- matrix(unlist(data[i, -omit.cols]), ncov, nkeep, byrow=T)
        select <- apply(matrix(!is.na(data[i, all.cov]), ncol=ncovs), 1, all)
        rows <- start.row:end.row
        cov.mat <- xlag(matrix(unlist(data[i, all.cov]), nrow=length(rows)), lag)
        new.data[rows[select], ] <- 
            cbind(start, stop, event.time, keep, cov.mat)[select,]
        rownames[rows] <- paste(subjects[i], '.', seq(along=rows), sep="")
        }
    row.names(new.data) <- rownames
    as.data.frame(new.data[new.data[, 1] != Inf & 
        apply(as.matrix(!is.na(new.data[, cov.names])), 1, all), ])
    }